In [1]:
from xmen.linkers import default_ensemble
from xmen.data import filter_and_apply_threshold
from xmen.evaluation import evaluate, evaluate_at_k, error_analysis

  from .autonotebook import tqdm as notebook_tqdm


## Load Dataset in BigBIO Format

In [2]:
# Clone forked repo until loader with SNOMED CT layer is on HF Hub
!git clone -b grascco_snomed git@github.com:phlobo/biomedical.git

fatal: destination path 'biomedical' already exists and is not an empty directory.


In [3]:
import datasets
ground_truth = datasets.load_dataset(
    'biomedical/bigbio/hub/hub_repos/grascco/grascco.py', 
    'grascco_snomed_bigbio_kb',
    data_dir='../gemtex_oncology/annotation/json/'
)['train']

## Generate Candidates Using xMEN Ensemble Linker

In [4]:
linker = default_ensemble('xmen_index/index/', cuda=False)

In [5]:
from utils import handle_dates, to_map_fn

candidates_ = linker.predict_batch(ground_truth)
candidates = candidates_.map(to_map_fn(handle_dates))

Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:05<00:00,  5.72s/ examples]


## Evaluation

In [6]:
_ = evaluate_at_k(ground_truth, candidates, eval_k=[1,2,3,4])

Recall@1 0.546875
Recall@2 0.609375
Recall@3 0.65625
Recall@4 0.671875


In [7]:
ea_df = error_analysis(ground_truth, candidates, tasks=['nen'])
ea_df.head(10)

Unnamed: 0,_word_len,_abbrev,gt_start,gt_end,gt_text,gold_type,gold_concept,pred_index,pred_index_score,pred_top,pred_top_score,document_id
0,1,False,105,113,[4.4.1997],Literal,"{'db_name': 'SNOMED CT', 'db_id': '258695005'}",0,1.0,258695005,1.0,Albers.txt
1,1,False,129,134,[19.3.],Literal,"{'db_name': 'SNOMED CT', 'db_id': '258695005'}",0,1.0,258695005,1.0,Albers.txt
2,1,False,143,151,[7.5.2029],Literal,"{'db_name': 'SNOMED CT', 'db_id': '258695005'}",0,1.0,258695005,1.0,Albers.txt
3,1,False,215,226,[Verbrennung],Concept,"{'db_name': 'SNOMED CT', 'db_id': '125666000'}",2,1.0,48333001,1.0,Albers.txt
4,1,False,227,229,[1.],Concept,"{'db_name': 'SNOMED CT', 'db_id': '258351006'}",22,0.76832,258695005,1.0,Albers.txt
5,2,False,232,241,[3. Grades],Concept,"{'db_name': 'SNOMED CT', 'db_id': '258353009'}",1,0.934421,61026006,0.934421,Albers.txt
6,1,False,243,247,[Kopf],Concept,"{'db_name': 'SNOMED CT', 'db_id': '302548004'}",5,0.806592,69536005,1.0,Albers.txt
7,1,False,250,254,[Hals],Concept,"{'db_name': 'SNOMED CT', 'db_id': '49928004'}",37,0.699021,45048000,1.0,Albers.txt
8,1,False,255,257,[5%],Concept,"{'db_name': 'SNOMED CT', 'db_id': '113341005'}",-1,,258695005,1.0,Albers.txt
9,1,True,260,263,[KOF],Concept,"{'db_name': 'SNOMED CT', 'db_id': '301898006'}",-1,,34763001,0.831992,Albers.txt


# Pre-trained Re-ranker

In [8]:
from xmen.reranking import CrossEncoderReranker
from xmen import load_kb

In [9]:
n_candidates = 8

In [10]:
kb = load_kb('xmen_index/snomed_german.jsonl')

In [11]:
ce_dataset = CrossEncoderReranker.prepare_data(candidates, ground_truth, kb, k=n_candidates)

Context length: 128
Use NIL values: True


Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  4.88 examples/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 64/64 [00:00<00:00, 111708.47it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 64/64 [00:00<00:00, 278171.46it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 64/64 [00:00<00:00, 9475.98it/s]


In [12]:
rr = CrossEncoderReranker.load('phlobo/xmen-de-ce-medmentions', device='mps') # works on MacOS M1, you may set device to cpu or <GPU ID>

In [13]:
reranked_ = rr.rerank_batch(candidates, ce_dataset, k=n_candidates)

Batches: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 64/64 [00:18<00:00,  3.39it/s]
Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  6.99 examples/s]
Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 97.37 examples/s]
Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 123.39 examples/s]


In [14]:
reranked = reranked_.map(to_map_fn(handle_dates))

In [15]:
_ = evaluate_at_k(ground_truth, reranked, eval_k=[1,2,3,4])

Recall@1 0.609375
Recall@2 0.640625
Recall@3 0.65625
Recall@4 0.671875


In [18]:
evaluate(ground_truth, reranked)

{'strict': {'precision': 0.6842105263157895,
  'recall': 0.609375,
  'fscore': 0.6446280991735538,
  'ptp': 39,
  'fp': 18,
  'rtp': 39,
  'fn': 25,
  'n_docs_system': 1,
  'n_annos_system': 57,
  'n_docs_gold': 1,
  'n_annos_gold': 64}}

In [16]:
ea_df_rr_ws = error_analysis(ground_truth, reranked, tasks=['nen'])
ea_df_rr_ws.head(10)

Unnamed: 0,_word_len,_abbrev,gt_start,gt_end,gt_text,gold_type,gold_concept,pred_index,pred_index_score,pred_top,pred_top_score,document_id
0,1,False,105,113,[4.4.1997],Literal,"{'db_name': 'SNOMED CT', 'db_id': '258695005'}",0,1.0,258695005,1.0,Albers.txt
1,1,False,129,134,[19.3.],Literal,"{'db_name': 'SNOMED CT', 'db_id': '258695005'}",0,1.0,258695005,1.0,Albers.txt
2,1,False,143,151,[7.5.2029],Literal,"{'db_name': 'SNOMED CT', 'db_id': '258695005'}",0,1.0,258695005,1.0,Albers.txt
3,1,False,215,226,[Verbrennung],Concept,"{'db_name': 'SNOMED CT', 'db_id': '125666000'}",1,0.166484,48333001,0.188725,Albers.txt
4,1,False,227,229,[1.],Concept,"{'db_name': 'SNOMED CT', 'db_id': '258351006'}",-1,,258695005,1.0,Albers.txt
5,2,False,232,241,[3. Grades],Concept,"{'db_name': 'SNOMED CT', 'db_id': '258353009'}",0,0.141101,258353009,0.141101,Albers.txt
6,1,False,243,247,[Kopf],Concept,"{'db_name': 'SNOMED CT', 'db_id': '302548004'}",5,0.114893,69536005,0.223935,Albers.txt
7,1,False,250,254,[Hals],Concept,"{'db_name': 'SNOMED CT', 'db_id': '49928004'}",-1,,45048000,0.241662,Albers.txt
8,1,False,255,257,[5%],Concept,"{'db_name': 'SNOMED CT', 'db_id': '113341005'}",-1,,258695005,1.0,Albers.txt
9,1,True,260,263,[KOF],Concept,"{'db_name': 'SNOMED CT', 'db_id': '301898006'}",-1,,34763001,0.138157,Albers.txt


# TODO: Fine-tuned Re-ranker?

In [17]:
# Train a re-ranker once annotated data (e.g., for GraSCCo) are available