In [1]:
from xmen.linkers import default_ensemble
from xmen.data import filter_and_apply_threshold
from xmen.evaluation import evaluate, evaluate_at_k, error_analysis

  from .autonotebook import tqdm as notebook_tqdm


## Load Dataset in BigBIO Format

In [2]:
# Clone forked repo until loader with SNOMED CT layer is on HF Hub
!git clone -b grascco_snomed git@github.com:phlobo/biomedical.git

fatal: destination path 'biomedical' already exists and is not an empty directory.


In [3]:
import datasets
ground_truth = datasets.load_dataset(
    'biomedical/bigbio/hub/hub_repos/grascco/grascco.py', 
    'grascco_snomed_bigbio_kb',
    data_dir='../gemtex_oncology/annotation/json/'
)['train']

## Generate Candidates Using xMEN Ensemble Linker

In [4]:
linker = default_ensemble('xmen_index/index/', cuda=False)

In [5]:
from utils import handle_dates, to_map_fn

candidates_ = linker.predict_batch(ground_truth)
candidates = candidates_.map(to_map_fn(handle_dates))

Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:05<00:00,  5.08s/ examples]


## Evaluation

In [6]:
_ = evaluate_at_k(ground_truth, candidates, eval_k=[1,2,3,4])

Recall@1 0.546875
Recall@2 0.609375
Recall@3 0.65625
Recall@4 0.671875


In [7]:
ea_df = error_analysis(ground_truth, candidates, tasks=['nen'])
ea_df

Unnamed: 0,_word_len,_abbrev,gt_start,gt_end,gt_text,gold_type,gold_concept,pred_index,pred_index_score,pred_top,pred_top_score,document_id
0,1,False,105,113,[4.4.1997],Literal,"{'db_name': 'SNOMED CT', 'db_id': '258695005'}",0,1.00000,258695005,1.0,Albers.txt
1,1,False,129,134,[19.3.],Literal,"{'db_name': 'SNOMED CT', 'db_id': '258695005'}",0,1.00000,258695005,1.0,Albers.txt
2,1,False,143,151,[7.5.2029],Literal,"{'db_name': 'SNOMED CT', 'db_id': '258695005'}",0,1.00000,258695005,1.0,Albers.txt
3,1,False,215,226,[Verbrennung],Concept,"{'db_name': 'SNOMED CT', 'db_id': '125666000'}",2,1.00000,48333001,1.0,Albers.txt
4,1,False,227,229,[1.],Concept,"{'db_name': 'SNOMED CT', 'db_id': '258351006'}",22,0.76832,258695005,1.0,Albers.txt
...,...,...,...,...,...,...,...,...,...,...,...,...
59,1,False,942,946,[Hand],Concept,"{'db_name': 'SNOMED CT', 'db_id': '85562004'}",0,1.00000,85562004,1.0,Albers.txt
60,1,False,947,957,[23.04.2029],Concept,"{'db_name': 'SNOMED CT', 'db_id': '258695005'}",0,1.00000,258695005,1.0,Albers.txt
61,1,False,1024,1034,[06.05.2029],Concept,"{'db_name': 'SNOMED CT', 'db_id': '258695005'}",0,1.00000,258695005,1.0,Albers.txt
62,1,False,2024,2028,[Hand],Concept,"{'db_name': 'SNOMED CT', 'db_id': '85562004'}",0,1.00000,85562004,1.0,Albers.txt


# Pre-trained Re-ranker

In [8]:
from xmen.reranking import CrossEncoderReranker
from xmen import load_kb

In [9]:
n_candidates = 64

In [10]:
kb = load_kb('xmen_index/snomed_german.jsonl')

In [11]:
ce_dataset = CrossEncoderReranker.prepare_data(candidates, ground_truth, kb, k=n_candidates)

Context length: 128
Use NIL values: True


Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  4.67 examples/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 64/64 [00:00<00:00, 16731.21it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 64/64 [00:00<00:00, 66052.03it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 64/64 [00:00<00:00, 1947.20it/s]


In [12]:
rr = CrossEncoderReranker.load('phlobo/xmen-de-ce-medmentions', device='mps') # works on MacOS M1, you may set device to cpu or <GPU ID>

In [13]:
reranked_ = rr.rerank_batch(candidates, ce_dataset, k=n_candidates)

Batches: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 64/64 [02:31<00:00,  2.36s/it]
Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  5.75 examples/s]
Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 33.00 examples/s]
Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 44.75 examples/s]


In [14]:
reranked_ = reranked_.map(to_map_fn(handle_dates))

In [15]:
_ = evaluate_at_k(ground_truth, reranked_, eval_k=[1,2,3,4])

Recall@1 0.59375
Recall@2 0.640625
Recall@3 0.65625
Recall@4 0.671875


# TODO: Fine-tuned Re-ranker?