In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [3]:
import sys
sys.path.insert(0, '..')

In [4]:
from pathlib import Path

In [5]:
from xmen.knowledge_base import CompositeKnowledgebase
from xmen.evaluation import entity_linking_error_analysis, evaluate

In [6]:
from xmen.confhelper import load_config
config = load_config('../conf/distemist.yaml')

In [7]:
base_path = Path(config.cache_dir)

In [8]:
def evaluate_at_k(ground_truth, pred, eval_k=[1,5,8,20,64,100], silent=False):
    res = {}
    for ki in eval_k:
        eval_res = evaluate(ground_truth, pred, top_k_predictions=ki)
        if not silent:
            print(f'Perf@{ki}', eval_res["strict"]['recall'])
        res[ki] = eval_res
    return res

In [9]:
distemist_gazetteer_jsonl = base_path / 'distemist_gazetteer' / 'distemist_gazetteer.jsonl'

dict_configs = ['distemist_gazetteer', 'distemist_umls_es', 'distemist_umls_en_es', 'distemist_umls_all'] 

In [15]:
from bigbio.dataloader import BigBioConfigHelpers
import datasets

configs = BigBioConfigHelpers()
ds = configs.for_config_name('distemist_linking_bigbio_kb').load_dataset()

Found cached dataset distemist (/home/Florian.Borchert/.cache/huggingface/datasets/bigbio___distemist/distemist_linking_bigbio_kb/1.0.0/f63b2c6775932c342d7bff59d751d6139c0c52c5255f7fbb3e458d255728a8dd)


  0%|          | 0/2 [00:00<?, ?it/s]

# Preparation - Just run once

__Note__: This is a bit more involved, because we dynamically explore different combinations of DisTEMIST gazetteer and UMLS subsets.

In most normal cases, you just want to configure the target dictionary in your .yaml file and run `xmen dict` and `xmen index` with default settings.

## Download Distemist gazetteer

In [None]:
!mkdir -p ../temp; wget https://zenodo.org/record/6505583/files/dictionary_distemist.tsv?download=0 -O ../temp/dictionary_distemist.tsv

## Setup dictionaries

TODO!!!!

`xmen dict conf/distemist/distemist_gazetteer.yaml --code dicts/distemist.py` OR ``xmen dict conf/distemist.yaml --key distemist_gazetteer`

In [None]:
meta_path = Path(config.dict.distemist_umls_en_es.umls.meta_path)

In [None]:
from xmen.umls import umls_utils
from collections import defaultdict
from tqdm.auto import tqdm

def read_cui2snomed_mapping(meta_path):
    mrconso = 'MRCONSO.RRF'
    cui2snomed = defaultdict(list)
    headers = umls_utils.read_umls_file_headers(meta_path, mrconso)
    with open(f"{meta_path}/{mrconso}") as fin:
        for line in tqdm(fin.readlines()):
            splits = line.strip().split("|")
            assert len(headers) == len(splits)
            concept = dict(zip(headers, splits))
            if concept['SAB'] in ['SNOMEDCT_US', 'SCTSPA']:
                cui2snomed[concept['CUI']].append(concept['SCUI'])
    return cui2snomed

In [None]:
cui2snomed_mapping = read_cui2snomed_mapping(meta_path)

In [None]:
def cui2snomed(entry):
    res = []
    cui = entry['concept_id']
    for sctid in cui2snomed_mapping[cui]:
        r = entry.copy()
        r['concept_id'] = sctid
        res.append(r)
    return res

In [None]:
import json
from tqdm.auto import tqdm

def merge_and_write_dicts(target_jsonl, added_jsonl, output_jsonl):
    print(f'Writing to {output_jsonl}')
    output_jsonl.parent.mkdir(exist_ok=True, parents=True)
    cui_count = alias_count = 0
    kb = CompositeKnowledgebase([added_jsonl], mappers=[cui2snomed])
    with open(output_jsonl, 'w') as fo:
        for l in tqdm(list(open(target_jsonl).readlines())):
            cui_count += 1
            entry = json.loads(l)
            sctid = str(entry['concept_id'])
            concept = kb.cui_to_entity[sctid]
            known_aliases = [entry['canonical_name']] + entry['aliases']
            new_aliases = [c for c in [concept.canonical_name] + concept.aliases if c not in known_aliases]
            entry['aliases'] += new_aliases
            alias_count += len(entry['aliases'])
            fo.write(json.dumps(entry) + '\n')
    print(f'Written {cui_count} concepts with {alias_count} aliases')

In [None]:
for d in dict_configs[1:]:
    merge_and_write_dicts(distemist_gazetteer_jsonl, base_path / d / f'{d}.jsonl', base_path / 'distemist' / 'merged' / f'{d}.jsonl')

3. Prepare TF-IDF NGram and SapBERT indices for all configurations

In [None]:
from IPython.display import Markdown, display

# Run these commands
display(Markdown(f"`xmen index conf/distemist.yaml --dict {base_path / 'distemist_gazetteer' / 'distemist_gazetteer.jsonl'} --output {base_path / 'distemist' / 'merged' / 'distemist_gazetteer'} --all`"))
for d in dict_configs[1:]:
    display(Markdown(f"`xmen index conf/distemist.yaml --dict {base_path / 'distemist' / 'merged' / f'{d}.jsonl'} --output {base_path / 'distemist' / 'merged' / f'{d}'} --all`"))

# Candidate Generation

In [None]:
embedding_model_name = 'cambridgeltl/SapBERT-UMLS-2020AB-all-lang-from-XLMR'

In [None]:
from notebook_util import analyze
from xmen.linkers import TFIDFNGramLinker, SapBERTLinker, EnsembleLinker
from xmen.linkers.util import filter_and_apply_threshold
from datasets import DatasetDict

In [None]:

def run_linkers(dict_config):
    index_base_path = base_path / 'distemist' / 'merged' / dict_config / 'index'
    
    ngram_linker = TFIDFNGramLinker(index_base_path=index_base_path / 'ngrams', k=100)
    pred_ngram = ngram_linker.predict_batch(ds)
    pred_ngram.save_to_disk(Path('..') / 'temp' / dict_config / 'pred_ngram')
    
    # Clear singleton to free up memory
    SapBERTLinker.clear()
    # Initialize linker from config
    sapbert_linker = SapBERTLinker(
        embedding_model_name = embedding_model_name,
        index_base_path = index_base_path / 'sapbert',
        k = 1000
    )
    
    pred_sapbert = sapbert_linker.predict_batch(ds, batch_size=128)
    pred_sapbert.save_to_disk(Path('..') / 'temp' / dict_config / 'pred_sapbert')
    
    # Ensemble
    ensemble_linker = EnsembleLinker()
    ensemble_linker.add_linker('sapbert', sapbert_linker, k=100)
    ensemble_linker.add_linker('ngram', ngram_linker, k=100)
    
    pred_ensemble = DatasetDict()
    # TODO: reuse_preds currently does not work with dataset dicts
    pred_ensemble['train'] = ensemble_linker.predict_batch(ds['train'], 128, 100, reuse_preds={'sapbert' : pred_sapbert['train'], 'ngram' : pred_ngram['train']})
    pred_ensemble['test'] = ensemble_linker.predict_batch(ds['test'], 128, 100, reuse_preds={'sapbert' : pred_sapbert['test'], 'ngram' : pred_ngram['test']})
    
    pred_ensemble.save_to_disk(Path('..') / 'temp' / dict_config / 'pred_ensemble')

In [None]:
for d in dict_configs:
    print(d)
    run_linkers(d)

In [None]:
import datasets
import pandas as pd

cg_performance = []

for d in dict_configs:
    print(d)
    print('Ngram')
    pred_ngram = datasets.load_from_disk(Path('..') / 'temp' / d / 'pred_ngram')
    eval_ngram = evaluate_at_k(ds['train'], pred_ngram['train'])
    for k, v in eval_ngram.items():
        cg_performance.append({'k' : k, 'linker' : 'ngram', 'dict' : d, 'recall' : v['strict']['recall']})
    print('SapBERT')
    pred_sapbert = datasets.load_from_disk(Path('..') / 'temp' / d / 'pred_sapbert')
    eval_sapbert = evaluate_at_k(ds['train'], filter_and_apply_threshold(pred_sapbert['train'], k=100, threshold=0.0))
    for k, v in eval_sapbert.items():
        cg_performance.append({'k' : k, 'linker' : 'sapbert', 'dict' : d, 'recall' : v['strict']['recall']})
    print('Ensemble')
    pred_ensemble = datasets.load_from_disk(Path('..') / 'temp' / d / 'pred_ensemble')
    eval_ensemble = evaluate_at_k(ds['train'], pred_ensemble['train'])
    for k, v in eval_ensemble.items():
        cg_performance.append({'k' : k, 'linker' : 'ensemble', 'dict' : d, 'recall' : v['strict']['recall']})
        
cg_performance = pd.DataFrame(cg_performance)

In [None]:
cg_performance.query('k < 100').sort_values('recall')

# Reranking

In [22]:
from xmen.reranking.cross_encoder import CrossEncoderReranker, CrossEncoderTrainingArgs

In [21]:
K_RERANKING = 64
CONTEXT_LENGTH = 128
CROSS_ENC_MODEL = 'cambridgeltl/SapBERT-UMLS-2020AB-all-lang-from-XLMR'
NUM_EPOCHS = 10
tmp_path = Path('..') / 'temp'

## Prepare Data for Cross-Encoder Training

In [None]:
from xmen.linkers.util import filter_and_apply_threshold

candidates = datasets.load_from_disk(tmp_path / 'distemist_umls_en_es' / 'pred_ensemble')
candidates = filter_and_apply_threshold(candidates, K_RERANKING, 0.0)

In [None]:
# TODxmen.linkers.utilms configurable
from xmen.knowledge_base import load_kb
kb = load_kb(base_path / 'distemist' / 'merged' / 'distemist_umls_en_es.jsonl')

In [None]:
cross_enc_ds = CrossEncoderReranker.prepare_data(candidates, ds, kb, CONTEXT_LENGTH)
# TODO: get proper validation set
cross_enc_ds['validation'] = cross_enc_ds['test']

In [None]:
cross_enc_ds.save_to_disk('../temp/cross_enc_ds')

## Train Cross Encoder

In [23]:
from xmen.data.indexed_dataset import IndexedDatasetDict
cross_enc_ds = IndexedDatasetDict.load_from_disk('../temp/cross_enc_ds')
cross_enc_ds

{'train': [5136 items], 'test': [2598 items], 'validation': [2598 items]}

In [35]:
train_args = CrossEncoderTrainingArgs(
    CROSS_ENC_MODEL, 
    NUM_EPOCHS,
    score_regularization=True,
)

In [36]:
tags = ['Distemist']

In [None]:
from omegaconf import OmegaConf, SCMode
import wandb

def wandb_callback(score, epoch, steps):
    wandb.log({
        "eval/accuracy" : score,
        "train/epoch" : epoch
    })
    
def eval_callback(dic):
    wandb.log(dic)

try:
    with wandb.init(tags=tags) as run:
        #dict_config = OmegaConf.to_container(conf, structured_config_mode=SCMode.DICT_CONFIG)
        wandb.log(train_args.args)
        rr = CrossEncoderReranker()
        output_dir = f'{tmp_path}/cross_encoder/{run.name}'
        wandb.log({'output_dir' : output_dir})
        rr.fit(cross_enc_ds['train'].dataset, cross_enc_ds['validation'].dataset, output_dir=output_dir, training_args=train_args, callback=wandb_callback, eval_callback=eval_callback)
finally:
    wandb.finish()

model_name := cambridgeltl/SapBERT-UMLS-2020AB-all-lang-from-XLMR
num_train_epochs := 10
fp16 := True
label_smoothing := False
score_regularization := True
train_layers := None
softmax_loss := True
_timestamp := 1683205039
_runtime := 3


Some weights of the model checkpoint at cambridgeltl/SapBERT-UMLS-2020AB-all-lang-from-XLMR were not used when initializing XLMRobertaForSequenceClassification: ['pooler.dense.bias', 'pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at cambridgeltl/SapBERT-UMLS-2020AB-all-lang-from-XLMR and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias

2023-05-04 14:57:23 - Use pytorch device: cuda
Using score regularization: True
Using label smoothing factor: False


Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration:   0%|          | 0/5136 [00:00<?, ?it/s]

2023-05-04 16:11:56 - EntityLinkingEvaluator: Evaluating the model on eval dataset after epoch 0:


In [None]:
#TODO

## Predict

In [None]:
#TODO

# Error Analysis

In [None]:
kb_ngram = CompositeKnowledgebase([Path(config.cache_dir) / 'distemist_gazetteer' / 'distemist_gazetteer.jsonl'])

In [None]:
ea_ngram = entity_linking_error_analysis(ds['train'], pred_ngram['train'])

In [None]:
ea_df_ngram, ea_counts_ngram = analyze(ea_ngram, kb_ngram, '', sem_group_version=None)

In [None]:
ea_counts_ngram

In [None]:
evaluate_at_k(ds['train'], prediction)

In [None]:
evaluate_at_k(ds['train'], prediction)

In [None]:
from xmen.linkers import EnsembleLinker

In [None]:
ensemble = EnsembleLinker()
ensemble.add_linker('sap', sap_bert_linker, k=100)
ensemble.add_linker('ngram', ngram_linker, k=100)

In [None]:
ensemble_pred = ensemble.predict_batch(ds, batch_size=128, top_k=100)

In [None]:
evaluate_at_k(ds['train'], ensemble_pred['train'])

In [None]:
ensemble_pred.save_to_disk('ensemble_pred')

In [None]:
import datasets
prediction = datasets.load_from_disk('prediction')

In [None]:
ea = entity_linking_error_analysis(ds['train'], prediction)

In [None]:
kb = CompositeKnowledgebase([Path(config.cache_dir) / 'distemist' / 'distemist.jsonl'])

In [None]:
ea_df, ea_counts = analyze(ea, kb, '', sem_group_version=None)

In [None]:
ea_df

In [None]:
ea_counts

In [None]:
# Clear singleton to free up memory
SapBERTLinker.clear()
# Initialize linker from config
sap_bert_linker = SapBERTLinker(
    embedding_model_name = embedding_model_name,
    index_base_path = Path(config.cache_dir) / 'distemist_gazetteer/index/sapbert',
    k = k,
)

In [None]:
prediction_monoling = sap_bert_linker.predict_batch(ds['train'], batch_size=128)

In [None]:
evaluate_at_k(ds['train'], prediction_monoling)

In [None]:
evaluate_at_k(ds['train'], prediction_monoling)