In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

In [3]:
import sys
sys.path.insert(0, '..')

In [4]:
from pathlib import Path

In [5]:
from xmen.knowledge_base import CompositeKnowledgebase
from xmen.evaluation import entity_linking_error_analysis, evaluate

In [6]:
from xmen.confhelper import load_config
config = load_config('../conf/distemist.yaml')

In [7]:
base_path = Path(config.cache_dir)

In [27]:
def evaluate_at_k(ground_truth, pred, eval_k=[1,5,8,20,64,100], silent=False):
    res = {}
    for ki in eval_k:
        eval_res = evaluate(ground_truth, pred, top_k_predictions=ki)
        if not silent:
            print(f'Perf@{ki}', eval_res["strict"]['recall'])
        res[ki] = eval_res
    return res

In [9]:
distemist_gazetteer_jsonl = base_path / 'distemist_gazetteer' / 'distemist_gazetteer.jsonl'

dict_configs = ['distemist_gazetteer', 'distemist_umls_es', 'distemist_umls_en_es', 'distemist_umls_all'] 

# Preparation - Just run once

__Note__: This is a bit more involved, because we dynamically explore different combinations of DisTEMIST gazetteer and UMLS subsets.

In most normal cases, you just want to configure the target dictionary in your .yaml file and run `xmen dict` and `xmen index` with default settings.

## Download Distemist gazetteer

In [None]:
!mkdir -p ../temp; wget https://zenodo.org/record/6505583/files/dictionary_distemist.tsv?download=0 -O ../temp/dictionary_distemist.tsv

## Setup dictionaries

TODO!!!!

`xmen dict conf/distemist/distemist_gazetteer.yaml --code dicts/distemist.py` OR ``xmen dict conf/distemist.yaml --key distemist_gazetteer`

In [None]:
meta_path = Path(config.dict.distemist_umls_en_es.umls.meta_path)

In [None]:
from xmen.umls import umls_utils
from collections import defaultdict
from tqdm.auto import tqdm

def read_cui2snomed_mapping(meta_path):
    mrconso = 'MRCONSO.RRF'
    cui2snomed = defaultdict(list)
    headers = umls_utils.read_umls_file_headers(meta_path, mrconso)
    with open(f"{meta_path}/{mrconso}") as fin:
        for line in tqdm(fin.readlines()):
            splits = line.strip().split("|")
            assert len(headers) == len(splits)
            concept = dict(zip(headers, splits))
            if concept['SAB'] in ['SNOMEDCT_US', 'SCTSPA']:
                cui2snomed[concept['CUI']].append(concept['SCUI'])
    return cui2snomed

In [None]:
cui2snomed_mapping = read_cui2snomed_mapping(meta_path)

In [None]:
def cui2snomed(entry):
    res = []
    cui = entry['concept_id']
    for sctid in cui2snomed_mapping[cui]:
        r = entry.copy()
        r['concept_id'] = sctid
        res.append(r)
    return res

In [None]:
import json
from tqdm.auto import tqdm

def merge_and_write_dicts(target_jsonl, added_jsonl, output_jsonl):
    print(f'Writing to {output_jsonl}')
    output_jsonl.parent.mkdir(exist_ok=True, parents=True)
    cui_count = alias_count = 0
    kb = CompositeKnowledgebase([added_jsonl], mappers=[cui2snomed])
    with open(output_jsonl, 'w') as fo:
        for l in tqdm(list(open(target_jsonl).readlines())):
            cui_count += 1
            entry = json.loads(l)
            sctid = str(entry['concept_id'])
            concept = kb.cui_to_entity[sctid]
            known_aliases = [entry['canonical_name']] + entry['aliases']
            new_aliases = [c for c in [concept.canonical_name] + concept.aliases if c not in known_aliases]
            entry['aliases'] += new_aliases
            alias_count += len(entry['aliases'])
            fo.write(json.dumps(entry) + '\n')
    print(f'Written {cui_count} concepts with {alias_count} aliases')

In [None]:
for d in dict_configs[1:]:
    merge_and_write_dicts(distemist_gazetteer_jsonl, base_path / d / f'{d}.jsonl', base_path / 'distemist' / 'merged' / f'{d}.jsonl')

3. Prepare TF-IDF NGram and SapBERT indices for all configurations

In [69]:
from IPython.display import Markdown, display

# Run these commands
display(Markdown(f"`xmen index conf/distemist.yaml --dict {base_path / 'distemist_gazetteer' / 'distemist_gazetteer.jsonl'} --output {base_path / 'distemist' / 'merged' / 'distemist_gazetteer'} --all`"))
for d in dict_configs[1:]:
    display(Markdown(f"`xmen index conf/distemist.yaml --dict {base_path / 'distemist' / 'merged' / f'{d}.jsonl'} --output {base_path / 'distemist' / 'merged' / f'{d}'} --all`"))

`xmen index conf/distemist.yaml --dict /home/Florian.Borchert/.cache/xmen/distemist_gazetteer/distemist_gazetteer.jsonl --output /home/Florian.Borchert/.cache/xmen/distemist/merged/distemist_gazetteer --all`

`xmen index conf/distemist.yaml --dict /home/Florian.Borchert/.cache/xmen/distemist/merged/distemist_umls_es.jsonl --output /home/Florian.Borchert/.cache/xmen/distemist/merged/distemist_umls_es --all`

`xmen index conf/distemist.yaml --dict /home/Florian.Borchert/.cache/xmen/distemist/merged/distemist_umls_en_es.jsonl --output /home/Florian.Borchert/.cache/xmen/distemist/merged/distemist_umls_en_es --all`

`xmen index conf/distemist.yaml --dict /home/Florian.Borchert/.cache/xmen/distemist/merged/distemist_umls_all.jsonl --output /home/Florian.Borchert/.cache/xmen/distemist/merged/distemist_umls_all --all`

# Load Dataset

In [None]:
#!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_sm-0.5.1.tar.gz

In [10]:
from bigbio.dataloader import BigBioConfigHelpers

configs = BigBioConfigHelpers()
ds = configs.for_config_name('distemist_linking_bigbio_kb').load_dataset()

Found cached dataset distemist (/home/Florian.Borchert/.cache/huggingface/datasets/bigbio___distemist/distemist_linking_bigbio_kb/1.0.0/f63b2c6775932c342d7bff59d751d6139c0c52c5255f7fbb3e458d255728a8dd)


  0%|          | 0/2 [00:00<?, ?it/s]

# Candidate generation

In [23]:
embedding_model_name = 'cambridgeltl/SapBERT-UMLS-2020AB-all-lang-from-XLMR'

In [12]:
from notebook_util import analyze
from xmen.linkers import TFIDFNGramLinker, SapBERTLinker, EnsembleLinker

Your CPU supports instructions that this binary was not compiled to use: SSE3 SSE4.1 SSE4.2 AVX AVX2
For maximum performance, you can install NMSLIB from sources 
pip install --no-binary :all: nmslib


In [32]:
dict_config = dict_configs[2]
print(dict_config)
index_base_path = base_path / 'distemist' / 'merged' / dict_config / 'index'

distemist_umls_en_es


In [33]:
ngram_linker = TFIDFNGramLinker(index_base_path=index_base_path / 'ngrams', k=100)

In [34]:
pred_ngram = ngram_linker.predict_batch(ds)

Map:   0%|          | 0/583 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [47]:
pred_ngram.save_to_disk(Path('..') / 'temp' / dict_config / 'pred_ngram')

Saving the dataset (0/1 shards):   0%|          | 0/583 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

In [35]:
eval_ngram = evaluate_at_k(ds['train'], pred_ngram['train'])

Perf@1 0.3230368440640119
Perf@5 0.6654261257908448
Perf@8 0.7058057312988463
Perf@20 0.7554893933755117
Perf@64 0.8034983252698177
Perf@100 0.8074060290286565


In [28]:
eval_ngram = evaluate_at_k(ds['train'], pred_ngram['train'])

Perf@1 0.45050241905470784
Perf@5 0.6146259769259397
Perf@8 0.6443989579456643
Perf@20 0.6940826200223298
Perf@64 0.7484183103833272
Perf@100 0.7644212876814291


In [36]:
# Clear singleton to free up memory
SapBERTLinker.clear()
# Initialize linker from config
sapbert_linker = SapBERTLinker(
    embedding_model_name = embedding_model_name,
    index_base_path = index_base_path / 'sapbert',
    k = 1000
)

In [37]:
pred_sapbert = sapbert_linker.predict_batch(ds, batch_size=128)

Map:   0%|          | 0/583 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [49]:
pred_sapbert.save_to_disk(Path('..') / 'temp' / dict_config / 'pred_sapbert')

Saving the dataset (0/1 shards):   0%|          | 0/583 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

In [38]:
from xmen.linkers.util import filter_and_apply_threshold

In [39]:
eval_sapbert = evaluate_at_k(ds['train'], filter_and_apply_threshold(pred_sapbert['train'], k=100, threshold=0.0))

Map:   0%|          | 0/583 [00:00<?, ? examples/s]

Perf@1 0.3796055080014886
Perf@5 0.7058057312988463
Perf@8 0.7566058801637514
Perf@20 0.8020096762188315
Perf@64 0.8418310383327131
Perf@100 0.8576479344994418


In [31]:
eval_sapbert = evaluate_at_k(ds['train'], filter_and_apply_threshold(pred_sapbert['train'], k=100, threshold=0.0))

Map:   0%|          | 0/583 [00:00<?, ? examples/s]

Perf@1 0.4869743208038705
Perf@5 0.6853368068477856
Perf@8 0.71622627465575
Perf@20 0.7646073688128023
Perf@64 0.8209899516189059
Perf@100 0.836434685522888


In [44]:
ensemble_linker = EnsembleLinker()
ensemble_linker.add_linker('sapbert', sapbert_linker, k=100)
ensemble_linker.add_linker('ngram', ngram_linker, k=100)

In [51]:
from datasets import DatasetDict
pred_ensemble = DatasetDict()
# TODO: reuse_preds currently does not work with dataset dicts
pred_ensemble['train'] = ensemble_linker.predict_batch(ds['train'], 128, 100, reuse_preds={'sapbert' : pred_sapbert['train'], 'ngram' : pred_ngram['train']})
pred_ensemble['test'] = ensemble_linker.predict_batch(ds['test'], 128, 100, reuse_preds={'sapbert' : pred_sapbert['test'], 'ngram' : pred_ngram['test']})

Map:   0%|          | 0/583 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [56]:
pred_ensemble.save_to_disk(Path('..') / 'temp' / dict_config / 'pred_ensemble')

Saving the dataset (0/1 shards):   0%|          | 0/583 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

In [52]:
eval_ensemble = evaluate_at_k(ds['train'], pred_ensemble['train'])

Perf@1 0.43766282098995163
Perf@5 0.7164123557871231
Perf@8 0.7612579084480834
Perf@20 0.8122441384443617
Perf@64 0.8526237439523632
Perf@100 0.8675102344622255


# Reranking

## Train Cross Encoder

In [59]:
#TODO

## Predict

In [60]:
#TODO

# Error Analysis

In [None]:
kb_ngram = CompositeKnowledgebase([Path(config.cache_dir) / 'distemist_gazetteer' / 'distemist_gazetteer.jsonl'])

In [None]:
ea_ngram = entity_linking_error_analysis(ds['train'], pred_ngram['train'])

In [None]:
ea_df_ngram, ea_counts_ngram = analyze(ea_ngram, kb_ngram, '', sem_group_version=None)

In [None]:
ea_counts_ngram

In [None]:
evaluate_at_k(ds['train'], prediction)

In [None]:
evaluate_at_k(ds['train'], prediction)

In [None]:
from xmen.linkers import EnsembleLinker

In [None]:
ensemble = EnsembleLinker()
ensemble.add_linker('sap', sap_bert_linker, k=100)
ensemble.add_linker('ngram', ngram_linker, k=100)

In [None]:
ensemble_pred = ensemble.predict_batch(ds, batch_size=128, top_k=100)

In [None]:
evaluate_at_k(ds['train'], ensemble_pred['train'])

In [None]:
ensemble_pred.save_to_disk('ensemble_pred')

In [None]:
import datasets
prediction = datasets.load_from_disk('prediction')

In [None]:
ea = entity_linking_error_analysis(ds['train'], prediction)

In [None]:
kb = CompositeKnowledgebase([Path(config.cache_dir) / 'distemist' / 'distemist.jsonl'])

In [None]:
ea_df, ea_counts = analyze(ea, kb, '', sem_group_version=None)

In [None]:
ea_df

In [None]:
ea_counts

In [None]:
# Clear singleton to free up memory
SapBERTLinker.clear()
# Initialize linker from config
sap_bert_linker = SapBERTLinker(
    embedding_model_name = embedding_model_name,
    index_base_path = Path(config.cache_dir) / 'distemist_gazetteer/index/sapbert',
    k = k,
)

In [None]:
prediction_monoling = sap_bert_linker.predict_batch(ds['train'], batch_size=128)

In [None]:
evaluate_at_k(ds['train'], prediction_monoling)

In [None]:
evaluate_at_k(ds['train'], prediction_monoling)