In [2]:
import os
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

In [3]:
from pathlib import Path

In [44]:
from xmen.knowledge_base import CompositeKnowledgebase
from xmen.evaluation import entity_linking_error_analysis

In [5]:
from xmen.confhelper import load_config
config = load_config('../conf/distemist.yaml')

In [6]:
from xmen.evaluation import evaluate

def evaluate_at_k(ground_truth, pred, eval_k=[1,5,8,20,64]):
    for ki in eval_k:
        print(f'Perf@{ki}', evaluate(ground_truth, pred, top_k_predictions=ki)["strict"]['recall'])

# Preparation

## Download Distemist gazetteer

In [None]:
!mkdir -p ../temp; wget https://zenodo.org/record/6505583/files/dictionary_distemist.tsv?download=0 -O ../temp/dictionary_distemist.tsv

## Setup dictionaries
TODO: not supporting multiple dicts! run `xmen dict conf/distemist.yaml` to create the xMEN dictionary from the DisTEMIST gazetteer

# Commands
`xmen dict conf/distemist.yaml --custom-name distemist_umls`

In [None]:
distemist_gazetteer_jsonl = Path(config.cache_dir) / 'distemist_gazetteer' / 'distemist_gazetteer.jsonl'
distemist_umls_jsonl = Path(config.cache_dir) / 'distemist' / 'distemist_umls.jsonl'
umls_all_jsonl = Path(config.cache_dir) / 'distemist' / 'umls_all' / 'umls_all.jsonl'

In [None]:
meta_path = '/home/Florian.Borchert/umls/2022AA/META/'
mrconso = 'MRCONSO.RRF'

In [None]:
from xmen.umls import umls_utils
from collections import defaultdict
from tqdm.auto import tqdm

def read_cui2snomed_mapping(meta_path):
    cui2snomed = defaultdict(list)
    headers = umls_utils.read_umls_file_headers(meta_path, mrconso)
    with open(f"{meta_path}/{mrconso}") as fin:
        for line in tqdm(fin.readlines()):
            splits = line.strip().split("|")
            assert len(headers) == len(splits)
            concept = dict(zip(headers, splits))
            if concept['SAB'] in ['SNOMEDCT_US', 'SCTSPA']:
                cui2snomed[concept['CUI']].append(concept['SCUI'])
    return cui2snomed

In [None]:
cui2snomed_mapping = read_cui2snomed_mapping(meta_path)

In [None]:
def cui2snomed(entry):
    res = []
    cui = entry['concept_id']
    for sctid in cui2snomed_mapping[cui]:
        r = entry.copy()
        r['concept_id'] = sctid
        res.append(r)
    return res

In [None]:
%%time
kb = CompositeKnowledgebase([umls_all_jsonl], mappers=[cui2snomed])

In [None]:
out_path = Path(config.cache_dir) / 'distemist_all' / 'distemist_all.jsonl'

In [None]:
import json
with open(out_path, 'w') as fo:
    for l in open(distemist_gazetteer_jsonl).readlines():
        entry = json.loads(l)
        sctid = str(entry['concept_id'])
        concept = kb.cui_to_entity[sctid]
        known_aliases = [entry['canonical_name']] + entry['aliases']
        new_aliases = [c for c in [concept.canonical_name] + concept.aliases if c not in known_aliases]
        entry['aliases'] += new_aliases
        fo.write(json.dumps(entry) + '\n')

In [None]:
kb = CompositeKnowledgebase([out_path])

In [None]:
kb.cui_to_entity['105000']

In [None]:
sum([len(v) for v in kb.alias_to_cuis.values()])

In [None]:
kb2 = CompositeKnowledgebase(out_pathth(config.cache_dir) / 'distemist_gazetteer' / 'distemist_gazetteer.jsonl'])

In [None]:
!wc -l ../temp/dictionary_distemist.tsv

In [None]:
len(kb2.cui_to_entity)

In [None]:
3. run `xmen index conf/distemist.yaml --all` to create the indices used for candidate generation

# Load Dataset

In [9]:
from bigbio.dataloader import BigBioConfigHelpers

configs = BigBioConfigHelpers()
ds = configs.for_config_name('distemist_linking_bigbio_kb').load_dataset()
for k, v in ds.items():
    ds[k] = v.add_column('corpus_id', len(v) * ['distemist'])

Found cached dataset distemist (/home/Florian.Borchert/.cache/huggingface/datasets/bigbio___distemist/distemist_linking_bigbio_kb/1.0.0/f63b2c6775932c342d7bff59d751d6139c0c52c5255f7fbb3e458d255728a8dd)


  0%|          | 0/2 [00:00<?, ?it/s]

# Preprocessing

In [21]:
#!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_sm-0.5.1.tar.gz

In [22]:
from xmen.preprocessing import AbbreviationExpander
ds = AbbreviationExpander().transform_batch(ds)

Map:   0%|          | 0/583 [00:00<?, ? examples/s]

  global_matches = self.global_matcher(doc)


Map:   0%|          | 0/250 [00:00<?, ? examples/s]

# Candidate generation

In [46]:
import sys
sys.path.append('../../xmen_notebooks/notebooks/')
from notebook_util import analyze

In [34]:
from xmen.linkers import TFIDFNGramLinker

In [39]:
ngram_base_path = Path(config.cache_dir) / 'distemist_gazetteer' / 'index' / 'ngrams'

In [49]:
ngram_linker = TFIDFNGramLinker(index_base_path=ngram_base_path, k=100)

In [50]:
pred_ngram = ngram_linker.predict_batch(ds)

Map:   0%|          | 0/583 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [51]:
evaluate_at_k(ds['train'], pred_ngram['train'])

Perf@1 0.45310755489393373
Perf@5 0.6185336806847785
Perf@8 0.6484927428358764
Perf@20 0.698920729438035
Perf@64 0.7534425009304057


In [43]:
kb_ngram = CompositeKnowledgebase([Path(config.cache_dir) / 'distemist_gazetteer' / 'distemist_gazetteer.jsonl'])

In [52]:
ea_ngram = entity_linking_error_analysis(ds['train'], pred_ngram['train'])

In [53]:
ea_df_ngram, ea_counts_ngram = analyze(ea_ngram, kb_ngram, '', sem_group_version=None)

In [54]:
ea_counts_ngram

error_cat
TP                                  0.453108
NOT_FOUND                           0.099367
SIMPLE_NOT_FOUND                    0.093599
TOP10_COMPLEX_ENTITY                0.084481
COMPLEX_ENTITY                      0.055266
TOP10_SAME_SYNONYMS                 0.054336
ABREV                               0.041124
TOP10_UNKNOWN_ERROR                 0.037588
TOP10_SUSPECTED_ANNOTATION_ERROR    0.029029
SUSPECTED_ANNOTATION_ERROR          0.025865
UNKNOWN_ERROR                       0.023260
MISSING_CUI_PRED                    0.001861
SAME_SYNONYMS                       0.000930
MISSING_CUI                         0.000186
Name: , dtype: float64

In [24]:
from xmen.linkers import SapBERTLinker

Your CPU supports instructions that this binary was not compiled to use: SSE3 SSE4.1 SSE4.2 AVX AVX2
For maximum performance, you can install NMSLIB from sources 
pip install --no-binary :all: nmslib


In [25]:
embedding_model_name = 'cambridgeltl/SapBERT-UMLS-2020AB-all-lang-from-XLMR'
index_base_path = Path(config.cache_dir) / 'distemist' / 'index' / 'sapbert'
k = 1000

In [29]:
# Clear singleton to free up memory
SapBERTLinker.clear()
# Initialize linker from config
sap_bert_linker = SapBERTLinker(
    embedding_model_name = embedding_model_name,
    index_base_path = index_base_path,
    k = k,
    expand_abbreviations=True,
)

In [30]:
prediction = sap_bert_linker.predict_batch(ds['train'], batch_size=128)

Map:   0%|          | 0/583 [00:00<?, ? examples/s]

In [31]:
evaluate_at_k(ds['train'], prediction)

Perf@1 0.4183103833271306
Perf@5 0.6950130256791961
Perf@8 0.7564197990323781
Perf@20 0.8046148120580573
Perf@64 0.846483066617045


In [28]:
evaluate_at_k(ds['train'], prediction)

Perf@1 0.4162634908820246
Perf@5 0.692221808708597
Perf@8 0.7523260141421659
Perf@20 0.8005210271678452
Perf@64 0.8422032005954596


In [55]:
from xmen.linkers import EnsembleLinker

In [71]:
ensemble = EnsembleLinker()
ensemble.add_linker('sap', sap_bert_linker, k=100)
ensemble.add_linker('ngram', ngram_linker, k=100)

In [72]:
ensemble_pred = ensemble.predict_batch(ds, batch_size=128, top_k=100)

Map:   0%|          | 0/583 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [73]:
evaluate_at_k(ds['train'], ensemble_pred['train'])

Perf@1 0.4614812058057313
Perf@5 0.7193896538890956
Perf@8 0.7640491254186825
Perf@20 0.8120580573129884
Perf@64 0.8556010420543357


In [74]:
ensemble_pred.save_to_disk('ensemble_pred')

Saving the dataset (0/1 shards):   0%|          | 0/583 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

In [7]:
import datasets
prediction = datasets.load_from_disk('prediction')


KeyboardInterrupt



In [12]:
ea = entity_linking_error_analysis(ds['train'], prediction)

In [14]:
kb = CompositeKnowledgebase([Path(config.cache_dir) / 'distemist' / 'distemist.jsonl'])

In [15]:
ea_df, ea_counts = analyze(ea, kb, '', sem_group_version=None)

In [None]:
ea_df

In [16]:
ea_counts

error_cat
TP                                  0.416263
TOP10_SAME_SYNONYMS                 0.210644
NOT_FOUND                           0.082062
TOP10_COMPLEX_ENTITY                0.066617
COMPLEX_ENTITY                      0.054522
ABREV                               0.040752
SUSPECTED_ANNOTATION_ERROR          0.039077
TOP10_UNKNOWN_ERROR                 0.034425
TOP10_SUSPECTED_ANNOTATION_ERROR    0.021771
SIMPLE_NOT_FOUND                    0.019725
UNKNOWN_ERROR                       0.010793
SAME_SYNONYMS                       0.003163
MISSING_CUI                         0.000186
Name: , dtype: float64

In [None]:
# Clear singleton to free up memory
SapBERTLinker.clear()
# Initialize linker from config
sap_bert_linker = SapBERTLinker(
    embedding_model_name = embedding_model_name,
    index_base_path = Path(config.cache_dir) / 'distemist_gazetteer/index/sapbert',
    k = k,
)

In [None]:
prediction_monoling = sap_bert_linker.predict_batch(ds['train'], batch_size=128)

In [None]:
evaluate_at_k(ds['train'], prediction_monoling)

In [None]:
evaluate_at_k(ds['train'], prediction_monoling)

# Reranking