In [1]:
import os
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [2]:
import sys
sys.path.insert(0, '..')

In [3]:
from pathlib import Path

In [4]:
from xmen.evaluation import entity_linking_error_analysis, evaluate_at_k
from xmen.linkers.util import filter_and_apply_threshold

In [5]:
from xmen.confhelper import load_config
config = load_config('../conf/distemist.yaml')
base_path = Path(config.cache_dir)
tmp_path = Path('..') / 'temp'

In [7]:
import datasets

ds = datasets.load_dataset('distemist', 'distemist_linking_bigbio_kb')

# Own validation set (20% of distemist training set / EL sub-track)
with open('distemist_validation_docs.txt', 'r') as fh:
    valid_ids = [l.strip() for l in fh.readlines()]
    
ds_train = ds['train'].filter(lambda d: d['document_id'] not in valid_ids)
ds_valid = ds['train'].filter(lambda d: d['document_id'] in valid_ids)

ds['train'] = ds_train
ds['validation'] = ds_valid

ds

Using the latest cached version of the module from /home/Florian.Borchert/.cache/huggingface/modules/datasets_modules/datasets/distemist/f63b2c6775932c342d7bff59d751d6139c0c52c5255f7fbb3e458d255728a8dd (last modified on Thu Apr 20 13:14:35 2023) since it couldn't be found locally at distemist., or remotely on the Hugging Face Hub.
Found cached dataset distemist (/home/Florian.Borchert/.cache/huggingface/datasets/distemist/distemist_linking_bigbio_kb/1.0.0/f63b2c6775932c342d7bff59d751d6139c0c52c5255f7fbb3e458d255728a8dd)


  0%|          | 0/2 [00:00<?, ?it/s]

Loading cached processed dataset at /home/Florian.Borchert/.cache/huggingface/datasets/distemist/distemist_linking_bigbio_kb/1.0.0/f63b2c6775932c342d7bff59d751d6139c0c52c5255f7fbb3e458d255728a8dd/cache-c51f74b0be31bc1c.arrow
Loading cached processed dataset at /home/Florian.Borchert/.cache/huggingface/datasets/distemist/distemist_linking_bigbio_kb/1.0.0/f63b2c6775932c342d7bff59d751d6139c0c52c5255f7fbb3e458d255728a8dd/cache-2b861d41b461ff1b.arrow


DatasetDict({
    train: Dataset({
        features: ['id', 'document_id', 'passages', 'entities', 'events', 'coreferences', 'relations'],
        num_rows: 466
    })
    test: Dataset({
        features: ['id', 'document_id', 'passages', 'entities', 'events', 'coreferences', 'relations'],
        num_rows: 250
    })
    validation: Dataset({
        features: ['id', 'document_id', 'passages', 'entities', 'events', 'coreferences', 'relations'],
        num_rows: 117
    })
})

# Preparation - Just run once

__Note__: This is a bit more involved, because we dynamically explore different combinations of DisTEMIST gazetteer and UMLS subsets.

In most normal cases, you just want to configure the target dictionary in your .yaml file and run `xmen dict` and `xmen index` with default settings.

## Download Distemist gazetteer

In [None]:
!mkdir -p ../temp; wget https://zenodo.org/record/6505583/files/dictionary_distemist.tsv?download=0 -O ../temp/dictionary_distemist.tsv

## Setup dictionaries

Since the DisTEMIST Task only covers a subset of SNOMED CT codes, we retrieve only additional synonyms for the UMLS for these codes.
Thus, we need to map CUIs to SNOMED CT IDs and merge them with the DisTEMIST gazetteer.

In [None]:
dict_configs = ['distemist_gazetteer', 'distemist_umls_es', 'distemist_umls_en_es', 'distemist_umls_all'] 

In [None]:
from IPython.display import Markdown, display

# Run these commands from the root folder
for d in dict_configs:
    display(Markdown(f"`xmen dict conf/distemist.yaml --code dicts/distemist.py --key {d}`"))

In [None]:
meta_path = Path(config.umls_meta_path)

In [None]:
from xmen.umls import umls_utils
from collections import defaultdict
from tqdm.auto import tqdm

def read_cui2snomed_mapping(meta_path):
    mrconso = 'MRCONSO.RRF'
    cui2snomed = defaultdict(list)
    headers = umls_utils.read_umls_file_headers(meta_path, mrconso)
    with open(f"{meta_path}/{mrconso}") as fin:
        for line in tqdm(fin.readlines()):
            splits = line.strip().split("|")
            assert len(headers) == len(splits)
            concept = dict(zip(headers, splits))
            if concept['SAB'] in ['SNOMEDCT_US', 'SCTSPA']:
                cui2snomed[concept['CUI']].append(concept['SCUI'])
    return cui2snomed

In [None]:
cui2snomed_mapping = read_cui2snomed_mapping(meta_path)

In [None]:
import json
from tqdm.auto import tqdm
from xmen.knowledge_base import CompositeKnowledgebase

def cui2snomed(entry):
    res = []
    cui = entry['concept_id']
    for sctid in cui2snomed_mapping[cui]:
        r = entry.copy()
        r['concept_id'] = sctid
        res.append(r)
    return res

def merge_and_write_dicts(target_jsonl, added_jsonl, output_jsonl):
    print(f'Writing to {output_jsonl}')
    output_jsonl.parent.mkdir(exist_ok=True, parents=True)
    cui_count = alias_count = 0
    kb = CompositeKnowledgebase([added_jsonl], mappers=[cui2snomed])
    with open(output_jsonl, 'w') as fo:
        for l in tqdm(list(open(target_jsonl).readlines())):
            cui_count += 1
            entry = json.loads(l)
            sctid = str(entry['concept_id'])
            concept = kb.cui_to_entity[sctid]
            known_aliases = [entry['canonical_name']] + entry['aliases']
            new_aliases = [c for c in [concept.canonical_name] + concept.aliases if c not in known_aliases]
            entry['aliases'] += new_aliases
            alias_count += len(entry['aliases'])
            fo.write(json.dumps(entry) + '\n')
    print(f'Written {cui_count} concepts with {alias_count} aliases')

In [None]:
for d in dict_configs[1:]:
    merge_and_write_dicts(distemist_gazetteer_jsonl, base_path / d / f'{d}.jsonl', base_path / 'distemist' / 'merged' / f'{d}.jsonl')

## Prepare TF-IDF NGram and SapBERT indices for all configurations

In [None]:
# Run these commands from the root folder
display(Markdown(f"`xmen index conf/distemist.yaml --dict {base_path / 'distemist_gazetteer' / 'distemist_gazetteer.jsonl'} --output {base_path / 'distemist' / 'merged' / 'distemist_gazetteer'} --all`"))
for d in dict_configs[1:]:
    display(Markdown(f"`xmen index conf/distemist.yaml --dict {base_path / 'distemist' / 'merged' / f'{d}.jsonl'} --output {base_path / 'distemist' / 'merged' / f'{d}'} --all`"))

# Candidate Generation

In [34]:
embedding_model_name = 'cambridgeltl/SapBERT-UMLS-2020AB-all-lang-from-XLMR'
dict_config = 'distemist_umls_en_es'

In [14]:
from notebook_util import analyze
from xmen.linkers import TFIDFNGramLinker, SapBERTLinker, EnsembleLinker
from xmen.linkers.util import filter_and_apply_threshold
from datasets import DatasetDict

In [15]:
index_base_path = base_path / 'distemist' / 'merged' / dict_config / 'index'

#### TF-IDF over character n-grams

In [16]:
ngram_linker = TFIDFNGramLinker(index_base_path=index_base_path / 'ngrams', k=100)
pred_ngram = ngram_linker.predict_batch(ds)
pred_ngram.save_to_disk(tmp_path / dict_config / 'pred_ngram')

Map:   0%|          | 0/466 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

Map:   0%|          | 0/117 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/466 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/117 [00:00<?, ? examples/s]

In [17]:
_ = evaluate_at_k(ds['validation'], pred_ngram['validation'])

Perf@1 0.28599801390268126
Perf@2 0.4141012909632572
Perf@4 0.5858987090367428
Perf@8 0.6732869910625621
Perf@16 0.7209533267130089
Perf@32 0.7616683217477657
Perf@64 0.7974180734856008


#### Dense Retrieval with SapBERT

In [18]:
# Clear singleton to free up memory
SapBERTLinker.clear()
# Initialize linker from config
sapbert_linker = SapBERTLinker(
    embedding_model_name = embedding_model_name,
    index_base_path = index_base_path / 'sapbert',
    k = 1000
)

pred_sapbert = sapbert_linker.predict_batch(ds, batch_size=128)
pred_sapbert.save_to_disk(tmp_path / dict_config / 'pred_sapbert')

Map:   0%|          | 0/466 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

Map:   0%|          | 0/117 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/466 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/117 [00:00<?, ? examples/s]

In [19]:
_ = evaluate_at_k(ds['validation'], pred_sapbert['validation'])

Perf@1 0.3634558093346574
Perf@2 0.5243296921549155
Perf@4 0.6593843098311817
Perf@8 0.7437934458788481
Perf@16 0.7864945382323734
Perf@32 0.8083416087388282
Perf@64 0.8262164846077458


#### Ensemble

In [20]:
ensemble_linker = EnsembleLinker()
ensemble_linker.add_linker('sapbert', sapbert_linker, k=100)
ensemble_linker.add_linker('ngram', ngram_linker, k=100)

# Re-use predictions for efficiency
# TODO: reuse_preds currently does not work with dataset dicts
pred_ensemble = DatasetDict()
pred_ensemble['train'] = ensemble_linker.predict_batch(ds['train'], 128, 100, reuse_preds={'sapbert' : pred_sapbert['train'], 'ngram' : pred_ngram['train']})
pred_ensemble['validation'] = ensemble_linker.predict_batch(ds['validation'], 128, 100, reuse_preds={'sapbert' : pred_sapbert['validation'], 'ngram' : pred_ngram['validation']})
pred_ensemble['test'] = ensemble_linker.predict_batch(ds['test'], 128, 100, reuse_preds={'sapbert' : pred_sapbert['test'], 'ngram' : pred_ngram['test']})

pred_ensemble.save_to_disk(tmp_path / dict_config / 'pred_ensemble')

Map:   0%|          | 0/466 [00:00<?, ? examples/s]

Map:   0%|          | 0/117 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/466 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/117 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

In [21]:
_ = evaluate_at_k(ds['validation'], pred_ensemble['validation'])

Perf@1 0.4270109235352532
Perf@2 0.5412115193644489
Perf@4 0.660377358490566
Perf@8 0.7487586891757696
Perf@16 0.7954319761668321
Perf@32 0.8152929493545183
Perf@64 0.8411122144985105


# Reranking

In [22]:
from xmen.reranking.cross_encoder import CrossEncoderReranker, CrossEncoderTrainingArgs

In [23]:
K_RERANKING = 64
CONTEXT_LENGTH = 128
CROSS_ENC_MODEL = 'PlanTL-GOB-ES/roberta-base-biomedical-clinical-es'
NUM_EPOCHS = 20
SEM_TYPE=True

## Prepare Data for Cross-Encoder Training

In [32]:
# Dictionary used for entity representation
dict_config_rr = 'distemist_umls_es'

In [35]:
from xmen.linkers.util import filter_and_apply_threshold

candidates = datasets.load_from_disk(tmp_path / dict_config / 'pred_ensemble')
candidates = filter_and_apply_threshold(candidates, K_RERANKING, 0.0)

Map:   0%|          | 0/466 [00:00<?, ? examples/s]

Map:   0%|          | 0/117 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [27]:
from xmen.knowledge_base import load_kb
#kb = load_kb(base_path / 'distemist_gazetteer' / 'distemist_gazetteer.jsonl')
kb = load_kb(base_path / 'distemist' / 'merged' / f'{dict_config_rr}.jsonl')

In [36]:
cross_enc_ds = CrossEncoderReranker.prepare_data(candidates, ds, kb, CONTEXT_LENGTH, encode_sem_type=SEM_TYPE)
cross_enc_ds.save_to_disk(tmp_path / f'cross_enc_ds_{dict_config_rr}')

Context length: 128


  0%|          | 0/4180 [00:00<?, ?it/s]

  0%|          | 0/4180 [00:00<?, ?it/s]

  0%|          | 0/4180 [00:00<?, ?it/s]

  0%|          | 0/956 [00:00<?, ?it/s]

  0%|          | 0/956 [00:00<?, ?it/s]

  0%|          | 0/956 [00:00<?, ?it/s]

  0%|          | 0/2598 [00:00<?, ?it/s]

  0%|          | 0/2598 [00:00<?, ?it/s]

  0%|          | 0/2598 [00:00<?, ?it/s]

## Train Cross Encoder

In [37]:
from xmen.data.indexed_dataset import IndexedDatasetDict
cross_enc_ds = IndexedDatasetDict.load_from_disk(tmp_path / f'cross_enc_ds_{dict_config_rr}')
cross_enc_ds

{'train': [4180 items], 'validation': [956 items], 'test': [2598 items]}

In [40]:
cross_enc_ds['train'].dataset[0][0:10]

[<ScoredInputExample> label: 0, score: 0.6854951977729797, texts:  dado de alta a los diez días de la intervención quirúrgica.
 El estudio anatomopatológico del apéndice cecal fue informado como  [START] apéndice cecal con signos inflamatorios [END] 
 
 ; disorder [TYPE] inflamación de apéndice epiploico [TITLE] inflamación de apéndice epiploico (trastorno) [SEP] Apendicitis epiploica,
 <ScoredInputExample> label: 0, score: 0.6809297800064087, texts:  dado de alta a los diez días de la intervención quirúrgica.
 El estudio anatomopatológico del apéndice cecal fue informado como  [START] apéndice cecal con signos inflamatorios [END] 
 
 ; disorder [TYPE] apendicitis crónica [TITLE] inflamación crónica del apéndice [SEP] apendicitis crónica (trastorno) [SEP] Apendicitis crónica,
 <ScoredInputExample> label: 0, score: 0.6728464961051941, texts:  dado de alta a los diez días de la intervención quirúrgica.
 El estudio anatomopatológico del apéndice cecal fue informado como  [START] apéndice 

In [41]:
train_args = CrossEncoderTrainingArgs(
    CROSS_ENC_MODEL, 
    NUM_EPOCHS,
    score_regularization=True,
)

In [42]:
rr = CrossEncoderReranker()
output_dir = f'{tmp_path}/cross_encoder_training/'

In [None]:
rr.fit(cross_enc_ds['train'].dataset, cross_enc_ds['validation'].dataset, output_dir=output_dir, training_args=train_args)

model_name := PlanTL-GOB-ES/roberta-base-biomedical-clinical-es
num_train_epochs := 20
fp16 := True
label_smoothing := False
score_regularization := True
train_layers := None
softmax_loss := True


Some weights of the model checkpoint at PlanTL-GOB-ES/roberta-base-biomedical-clinical-es were not used when initializing RobertaForSequenceClassification: ['lm_head.decoder.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at PlanTL-GOB-ES/roberta-base-biomedical-clinical-es and are newly initialized: 

Using score regularization: True
Using label smoothing factor: False


Epoch:   0%|          | 0/20 [00:00<?, ?it/s]

Iteration:   0%|          | 0/4180 [00:00<?, ?it/s]

## Predict

In [None]:
rr = CrossEncoderReranker.load(output_dir, device=0)

In [None]:
cross_enc_pred = cross_encoder_predict(rr, cross_enc_ds['test'], candidates['test'])

# Error Analysis

In [None]:
kb_ngram = CompositeKnowledgebase([Path(config.cache_dir) / 'distemist_gazetteer' / 'distemist_gazetteer.jsonl'])

In [None]:
ea_ngram = entity_linking_error_analysis(ds['train'], pred_ngram['train'])

In [None]:
ea_df_ngram, ea_counts_ngram = analyze(ea_ngram, kb_ngram, '', sem_group_version=None)

In [None]:
ea_counts_ngram

In [None]:
evaluate_at_k(ds['train'], prediction)

In [None]:
evaluate_at_k(ds['train'], prediction)

In [None]:
from xmen.linkers import EnsembleLinker

In [None]:
ensemble = EnsembleLinker()
ensemble.add_linker('sap', sap_bert_linker, k=100)
ensemble.add_linker('ngram', ngram_linker, k=100)

In [None]:
ensemble_pred = ensemble.predict_batch(ds, batch_size=128, top_k=100)

In [None]:
evaluate_at_k(ds['train'], ensemble_pred['train'])

In [None]:
ensemble_pred.save_to_disk('ensemble_pred')

In [None]:
import datasets
prediction = datasets.load_from_disk('prediction')

In [None]:
ea = entity_linking_error_analysis(ds['train'], prediction)

In [None]:
kb = CompositeKnowledgebase([Path(config.cache_dir) / 'distemist' / 'distemist.jsonl'])

In [None]:
ea_df, ea_counts = analyze(ea, kb, '', sem_group_version=None)

In [None]:
ea_df

In [None]:
ea_counts

In [None]:
# Clear singleton to free up memory
SapBERTLinker.clear()
# Initialize linker from config
sap_bert_linker = SapBERTLinker(
    embedding_model_name = embedding_model_name,
    index_base_path = Path(config.cache_dir) / 'distemist_gazetteer/index/sapbert',
    k = k,
)

In [None]:
prediction_monoling = sap_bert_linker.predict_batch(ds['train'], batch_size=128)

In [None]:
evaluate_at_k(ds['train'], prediction_monoling)

In [None]:
evaluate_at_k(ds['train'], prediction_monoling)