# UMLS Entity Linking

In [1]:
import datasets
from pathlib import Path
from nen_util import get_annotation_dataframe
import pandas as pd

**Note:** For this notebook, we only use a small subset of GGPONC 2.0 for demonstration purposes
Change the version if you want to use any other GGPONC release

In [2]:
from xmen import load_config, load_kb

conf = load_config('xmen_ggponc3.yaml')
version = conf.data_version
version

'v2.0_agreement'

## Preparation

`xmen dict xmen_ggponc3.yaml`

`xmen index xmen_ggponc3.yaml --all --overwrite`

In [3]:
!mkdir -p output/$version
#!unzip -n -q data/$version/plain_text/plain_text.zip -d output/$version/plain_text

## Read Sentences

In [4]:
from ggponc import read_sentences
sent_df = read_sentences(version, 'data')
sent_df.head()

  0%|          | 0/14 [00:00<?, ?it/s]

Unnamed: 0,file,sentence_id,sentence
0,01_magenkarzinom_0192,0,Vitamine und essentielle Spurenelemente sind u...
1,01_magenkarzinom_0192,1,Mehr noch als bei Gesunden sollte bei Tumorpat...
2,01_magenkarzinom_0192,2,Am ehesten werden bei Patienten mit Malignomen...
3,01_magenkarzinom_0192,3,Bei oraler und enteraler Ernährung gelten für ...
4,01_magenkarzinom_0192,4,Die Nährstoffgehalte von bilanzierten Diäten s...


## Resolve Elliptical Compounds

In [5]:
from ggponc import ECCNPResolver
resolver = ECCNPResolver(**conf.eccnp)



In [6]:
resolver.transform('Chemo- und Strahlentherapie, BRAF-/MEK-Inhibitor')

'Chemotherapie und Strahlentherapie, BRAF-Inhibitor/MEK-Inhibitor,'

In [7]:
sent_df = resolver.set_df(sent_df)

  0%|          | 0/374 [00:00<?, ?it/s]

In [8]:
sent_df.to_parquet(f'output/{version}/sentences_resolved.parquet')

## Run NER Model

In [9]:
sent_df = pd.read_parquet(f'output/{version}/sentences_resolved.parquet')

In [10]:
# Drop excessively long pre-processed sentences, most like generation errors
sent_df['ratio'] = (sent_df.sentence_preprocessed.str.len() / sent_df.sentence.str.len()).sort_values()
drop_index = sent_df.ratio > 2
print('Resetting', drop_index.sum(), '/', len(drop_index), 'docs due to likely generation errors')
sent_df.loc[sent_df.ratio > 2, 'sentence_preprocessed'] = sent_df.sentence

Resetting 0 / 374 docs due to likely generation errors


In [11]:
from ggponc import NERTagger
ner = NERTagger()

In [12]:
d = ner.transform('Chemo- und Strahlentherapie, BRAF-/MEK-Inhibitor')
d.spans

{'entities': [Chemo-, Strahlentherapie, BRAF-/MEK-Inhibitor]}

In [13]:
d = ner.transform('Chemotherapie und Strahlentherapie, BRAF-Inhibitor/MEK-Inhibitor')
d.spans

{'entities': [Chemotherapie, Strahlentherapie, BRAF-Inhibitor, MEK-Inhibitor]}

In [14]:
ner_df = ner.set_df(sent_df)

  0%|          | 0/374 [00:00<?, ?it/s]

In [15]:
from ggponc import merge_sentence_docs
docs = merge_sentence_docs(ner_df.spacy_ner, ner_df.file)

In [16]:
from spacy import displacy

displacy.render(docs[0], style='span', options = {'spans_key' : 'entities'})

In [17]:
from xmen.data import from_spacy
ds = from_spacy(docs, span_key='entities', doc_id_key='file_name')

In [18]:
fname = f'output/{version}/ggponc_ner_spacy'
ds.save_to_disk(fname)
fname

Saving the dataset (0/1 shards):   0%|          | 0/14 [00:00<?, ? examples/s]

'output/v2.0_agreement/ggponc_ner_spacy'

# Run Entity Linker

In [19]:
ds = datasets.load_from_disk(f'output/{version}/ggponc_ner_spacy')

In [20]:
len([e for d in ds for e in d['entities']])

1177

In [21]:
from ggponc import EntityLinker
linker = EntityLinker(**conf.linker.ranking, candidate_generation_kwargs=conf.linker.candidate_generation)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations




In [22]:
result = linker.transform(ds)

Map:   0%|          | 0/14 [00:00<?, ? examples/s]

  global_matches = self.global_matcher(doc)


Map:   0%|          | 0/14 [00:00<?, ? examples/s]

Map:   0%|          | 0/14 [00:00<?, ? examples/s]

Map:   0%|          | 0/14 [00:00<?, ? examples/s]

Context length: 128
Use NIL values: False


  0%|          | 0/1177 [00:00<?, ?it/s]

  0%|          | 0/1177 [00:00<?, ?it/s]

  0%|          | 0/1177 [00:00<?, ?it/s]

Batches:   0%|          | 0/1177 [00:00<?, ?it/s]

Map:   0%|          | 0/14 [00:00<?, ? examples/s]

Map:   0%|          | 0/14 [00:00<?, ? examples/s]

In [23]:
result.save_to_disk(f'output/{version}/ggponc_xmen')

Saving the dataset (0/1 shards):   0%|          | 0/14 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/14 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/14 [00:00<?, ? examples/s]

# Final Format

In [24]:
import datasets
result = datasets.load_from_disk(f'output/{version}/ggponc_xmen')

In [25]:
types = ['Clinical_Drug', 'Diagnostic', 'Therapeutic']

import random
show_indices = list(range(0, len(result['candidates'])))
random.shuffle(show_indices)
show_indices[0:10]

[13, 2, 3, 4, 11, 0, 5, 12, 7, 8]

In [26]:
get_annotation_dataframe(result['candidates'].select(show_indices[0:10]), linker.kb, 3, 0.0).query('type in @types').iloc[0:10]

  0%|          | 0/10 [00:00<?, ?it/s]

Unnamed: 0,document,text,type,start,end,cui,tuis,canonical,linker,confidence
72,27_supportive-therapie_0765,Strahlentherapie,Therapeutic,1162,1178,C1522449,[T061],Strahlentherapie,"[ngram, sapbert]",1.0
73,27_supportive-therapie_0765,Strahlentherapie,Therapeutic,1162,1178,C0031765,[T061],Phototherapie,"[ngram, sapbert]",0.88524
74,27_supportive-therapie_0765,Strahlentherapie,Therapeutic,1162,1178,C0034619,[T170],radiotherapy,[sapbert],0.8639
75,27_supportive-therapie_0765,Steroidtherapie,Therapeutic,1313,1328,C0149783,[T061],Steroidtherapie,"[ngram, sapbert]",1.0
76,27_supportive-therapie_0765,Steroidtherapie,Therapeutic,1313,1328,C3897353,[T061],Topical Steroid Therapy,"[ngram, sapbert]",0.813714
77,27_supportive-therapie_0765,Steroidtherapie,Therapeutic,1313,1328,C5667589,[T061],Pulse Steroids,"[ngram, sapbert]",0.806739
81,27_supportive-therapie_0765,Steroidtherapie,Therapeutic,1403,1418,C0149783,[T061],Steroidtherapie,"[ngram, sapbert]",1.0
82,27_supportive-therapie_0765,Steroidtherapie,Therapeutic,1403,1418,C3897353,[T061],Topical Steroid Therapy,"[ngram, sapbert]",0.813714
83,27_supportive-therapie_0765,Steroidtherapie,Therapeutic,1403,1418,C5667589,[T061],Pulse Steroids,"[ngram, sapbert]",0.806739
90,27_supportive-therapie_0765,Bestrahlung,Therapeutic,1516,1527,C0851346,[T070],Strahlung,"[ngram, sapbert]",0.894799


In [27]:
get_annotation_dataframe(result['reranked'].select(show_indices[0:10]), linker.kb, 3, 0.0).query('type in @types').iloc[0:10]

  0%|          | 0/10 [00:00<?, ?it/s]

Unnamed: 0,document,text,type,start,end,cui,tuis,canonical,linker,confidence
72,27_supportive-therapie_0765,Strahlentherapie,Therapeutic,1162,1178,C1522449,[T061],Strahlentherapie,"[ngram, sapbert]",0.138755
73,27_supportive-therapie_0765,Strahlentherapie,Therapeutic,1162,1178,C0043308,[T061],Röntgenstrahlentherapie,[sapbert],0.063443
74,27_supportive-therapie_0765,Strahlentherapie,Therapeutic,1162,1178,C0948312,[T061],Radiotherapy to nose,[ngram],0.063037
75,27_supportive-therapie_0765,Steroidtherapie,Therapeutic,1313,1328,C0149783,[T061],Steroidtherapie,"[ngram, sapbert]",0.168302
76,27_supportive-therapie_0765,Steroidtherapie,Therapeutic,1313,1328,C0279025,[T061],Hormontherapie,"[ngram, sapbert]",0.063342
77,27_supportive-therapie_0765,Steroidtherapie,Therapeutic,1313,1328,C0279494,[T061],Oestrogentherapie,[sapbert],0.062963
81,27_supportive-therapie_0765,Steroidtherapie,Therapeutic,1403,1418,C0149783,[T061],Steroidtherapie,"[ngram, sapbert]",0.163678
82,27_supportive-therapie_0765,Steroidtherapie,Therapeutic,1403,1418,C0279025,[T061],Hormontherapie,"[ngram, sapbert]",0.063427
83,27_supportive-therapie_0765,Steroidtherapie,Therapeutic,1403,1418,C0279494,[T061],Oestrogentherapie,[sapbert],0.061039
90,27_supportive-therapie_0765,Bestrahlung,Therapeutic,1516,1527,C1522449,[T061],Strahlentherapie,[sapbert],0.116514


In [28]:
df = get_annotation_dataframe(result['reranked'], linker.kb, 1, 0.0)

  0%|          | 0/14 [00:00<?, ?it/s]

In [29]:
df.to_csv(f'output/{version}/entities_with_cuis.tsv', sep='\t', index=False)

# Analysis