# UMLS Entity Linking

In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '3'

In [2]:
import datasets
from pathlib import Path
from nen_util import get_annotation_dataframe
import pandas as pd

In [7]:
from xmen import load_config, load_kb

conf = load_config('xmen_ggponc3.yaml')
version = conf.data_version

## Preparation

`xmen dict xmen_ggponc3.yaml`

`xmen index xmen_ggponc3.yaml --all --overwrite`

In [None]:
#!unzip -n -q data/v3.0_2024_01_03/plain_text/plain_text.zip -d data/v3_plain_text
!mkdir output/$version
!unzip -n -q data/$version/plain_text/plain_text.zip -d output/$version/plain_text

## Read Sentences

In [8]:
from ggponc import read_sentences
sent_df = read_sentences(version)
sent_df.head()

  0%|          | 0/11280 [00:00<?, ?it/s]

Unnamed: 0,file,sentence_id,sentence
0,00_mundhoehlenkarzinom_0000,0,Tabakkonsum ist ein wesentlicher Risikofaktor ...
1,00_mundhoehlenkarzinom_0001,0,Alkoholkonsum ist ein wesentlicher Risikofakto...
2,00_mundhoehlenkarzinom_0002,0,Hauptrisikofaktoren für das Auftreten eines Mu...
3,00_mundhoehlenkarzinom_0002,1,Bei chronischem Tabak- oder Alkoholabusus ist ...
4,00_mundhoehlenkarzinom_0002,2,Auch für das Lippenkarzinom gilt der Kontakt d...


In [9]:
len(sent_df)

97014

## Resolve Elliptical Compounds

In [10]:
from ggponc import ECCNPResolver
resolver = ECCNPResolver(**conf.eccnp)



In [11]:
resolver.transform('Chemo- und Strahlentherapie, BRAF-/MEK-Inhibitor')

'Chemotherapie und Strahlentherapie, BRAF-Inhibitor/MEK-Inhibitor,'

In [12]:
sent_df = resolver.set_df(sent_df)

  0%|          | 0/97014 [00:00<?, ?it/s]

In [13]:
sent_df.to_parquet(f'output/{version}/sentences_resolved.parquet')

## Run NER Model

In [14]:
sent_df = pd.read_parquet(f'output/{version}/sentences_resolved.parquet')

In [15]:
from ggponc import NERTagger
ner = NERTagger()

In [16]:
ner.transform('Chemo- und Strahlentherapie, BRAF-/MEK-Inhibitor').spans

{'entities': [Chemo-, Strahlentherapie, BRAF-/MEK-Inhibitor]}

In [17]:
ner.transform('Chemotherapie und Strahlentherapie, BRAF-Inhibitor/MEK-Inhibitor').spans

{'entities': [Chemotherapie, Strahlentherapie, BRAF-Inhibitor, MEK-Inhibitor]}

In [18]:
ner_df = ner.set_df(sent_df)

  0%|          | 0/97014 [00:00<?, ?it/s]

In [19]:
from ggponc import merge_sentence_docs
docs = merge_sentence_docs(ner_df.spacy_ner, ner_df.file)

In [20]:
from spacy import displacy

displacy.render(docs[2], style='span', options = {'spans_key' : 'entities'})

In [22]:
from xmen.data import from_spacy
ds = from_spacy(docs, span_key='entities', doc_id_key='file_name')

In [23]:
fname = f'output/{version}/ggponc_ner_spacy'
ds.save_to_disk(fname)
fname

Saving the dataset (0/1 shards):   0%|          | 0/11280 [00:00<?, ? examples/s]

'output/v2.1_2023_03_30/ggponc_ner_spacy'

# Run Entity Linker

In [24]:
ds = datasets.load_from_disk(f'output/{version}/ggponc_ner_spacy')

In [26]:
len([e for d in ds for e in d['entities']])

258105

In [27]:
from ggponc import EntityLinker
linker = EntityLinker(**conf.linker.ranking, candidate_generation_kwargs=conf.linker.candidate_generation)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [None]:
result = linker.transform(ds)

Map:   0%|          | 0/11280 [00:00<?, ? examples/s]

  global_matches = self.global_matcher(doc)


In [None]:
result.save_to_disk(f'output/{version}/ggponc_xmen')

# Final Format

In [30]:
result = datasets.load_from_disk(f'output/{version}/ggponc_xmen')

  table = cls._concat_blocks(blocks, axis=0)


In [91]:
types = ['Clinical_Drug', 'Diagnostic', 'Therapeutic']

import random
show_indices = list(range(0, len(result['candidates'])))
random.shuffle(show_indices)

In [92]:
get_annotation_dataframe(result['candidates'].select(show_indices[0:10]), linker.kb, 3, 0.0).query('type in @types').iloc[0:10]

  0%|          | 0/10 [00:00<?, ?it/s]

Unnamed: 0,document,text,type,start,end,cui,tuis,canonical,linker,confidence
0,20_harnblasenkarzinom_0032,Prävention durch Konsum,Therapeutic,0,23,C0948532,[T061],Constipation prophylaxis,[sapbert],0.738808
1,20_harnblasenkarzinom_0032,Prävention durch Konsum,Therapeutic,0,23,C1261558,[T061],Abuse prevention (procedure),[sapbert],0.71732
2,20_harnblasenkarzinom_0032,Prävention durch Konsum,Therapeutic,0,23,C0033107,[T080],preventive therapy,[sapbert],0.694419
3,20_harnblasenkarzinom_0032,Prävention durch Konsum vegetabiler Lebensmittel,Therapeutic,0,48,C0948532,[T061],Constipation prophylaxis,[sapbert],0.721405
4,20_harnblasenkarzinom_0032,Prävention durch Konsum vegetabiler Lebensmittel,Therapeutic,0,48,C1444526,[T058],Food surveillance,[sapbert],0.657961
5,20_harnblasenkarzinom_0032,Prävention durch Konsum vegetabiler Lebensmittel,Therapeutic,0,48,C0016471,[T058],Food Inspection,[sapbert],0.648995
9,20_harnblasenkarzinom_0032,aktive Ernährungsumstellung,Therapeutic,53,80,C0085841,[T061],Artificial Feeding,"[ngram, sapbert]",0.737208
10,20_harnblasenkarzinom_0032,aktive Ernährungsumstellung,Therapeutic,53,80,C0086153,[T061],Dietary Modification,"[ngram, sapbert]",0.731634
11,20_harnblasenkarzinom_0032,aktive Ernährungsumstellung,Therapeutic,53,80,C0376312,[T061],Zwangsernährung,"[ngram, sapbert]",0.717698
24,20_harnblasenkarzinom_0032,diätetische Umstellungen,Therapeutic,543,567,C0086153,[T061],Dietary Modification,[sapbert],0.836387


In [93]:
get_annotation_dataframe(result['reranked'].select(show_indices[0:10]), linker.kb, 3, 0.0).query('type in @types').iloc[0:10]

  0%|          | 0/10 [00:00<?, ?it/s]

Unnamed: 0,document,text,type,start,end,cui,tuis,canonical,linker,confidence
0,20_harnblasenkarzinom_0032,Prävention durch Konsum,Therapeutic,0,23,C0150358,[T061],Substance use prevention,[sapbert],0.065839
1,20_harnblasenkarzinom_0032,Prävention durch Konsum,Therapeutic,0,23,C2700409,[T080],prevention,[sapbert],0.065301
2,20_harnblasenkarzinom_0032,Prävention durch Konsum,Therapeutic,0,23,C0199176,[T061],Prophylaxe,[sapbert],0.065155
3,20_harnblasenkarzinom_0032,Prävention durch Konsum vegetabiler Lebensmittel,Therapeutic,0,48,C1254456,[T059],Food culture,[sapbert],0.06407
4,20_harnblasenkarzinom_0032,Prävention durch Konsum vegetabiler Lebensmittel,Therapeutic,0,48,C1444526,[T058],Food surveillance,[sapbert],0.063674
5,20_harnblasenkarzinom_0032,Prävention durch Konsum vegetabiler Lebensmittel,Therapeutic,0,48,C0242887,[T170],Food Policy,[sapbert],0.063494
9,20_harnblasenkarzinom_0032,aktive Ernährungsumstellung,Therapeutic,53,80,C0086153,[T061],Dietary Modification,"[ngram, sapbert]",0.069787
10,20_harnblasenkarzinom_0032,aktive Ernährungsumstellung,Therapeutic,53,80,C0376312,[T061],Zwangsernährung,"[ngram, sapbert]",0.066339
11,20_harnblasenkarzinom_0032,aktive Ernährungsumstellung,Therapeutic,53,80,C0085841,[T061],Artificial Feeding,"[ngram, sapbert]",0.066319
24,20_harnblasenkarzinom_0032,diätetische Umstellungen,Therapeutic,543,567,C0086153,[T061],Dietary Modification,[sapbert],0.073331


In [94]:
df = get_annotation_dataframe(result['reranked'], linker.kb, 1, 0.0)

  0%|          | 0/11280 [00:00<?, ?it/s]

In [95]:
df.to_csv(f'output/{version}/entities_with_cuis.tsv', sep='\t', index=False)