# UMLS Entity Linking

In [2]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '3'

In [3]:
import datasets
from pathlib import Path
from nen_util import get_annotation_dataframe
import pandas as pd

In [4]:
from xmen import load_config, load_kb

conf = load_config('xmen_ggponc3.yaml')
version = conf.data_version

## Preparation

`xmen dict xmen_ggponc3.yaml`

`xmen index xmen_ggponc3.yaml --all --overwrite`

In [None]:
#!unzip -n -q data/v3.0_2024_01_03/plain_text/plain_text.zip -d data/v3_plain_text
!mkdir output/$version
!unzip -n -q data/$version/plain_text/plain_text.zip -d output/$version/plain_text

## Read Sentences

In [None]:
from ggponc import read_sentences
sent_df = read_sentences(version)
sent_df.head()

## Resolve Elliptical Compounds

In [None]:
from ggponc import ECCNPResolver
resolver = ECCNPResolver(**conf.eccnp)

In [None]:
resolver.transform('Chemo- und Strahlentherapie, BRAF-/MEK-Inhibitor')

In [None]:
sent_df = resolver.set_df(sent_df)

In [None]:
sent_df.to_parquet(f'output/{version}/sentences_resolved.parquet')

## Run NER Model

In [None]:
sent_df = pd.read_parquet(f'output/{version}/sentences_resolved.parquet')

In [None]:
# Drop excessively long pre-processed sentences, most like generation errors
sent_df['ratio'] = (sent_df.sentence_preprocessed.str.len() / sent_df.sentence.str.len()).sort_values()
sent_df.loc[sent_df.ratio > 2, 'sentence_preprocessed'] = sent_df.sentence

In [None]:
from ggponc import NERTagger
ner = NERTagger()

In [None]:
d = ner.transform('Chemo- und Strahlentherapie, BRAF-/MEK-Inhibitor')
d.spans

In [None]:
d = ner.transform('Chemotherapie und Strahlentherapie, BRAF-Inhibitor/MEK-Inhibitor')
d.spans

In [None]:
ner_df = ner.set_df(sent_df)

In [None]:
from ggponc import merge_sentence_docs
docs = merge_sentence_docs(ner_df.spacy_ner, ner_df.file)

In [None]:
from spacy import displacy

displacy.render(docs[2], style='span', options = {'spans_key' : 'entities'})

In [None]:
from xmen.data import from_spacy
ds = from_spacy(docs, span_key='entities', doc_id_key='file_name')

In [None]:
fname = f'output/{version}/ggponc_ner_spacy'
ds.save_to_disk(fname)
fname

# Run Entity Linker

In [5]:
ds = datasets.load_from_disk(f'output/{version}/ggponc_ner_spacy')

In [6]:
len([e for d in ds for e in d['entities']])

257325

In [7]:
ds.filter(lambda d: d['document_id'] == '18_hodgkin-lymphom_0194')[0]['passages'][26]

Filter:   0%|          | 0/11280 [00:00<?, ? examples/s]

{'id': 26,
 'offsets': [[4171, 4206]],
 'text': ['Reproduktionsmedizinische Verfahren'],
 'type': 'sentence'}

In [8]:
from ggponc import EntityLinker
linker = EntityLinker(**conf.linker.ranking, candidate_generation_kwargs=conf.linker.candidate_generation)

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
Your CPU supports instructions that this binary was not compiled to use: SSE3 SSE4.1 SSE4.2 AVX AVX2
For maximum performance, you can install NMSLIB from sources 
pip install --no-binary :all: nmslib
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [None]:
result = linker.transform(ds)



Map:   0%|          | 0/11280 [00:00<?, ? examples/s]

  global_matches = self.global_matcher(doc)


Map:   0%|          | 0/11280 [00:00<?, ? examples/s]

Map:   0%|          | 0/11280 [00:00<?, ? examples/s]

Map:   0%|          | 0/11280 [00:00<?, ? examples/s]

Context length: 128
Use NIL values: False


  0%|          | 0/257325 [00:00<?, ?it/s]

  0%|          | 0/257325 [00:00<?, ?it/s]

  return cls._concat_blocks(pa_tables_to_concat_vertically, axis=0)


  0%|          | 0/257325 [00:00<?, ?it/s]

Batches:   0%|          | 0/257318 [00:00<?, ?it/s]

Map:   0%|          | 0/11280 [00:00<?, ? examples/s]

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



Map:   0%|          | 0/11280 [00:00<?, ? examples/s]

In [None]:
result.save_to_disk(f'output/{version}/ggponc_xmen')

# Final Format

In [None]:
result = datasets.load_from_disk(f'output/{version}/ggponc_xmen')

In [None]:
types = ['Clinical_Drug', 'Diagnostic', 'Therapeutic']

import random
show_indices = list(range(0, len(result['candidates'])))
random.shuffle(show_indices)
show_indices[0:10]

In [19]:
get_annotation_dataframe(result['candidates'].select(show_indices[0:10]), linker.kb, 3, 0.0).query('type in @types').iloc[0:10]

  0%|          | 0/10 [00:00<?, ?it/s]

Unnamed: 0,document,text,type,start,end,cui,tuis,canonical,linker,confidence
0,06_hautkrebspraevention_0286,Hautkrebs-Screening,Diagnostic,76,95,C4087363,[T061],Skin cancer excision,"[ngram, sapbert]",0.83525
1,06_hautkrebspraevention_0286,Hautkrebs-Screening,Diagnostic,76,95,C0576774,[T061],Excision of skin carcinoma,[sapbert],0.807886
2,06_hautkrebspraevention_0286,Hautkrebs-Screening,Diagnostic,76,95,C0578566,[T061],Excision of malignant skin tumor,[sapbert],0.753645
3,06_hautkrebspraevention_0286,gezielten Anamnese,Diagnostic,150,168,C0025084,[T058],Medical History Taking,"[ngram, sapbert]",0.745956
4,06_hautkrebspraevention_0286,gezielten Anamnese,Diagnostic,150,168,C0561843,[T041],"Memory, Episodic",[sapbert],0.745167
5,06_hautkrebspraevention_0286,gezielten Anamnese,Diagnostic,150,168,C0199184,[T058],"History taking, limited",[sapbert],0.739637
6,06_hautkrebspraevention_0286,visuellen standardisierten Ganzkörperinspektion,Diagnostic,177,224,C1112379,[T060],Visual tracking test,[sapbert],0.744488
7,06_hautkrebspraevention_0286,visuellen standardisierten Ganzkörperinspektion,Diagnostic,177,224,C1565979,[T060],Whole Body Screening,"[ngram, sapbert]",0.722371
8,06_hautkrebspraevention_0286,visuellen standardisierten Ganzkörperinspektion,Diagnostic,177,224,C0200160,[T060],"Color vision examination, extended",[sapbert],0.704022
9,06_hautkrebspraevention_0286,Befundmitteilung,Therapeutic,234,250,C0949214,[T170],Anzeigen [Dokumenttyp],[sapbert],0.844695


In [20]:
get_annotation_dataframe(result['reranked'].select(show_indices[0:10]), linker.kb, 3, 0.0).query('type in @types').iloc[0:10]

  0%|          | 0/10 [00:00<?, ?it/s]

Unnamed: 0,document,text,type,start,end,cui,tuis,canonical,linker,confidence
0,06_hautkrebspraevention_0286,Hautkrebs-Screening,Diagnostic,76,95,C2919584,[T061],Screening for skin cancer,[sapbert],0.101426
1,06_hautkrebspraevention_0286,Hautkrebs-Screening,Diagnostic,76,95,C4087363,[T061],Skin cancer excision,"[ngram, sapbert]",0.06948
2,06_hautkrebspraevention_0286,Hautkrebs-Screening,Diagnostic,76,95,C0420052,[T060],Skin disease screening,[sapbert],0.062753
3,06_hautkrebspraevention_0286,gezielten Anamnese,Diagnostic,150,168,C0025084,[T058],Medical History Taking,"[ngram, sapbert]",0.066949
4,06_hautkrebspraevention_0286,gezielten Anamnese,Diagnostic,150,168,C0085970,[T170],Case Histories,[sapbert],0.065283
5,06_hautkrebspraevention_0286,gezielten Anamnese,Diagnostic,150,168,C2732279,[T058],History taken on subject,[sapbert],0.064761
6,06_hautkrebspraevention_0286,visuellen standardisierten Ganzkörperinspektion,Diagnostic,177,224,C0199219,[T058],Visual examination,[sapbert],0.06434
7,06_hautkrebspraevention_0286,visuellen standardisierten Ganzkörperinspektion,Diagnostic,177,224,C0042792,[T060],Vision Screening,[sapbert],0.063352
8,06_hautkrebspraevention_0286,visuellen standardisierten Ganzkörperinspektion,Diagnostic,177,224,C2230326,[T060],Visual Inspection of the Abdomen,[sapbert],0.063348
9,06_hautkrebspraevention_0286,Befundmitteilung,Therapeutic,234,250,C1269803,[T170],Consent report,[sapbert],0.063774


In [21]:
df = get_annotation_dataframe(result['reranked'], linker.kb, 1, 0.0)

  0%|          | 0/11280 [00:00<?, ?it/s]

In [22]:
df.to_csv(f'output/{version}/entities_with_cuis.tsv', sep='\t', index=False)

In [23]:
import pandas as pd

In [24]:
tsv = pd.read_csv('output/v2.1_2023_03_30/entities_with_cuis.tsv', sep='\t')