# Linking Entities in German Medical Text to SNOMED CT

## Preparation

### Download NER Model

In [1]:
!huggingface-cli download phlobo/de_ggponc_medbertde de_ggponc_medbertde-any-py3-none-any.whl --local-dir ../local_files
!pip install -q ../local_files/de_ggponc_medbertde-any-py3-none-any.whl

Consider using `hf_transfer` for faster downloads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details.
../local_files/de_ggponc_medbertde-any-py3-none-any.whl


### Prepare Dicts and Index

`xmen dict conf/snomed_german.yaml --code dicts/umls_source.py`

`xmen index conf/snomed_german.yaml --all --overwrite`

## Entity Tagging

In [2]:
import spacy
nlp = spacy.load('de_ggponc_medbertde')

In [3]:
sentences = [
    "Cetuximab ist ein monoklonaler Antikörper, der gegen den epidermalen Wachstumsfaktorrezeptor (EGFR) gerichtet ist und" \
       "dient zur Therapie des fortgeschrittenen kolorektalen Karzinoms zusammen mit Irinotecan oder in Kombination mit FOLFOX bzw. " \
       "allein nach Versagen einer Behandlung mit Oxaliplatin und Irinotecan.",
    "Die HPV-Diagnostik hat beim Plattenepithelkarzinom der Mundhöhle keinen validen Nutzen als prognostischer Faktor."
]
sentences

['Cetuximab ist ein monoklonaler Antikörper, der gegen den epidermalen Wachstumsfaktorrezeptor (EGFR) gerichtet ist unddient zur Therapie des fortgeschrittenen kolorektalen Karzinoms zusammen mit Irinotecan oder in Kombination mit FOLFOX bzw. allein nach Versagen einer Behandlung mit Oxaliplatin und Irinotecan.',
 'Die HPV-Diagnostik hat beim Plattenepithelkarzinom der Mundhöhle keinen validen Nutzen als prognostischer Faktor.']

In [4]:
docs = list(nlp.pipe(sentences))

In [5]:
import pandas as pd

In [6]:
ents = []
for d in docs:
    for span in sorted(d.spans['entities'], key=lambda s: s.start):
        ents.append({'mention' : span.text, 'class' : span.label_})
pd.DataFrame(ents)

Unnamed: 0,mention,class
0,Cetuximab,Clinical_Drug
1,monoklonaler Antikörper,Clinical_Drug
2,epidermalen Wachstumsfaktorrezeptor,Nutrient_or_Body_Substance
3,EGFR,Nutrient_or_Body_Substance
4,Therapie des fortgeschrittenen kolorektalen Ka...,Therapeutic
5,fortgeschrittenen kolorektalen Karzinoms,Diagnosis_or_Pathology
6,Irinotecan,Clinical_Drug
7,FOLFOX,Therapeutic
8,Versagen einer Behandlung,Other_Finding
9,Behandlung mit Oxaliplatin und Irinotecan,Therapeutic


## Candidate Generation

In [7]:
from pathlib import Path

from xmen.data import from_spacy
from xmen.linkers import SapBERTLinker, TFIDFNGramLinker, EnsembleLinker
from xmen import load_config

In [8]:
dataset = from_spacy(docs, span_key='entities')

In [9]:
dataset

Dataset({
    features: ['id', 'document_id', 'passages', 'entities', 'coreferences', 'relations', 'events', 'corpus_id', 'lang'],
    num_rows: 2
})

In [10]:
conf = load_config('../examples/conf/snomed_german.yaml')

In [11]:
from xmen.linkers import default_ensemble
linker = default_ensemble(Path(conf.linker.candidate_generation.ngram.index_base_path).parent, cuda=False)

candidates = linker.predict_batch(dataset)

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

In [12]:
from xmen import load_kb

kb = load_kb(Path(conf.cache_dir) / 'snomed_german' / 'snomed_german.jsonl')

In [13]:
from util import get_dataframe
get_dataframe(candidates, kb)

Unnamed: 0,mention,class,cui,canonical name,linked by,score
0,Cetuximab,Clinical_Drug,409401002,Product containing cetuximab (medicinal product),"[ngram, sapbert]",1.0
1,monoklonaler Antikörper,Clinical_Drug,49616005,Monoclonal antibody,"[ngram, sapbert]",0.982318
2,epidermalen Wachstumsfaktorrezeptor,Nutrient_or_Body_Substance,86960007,Epidermal growth factor-urogastrone receptor,[sapbert],0.937485
3,EGFR,Nutrient_or_Body_Substance,86960007,Epidermal growth factor-urogastrone receptor,"[ngram, sapbert]",1.0
4,Therapie des fortgeschrittenen kolorektalen Ka...,Therapeutic,1217692004,Metastasis from malignant neoplasm of colon an...,"[ngram, sapbert]",0.69648
5,fortgeschrittenen kolorektalen Karzinoms,Diagnosis_or_Pathology,1217692004,Metastasis from malignant neoplasm of colon an...,"[ngram, sapbert]",0.843411
6,Irinotecan,Clinical_Drug,372538008,Irinotecan,"[ngram, sapbert]",1.0
7,FOLFOX,Therapeutic,699297004,"Ohdo syndrome, Maat-Kievit-Brunner type",[sapbert],0.812118
8,Versagen einer Behandlung,Other_Finding,7058009,Noncompliance with treatment,"[ngram, sapbert]",0.881669
9,Behandlung mit Oxaliplatin und Irinotecan,Therapeutic,447053005,Oxaliplatin desensitization therapy,"[ngram, sapbert]",0.673975


## Re-Ranking

In [14]:
from xmen.reranking import CrossEncoderReranker

In [15]:
ce_candidates = CrossEncoderReranker.prepare_data(candidates, None, kb, k=64)

Context length: 128
Use NIL values: True


Map:   0%|          | 0/2 [00:00<?, ? examples/s]

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

In [16]:
rr = CrossEncoderReranker.load("phlobo/xmen-de-ce-medmentions", device=0)

In [17]:
reranked = rr.rerank_batch(candidates, ce_candidates, k=64)

Batches:   0%|          | 0/14 [00:00<?, ?it/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

In [18]:
# After Re-ranking
get_dataframe(reranked, kb)

Unnamed: 0,mention,class,cui,canonical name,linked by,score
0,Cetuximab,Clinical_Drug,409400001,Cetuximab,"[ngram, sapbert]",0.040032
1,monoklonaler Antikörper,Clinical_Drug,49616005,Monoclonal antibody,"[ngram, sapbert]",0.043237
2,epidermalen Wachstumsfaktorrezeptor,Nutrient_or_Body_Substance,86960007,Epidermal growth factor-urogastrone receptor,[sapbert],0.022354
3,EGFR,Nutrient_or_Body_Substance,86960007,Epidermal growth factor-urogastrone receptor,"[ngram, sapbert]",0.033175
4,Therapie des fortgeschrittenen kolorektalen Ka...,Therapeutic,1217692004,Metastasis from malignant neoplasm of colon an...,"[ngram, sapbert]",0.017036
5,fortgeschrittenen kolorektalen Karzinoms,Diagnosis_or_Pathology,126837005,Tumor of large intestine,"[ngram, sapbert]",0.017528
6,Irinotecan,Clinical_Drug,372538008,Irinotecan,"[ngram, sapbert]",0.042663
7,FOLFOX,Therapeutic,461391000124102,Folfox protocol,"[ngram, sapbert]",0.01886
8,Versagen einer Behandlung,Other_Finding,266721009,Absent response to treatment,[sapbert],0.017751
9,Behandlung mit Oxaliplatin und Irinotecan,Therapeutic,447053005,Oxaliplatin desensitization therapy,"[ngram, sapbert]",0.017154
