In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from datasets import load_dataset
ggponc_raw = load_dataset('bigbio/ggponc2', data_dir='data/v2.0_2022_03_24', name='ggponc2_fine_long_bigbio_kb')

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
from xmen.evaluation import error_analysis
ea = error_analysis(ggponc_raw['test'], ggponc_raw['test'], tasks=['ner'])

In [3]:
from ner_util import *

ggponc_ds = bigbio_split_passages(ggponc_raw)
ggponc_ds

DatasetDict({
    train: Dataset({
        features: ['id', 'document_id', 'passages', 'entities', 'events', 'coreferences', 'relations'],
        num_rows: 59515
    })
    test: Dataset({
        features: ['id', 'document_id', 'passages', 'entities', 'events', 'coreferences', 'relations'],
        num_rows: 13714
    })
    validation: Dataset({
        features: ['id', 'document_id', 'passages', 'entities', 'events', 'coreferences', 'relations'],
        num_rows: 12770
    })
})

In [None]:
len([e for d in ggponc_ds['train']['entities'] for e in d])

In [None]:
substance_examples = ggponc_ds['train'].filter(lambda d: len([e for e in d['entities'] if 'Substance' in e['type']]) > 0)
substance_examples

In [None]:
import datasets

ggponc_ds_upsampled = ggponc_ds.copy()
ggponc_ds_upsampled['train'] = datasets.concatenate_datasets([ggponc_ds['train'], substance_examples, substance_examples])
ggponc_ds_upsampled


In [5]:
from xmen.data import from_spacy
import spacy

nlp = spacy.blank('de')

In [6]:
from xmen.data import Sampler

subset = Sampler(1, 100).transform_batch(ggponc_ds)
subset

bigbio_to_spacy_docbin('data/ggponc_subset', nlp, subset, 'entities', is_sentencized=True)

 99%|█████████▉| 99/100 [00:00<00:00, 1988.61it/s]


 99%|█████████▉| 99/100 [00:00<00:00, 2064.55it/s]
 99%|█████████▉| 99/100 [00:00<00:00, 2159.37it/s]


In [None]:
bigbio_to_spacy_docbin('data/ggponc_spacy_up', nlp, ggponc_ds_upsampled, 'entities', is_sentencized=True)

In [None]:
bigbio_to_spacy_docbin('data/ggponc_spacy', nlp, ggponc_ds, 'entities', is_sentencized=True)

# Evaluate NER model

In [59]:
import spacy
from spacy_ner import chunk_and_ngram_suggester
run = "2023-12-22_13-23-15-cuda-3"
ner_model = spacy.load(f'spacy_ner/training/{run}/model-best/') 

In [68]:
d = ner_model("Wir haben ein Versagen einer Behandlung von Covid-19 mit Oxaliplatin und Irinotecan festgestellt.")
for e in d.spans['entities']:
    print(e, e.label_)

Covid-19 Diagnosis_or_Pathology
Oxaliplatin Clinical_Drug
Irinotecan Clinical_Drug
Behandlung von Covid-19 mit Oxaliplatin und Irinotecan Therapeutic


In [61]:
from spacy.tokens import DocBin
from itertools import islice
from xmen.data import from_spacy

spacy_test_db = DocBin(store_user_data=True).from_disk('data/ggponc_spacy/test.spacy')
spacy_test = list(spacy_test_db.get_docs(ner_model.vocab))

In [62]:
spacy_test_ds = from_spacy(spacy_test, 'entities')

In [63]:
from tqdm.auto import tqdm
test_data = [d.text for d in spacy_test]
pred = list(tqdm(ner_model.pipe(test_data), total=len(test_data)))

100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 13714/13714 [05:04<00:00, 45.09it/s]


In [64]:
pred_ds = from_spacy(pred, 'entities')

In [65]:
from xmen.evaluation import evaluate, error_analysis
evaluate(spacy_test_ds, pred_ds, ner_only=True, metrics='all')

{'strict': {'precision': 0.7357169185403613,
  'recall': 0.7545874894582255,
  'fscore': 0.7450327322843688,
  'ptp': 25948,
  'fp': 9321,
  'rtp': 25948,
  'fn': 8439,
  'n_docs_system': 13714,
  'n_annos_system': 35319,
  'n_docs_gold': 13714,
  'n_annos_gold': 34414},
 'partial': {'precision': 0.8292413421815512,
  'recall': 0.829673434405918,
  'fscore': 0.829457332020895,
  'ptp': 29287.974964510206,
  'fp': 6031.025035489794,
  'rtp': 28552.381571645263,
  'fn': 5861.618428354737,
  'n_docs_system': 13714,
  'n_annos_system': 35319,
  'n_docs_gold': 13714,
  'n_annos_gold': 34414},
 'loose': {'precision': 0.9679721496953873,
  'recall': 0.9772427730427906,
  'fscore': 0.9725853701193651,
  'ptp': 11122,
  'fp': 368,
  'rtp': 11122,
  'fn': 259,
  'n_docs_system': 13714,
  'n_annos_system': 35319,
  'n_docs_gold': 13714,
  'n_annos_gold': 34414},
 'ner_strict': {'precision': 0.7357169185403613,
  'recall': 0.7545874894582255,
  'fscore': 0.7450327322843688,
  'ptp': 25948,
  'fp':

In [31]:
ea = error_analysis(spacy_test_ds.select(range(0,100)), pred_ds.select(range(0,100)), tasks=['ner'])

In [58]:
d = ner_model("""Im Gegensatz dazu zeigte sich keine Verbesserung nach Beginn einer Statintherapie bei kardiochirurgischen Patienten.""")
for e in d.spans['entities']:
    print(e, e.label_)

Beginn einer Statintherapie Therapeutic
Verbesserung nach Beginn einer Statintherapie Other_Finding


In [32]:
ea[ea.document_id == '19']

Unnamed: 0,_word_len,_abbrev,gt_start,gt_end,gt_text,gold_type,pred_start,pred_end,pred_text,pred_type,ner_match_type,corpus_id,document_id
63,1.0,False,5.0,14.0,[Resektion],Therapeutic,5.0,79.0,[Resektion mit einem histologisch bestätigten ...,Therapeutic,be,,19
64,6.0,False,25.0,79.0,[histologisch bestätigten Sicherheitsabstand v...,Other_Finding,5.0,79.0,[Resektion mit einem histologisch bestätigten ...,Therapeutic,lbe,,19
65,,,,,,,25.0,79.0,[histologisch bestätigten Sicherheitsabstand v...,Other_Finding,fp,,19
66,,,,,,,25.0,79.0,[histologisch bestätigten Sicherheitsabstand v...,Other_Finding,fp,,19
67,2.0,False,89.0,111.0,[knapper Resektionsrand],Other_Finding,89.0,111.0,[knapper Resektionsrand],Other_Finding,tp,,19
68,2.0,False,149.0,172.0,[sicherer Resektionsrand],Other_Finding,149.0,172.0,[sicherer Resektionsrand],Other_Finding,tp,,19


In [None]:
spacy_test[1]

In [None]:
spacy_test[1].spans['entities']

In [None]:
pred[1].spans['entities']

In [None]:
ea[ea.document_id == '4']

In [None]:
from spacy_ner import snomed_spans
ner_model = spacy.load('data/v2.0_2022_03_24/models/spacy')

In [None]:
f = lambda d: d['document_id'] == '00_mundhoehlenkarzinom_0039.tsv'
ggponc_ds['test'].filter(f)[0]['entities']

In [None]:
[d for d in spacy_test if d.user_data['document_id'] == '00_mundhoehlenkarzinom_0039.tsv'][0].spans

In [None]:
#bigbio_spacy.filter(f)[0]['entities']

In [None]:
import spacy
from spacy.tokens import DocBin
from itertools import islice

In [None]:
nlp = spacy.blank('de')
db = DocBin(store_user_data=True).from_disk('data/v2.0_2022_03_24/annotations/spacy/test.spacy')
docs = list(db.get_docs(nlp.vocab))
for d in docs:
    d[0].is_sent_start = True
    for t in d[1:]:
        t.is_sent_start = False
docs[0:5]

In [None]:
len(docs)

In [None]:
len(spacy_test)

In [None]:
d2 = from_spacy(spacy_test, span_key='snomed')

In [None]:
from xmen.evaluation import evaluate
from xmen.data import from_spacy

evaluate(d2, d2, ner_only=True)

In [None]:
ents_spacy = []
for d in conv_test:
    for s in d.spans['snomed']:
        ents_spacy.append(s)

In [None]:
len(ents_spacy)

In [None]:
ents_spacy[123:127]