In [1]:
import re
from datasets import Dataset, load_dataset
import datasets
import pandas as pd
import numpy as np
from pathlib import Path
from span_marker import SpanMarkerModel
import spacy
import os
import tqdm

In [3]:
os.environ["CUDA_VISIBLE_DEVICES"] = '2'

# Passing full sentences to SpanMarkerModel.predict()

Load all texts

In [5]:
def load_texts(text_path, tsv_filter = None, sort_keys = False):
    text_path = Path(text_path)
    doc_ids = []
    texts = []
    if tsv_filter:
        tsv_filter = pd.read_csv(tsv_filter, sep='\t').filename.unique()
    files = list(text_path.glob('*.txt'))
    if sort_keys:
        files = sorted(files, key=lambda k: int(k.stem.split('_')[-1]))
    for f in files:
        if tsv_filter is None or f.stem in tsv_filter:
            doc_ids.append(f.stem)
            texts.append(open(f, 'r', encoding='utf-8').read())
    return Dataset.from_dict({
        'document_id' : doc_ids,
        'text' : texts
    })
    
text_path = "../data/symptemist_train/subtask1-ner/txt"
ds = load_texts(text_path)
ds

Dataset({
    features: ['document_id', 'text'],
    num_rows: 750
})

Get for the eval only the texts that belonged to the eval split on the SpanMarker training

In [45]:
data = load_dataset(
    path="../../biomedical/bigbio/hub/hub_repos/symptemist/symptemist.py", 
    name="symptemist_entities_bigbio_kb"
)
data = data.sort("document_id")
eval_list = data["train"].select(range(600,744))["document_id"]
ds_eval = ds.filter(lambda e: e["document_id"] in eval_list)
ds_eval = ds_eval.sort("document_id")
ds_eval

Found cached dataset symptemist (/home/Ignacio.Rodriguez/.cache/huggingface/datasets/symptemist/symptemist_entities_bigbio_kb/2.0.0/2542aaab0d6c9963785fca5b4b0712501e06aa5a2e136b7b4d26d1fd7a2c382a)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached sorted indices for dataset at /home/Ignacio.Rodriguez/.cache/huggingface/datasets/symptemist/symptemist_entities_bigbio_kb/2.0.0/2542aaab0d6c9963785fca5b4b0712501e06aa5a2e136b7b4d26d1fd7a2c382a/cache-a1bc75b8e00708bf.arrow


Filter:   0%|          | 0/750 [00:00<?, ? examples/s]

Dataset({
    features: ['document_id', 'text'],
    num_rows: 144
})

Run inference on the evaluation texts

In [11]:
def infer_sent(ds, checkpoint, device = int(os.environ["CUDA_VISIBLE_DEVICES"])):
    nlp = spacy.load("es_core_news_sm")
    pp = SpanMarkerModel.from_pretrained(checkpoint)
    pp.cuda(device)

    output = {
        "filename": [],
        "label": [],
        "start_span": [],
        "end_span": [],
        "text": [],
    }
    
    for row in tqdm.tqdm(ds, desc = "predicted texts"):
        doc = nlp(row["text"])
        for _, sentence in enumerate(doc.sents):
            preds = pp.predict(sentence.text)
            sentence_start = sentence.start_char
            for ent in preds:
                output["filename"].append(row["document_id"])
                output["label"].append("SINTOMA")
                output["start_span"].append(sentence_start + ent["char_start_index"])
                output["end_span"].append(sentence_start + ent["char_end_index"])
                output["text"].append(ent["span"])

    return pd.DataFrame.from_dict(output)

eval_preds = infer_sent(ds_eval, "../checkpoints/v2/checkpoint-25200")
eval_preds

You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embeding dimension will be 52002. This might induce some performance reduction as *Tensor Cores* will not be available. For more details  about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc
predicted texts:   0%|                                                                                       | 0/144 [00:00<?, ?it/s]This model was trained with document-level context: inference without document-level context may cause decreased performance.
This model was trained with document-level context: inference without document-level context may cause decreased performance.
This model was trained with document-level context: inference without document-level context may cause decreased performance.
This model was trained with document-lev

Unnamed: 0,filename,label,start_span,end_span,text
0,es-S1887-85712013000200013-1,SINTOMA,750,755,dolor
1,es-S1887-85712013000200013-1,SINTOMA,995,1031,amnesia del procedimiento quirúrgico
2,es-S1579-699X2004000400002-1,SINTOMA,637,685,niveles elevados de los metabolitos del estireno
3,es-S1579-699X2004000400002-1,SINTOMA,1106,1113,vértigo
4,es-S1579-699X2004000400002-1,SINTOMA,1115,1124,temblores
...,...,...,...,...,...
1792,es-S1139-76322009000700014-1,SINTOMA,1936,1970,condensación residual en LSD y LMD
1793,es-S1139-76322009000700014-1,SINTOMA,2020,2077,exudado faringoamigdalar: flora normal en cult...
1794,es-S1139-76322009000700014-1,SINTOMA,2261,2320,aumento de densidad en LSD de distribución par...
1795,es-S1139-76322009000700014-1,SINTOMA,2332,2362,desplazamiento de cisura menor


In [12]:
eval_preds = eval_preds.sort_values("filename")
eval_preds

Unnamed: 0,filename,label,start_span,end_span,text
919,es-S1135-76062011000100006-1,SINTOMA,139,146,fallece
645,es-S1135-76062011000200009-1,SINTOMA,719,730,falleciendo
335,es-S1135-76062012000200006-1,SINTOMA,1049,1055,disnea
330,es-S1135-76062012000200006-1,SINTOMA,543,556,fallecimiento
331,es-S1135-76062012000200006-1,SINTOMA,784,801,debilidad general
...,...,...,...,...,...
1505,es-S2254-28842014000300010-1,SINTOMA,1074,1124,control bacteriológico del monitor que es nega...
1504,es-S2254-28842014000300010-1,SINTOMA,1055,1061,fiebre
1503,es-S2254-28842014000300010-1,SINTOMA,1036,1052,malestar general
1509,es-S2254-28842014000300010-1,SINTOMA,1511,1539,deterioro del estado general


Cut from Golden Standard the correspondign eval docs

In [13]:
gs = pd.read_csv("../data/symptemist_train/subtask1-ner/tsv/symptemist_tsv_train_subtask1.tsv", sep="\t")
gs = gs[gs["filename"].isin(eval_list)]
gs = gs.sort_values("filename")
gs

Unnamed: 0,filename,ann_id,label,start_span,end_span,text
7143,es-S1135-76062011000100006-1,T1,SINTOMA,139,146,fallece
5232,es-S1135-76062011000200009-1,T1,SINTOMA,719,730,falleciendo
6379,es-S1135-76062012000200006-1,T3,SINTOMA,784,801,debilidad general
6380,es-S1135-76062012000200006-1,T4,SINTOMA,930,937,cefalea
6378,es-S1135-76062012000200006-1,T2,SINTOMA,1460,1492,edema de extremidades inferiores
...,...,...,...,...,...,...
1985,es-S2254-28842014000300010-1,T9,SINTOMA,1074,1124,control bacteriológico del monitor que es nega...
1984,es-S2254-28842014000300010-1,T8,SINTOMA,2247,2254,afebril
1983,es-S2254-28842014000300010-1,T7,SINTOMA,1720,1760,microbiológicos positivos para S. aureus
1982,es-S2254-28842014000300010-1,T6,SINTOMA,2218,2223,dolor


Reformat and store golden standard and preds to run the eval lib.

In [15]:
#eval_preds.rename(columns={"start_span":"off0", "end_span":"off1", "text":"span"}, inplace=True)
eval_preds.to_csv("../eval_results.tsv", sep="\t", index=False)

#gs.rename(columns={"ann_id": "mark", "start_span":"off0", "end_span":"off1", "text":"span"}, inplace=True)
#gs.to_csv("../golden_standard.tsv", sep="\t", index=False)

Results BEST checkpoint: Prec .729, Rec .741, F1 .734

Results LAST checkpoint: Prec .753, Rec .725, F1 .738

# Passing context-level info to SpanMarkerModel.predict()

In [75]:
def text2spanmarker(ds) -> Dataset:    
    
    nlp = spacy.load("es_core_news_sm")
    output = {
        "filename": [],
        "document_id": [],
        "sentence_id": [],
        "sentence_start": [],
        "tokens": [],
    }
    
    for doc_id, row in tqdm.tqdm(enumerate(ds), desc="Document progress:"):
        
        text = row["text"]
        doc = nlp(text)
        
        for sentence_id, sentence in enumerate(doc.sents):

            output["filename"].append(row["document_id"])
            output["document_id"].append(doc_id)
            output["sentence_id"].append(sentence_id)
            output["sentence_start"].append(sentence.start_char)
            output["tokens"].append(sentence.text)
        
    return datasets.Dataset.from_dict(output)

sm = text2spanmarker(ds_eval)
sm.to_pandas().head()

Document progress:: 144it [00:04, 31.37it/s]


Unnamed: 0,filename,document_id,sentence_id,sentence_start,tokens
0,es-S1135-76062011000100006-1,0,0,0,Hombre de 27 años que durante una persecución ...
1,es-S1135-76062011000100006-1,0,1,104,Trasladado a un centro asistencial fallece a l...
2,es-S1135-76062011000100006-1,0,2,204,Entre los hallazgos autópsicos se constatan do...
3,es-S1135-76062011000100006-1,0,3,402,"El orificio de entrada cutáneo, sin tatuaje ni..."
4,es-S1135-76062011000100006-1,0,4,513,"Rebatido el cuero cabelludo, se observa orific..."


In [80]:
def infer_ds(span_marker_ds: Dataset, checkpoint, device = int(os.environ["CUDA_VISIBLE_DEVICES"])):
    nlp = spacy.load("es_core_news_sm")
    pp = SpanMarkerModel.from_pretrained(checkpoint)
    pp.cuda(device)

    preds = pp.predict(span_marker_ds, batch_size = 16, show_progress_bar = True)
    
    output = {
        "filename": [],
        "label": [],
        "start_span": [],
        "end_span": [],
        "text": [],
    }
    
    for i, row in enumerate(span_marker_ds):
        if len(preds[i]) > 0:
            for pred in preds[i]:
                output["filename"].append(row["filename"])
                output["label"].append("SINTOMA")
                start_span = pred["char_start_index"] + row["sentence_start"]
                end_span = row["sentence_start"] + pred["char_end_index"]
                output["start_span"].append(start_span)
                output["end_span"].append(end_span)
                output["text"].append(pred["span"])

    return pd.DataFrame.from_dict(output)

In [86]:
df = infer_ds(sm, "../checkpoints/v2/checkpoint-5040")
df

You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embeding dimension will be 52002. This might induce some performance reduction as *Tensor Cores* will not be available. For more details  about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc


Adding document-level context:   0%|          | 0/2483 [00:00<?, ?it/s]

Spreading data between multiple samples:   0%|          | 0/2483 [00:00<?, ? examples/s]

  0%|          | 0/403 [00:00<?, ?it/s]

Unnamed: 0,filename,label,start_span,end_span,text
0,es-S1135-76062011000100006-1,SINTOMA,139,146,fallece
1,es-S1135-76062011000100006-1,SINTOMA,904,949,orificio de salida cutáneo en región parietal
2,es-S1135-76062011000200009-1,SINTOMA,719,730,falleciendo
3,es-S1135-76062012000200006-1,SINTOMA,543,556,fallecimiento
4,es-S1135-76062012000200006-1,SINTOMA,583,598,cuadro catarral
...,...,...,...,...,...
1906,es-S2254-28842014000300010-1,SINTOMA,1709,1760,resultados microbiológicos positivos para S. a...
1907,es-S2254-28842014000300010-1,SINTOMA,2218,2223,dolor
1908,es-S2254-28842014000300010-1,SINTOMA,2247,2254,afebril
1909,es-S2254-28842014000300010-1,SINTOMA,2337,2367,negativización de hemocultivos


In [87]:
df.rename(columns={"start_span":"off0", "end_span":"off1", "text":"span"}, inplace=True)
df.to_csv("../eval_results_withcontext_best.tsv", sep="\t", index=False)

Last results with context Prec .752 Rec .723 F1 .737

Best results with context Prec .728 Rec .745 F1 .737