In [1]:
import re
from datasets import Dataset, load_dataset
import datasets
import pandas as pd
import numpy as np
from pathlib import Path
from span_marker import SpanMarkerModel
import spacy
import os
import tqdm
from xmen.evaluation import error_analysis

In [2]:
os.environ["CUDA_VISIBLE_DEVICES"] = '0'

Load all texts

In [3]:
def load_texts(text_path, tsv_filter = None, sort_keys = False):
    text_path = Path(text_path)
    doc_ids = []
    texts = []
    if tsv_filter:
        tsv_filter = pd.read_csv(tsv_filter, sep='\t').filename.unique()
    files = list(text_path.glob('*.txt'))
    if sort_keys:
        files = sorted(files, key=lambda k: int(k.stem.split('_')[-1]))
    for f in files:
        if tsv_filter is None or f.stem in tsv_filter:
            doc_ids.append(f.stem)
            texts.append(open(f, 'r', encoding='utf-8').read())
    return Dataset.from_dict({
        'document_id' : doc_ids,
        'text' : texts
    })
    
text_path = "../data/symptemist_train/subtask1-ner/txt"
texts = load_texts(text_path)
texts

Dataset({
    features: ['document_id', 'text'],
    num_rows: 750
})

Get for the eval only the texts that belonged to the eval split on the SpanMarker training

In [4]:
# load and sort bigbio
bigbio = load_dataset(
    path="../../biomedical/bigbio/hub/hub_repos/symptemist/symptemist.py", 
    name="symptemist_entities_bigbio_kb"
)
bigbio = bigbio.sort("document_id")

# get the docs used for eval
bigbio_gs = bigbio["train"].select(range(600,744))
eval_list = bigbio_gs["document_id"]

texts_eval = texts.filter(lambda e: e["document_id"] in eval_list)
texts_eval = texts_eval.sort("document_id")
texts_eval

Found cached dataset symptemist (/home/Ignacio.Rodriguez/.cache/huggingface/datasets/symptemist/symptemist_entities_bigbio_kb/2.0.0/2542aaab0d6c9963785fca5b4b0712501e06aa5a2e136b7b4d26d1fd7a2c382a)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached sorted indices for dataset at /home/Ignacio.Rodriguez/.cache/huggingface/datasets/symptemist/symptemist_entities_bigbio_kb/2.0.0/2542aaab0d6c9963785fca5b4b0712501e06aa5a2e136b7b4d26d1fd7a2c382a/cache-a1bc75b8e00708bf.arrow


Filter:   0%|          | 0/750 [00:00<?, ? examples/s]

Dataset({
    features: ['document_id', 'text'],
    num_rows: 144
})

Transform the loaded texts into a Dataset with context level info that can be passed to SpanModelMarker.predict()

In [5]:
def text2spanmarker(ds) -> Dataset:
    """
    Parses the read texts to a senteniced format with doc-level context useful for SpanMarkerModel
    """
    
    nlp = spacy.load("es_core_news_sm")
    output = {
        "filename": [],
        "document_id": [],
        "sentence_id": [],
        "sentence_start": [],
        "tokens": [],
    }
    
    for doc_id, row in tqdm.tqdm(enumerate(ds), desc="Document progress:"):
        
        text = row["text"]
        doc = nlp(text)
        
        for sentence_id, sentence in enumerate(doc.sents):

            output["filename"].append(row["document_id"])
            output["document_id"].append(doc_id)
            output["sentence_id"].append(sentence_id)
            output["sentence_start"].append(sentence.start_char)
            output["tokens"].append(sentence.text)
        
    return datasets.Dataset.from_dict(output)

span_marker_input = text2spanmarker(texts_eval)
span_marker_input.to_pandas().head()

Document progress:: 144it [00:11, 12.77it/s]


Unnamed: 0,filename,document_id,sentence_id,sentence_start,tokens
0,es-S1135-76062011000100006-1,0,0,0,Hombre de 27 años que durante una persecución ...
1,es-S1135-76062011000100006-1,0,1,104,Trasladado a un centro asistencial fallece a l...
2,es-S1135-76062011000100006-1,0,2,204,Entre los hallazgos autópsicos se constatan do...
3,es-S1135-76062011000100006-1,0,3,402,"El orificio de entrada cutáneo, sin tatuaje ni..."
4,es-S1135-76062011000100006-1,0,4,513,"Rebatido el cuero cabelludo, se observa orific..."


Load spanmarker preds

In [6]:
import pickle
with open("preds.pkl", 'rb') as file:
    span_marker_preds = pickle.load(file)

Transform the prediction back into BigBIO format to ruin `xmen::error_analysis`

In [7]:
def infer2bigbio(span_marker_ds: Dataset, checkpoint, preds = None, device = int(os.environ["CUDA_VISIBLE_DEVICES"])):
    """
    Gets a dataset with doc-level info, makes predictions on it and maps it to the BigBIO format for xmen::error_analysis
    """
    
    if preds is  None:
        pp = SpanMarkerModel.from_pretrained(checkpoint)
        pp.cuda(device)
        preds = pp.predict(span_marker_ds, batch_size = 16, show_progress_bar = True)

    ents = {}
    
    for source, predicted_entities in zip(span_marker_ds,preds):
        doc_id = source["filename"]
        if doc_id not in ents.keys():
            ents[doc_id] = []
            
        if len(predicted_entities) > 0: # passage has predicted entitities
            for pred in predicted_entities:
                off0 = pred["char_start_index"] + source["sentence_start"]
                off1 = source["sentence_start"] + pred["char_end_index"]
                ent = {
                    "id": f"{doc_id}_{len(ents[doc_id])}",
                    "type": pred["label"],
                    "text": [pred["span"]],
                    "offsets": [[off0,off1]],
                    "normalized": [],
                }
                ents[doc_id].append(ent)

    out = {"document_id": [], "entities": []}
    for k,v in ents.items():
        out["document_id"].append(k)
        if len(v) > 0:
            out["entities"].append(v)
        else:
            print(f"There are no predicted entities for doc {k}, appending 1 dummy for error analysis")
            out["entities"].append([{
                'id': f'{k}_dummy',
                'normalized': [],
                'offsets': [[0, 1]],
                'text': ['d'],
                'type': 'DUMMY'
            }])
        
    return Dataset.from_dict(out)

bigbio_preds = infer2bigbio(span_marker_input, "", span_marker_preds)
bigbio_preds

There are no predicted entities for doc es-S1135-76062011000200009-1, appending 1 dummy for error analysis
There are no predicted entities for doc es-S1698-69462006000400002-1, appending 1 dummy for error analysis


Dataset({
    features: ['document_id', 'entities'],
    num_rows: 144
})

Most docs work, like number 2:

In [8]:
i = 2
for true_ent in bigbio_gs[i]["entities"]:
    print(true_ent)

print("\n----------GROUND TRUTH ABOVE, PREDICTION BELOW----------------\n")

for pred_ent in bigbio_preds[i]["entities"]:
    print(pred_ent)

error_analysis(bigbio_gs.select([0,i]),bigbio_preds.select([0,i]))

{'id': '508_es-S1135-76062012000200006-1_6377_entity_id', 'type': 'SINTOMA', 'text': ['disnea'], 'offsets': [[1049, 1055]], 'normalized': []}
{'id': '508_es-S1135-76062012000200006-1_6378_entity_id', 'type': 'SINTOMA', 'text': ['edema de extremidades inferiores'], 'offsets': [[1460, 1492]], 'normalized': []}
{'id': '508_es-S1135-76062012000200006-1_6379_entity_id', 'type': 'SINTOMA', 'text': ['debilidad general'], 'offsets': [[784, 801]], 'normalized': []}
{'id': '508_es-S1135-76062012000200006-1_6380_entity_id', 'type': 'SINTOMA', 'text': ['cefalea'], 'offsets': [[930, 937]], 'normalized': []}
{'id': '508_es-S1135-76062012000200006-1_6381_entity_id', 'type': 'SINTOMA', 'text': ['focalidad neurológica'], 'offsets': [[960, 981]], 'normalized': []}
{'id': '508_es-S1135-76062012000200006-1_6382_entity_id', 'type': 'SINTOMA', 'text': ['cuadro respiratorio'], 'offsets': [[663, 682]], 'normalized': []}
{'id': '508_es-S1135-76062012000200006-1_6383_entity_id', 'type': 'SINTOMA', 'text': ['est

Unnamed: 0,_word_len,_abbrev,pred_start,pred_end,pred_text,gt_start,gt_end,gt_text,entity_match_type,gold_concept,gold_type,pred_index,pred_index_score,pred_top,pred_top_score,document_id
0,1,False,139,146,[fallece],139,146,[fallece],tp,{'db_id': 'NIL'},SINTOMA,0,,NIL,,es-S1135-76062011000100006-1
1,1,False,543,556,[fallecimiento],543,556,[fallecimiento],tp,{'db_id': 'NIL'},SINTOMA,0,,NIL,,es-S1135-76062012000200006-1
2,2,False,702,707,[morir],663,682,[cuadro respiratorio],fn,{'db_id': 'NIL'},SINTOMA,0,,NIL,,es-S1135-76062012000200006-1
3,2,False,702,707,[morir],784,801,[debilidad general],fp,{'db_id': 'NIL'},SINTOMA,0,,NIL,,es-S1135-76062012000200006-1
4,2,False,784,801,[debilidad general],784,801,[debilidad general],tp,{'db_id': 'NIL'},SINTOMA,0,,NIL,,es-S1135-76062012000200006-1
5,1,False,930,937,[cefalea],930,937,[cefalea],tp,{'db_id': 'NIL'},SINTOMA,0,,NIL,,es-S1135-76062012000200006-1
6,2,False,960,981,[focalidad neurológica],960,981,[focalidad neurológica],tp,{'db_id': 'NIL'},SINTOMA,0,,NIL,,es-S1135-76062012000200006-1
7,4,False,995,1000,[murió],995,1016,[murió de forma súbita],be,{'db_id': 'NIL'},SINTOMA,0,,NIL,,es-S1135-76062012000200006-1
8,1,False,1049,1055,[disnea],1090,1096,[muerte],fp,{'db_id': 'NIL'},SINTOMA,0,,NIL,,es-S1135-76062012000200006-1
9,1,False,1090,1096,[muerte],1090,1096,[muerte],tp,{'db_id': 'NIL'},SINTOMA,0,,NIL,,es-S1135-76062012000200006-1


But some don't, like number 3

Most docs work, like number 2:

In [9]:
i = 3
for true_ent in bigbio_gs[i]["entities"]:
    print(true_ent)

print("\n----------GROUND TRUTH ABOVE, PREDICTION BELOW----------------\n")

for pred_ent in bigbio_preds[i]["entities"]:
    print(pred_ent)

error_analysis(bigbio_gs.select([0,i]),bigbio_preds.select([0,i]))

{'id': '336_es-S1135-76062012000300014-1_4097_entity_id', 'type': 'SINTOMA', 'text': ['encéfalo con notable aumento de peso'], 'offsets': [[535, 571]], 'normalized': []}
{'id': '336_es-S1135-76062012000300014-1_4098_entity_id', 'type': 'SINTOMA', 'text': ['sedimento urinario se observó crecimiento de Proteus vulgaris'], 'offsets': [[1595, 1656]], 'normalized': []}
{'id': '336_es-S1135-76062012000300014-1_4099_entity_id', 'type': 'SINTOMA', 'text': ['orina con un llamativo color púrpura'], 'offsets': [[477, 513]], 'normalized': []}
{'id': '336_es-S1135-76062012000300014-1_4100_entity_id', 'type': 'SINTOMA', 'text': ['vejiga urinaria se cuantificaron 400 cc de orina turbia'], 'offsets': [[1284, 1339]], 'normalized': []}
{'id': '336_es-S1135-76062012000300014-1_4101_entity_id', 'type': 'SINTOMA', 'text': ['Análisis toxicológico negativo en muestras de sangre, orina y humor vítreo'], 'offsets': [[1474, 1548]], 'normalized': []}
{'id': '336_es-S1135-76062012000300014-1_4102_entity_id', 'typ

TypeError: can only join an iterable