In [3]:
import re
from datasets import Dataset, load_dataset
import datasets
import pandas as pd
import numpy as np
from pathlib import Path
from span_marker import SpanMarkerModel
import spacy
import os
import tqdm
from xmen.evaluation import error_analysis

In [5]:
base_path = Path('../data/symptemist-train_all_subtasks+gazetteer+multilingual+test_all_subtasks+bg_231006/')

# Load Raw txt Files

In [6]:
def load_texts(text_path, tsv_filter = None, sort_keys = False):
    text_path = Path(text_path)
    doc_ids = []
    texts = []
    if tsv_filter:
        tsv_filter = pd.read_csv(tsv_filter, sep='\t').filename.unique()
    files = list(text_path.glob('*.txt'))
    if sort_keys:
        files = sorted(files, key=lambda k: int(k.stem.split('_')[-1]))
    for f in files:
        if tsv_filter is None or f.stem in tsv_filter:
            doc_ids.append(f.stem)
            texts.append(open(f, 'r', encoding='utf-8').read())
    return Dataset.from_dict({
        'document_id' : doc_ids,
        'text' : texts
    })
    
texts = load_texts(base_path / "symptemist_train/subtask1-ner/txt")
texts

Dataset({
    features: ['document_id', 'text'],
    num_rows: 750
})

# Load BigBIO Dataset w/ Ground Truth Label


In [7]:
bigbio = load_dataset(
    path="../../biomedical/bigbio/hub/hub_repos/symptemist/symptemist.py", 
    name="symptemist_entities_bigbio_kb"
)
bigbio = bigbio.sort("document_id")

# get the docs used for eval
bigbio_gs = bigbio["train"].select(range(600,744))
eval_list = bigbio_gs["document_id"]

texts_eval = texts.filter(lambda e: e["document_id"] in eval_list)
texts_eval = texts_eval.sort("document_id")
texts_eval

Filter: 100%|██████████████████████████████████████████████████████████████████████████████████| 750/750 [00:00<00:00, 129640.55 examples/s]


Dataset({
    features: ['document_id', 'text'],
    num_rows: 144
})

Transform the loaded texts into a Dataset with context level info that can be passed to SpanModelMarker.predict()

In [8]:
def text2spanmarker(ds) -> Dataset:
    """
    Parses the read texts to a senteniced format with doc-level context useful for SpanMarkerModel
    """
    
    nlp = spacy.load("es_core_news_sm")
    output = {
        "filename": [],
        "document_id": [],
        "sentence_id": [],
        "sentence_start": [],
        "tokens": [],
    }
    
    for doc_id, (row, doc) in tqdm.tqdm(enumerate(zip(ds, nlp.pipe(ds['text'], disable=['ner']))), desc="Document progress:", total=len(ds)):
        for sentence_id, sentence in enumerate(doc.sents):

            output["filename"].append(row["document_id"])
            output["document_id"].append(doc_id)
            output["sentence_id"].append(sentence_id)
            output["sentence_start"].append(sentence.start_char)
            output["tokens"].append(sentence.text)
        
    return datasets.Dataset.from_dict(output)

span_marker_input = text2spanmarker(texts_eval).sort("document_id")
span_marker_input.to_pandas().head()

span-marker is already registered. Overwriting pipeline for task span-marker...
Document progress:: 100%|█████████████████████████████████████████████████████████████████████████████████| 144/144 [00:02<00:00, 56.21it/s]


Unnamed: 0,filename,document_id,sentence_id,sentence_start,tokens
0,es-S1135-76062011000100006-1,0,0,0,Hombre de 27 años que durante una persecución ...
1,es-S1135-76062011000100006-1,0,1,104,Trasladado a un centro asistencial fallece a l...
2,es-S1135-76062011000100006-1,0,2,204,Entre los hallazgos autópsicos se constatan do...
3,es-S1135-76062011000100006-1,0,3,402,"El orificio de entrada cutáneo, sin tatuaje ni..."
4,es-S1135-76062011000100006-1,0,4,513,"Rebatido el cuero cabelludo, se observa orific..."


In [9]:
model = SpanMarkerModel.from_pretrained('../checkpoints/v2/checkpoint-24390')
model.cuda(0)

SpanMarkerModel(
  (encoder): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(52008, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((

In [None]:
preds = model.predict(span_marker_input, batch_size = 16, show_progress_bar = True)

Spreading data between multiple samples: 100%|█████████████████████████████████████████████████| 2269/2269 [00:00<00:00, 2585.90 examples/s]
 80%|████████████████████████████████████████████████████████████████████████████████▌                    | 312/391 [00:55<00:13,  5.78it/s]

### Evaluation

Transform back into BigBIO format to use xMEN evaluation and error analysis

In [None]:
from xmen.data import from_spans
pred_bigbio_gs = from_spans(preds, span_marker_input['tokens'], span_marker_input['filename'], span_marker_input['sentence_start'])

In [None]:
from xmen.evaluation import error_analysis, evaluate

In [None]:
evaluate(bigbio_gs, pred_bigbio_gs, ner_only=True)

In [None]:
ea_df = error_analysis(bigbio_gs, pred_bigbio_gs)

In [None]:
ea_df.ner_match_type.value_counts()

In [None]:
# Recover strict precision and recall from error analysis
tp_count_strict = ea_df.ner_match_type.value_counts()['tp']
tp_count_strict / sum([len(p) for p in preds]), tp_count_strict / len([e for d in bigbio_gs['entities'] for e in d])

In [None]:
# Recover relaxed precision and recall from error analysis
tp_count_relaxed = ea_df.ner_match_type.value_counts()['tp'] + ea_df.ner_match_type.value_counts()['be']
tp_count_relaxed / sum([len(p) for p in preds]), tp_count_relaxed / len([e for d in bigbio_gs['entities'] for e in d])

### Get Predictions on Background Set

In [None]:
def infer_ds(span_marker_ds: Dataset, preds):    
    output = {
        "filename": [],
        "label": [],
        "start_span": [],
        "end_span": [],
        "text": [],
    }
    
    for i, row in enumerate(span_marker_ds):
        if len(preds[i]) > 0:
            for pred in preds[i]:
                output["filename"].append(row["filename"])
                output["label"].append("SINTOMA")
                start_span = pred["char_start_index"] + row["sentence_start"]
                end_span = row["sentence_start"] + pred["char_end_index"]
                output["start_span"].append(start_span)
                output["end_span"].append(end_span)
                output["text"].append(pred["span"])

    return pd.DataFrame.from_dict(output)

In [None]:
background_text = load_texts(base_path / "symptemist_background-set/all_txt")

In [None]:
background_spanmarker = text2spanmarker(background_text).sort("document_id")

In [None]:
background_spanmarker.save_to_disk('output/background_spanmarker')

Inference will take a couple of days -> use [../src/run_ner.py](../src/run_ner.py)

In [None]:
# preds_background = model.predict(background_spanmarker, batch_size = 128, show_progress_bar = True)