In [1]:
import sys
sys.path.append('../scripts')

import os
# Disable weights and biases (if installed)
os.environ["WANDB_DISABLED"] = "true"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

In [2]:
from pathlib import Path
import pandas as pd

import transformers
import datasets
from transformers import AutoModelForTokenClassification, AutoTokenizer, Trainer, TrainingArguments, pipeline, DataCollatorForTokenClassification, EarlyStoppingCallback, trainer_utils

In [3]:
import huggingface_utils
from run_ner_training import get_train_args

from xl_bel.el_datasets import load_distemist_entities
from xl_bel.evaluation import evaluate

import distemist_util

In [4]:
ds_entities_train = load_distemist_entities()["train"]

Reusing dataset dis_temist (/home/Florian.Borchert/.cache/huggingface/datasets/dis_temist/subtrack1_entities/3.0.1/a1915148dbf2c746ae734644cb098356da71f63ed6fa586d9b24ac3cfb42df01)


  0%|          | 0/2 [00:00<?, ?it/s]

# Run training prediction

### Post-processing function (cleaning up entity boundaries)

In [5]:
import re

def clean_up(entities):
    entities = entities.copy()
    result = {k: [] for k in entities.keys()}
    
    def append(i):
        for k in result.keys():
            result[k].append(entities[k][i])
    
    for i, t in enumerate(entities['text']):
        start = entities['spans_start'][i][0]
        end = entities['spans_end'][i][-1]
        if len(t) == 1:
            continue
        append(i)
        m = re.match(r'([\w\s]*)\W+$', t)
        if m:
            result['text'][-1] = m.group(1)
            result['spans_end'][-1] = [end - len(t) + len(m.group(1))]
            t = m.group(1)
            end = entities['spans_end'][i][-1]
        m = re.match(r'^([^\n]+)\n+.*', t)
        if m:
            print(m)
            result['text'][-1] = m.group(1)
            result['spans_end'][-1] = [end - len(t) + len(m.group(1))]
            t = m.group(1)
            end = entities['spans_end'][i][-1]
    return result

In [6]:
from transformers.pipelines.token_classification import AggregationStrategy

def run_ner_pipeline(checkpoint_path, post_process, predict_train=True, predict_test=True, agg_strategy=AggregationStrategy.FIRST):
    metrics = {}
    
    # Own validation set (20% of distemist linking)
    with open('../test_docs.txt', 'r') as fh:
        test_ids = [l.strip() for l in fh.readlines()]
    
    print('Loading NER model checkpoint')
    model_checkpoint = Path(checkpoint_path)
    model = AutoModelForTokenClassification.from_pretrained(model_checkpoint)
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)
    assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)
    
    pipe = pipeline('ner', model=model, tokenizer=tokenizer, device=0, aggregation_strategy=agg_strategy)
    
    all_training_texts = distemist_util.load_distemist_texts('../data/distemist/training/text_files', sort_keys=False)
           
    def add_metrics(key, eval):
        _keys = ['precision', 'recall', 'fscore']
        metrics[key + '_strict'] = {k:v for k,v in eval['ner_strict'].items() if k in _keys}
        metrics[key + '_loose'] = {k:v for k,v in eval['ner_partial'].items() if k in _keys}
        
    if predict_train:
        X = all_training_texts.filter(lambda r: r['document_id'] not in test_ids)

        print('Running NER model on training data')
        all_training_pred = distemist_util.run_ner_pipeline(X, pipe)

        eval_train = evaluate(ds_entities_train.filter(lambda r: r['document_id'] not in test_ids), all_training_pred, ner_only=True)
        add_metrics('train', eval_train)
    
        out_file = Path('../results') / 'x-entities_training.tsv'
        distemist_util.write_dataset_to_tsv(all_training_pred, out_file, False)
    
    X_valid = all_training_texts.filter(lambda r: r['document_id'] in test_ids)

    print('Running NER model on validation data')
    validation_pred = distemist_util.run_ner_pipeline(X_valid, pipe)

    eval_valid = evaluate(ds_entities_train.filter(lambda r: r['document_id'] in test_ids), validation_pred, ner_only=True)
    add_metrics('valid', eval_valid)
    
    out_file = Path('../results') / 'x-entities_valid.tsv'
    distemist_util.write_dataset_to_tsv(validation_pred, out_file, False)
    
    if post_process:
        print('Cleaning up')
        valid_clean = validation_pred.map(lambda row: {'entities' : clean_up(row['entities'])})
        eval_valid_clean = evaluate(ds_entities_train.filter(lambda r: r['document_id'] in test_ids), valid_clean, ner_only=True)
        add_metrics('valid_clean', eval_valid_clean)
        
        out_file = Path('../results') / 'x-entities_valid_clean.tsv'
        distemist_util.write_dataset_to_tsv(valid_clean, out_file, False)
    
    if predict_test:
        print('Running NER model on test data')
        all_test_texts = distemist_util.load_distemist_texts('../data/distemist/test_background/text_files', sort_keys=True)
        all_test_pred = distemist_util.run_ner_pipeline(all_test_texts, pipe)
        
        if post_process:
            all_test_pred = all_test_pred.map(lambda row: {'entities' : clean_up(row['entities'])})
    
    return metrics, all_test_pred if predict_test else validation_pred

In [7]:
%%time
m, valid_pred_1 = run_ner_pipeline('../models/model_1/best_cp/', post_process=True, predict_test=False, predict_train=False)
print(m)

Loading NER model checkpoint


  0%|          | 0/1 [00:00<?, ?ba/s]

Running NER model on validation data


  preds = np.array(pipeline([s.text for s in spacy_sents]))


  0%|          | 0/117 [00:00<?, ?ex/s]

  sents = np.array(spacy_sents)[np.array(idx) == i].ravel()
Loading cached processed dataset at /home/Florian.Borchert/.cache/huggingface/datasets/dis_temist/subtrack1_entities/3.0.1/a1915148dbf2c746ae734644cb098356da71f63ed6fa586d9b24ac3cfb42df01/cache-06f2f21492335cfd.arrow


Cleaning up


  0%|          | 0/117 [00:00<?, ?ex/s]

Loading cached processed dataset at /home/Florian.Borchert/.cache/huggingface/datasets/dis_temist/subtrack1_entities/3.0.1/a1915148dbf2c746ae734644cb098356da71f63ed6fa586d9b24ac3cfb42df01/cache-06f2f21492335cfd.arrow


{'valid_strict': {'precision': 0.74435318275154, 'recall': 0.7583682008368201, 'fscore': 0.7512953367875648}, 'valid_loose': {'precision': 0.8618814269814896, 'recall': 0.8775996152323885, 'fscore': 0.8696695052591388}, 'valid_clean_strict': {'precision': 0.7564234326824255, 'recall': 0.7698744769874477, 'fscore': 0.7630896837739761}, 'valid_clean_loose': {'precision': 0.8639885318513207, 'recall': 0.8775996152323885, 'fscore': 0.8707408859983137}}
CPU times: user 23.2 s, sys: 1.5 s, total: 24.7 s
Wall time: 23.1 s


In [8]:
m, test_pred_1 = run_ner_pipeline('../models/model_1/best_cp/', post_process=False)
m

Loading NER model checkpoint


  0%|          | 0/1 [00:00<?, ?ba/s]

Running NER model on training data


  preds = np.array(pipeline([s.text for s in spacy_sents]))


  0%|          | 0/633 [00:00<?, ?ex/s]

  sents = np.array(spacy_sents)[np.array(idx) == i].ravel()
Loading cached processed dataset at /home/Florian.Borchert/.cache/huggingface/datasets/dis_temist/subtrack1_entities/3.0.1/a1915148dbf2c746ae734644cb098356da71f63ed6fa586d9b24ac3cfb42df01/cache-add7b3366a304a74.arrow


  0%|          | 0/1 [00:00<?, ?ba/s]

Running NER model on validation data


  preds = np.array(pipeline([s.text for s in spacy_sents]))


  0%|          | 0/117 [00:00<?, ?ex/s]

  sents = np.array(spacy_sents)[np.array(idx) == i].ravel()
Loading cached processed dataset at /home/Florian.Borchert/.cache/huggingface/datasets/dis_temist/subtrack1_entities/3.0.1/a1915148dbf2c746ae734644cb098356da71f63ed6fa586d9b24ac3cfb42df01/cache-06f2f21492335cfd.arrow


Running NER model on test data


  preds = np.array(pipeline([s.text for s in spacy_sents]))


  0%|          | 0/3000 [00:00<?, ?ex/s]

  sents = np.array(spacy_sents)[np.array(idx) == i].ravel()


{'train_strict': {'precision': 0.9701189440092834,
  'recall': 0.9407792938528625,
  'fscore': 0.9552238805970149},
 'train_loose': {'precision': 0.9974991519333785,
  'recall': 0.9855193613522019,
  'fscore': 0.9914730705503741},
 'valid_strict': {'precision': 0.74435318275154,
  'recall': 0.7583682008368201,
  'fscore': 0.7512953367875648},
 'valid_loose': {'precision': 0.8618814269814896,
  'recall': 0.8775996152323885,
  'fscore': 0.8696695052591388}}

In [9]:
out_file = Path('../submission') / 'subtrack1_entities' / '1-roberta-clinical-es-linear-lr.tsv'
distemist_util.write_dataset_to_tsv(test_pred_1, out_file, False)

Unnamed: 0,filename,mark,label,off0,off1,span
0,distemist_test_1,T1,ENFERMEDAD,81,86,ángor
1,distemist_test_1,T2,ENFERMEDAD,316,334,disfunción eréctil
2,distemist_test_1,T3,ENFERMEDAD,803,829,adenocarcinoma de próstata
3,distemist_test_1,T4,ENFERMEDAD,1214,1260,arterias pudendas accesorias (APA) bilaterales
4,distemist_test_2,T1,ENFERMEDAD,117,132,urticaria aguda
...,...,...,...,...,...,...
48777,distemist_test_3000,T6,ENFERMEDAD,2312,2352,carcinoma neuroendocrino indiferenciado:
48778,distemist_test_3000,T7,ENFERMEDAD,2624,2658,feocromocitoma maligno no secretor
48779,distemist_test_3000,T8,ENFERMEDAD,2842,2913,feocromocitoma maligno no secretor con metásta...
48780,distemist_test_3000,T9,ENFERMEDAD,3019,3024,tumor


In [10]:
m, test_pred_2 = run_ner_pipeline('../models/model_2/best_cp', post_process=False)
m

Loading NER model checkpoint


  0%|          | 0/1 [00:00<?, ?ba/s]

Running NER model on training data


  preds = np.array(pipeline([s.text for s in spacy_sents]))


  0%|          | 0/633 [00:00<?, ?ex/s]

  sents = np.array(spacy_sents)[np.array(idx) == i].ravel()
Loading cached processed dataset at /home/Florian.Borchert/.cache/huggingface/datasets/dis_temist/subtrack1_entities/3.0.1/a1915148dbf2c746ae734644cb098356da71f63ed6fa586d9b24ac3cfb42df01/cache-add7b3366a304a74.arrow


  0%|          | 0/1 [00:00<?, ?ba/s]

Running NER model on validation data


  preds = np.array(pipeline([s.text for s in spacy_sents]))


  0%|          | 0/117 [00:00<?, ?ex/s]

  sents = np.array(spacy_sents)[np.array(idx) == i].ravel()
Loading cached processed dataset at /home/Florian.Borchert/.cache/huggingface/datasets/dis_temist/subtrack1_entities/3.0.1/a1915148dbf2c746ae734644cb098356da71f63ed6fa586d9b24ac3cfb42df01/cache-06f2f21492335cfd.arrow


Running NER model on test data


  preds = np.array(pipeline([s.text for s in spacy_sents]))


  0%|          | 0/3000 [00:00<?, ?ex/s]

  sents = np.array(spacy_sents)[np.array(idx) == i].ravel()


{'train_strict': {'precision': 0.9695475638051044,
  'recall': 0.9404979603319735,
  'fscore': 0.9548018564798286},
 'train_loose': {'precision': 0.9970064081192465,
  'recall': 0.9848118410392517,
  'fscore': 0.9908716066417864},
 'valid_strict': {'precision': 0.7455497382198953,
  'recall': 0.7447698744769874,
  'fscore': 0.7451596023024595},
 'valid_loose': {'precision': 0.8673886504603883,
  'recall': 0.8648043401658576,
  'fscore': 0.866094567508388}}

In [11]:
out_file = Path('../submission') / 'subtrack1_entities' / '2-roberta-clinical-es-constant-lr.tsv'
distemist_util.write_dataset_to_tsv(test_pred_2, out_file, False)

Unnamed: 0,filename,mark,label,off0,off1,span
0,distemist_test_1,T1,ENFERMEDAD,81,94,ángor estable
1,distemist_test_1,T2,ENFERMEDAD,316,334,disfunción eréctil
2,distemist_test_1,T3,ENFERMEDAD,803,829,adenocarcinoma de próstata
3,distemist_test_1,T4,ENFERMEDAD,1214,1260,arterias pudendas accesorias (APA) bilaterales
4,distemist_test_2,T1,ENFERMEDAD,117,132,urticaria aguda
...,...,...,...,...,...,...
48463,distemist_test_3000,T5,ENFERMEDAD,2312,2352,carcinoma neuroendocrino indiferenciado:
48464,distemist_test_3000,T6,ENFERMEDAD,2624,2658,feocromocitoma maligno no secretor
48465,distemist_test_3000,T7,ENFERMEDAD,2842,2913,feocromocitoma maligno no secretor con metásta...
48466,distemist_test_3000,T8,ENFERMEDAD,3019,3024,tumor


In [12]:
m, test_pred_3 = run_ner_pipeline('../models/model_1/best_cp', post_process=True)
m

Loading NER model checkpoint


  0%|          | 0/1 [00:00<?, ?ba/s]

Running NER model on training data


  preds = np.array(pipeline([s.text for s in spacy_sents]))


  0%|          | 0/633 [00:00<?, ?ex/s]

  sents = np.array(spacy_sents)[np.array(idx) == i].ravel()
Loading cached processed dataset at /home/Florian.Borchert/.cache/huggingface/datasets/dis_temist/subtrack1_entities/3.0.1/a1915148dbf2c746ae734644cb098356da71f63ed6fa586d9b24ac3cfb42df01/cache-add7b3366a304a74.arrow


  0%|          | 0/1 [00:00<?, ?ba/s]

Running NER model on validation data


  preds = np.array(pipeline([s.text for s in spacy_sents]))


  0%|          | 0/117 [00:00<?, ?ex/s]

  sents = np.array(spacy_sents)[np.array(idx) == i].ravel()
Loading cached processed dataset at /home/Florian.Borchert/.cache/huggingface/datasets/dis_temist/subtrack1_entities/3.0.1/a1915148dbf2c746ae734644cb098356da71f63ed6fa586d9b24ac3cfb42df01/cache-06f2f21492335cfd.arrow


Cleaning up


  0%|          | 0/117 [00:00<?, ?ex/s]

Loading cached processed dataset at /home/Florian.Borchert/.cache/huggingface/datasets/dis_temist/subtrack1_entities/3.0.1/a1915148dbf2c746ae734644cb098356da71f63ed6fa586d9b24ac3cfb42df01/cache-06f2f21492335cfd.arrow


Running NER model on test data


  preds = np.array(pipeline([s.text for s in spacy_sents]))


  0%|          | 0/3000 [00:00<?, ?ex/s]

  sents = np.array(spacy_sents)[np.array(idx) == i].ravel()


  0%|          | 0/3000 [00:00<?, ?ex/s]

<re.Match object; span=(0, 34), match='enfermedad de Rendu-Osler-Weber:\n1'>
<re.Match object; span=(0, 34), match='granulocítico ovárico\n\nTratamiento'>
<re.Match object; span=(0, 43), match='neumonías extrahospitalarias de repetición\n'>
<re.Match object; span=(0, 46), match='lesiones cerebrales por toxoplasmosis y/o TB\n-'>
<re.Match object; span=(0, 9), match='verrugas\n'>
<re.Match object; span=(0, 28), match='hiperglucémica\nPielonefritis'>
<re.Match object; span=(0, 36), match='urticaria generalizada\n\nAntecedentes'>
<re.Match object; span=(0, 31), match='dermatitis atópica\n\nExploración'>
<re.Match object; span=(0, 36), match='Urticaria inducida por infecciones\nb'>
<re.Match object; span=(0, 57), match='Alergia a las proteínas de la leche de vaca IgE m>
<re.Match object; span=(0, 42), match='Urticaria por picadura de insectos\n\nAunque'>
<re.Match object; span=(0, 27), match='infiltración por LAL-T.\n\nEl'>
<re.Match object; span=(0, 30), match='infiltración masiva LAL-T.\n

{'train_strict': {'precision': 0.9701189440092834,
  'recall': 0.9407792938528625,
  'fscore': 0.9552238805970149},
 'train_loose': {'precision': 0.9974991519333785,
  'recall': 0.9855193613522019,
  'fscore': 0.9914730705503741},
 'valid_strict': {'precision': 0.74435318275154,
  'recall': 0.7583682008368201,
  'fscore': 0.7512953367875648},
 'valid_loose': {'precision': 0.8618814269814896,
  'recall': 0.8775996152323885,
  'fscore': 0.8696695052591388},
 'valid_clean_strict': {'precision': 0.7564234326824255,
  'recall': 0.7698744769874477,
  'fscore': 0.7630896837739761},
 'valid_clean_loose': {'precision': 0.8639885318513207,
  'recall': 0.8775996152323885,
  'fscore': 0.8707408859983137}}

In [13]:
out_file = Path('../submission') / 'subtrack1_entities' / '3-roberta-clinical-es-linear-lr-post-process.tsv'
distemist_util.write_dataset_to_tsv(test_pred_3, out_file, False)

Unnamed: 0,filename,mark,label,off0,off1,span
0,distemist_test_1,T1,ENFERMEDAD,81,86,ángor
1,distemist_test_1,T2,ENFERMEDAD,316,334,disfunción eréctil
2,distemist_test_1,T3,ENFERMEDAD,803,829,adenocarcinoma de próstata
3,distemist_test_1,T4,ENFERMEDAD,1214,1260,arterias pudendas accesorias (APA) bilaterales
4,distemist_test_2,T1,ENFERMEDAD,117,132,urticaria aguda
...,...,...,...,...,...,...
48739,distemist_test_3000,T6,ENFERMEDAD,2312,2351,carcinoma neuroendocrino indiferenciado
48740,distemist_test_3000,T7,ENFERMEDAD,2624,2658,feocromocitoma maligno no secretor
48741,distemist_test_3000,T8,ENFERMEDAD,2842,2913,feocromocitoma maligno no secretor con metásta...
48742,distemist_test_3000,T9,ENFERMEDAD,3019,3024,tumor


In [14]:
m, test_pred_4 = run_ner_pipeline('../models/model_2/best_cp', post_process=True)
m

Loading NER model checkpoint


  0%|          | 0/1 [00:00<?, ?ba/s]

Running NER model on training data


  preds = np.array(pipeline([s.text for s in spacy_sents]))


  0%|          | 0/633 [00:00<?, ?ex/s]

  sents = np.array(spacy_sents)[np.array(idx) == i].ravel()
Loading cached processed dataset at /home/Florian.Borchert/.cache/huggingface/datasets/dis_temist/subtrack1_entities/3.0.1/a1915148dbf2c746ae734644cb098356da71f63ed6fa586d9b24ac3cfb42df01/cache-add7b3366a304a74.arrow


  0%|          | 0/1 [00:00<?, ?ba/s]

Running NER model on validation data


  preds = np.array(pipeline([s.text for s in spacy_sents]))


  0%|          | 0/117 [00:00<?, ?ex/s]

  sents = np.array(spacy_sents)[np.array(idx) == i].ravel()
Loading cached processed dataset at /home/Florian.Borchert/.cache/huggingface/datasets/dis_temist/subtrack1_entities/3.0.1/a1915148dbf2c746ae734644cb098356da71f63ed6fa586d9b24ac3cfb42df01/cache-06f2f21492335cfd.arrow


Cleaning up


  0%|          | 0/117 [00:00<?, ?ex/s]

Loading cached processed dataset at /home/Florian.Borchert/.cache/huggingface/datasets/dis_temist/subtrack1_entities/3.0.1/a1915148dbf2c746ae734644cb098356da71f63ed6fa586d9b24ac3cfb42df01/cache-06f2f21492335cfd.arrow


Running NER model on test data


  preds = np.array(pipeline([s.text for s in spacy_sents]))


  0%|          | 0/3000 [00:00<?, ?ex/s]

  sents = np.array(spacy_sents)[np.array(idx) == i].ravel()


  0%|          | 0/3000 [00:00<?, ?ex/s]

<re.Match object; span=(0, 34), match='enfermedad de Rendu-Osler-Weber:\n1'>
<re.Match object; span=(0, 22), match='Epistaxis recurrente\n2'>
<re.Match object; span=(0, 34), match='granulocítico ovárico\n\nTratamiento'>
<re.Match object; span=(0, 43), match='neumonías extrahospitalarias de repetición\n'>
<re.Match object; span=(0, 46), match='lesiones cerebrales por toxoplasmosis y/o TB\n-'>
<re.Match object; span=(0, 9), match='verrugas\n'>
<re.Match object; span=(0, 21), match='víricas:\n\nINFECCIONES'>
<re.Match object; span=(0, 28), match='hiperglucémica\nPielonefritis'>
<re.Match object; span=(0, 36), match='urticaria generalizada\n\nAntecedentes'>
<re.Match object; span=(0, 31), match='dermatitis atópica\n\nExploración'>
<re.Match object; span=(0, 36), match='Urticaria inducida por infecciones\nb'>
<re.Match object; span=(0, 57), match='Alergia a las proteínas de la leche de vaca IgE m>
<re.Match object; span=(0, 42), match='Urticaria por picadura de insectos\n\nAunque'>
<re.Matc

{'train_strict': {'precision': 0.9695475638051044,
  'recall': 0.9404979603319735,
  'fscore': 0.9548018564798286},
 'train_loose': {'precision': 0.9970064081192465,
  'recall': 0.9848118410392517,
  'fscore': 0.9908716066417864},
 'valid_strict': {'precision': 0.7455497382198953,
  'recall': 0.7447698744769874,
  'fscore': 0.7451596023024595},
 'valid_loose': {'precision': 0.8673886504603883,
  'recall': 0.8648043401658576,
  'fscore': 0.866094567508388},
 'valid_clean_strict': {'precision': 0.7578616352201258,
  'recall': 0.7562761506276151,
  'fscore': 0.7570680628272252},
 'valid_clean_loose': {'precision': 0.8695434935021332,
  'recall': 0.8648043401658576,
  'fscore': 0.867167441900332}}

In [15]:
out_file = Path('../submission') / 'subtrack1_entities' / '4-roberta-clinical-es-constant-lr-post-process.tsv'
distemist_util.write_dataset_to_tsv(test_pred_4, out_file, False)

Unnamed: 0,filename,mark,label,off0,off1,span
0,distemist_test_1,T1,ENFERMEDAD,81,94,ángor estable
1,distemist_test_1,T2,ENFERMEDAD,316,334,disfunción eréctil
2,distemist_test_1,T3,ENFERMEDAD,803,829,adenocarcinoma de próstata
3,distemist_test_1,T4,ENFERMEDAD,1214,1260,arterias pudendas accesorias (APA) bilaterales
4,distemist_test_2,T1,ENFERMEDAD,117,132,urticaria aguda
...,...,...,...,...,...,...
48440,distemist_test_3000,T5,ENFERMEDAD,2312,2351,carcinoma neuroendocrino indiferenciado
48441,distemist_test_3000,T6,ENFERMEDAD,2624,2658,feocromocitoma maligno no secretor
48442,distemist_test_3000,T7,ENFERMEDAD,2842,2913,feocromocitoma maligno no secretor con metásta...
48443,distemist_test_3000,T8,ENFERMEDAD,3019,3024,tumor
