In [48]:
!pip install torch pandas numpy transformers accelerate datasets tokenizers seqeval evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
Installing collected packages: evaluate
Successfully installed evaluate-0.4.3


## Token classification

In [10]:
import os
import pandas as pd
import datasets 
import numpy as np 
from transformers import BertTokenizerFast 
from transformers import DataCollatorForTokenClassification 
from transformers import AutoModelForTokenClassification 
from datasets import load_dataset
from collections import defaultdict
from datasets import Dataset, DatasetDict
import pandas as pd

In [51]:
data_files = {"train": "./datasets/preprocessed_NER/biored/train.csv", "validation": "./datasets/preprocessed_NER/biored/dev.csv", "test": "./datasets/preprocessed_NER/biored/test.csv"}
raw_dataset = load_dataset("csv", data_files=data_files)

print(raw_dataset["train"][0])
raw_dataset["train"].features["labels"]

{'words': 'Late-onset', 'sentence_id': 0, 'labels': 'O'}


Value(dtype='string', id=None)

In [None]:
grouped_datasets = {}

for split in ["train", "validation", "test"]:
    dataset = raw_dataset[split]
    grouped = defaultdict(lambda: {"words": [], "labels": []})

    for example in dataset:
        sid = example["sentence_id"]
        grouped[sid]["words"].append(example["words"])
        grouped[sid]["labels"].append(example["labels"])

    grouped_list = []
    for sid, data in grouped.items():
        grouped_list.append({
            "sentence_id": sid,
            "words": data["words"],
            "labels": data["labels"]
        })
    grouped_datasets[split] = Dataset.from_list(grouped_list)

all_labels = set()
for example in grouped_datasets["train"]:
    all_labels.update(example["labels"])
label_list = sorted(list(all_labels))

label_feature = ClassLabel(names=label_list)

features = Features({
    "sentence_id": Value("int32"),
    "words": Sequence(Value("string")),
    "labels": Sequence(label_feature),
})

final_datasets = DatasetDict()
for split in ["train", "validation", "test"]:
    grouped_datasets[split] = grouped_datasets[split].cast(features)

final_datasets = DatasetDict(grouped_datasets)

print(final_datasets["train"][0])
print("List of labels:", final_datasets["train"].features["labels"].feature.names)

Casting the dataset:   0%|          | 0/4342 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1127 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1096 [00:00<?, ? examples/s]

{'sentence_id': 0, 'words': ['Late-onset', 'metachromatic', 'leukodystrophy', ':', 'molecular', 'pathology', 'in', 'two', 'siblings', '.'], 'labels': [12, 2, 8, 12, 12, 12, 12, 12, 12, 12]}
Lista de etiquetas: ['B-CellLine', 'B-ChemicalEntity', 'B-DiseaseOrPhenotypicFeature', 'B-GeneOrGeneProduct', 'B-OrganismTaxon', 'B-SequenceVariant', 'I-CellLine', 'I-ChemicalEntity', 'I-DiseaseOrPhenotypicFeature', 'I-GeneOrGeneProduct', 'I-OrganismTaxon', 'I-SequenceVariant', 'O']


In [61]:
final_datasets.shape

{'train': (4342, 3), 'validation': (1127, 3), 'test': (1096, 3)}

In [59]:
final_datasets["train"][0]

{'sentence_id': 0,
 'words': ['Late-onset',
  'metachromatic',
  'leukodystrophy',
  ':',
  'molecular',
  'pathology',
  'in',
  'two',
  'siblings',
  '.'],
 'labels': [12, 2, 8, 12, 12, 12, 12, 12, 12, 12]}

In [63]:
final_datasets["train"].features["labels"]

Sequence(feature=ClassLabel(names=['B-CellLine', 'B-ChemicalEntity', 'B-DiseaseOrPhenotypicFeature', 'B-GeneOrGeneProduct', 'B-OrganismTaxon', 'B-SequenceVariant', 'I-CellLine', 'I-ChemicalEntity', 'I-DiseaseOrPhenotypicFeature', 'I-GeneOrGeneProduct', 'I-OrganismTaxon', 'I-SequenceVariant', 'O'], id=None), length=-1, id=None)

In [67]:
tokenizer = BertTokenizerFast.from_pretrained("google-bert/bert-base-uncased") 

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [71]:
example_text = final_datasets['train'][0]
tokenized_input = tokenizer(example_text["words"], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
word_ids = tokenized_input.word_ids()
print(word_ids)

[None, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 4, 5, 6, 7, 8, 9, None]


In [73]:
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
tokens

['[CLS]',
 'late',
 '-',
 'onset',
 'meta',
 '##ch',
 '##romatic',
 'le',
 '##uk',
 '##od',
 '##yst',
 '##rop',
 '##hy',
 ':',
 'molecular',
 'pathology',
 'in',
 'two',
 'siblings',
 '.',
 '[SEP]']

In [75]:
len(example_text['labels']), len(tokenized_input["input_ids"])

(10, 21)

In [None]:
def tokenize_and_align_labels(examples):
    texts = [[str(token) for token in sent] for sent in examples["words"]]
    tokenized_inputs = tokenizer(
        texts,
        truncation=True,
        is_split_into_words=True,
        padding="max_length",
        max_length=128
    )
    
    labels_all = []
    for i, word_labels in enumerate(examples["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(word_labels[word_idx])
            else:
                # We decide whether sub-tokens inherit the label or are ignored.
                # Ignore -> -100 ; if not -> word_labels[word_idx].
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels_all.append(label_ids)
    
    tokenized_inputs["labels"] = labels_all
    return tokenized_inputs

In [143]:
final_datasets['train'][4:5]

{'sentence_id': [4],
 'words': [['A',
   'comparison',
   'of',
   'genotypes',
   ',',
   'ARSA',
   'activities',
   ',',
   'and',
   'clinical',
   'data',
   'on',
   '4',
   'individuals',
   'carrying',
   'the',
   'allele',
   'of',
   '81',
   'patients',
   'with',
   'MLD',
   'examined',
   ',',
   'further',
   'validates',
   'the',
   'concept',
   'that',
   'different',
   'degrees',
   'of',
   'residual',
   'ARSA',
   'activity',
   'are',
   'the',
   'basis',
   'of',
   'phenotypical',
   'variation',
   'in',
   'MLD',
   '..']],
 'labels': [[12,
   12,
   12,
   12,
   12,
   3,
   12,
   12,
   12,
   12,
   12,
   12,
   12,
   12,
   12,
   12,
   12,
   12,
   12,
   4,
   12,
   2,
   12,
   12,
   12,
   12,
   12,
   12,
   12,
   12,
   12,
   12,
   12,
   3,
   12,
   12,
   12,
   12,
   12,
   12,
   12,
   12,
   2,
   12]]}

In [145]:
q = tokenize_and_align_labels(final_datasets['train'][4:5]) 
print(q) 

{'input_ids': [[101, 1037, 7831, 1997, 8991, 26305, 2015, 1010, 29393, 2050, 3450, 1010, 1998, 6612, 2951, 2006, 1018, 3633, 4755, 1996, 2035, 12260, 1997, 6282, 5022, 2007, 19875, 2094, 8920, 1010, 2582, 9398, 8520, 1996, 4145, 2008, 2367, 5445, 1997, 21961, 29393, 2050, 4023, 2024, 1996, 3978, 1997, 6887, 16515, 27086, 8386, 1999, 19875, 2094, 1012, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mas

In [147]:
for token, label in zip(tokenizer.convert_ids_to_tokens(q["input_ids"][0]),q["labels"][0]): 
    print(f"{token:_<40} {label}") 

[CLS]___________________________________ -100
a_______________________________________ 12
comparison______________________________ 12
of______________________________________ 12
gen_____________________________________ 12
##otype_________________________________ -100
##s_____________________________________ -100
,_______________________________________ 12
ars_____________________________________ 3
##a_____________________________________ -100
activities______________________________ 12
,_______________________________________ 12
and_____________________________________ 12
clinical________________________________ 12
data____________________________________ 12
on______________________________________ 12
4_______________________________________ 12
individuals_____________________________ 12
carrying________________________________ 12
the_____________________________________ 12
all_____________________________________ 12
##ele___________________________________ -100
of_____________________

In [151]:
tokenized_datasets = final_datasets.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/4342 [00:00<?, ? examples/s]

Map:   0%|          | 0/1127 [00:00<?, ? examples/s]

Map:   0%|          | 0/1096 [00:00<?, ? examples/s]

In [254]:
print(str(tokenized_datasets['train'][0]))

{'sentence_id': 0, 'words': ['Late-onset', 'metachromatic', 'leukodystrophy', ':', 'molecular', 'pathology', 'in', 'two', 'siblings', '.'], 'labels': [-100, 12, -100, -100, 2, -100, -100, 8, -100, -100, -100, -100, -100, 12, 12, 12, 12, 12, 12, 12, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100], 'input_ids': [101, 2397, 1011, 14447, 18804, 2818, 23645, 3393, 6968, 7716, 27268, 18981, 10536, 1024,

## Defining model

In [190]:
num_labels = len(label_list) 
model = AutoModelForTokenClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [192]:
from transformers import TrainingArguments, Trainer 

args = TrainingArguments( 
    "test-ner",
    eval_strategy = "epoch", 
    learning_rate=2e-5, 
    per_device_train_batch_size=16, 
    per_device_eval_batch_size=16, 
    num_train_epochs=3, 
    weight_decay=0.01, 
) 

In [194]:
data_collator = DataCollatorForTokenClassification(tokenizer) 

In [196]:
import evaluate

metric = evaluate.load("seqeval") 

## Test dataset

In [199]:
example = final_datasets['train'][0]

In [201]:
label_list = final_datasets["train"].features["labels"].feature.names 

label_list

['B-CellLine',
 'B-ChemicalEntity',
 'B-DiseaseOrPhenotypicFeature',
 'B-GeneOrGeneProduct',
 'B-OrganismTaxon',
 'B-SequenceVariant',
 'I-CellLine',
 'I-ChemicalEntity',
 'I-DiseaseOrPhenotypicFeature',
 'I-GeneOrGeneProduct',
 'I-OrganismTaxon',
 'I-SequenceVariant',
 'O']

In [203]:
for i in example["labels"]:
  print(i)

12
2
8
12
12
12
12
12
12
12


In [205]:
labels = [label_list[i] for i in example["labels"]] 
labels

['O',
 'B-DiseaseOrPhenotypicFeature',
 'I-DiseaseOrPhenotypicFeature',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O']

In [207]:
metric.compute(predictions=[labels], references=[labels]) 

{'DiseaseOrPhenotypicFeature': {'precision': 1.0,
  'recall': 1.0,
  'f1': 1.0,
  'number': 1},
 'overall_precision': 1.0,
 'overall_recall': 1.0,
 'overall_f1': 1.0,
 'overall_accuracy': 1.0}

### Compute Metrics

In [211]:
def compute_metrics(eval_preds):
    pred_logits, labels = eval_preds 
    pred_ids = np.argmax(pred_logits, axis=2) 

    predictions = [ 
        [label_list[p] for (p, l) in zip(pred, label) if l != -100] 
        for pred, label in zip(pred_ids, labels)
    ] 

    true_labels = [ 
        [label_list[l] for (p, l) in zip(pred, label) if l != -100] 
        for pred, label in zip(pred_ids, labels)
    ]

    results = metric.compute(predictions=predictions, references=true_labels)

    return { 
          "precision": results["overall_precision"], 
          "recall": results["overall_recall"], 
          "f1": results["overall_f1"], 
          "accuracy": results["overall_accuracy"], 
    }

## Training

In [214]:
trainer = Trainer( 
   model, 
   args, 
   train_dataset=tokenized_datasets["train"], 
   eval_dataset=tokenized_datasets["validation"], 
   data_collator=data_collator, 
   tokenizer=tokenizer, 
   compute_metrics=compute_metrics 
) 

  trainer = Trainer(


In [216]:
trainer.train() 

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.172772,0.761509,0.807885,0.784011,0.949325
2,0.239900,0.16197,0.783987,0.829656,0.806175,0.955184
3,0.239900,0.159738,0.787762,0.840836,0.813434,0.95652


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=816, training_loss=0.17959202504625507, metrics={'train_runtime': 4628.2381, 'train_samples_per_second': 2.814, 'train_steps_per_second': 0.176, 'total_flos': 850997511876096.0, 'train_loss': 0.17959202504625507, 'epoch': 3.0})

## Save

In [228]:
model.save_pretrained("ner_model")

In [230]:
tokenizer.save_pretrained("tokenizer")

('tokenizer\\tokenizer_config.json',
 'tokenizer\\special_tokens_map.json',
 'tokenizer\\vocab.txt',
 'tokenizer\\added_tokens.json',
 'tokenizer\\tokenizer.json')

In [232]:
id2label = {
    str(i): label for i,label in enumerate(label_list)
}
label2id = {
    label: str(i) for i,label in enumerate(label_list)
}

In [234]:
id2label

{'0': 'B-CellLine',
 '1': 'B-ChemicalEntity',
 '2': 'B-DiseaseOrPhenotypicFeature',
 '3': 'B-GeneOrGeneProduct',
 '4': 'B-OrganismTaxon',
 '5': 'B-SequenceVariant',
 '6': 'I-CellLine',
 '7': 'I-ChemicalEntity',
 '8': 'I-DiseaseOrPhenotypicFeature',
 '9': 'I-GeneOrGeneProduct',
 '10': 'I-OrganismTaxon',
 '11': 'I-SequenceVariant',
 '12': 'O'}

In [236]:
label2id

{'B-CellLine': '0',
 'B-ChemicalEntity': '1',
 'B-DiseaseOrPhenotypicFeature': '2',
 'B-GeneOrGeneProduct': '3',
 'B-OrganismTaxon': '4',
 'B-SequenceVariant': '5',
 'I-CellLine': '6',
 'I-ChemicalEntity': '7',
 'I-DiseaseOrPhenotypicFeature': '8',
 'I-GeneOrGeneProduct': '9',
 'I-OrganismTaxon': '10',
 'I-SequenceVariant': '11',
 'O': '12'}

## Loading model and prediction

In [239]:
import json

In [241]:
config = json.load(open("ner_model/config.json"))
config["id2label"] = id2label
config["label2id"] = label2id
json.dump(config, open("ner_model/config.json","w"))

In [251]:
from transformers import BertForTokenClassification, BertTokenizerFast

In [249]:
model = BertForTokenClassification.from_pretrained("ner_model")
tokenizer = BertTokenizerFast.from_pretrained("tokenizer")
tokenized_test = final_datasets["test"].map(tokenize_and_align_labels, batched=True)
results = trainer.evaluate(eval_dataset=tokenized_test)
print(results)

Map:   0%|          | 0/1096 [00:00<?, ? examples/s]

{'eval_loss': 0.13821668922901154, 'eval_precision': 0.7817133443163097, 'eval_recall': 0.8341634925285673, 'eval_f1': 0.8070871722182849, 'eval_accuracy': 0.9584826656257762, 'eval_runtime': 107.5048, 'eval_samples_per_second': 10.195, 'eval_steps_per_second': 0.642, 'epoch': 3.0}
