In [1]:
!pip install torch pandas numpy transformers accelerate datasets tokenizers seqeval evaluate



## Token classification

In [2]:
import os
import pandas as pd
import datasets 
import numpy as np 
from transformers import BertTokenizerFast, RobertaTokenizerFast, AutoTokenizer
from transformers import DataCollatorForTokenClassification 
from transformers import AutoModelForTokenClassification 
from datasets import load_dataset
from collections import defaultdict
from datasets import Dataset, DatasetDict, Features, Value, Sequence, ClassLabel
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
model_to_use = "dmis-lab/biobert-base-cased-v1.2"
dataset_to_use = "biored"

In [4]:
data_files = {"train": "./datasets/preprocessed_NER/"+dataset_to_use+"/train.csv", "validation": "./datasets/preprocessed_NER/"+dataset_to_use+"/dev.csv", "test": "./datasets/preprocessed_NER/"+dataset_to_use+"/test.csv"}
raw_dataset = load_dataset("csv", data_files=data_files)

print(raw_dataset["train"][0])
raw_dataset["train"].features["labels"]

{'words': 'Late-onset', 'sentence_id': 0, 'labels': 'O'}


Value(dtype='string', id=None)

In [5]:
grouped_datasets = {}

for split in ["train", "validation", "test"]:
    dataset = raw_dataset[split]
    grouped = defaultdict(lambda: {"words": [], "labels": []})

    for example in dataset:
        sid = example["sentence_id"]
        grouped[sid]["words"].append(example["words"])
        grouped[sid]["labels"].append(example["labels"])

    grouped_list = []
    for sid, data in grouped.items():
        grouped_list.append({
            "sentence_id": sid,
            "words": data["words"],
            "labels": data["labels"]
        })
    grouped_datasets[split] = Dataset.from_list(grouped_list)

all_labels = set()
for example in grouped_datasets["train"]:
    all_labels.update(example["labels"])
label_list = sorted(list(all_labels))

label_feature = ClassLabel(names=label_list)

features = Features({
    "sentence_id": Value("int32"),
    "words": Sequence(Value("string")),
    "labels": Sequence(label_feature),
})

final_datasets = DatasetDict()
for split in ["train", "validation", "test"]:
    grouped_datasets[split] = grouped_datasets[split].cast(features)

final_datasets = DatasetDict(grouped_datasets)

print(final_datasets["train"][0])
print("List of labels:", final_datasets["train"].features["labels"].feature.names)

Casting the dataset: 100%|███████████████████████████████████████████████████████████████████| 4342/4342 [00:00<00:00, 14587.36 examples/s]
Casting the dataset: 100%|███████████████████████████████████████████████████████████████████| 1127/1127 [00:00<00:00, 34354.80 examples/s]
Casting the dataset: 100%|███████████████████████████████████████████████████████████████████| 1096/1096 [00:00<00:00, 34930.22 examples/s]

{'sentence_id': 0, 'words': ['Late-onset', 'metachromatic', 'leukodystrophy', ':', 'molecular', 'pathology', 'in', 'two', 'siblings', '.'], 'labels': [12, 2, 8, 12, 12, 12, 12, 12, 12, 12]}
List of labels: ['B-CellLine', 'B-ChemicalEntity', 'B-DiseaseOrPhenotypicFeature', 'B-GeneOrGeneProduct', 'B-OrganismTaxon', 'B-SequenceVariant', 'I-CellLine', 'I-ChemicalEntity', 'I-DiseaseOrPhenotypicFeature', 'I-GeneOrGeneProduct', 'I-OrganismTaxon', 'I-SequenceVariant', 'O']





In [6]:
final_datasets.shape

{'train': (4342, 3), 'validation': (1127, 3), 'test': (1096, 3)}

In [7]:
final_datasets["train"][0]

{'sentence_id': 0,
 'words': ['Late-onset',
  'metachromatic',
  'leukodystrophy',
  ':',
  'molecular',
  'pathology',
  'in',
  'two',
  'siblings',
  '.'],
 'labels': [12, 2, 8, 12, 12, 12, 12, 12, 12, 12]}

In [8]:
final_datasets["train"].features["labels"]

Sequence(feature=ClassLabel(names=['B-CellLine', 'B-ChemicalEntity', 'B-DiseaseOrPhenotypicFeature', 'B-GeneOrGeneProduct', 'B-OrganismTaxon', 'B-SequenceVariant', 'I-CellLine', 'I-ChemicalEntity', 'I-DiseaseOrPhenotypicFeature', 'I-GeneOrGeneProduct', 'I-OrganismTaxon', 'I-SequenceVariant', 'O'], id=None), length=-1, id=None)

In [9]:
tokenizer = AutoTokenizer.from_pretrained(model_to_use) 

In [10]:
example_text = final_datasets['train'][0]
tokenized_input = tokenizer(example_text["words"], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
word_ids = tokenized_input.word_ids()
print(word_ids)

[None, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 2, 3, 4, 5, 5, 6, 7, 8, 9, None]


In [11]:
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
tokens

['[CLS]',
 'late',
 '-',
 'onset',
 'meta',
 '##ch',
 '##romatic',
 'le',
 '##uk',
 '##ody',
 '##stro',
 '##phy',
 ':',
 'molecular',
 'path',
 '##ology',
 'in',
 'two',
 'siblings',
 '.',
 '[SEP]']

In [12]:
len(example_text['labels']), len(tokenized_input["input_ids"])

(10, 21)

In [13]:
def tokenize_and_align_labels(examples):
    texts = [[str(token) for token in sent] for sent in examples["words"]]
    
    tokenized_inputs = tokenizer(
        texts,
        truncation=True,
        is_split_into_words=True,
        padding="max_length",
        max_length=128
    )
    
    labels_all = []
    for i, word_labels in enumerate(examples["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(word_labels[word_idx])
            else:
                # We decide whether sub-tokens inherit the label or are ignored.
                # Ignore -> -100 ; if not -> word_labels[word_idx].
                label_ids.append(word_labels[word_idx])
            previous_word_idx = word_idx
        labels_all.append(label_ids)
    
    tokenized_inputs["labels"] = labels_all
    return tokenized_inputs

In [14]:
final_datasets['train'][4:5]

{'sentence_id': [4],
 'words': [['A',
   'comparison',
   'of',
   'genotypes',
   ',',
   'ARSA',
   'activities',
   ',',
   'and',
   'clinical',
   'data',
   'on',
   '4',
   'individuals',
   'carrying',
   'the',
   'allele',
   'of',
   '81',
   'patients',
   'with',
   'MLD',
   'examined',
   ',',
   'further',
   'validates',
   'the',
   'concept',
   'that',
   'different',
   'degrees',
   'of',
   'residual',
   'ARSA',
   'activity',
   'are',
   'the',
   'basis',
   'of',
   'phenotypical',
   'variation',
   'in',
   'MLD',
   '..']],
 'labels': [[12,
   12,
   12,
   12,
   12,
   3,
   12,
   12,
   12,
   12,
   12,
   12,
   12,
   12,
   12,
   12,
   12,
   12,
   12,
   4,
   12,
   2,
   12,
   12,
   12,
   12,
   12,
   12,
   12,
   12,
   12,
   12,
   12,
   3,
   12,
   12,
   12,
   12,
   12,
   12,
   12,
   12,
   2,
   12]]}

In [15]:
q = tokenize_and_align_labels(final_datasets['train'][4:5]) 
print(q) 

{'input_ids': [[101, 170, 7577, 1104, 176, 26601, 15177, 1116, 117, 170, 24129, 2619, 117, 1105, 7300, 2233, 1113, 125, 2833, 4004, 1103, 1155, 11194, 1104, 5615, 4420, 1114, 182, 5253, 8600, 117, 1748, 9221, 5430, 1103, 3400, 1115, 1472, 4842, 1104, 25399, 170, 24129, 3246, 1132, 1103, 3142, 1104, 185, 10436, 27202, 1348, 8516, 1107, 182, 5253, 119, 119, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1,

In [16]:
for token, label in zip(tokenizer.convert_ids_to_tokens(q["input_ids"][0]),q["labels"][0]): 
    print(f"{token:_<40} {label}") 

[CLS]___________________________________ -100
a_______________________________________ 12
comparison______________________________ 12
of______________________________________ 12
g_______________________________________ 12
##eno___________________________________ 12
##type__________________________________ 12
##s_____________________________________ 12
,_______________________________________ 12
a_______________________________________ 3
##rsa___________________________________ 3
activities______________________________ 12
,_______________________________________ 12
and_____________________________________ 12
clinical________________________________ 12
data____________________________________ 12
on______________________________________ 12
4_______________________________________ 12
individuals_____________________________ 12
carrying________________________________ 12
the_____________________________________ 12
all_____________________________________ 12
##ele___________________________

In [17]:
tokenized_datasets = final_datasets.map(tokenize_and_align_labels, batched=True)

Map: 100%|████████████████████████████████████████████████████████████████████████████████████| 4342/4342 [00:00<00:00, 8683.80 examples/s]
Map: 100%|███████████████████████████████████████████████████████████████████████████████████| 1127/1127 [00:00<00:00, 11165.13 examples/s]
Map: 100%|███████████████████████████████████████████████████████████████████████████████████| 1096/1096 [00:00<00:00, 11323.00 examples/s]


In [18]:
print(str(tokenized_datasets['train'][0]))

{'sentence_id': 0, 'words': ['Late-onset', 'metachromatic', 'leukodystrophy', ':', 'molecular', 'pathology', 'in', 'two', 'siblings', '.'], 'labels': [-100, 12, 12, 12, 2, 2, 2, 8, 8, 8, 8, 8, 12, 12, 12, 12, 12, 12, 12, 12, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100], 'input_ids': [101, 1523, 118, 15415, 27154, 1732, 16341, 5837, 7563, 22320, 21216, 22192, 131, 9546, 3507, 4807, 1107, 1160, 9

## Defining model

In [19]:
num_labels = len(label_list) 
model = AutoModelForTokenClassification.from_pretrained(model_to_use, num_labels=num_labels)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
from transformers import TrainingArguments, Trainer 

args = TrainingArguments(
    output_dir="output-ner",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=5,
    weight_decay=0.01,
    #warmup_ratio=0.1,
    logging_dir="./logs",
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    save_total_limit=2,
    fp16=True 
)

In [21]:
data_collator = DataCollatorForTokenClassification(tokenizer) 

In [22]:
import evaluate

metric = evaluate.load("seqeval") 

## Test dataset

In [23]:
example = final_datasets['train'][0]

In [24]:
label_list = final_datasets["train"].features["labels"].feature.names 

label_list

['B-CellLine',
 'B-ChemicalEntity',
 'B-DiseaseOrPhenotypicFeature',
 'B-GeneOrGeneProduct',
 'B-OrganismTaxon',
 'B-SequenceVariant',
 'I-CellLine',
 'I-ChemicalEntity',
 'I-DiseaseOrPhenotypicFeature',
 'I-GeneOrGeneProduct',
 'I-OrganismTaxon',
 'I-SequenceVariant',
 'O']

In [25]:
for i in example["labels"]:
  print(i)

12
2
8
12
12
12
12
12
12
12


In [26]:
labels = [label_list[i] for i in example["labels"]] 
labels

['O',
 'B-DiseaseOrPhenotypicFeature',
 'I-DiseaseOrPhenotypicFeature',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O']

In [27]:
metric.compute(predictions=[labels], references=[labels]) 

{'DiseaseOrPhenotypicFeature': {'precision': np.float64(1.0),
  'recall': np.float64(1.0),
  'f1': np.float64(1.0),
  'number': np.int64(1)},
 'overall_precision': np.float64(1.0),
 'overall_recall': np.float64(1.0),
 'overall_f1': np.float64(1.0),
 'overall_accuracy': 1.0}

### Compute Metrics

In [28]:
def compute_metrics(eval_preds):
    pred_logits, labels = eval_preds 
    pred_ids = np.argmax(pred_logits, axis=2) 

    predictions = [ 
        [label_list[p] for (p, l) in zip(pred, label) if l != -100] 
        for pred, label in zip(pred_ids, labels)
    ] 

    true_labels = [ 
        [label_list[l] for (p, l) in zip(pred, label) if l != -100] 
        for pred, label in zip(pred_ids, labels)
    ]

    results = metric.compute(predictions=predictions, references=true_labels)

    return { 
          "precision": results["overall_precision"], 
          "recall": results["overall_recall"], 
          "f1": results["overall_f1"], 
          "accuracy": results["overall_accuracy"], 
    }

## Training

In [29]:
trainer = Trainer( 
   model, 
   args, 
   train_dataset=tokenized_datasets["train"], 
   eval_dataset=tokenized_datasets["validation"], 
   data_collator=data_collator, 
   tokenizer=tokenizer, 
   compute_metrics=compute_metrics 
) 

  trainer = Trainer(


In [30]:
from transformers import get_cosine_schedule_with_warmup
import torch

train_batch_size = args.per_device_train_batch_size
total_steps = (len(tokenized_datasets["train"]) // train_batch_size) * args.num_train_epochs
warmup_steps = int(0.1 * total_steps)

optimizer = torch.optim.AdamW(trainer.model.parameters(), lr=args.learning_rate)

scheduler = get_cosine_schedule_with_warmup(
    optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_steps,
)

trainer.optimizer = optimizer
trainer.lr_scheduler = scheduler

In [31]:
trainer.train() 

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.197,0.246652,0.84256,0.874394,0.858182,0.934655
2,0.1185,0.255503,0.865478,0.879334,0.872351,0.937596
3,0.0934,0.273095,0.889857,0.878053,0.883916,0.943389
4,0.0654,0.317662,0.883496,0.894246,0.888838,0.945817
5,0.0109,0.325094,0.882635,0.892325,0.887453,0.945282


TrainOutput(global_step=5430, training_loss=0.15150197805291382, metrics={'train_runtime': 126.7621, 'train_samples_per_second': 171.266, 'train_steps_per_second': 42.836, 'total_flos': 1418329186460160.0, 'train_loss': 0.15150197805291382, 'epoch': 5.0})

## Save

In [32]:
model.save_pretrained("ner_model")

In [33]:
tokenizer.save_pretrained("tokenizer")

('tokenizer/tokenizer_config.json',
 'tokenizer/special_tokens_map.json',
 'tokenizer/vocab.txt',
 'tokenizer/added_tokens.json',
 'tokenizer/tokenizer.json')

In [34]:
id2label = {
    str(i): label for i,label in enumerate(label_list)
}
label2id = {
    label: str(i) for i,label in enumerate(label_list)
}

In [35]:
id2label

{'0': 'B-CellLine',
 '1': 'B-ChemicalEntity',
 '2': 'B-DiseaseOrPhenotypicFeature',
 '3': 'B-GeneOrGeneProduct',
 '4': 'B-OrganismTaxon',
 '5': 'B-SequenceVariant',
 '6': 'I-CellLine',
 '7': 'I-ChemicalEntity',
 '8': 'I-DiseaseOrPhenotypicFeature',
 '9': 'I-GeneOrGeneProduct',
 '10': 'I-OrganismTaxon',
 '11': 'I-SequenceVariant',
 '12': 'O'}

In [36]:
label2id

{'B-CellLine': '0',
 'B-ChemicalEntity': '1',
 'B-DiseaseOrPhenotypicFeature': '2',
 'B-GeneOrGeneProduct': '3',
 'B-OrganismTaxon': '4',
 'B-SequenceVariant': '5',
 'I-CellLine': '6',
 'I-ChemicalEntity': '7',
 'I-DiseaseOrPhenotypicFeature': '8',
 'I-GeneOrGeneProduct': '9',
 'I-OrganismTaxon': '10',
 'I-SequenceVariant': '11',
 'O': '12'}

## Loading model and prediction

In [37]:
import json

In [38]:
config = json.load(open("ner_model/config.json"))
config["id2label"] = id2label
config["label2id"] = label2id
json.dump(config, open("ner_model/config.json","w"))

In [39]:
from transformers import BertForTokenClassification, BertTokenizerFast

In [40]:
model = AutoModelForTokenClassification.from_pretrained("ner_model")
tokenizer = AutoTokenizer.from_pretrained("tokenizer")
tokenized_test = final_datasets["test"].map(tokenize_and_align_labels, batched=True)
results = trainer.evaluate(eval_dataset=tokenized_test)
print(results)

Map: 100%|███████████████████████████████████████████████████████████████████████████████████| 1096/1096 [00:00<00:00, 10680.96 examples/s]


{'eval_loss': 0.2692228853702545, 'eval_precision': 0.8720277851990382, 'eval_recall': 0.8941649164459866, 'eval_f1': 0.8829576194770062, 'eval_accuracy': 0.9487617037519366, 'eval_runtime': 2.0217, 'eval_samples_per_second': 542.129, 'eval_steps_per_second': 135.532, 'epoch': 5.0}
