In [1]:
!pip install torch pandas numpy transformers accelerate datasets tokenizers seqeval evaluate



## Token classification

In [2]:
import os
import pandas as pd
import datasets 
import numpy as np 
from transformers import BertTokenizerFast 
from transformers import DataCollatorForTokenClassification 
from transformers import AutoModelForTokenClassification 
from datasets import load_dataset
from collections import defaultdict
from datasets import Dataset, DatasetDict, Features, Value, Sequence, ClassLabel
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
data_files = {"train": "./datasets/preprocessed_NER/ncbi-disease/train.csv", "validation": "./datasets/preprocessed_NER/ncbi-disease/dev.csv", "test": "./datasets/preprocessed_NER/ncbi-disease/test.csv"}
raw_dataset = load_dataset("csv", data_files=data_files)

print(raw_dataset["train"][0])
raw_dataset["train"].features["labels"]

{'words': 'Glucose', 'sentence_id': 0, 'labels': 'O'}


Value(dtype='string', id=None)

In [4]:
grouped_datasets = {}

for split in ["train", "validation", "test"]:
    dataset = raw_dataset[split]
    grouped = defaultdict(lambda: {"words": [], "labels": []})

    for example in dataset:
        sid = example["sentence_id"]
        grouped[sid]["words"].append(example["words"])
        grouped[sid]["labels"].append(example["labels"])

    grouped_list = []
    for sid, data in grouped.items():
        grouped_list.append({
            "sentence_id": sid,
            "words": data["words"],
            "labels": data["labels"]
        })
    grouped_datasets[split] = Dataset.from_list(grouped_list)

all_labels = set()
for example in grouped_datasets["train"]:
    all_labels.update(example["labels"])
label_list = sorted(list(all_labels))

label_feature = ClassLabel(names=label_list)

features = Features({
    "sentence_id": Value("int32"),
    "words": Sequence(Value("string")),
    "labels": Sequence(label_feature),
})

final_datasets = DatasetDict()
for split in ["train", "validation", "test"]:
    grouped_datasets[split] = grouped_datasets[split].cast(features)

final_datasets = DatasetDict(grouped_datasets)

print(final_datasets["train"][0])
print("List of labels:", final_datasets["train"].features["labels"].feature.names)

Casting the dataset: 100%|████████████████████████████| 5729/5729 [00:01<00:00, 5588.20 examples/s]
Casting the dataset: 100%|█████████████████████████████| 947/947 [00:00<00:00, 28981.53 examples/s]
Casting the dataset: 100%|█████████████████████████████| 978/978 [00:00<00:00, 29296.86 examples/s]

{'sentence_id': 0, 'words': ['Glucose', '6-phosphate', 'dehydrogenase', 'variants', ':', 'Gd', '(', '+', ')', 'Alexandra', 'associated', 'with', 'neonatal', 'jaundice', 'and', 'Gd', '(', '-', ')', 'Camperdown', 'in', 'a', 'young', 'man', 'with', 'lamellar', 'cataracts', '.'], 'labels': [8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 3, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8]}
List of labels: ['B-CompositeMention', 'B-DiseaseClass', 'B-Modifier', 'B-SpecificDisease', 'I-CompositeMention', 'I-DiseaseClass', 'I-Modifier', 'I-SpecificDisease', 'O']





In [5]:
final_datasets.shape

{'train': (5729, 3), 'validation': (947, 3), 'test': (978, 3)}

In [6]:
final_datasets["train"][0]

{'sentence_id': 0,
 'words': ['Glucose',
  '6-phosphate',
  'dehydrogenase',
  'variants',
  ':',
  'Gd',
  '(',
  '+',
  ')',
  'Alexandra',
  'associated',
  'with',
  'neonatal',
  'jaundice',
  'and',
  'Gd',
  '(',
  '-',
  ')',
  'Camperdown',
  'in',
  'a',
  'young',
  'man',
  'with',
  'lamellar',
  'cataracts',
  '.'],
 'labels': [8,
  8,
  8,
  8,
  8,
  8,
  8,
  8,
  8,
  8,
  8,
  8,
  3,
  7,
  8,
  8,
  8,
  8,
  8,
  8,
  8,
  8,
  8,
  8,
  8,
  8,
  8,
  8]}

In [7]:
final_datasets["train"].features["labels"]

Sequence(feature=ClassLabel(names=['B-CompositeMention', 'B-DiseaseClass', 'B-Modifier', 'B-SpecificDisease', 'I-CompositeMention', 'I-DiseaseClass', 'I-Modifier', 'I-SpecificDisease', 'O'], id=None), length=-1, id=None)

In [8]:
tokenizer = BertTokenizerFast.from_pretrained("dmis-lab/biobert-base-cased-v1.2") 

In [9]:
example_text = final_datasets['train'][0]
tokenized_input = tokenizer(example_text["words"], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
word_ids = tokenized_input.word_ids()
print(word_ids)

[None, 0, 1, 1, 1, 2, 2, 2, 2, 2, 3, 4, 5, 5, 6, 7, 8, 9, 9, 9, 10, 11, 12, 12, 12, 13, 13, 13, 13, 14, 15, 15, 16, 17, 18, 19, 19, 19, 20, 21, 22, 23, 24, 25, 25, 26, 26, 26, 27, None]


In [10]:
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
tokens

['[CLS]',
 'glucose',
 '6',
 '-',
 'phosphate',
 'de',
 '##hy',
 '##dr',
 '##ogen',
 '##ase',
 'variants',
 ':',
 'g',
 '##d',
 '(',
 '+',
 ')',
 'ale',
 '##xa',
 '##ndra',
 'associated',
 'with',
 'neon',
 '##ata',
 '##l',
 'j',
 '##au',
 '##ndi',
 '##ce',
 'and',
 'g',
 '##d',
 '(',
 '-',
 ')',
 'camp',
 '##erd',
 '##own',
 'in',
 'a',
 'young',
 'man',
 'with',
 'lame',
 '##llar',
 'cat',
 '##ara',
 '##cts',
 '.',
 '[SEP]']

In [11]:
len(example_text['labels']), len(tokenized_input["input_ids"])

(28, 50)

In [12]:
def tokenize_and_align_labels(examples):
    texts = [[str(token) for token in sent] for sent in examples["words"]]
    label_all_tokens=False
    tokenized_inputs = tokenizer(
        texts,
        truncation=True,
        is_split_into_words=True,
        padding="max_length",
        max_length=128,
        return_tensors='pt'
    )
    
    labels_all = []
    for i, word_ids in enumerate(tokenized_inputs.word_ids(batch_index=i) for i in range(len(examples["words"]))):
        word_labels = examples["labels"][i]
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(word_labels[word_idx])
            else:
                label_ids.append(word_labels[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx
        labels_all.append(label_ids)

    tokenized_inputs["labels"] = labels_all
    return tokenized_inputs

In [13]:
print(str(final_datasets['train'][4:5]))

{'sentence_id': [4], 'words': [['Although', 'this', 'association', 'may', 'be', 'coincidental', ',', 'it', 'prompts', 'further', 'attention', 'to', 'the', 'possibility', 'that', 'under', 'certain', 'circumstances', 'G6PD', 'deficiency', 'may', 'favor', 'cataract', 'formation', '.']], 'labels': [[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 3, 7, 8, 8, 2, 8, 8]]}


In [14]:
q = tokenize_and_align_labels(final_datasets['train'][4:5]) 
print(q) 

{'input_ids': tensor([[  101,  1780,  1142,  3852,  1336,  1129, 21439, 21739,   117,  1122,
          5250, 18378,  1116,  1748,  2209,  1106,  1103,  5417,  1115,  1223,
          2218,  5607,   176,  1545,  1643,  1181, 21344,  1336,  5010,  5855,
          4626,  5822,  3855,   119,   102,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,  

In [15]:
for token, label in zip(tokenizer.convert_ids_to_tokens(q["input_ids"][0]),q["labels"][0]): 
    print(f"{token:_<40} {label}") 

[CLS]___________________________________ -100
although________________________________ 8
this____________________________________ 8
association_____________________________ 8
may_____________________________________ 8
be______________________________________ 8
coincide________________________________ 8
##ntal__________________________________ -100
,_______________________________________ 8
it______________________________________ 8
pro_____________________________________ 8
##mpt___________________________________ -100
##s_____________________________________ -100
further_________________________________ 8
attention_______________________________ 8
to______________________________________ 8
the_____________________________________ 8
possibility_____________________________ 8
that____________________________________ 8
under___________________________________ 8
certain_________________________________ 8
circumstances___________________________ 8
g_______________________________________ 3

In [16]:
tokenized_datasets = final_datasets.map(tokenize_and_align_labels, batched=True)

Map: 100%|████████████████████████████████████████████| 5729/5729 [00:01<00:00, 4367.26 examples/s]
Map: 100%|██████████████████████████████████████████████| 947/947 [00:00<00:00, 4662.90 examples/s]
Map: 100%|██████████████████████████████████████████████| 978/978 [00:00<00:00, 4811.98 examples/s]


In [17]:
print(str(tokenized_datasets['train'][0]))

{'sentence_id': 0, 'words': ['Glucose', '6-phosphate', 'dehydrogenase', 'variants', ':', 'Gd', '(', '+', ')', 'Alexandra', 'associated', 'with', 'neonatal', 'jaundice', 'and', 'Gd', '(', '-', ')', 'Camperdown', 'in', 'a', 'young', 'man', 'with', 'lamellar', 'cataracts', '.'], 'labels': [-100, 8, 8, -100, -100, 8, -100, -100, -100, -100, 8, 8, 8, -100, 8, 8, 8, 8, -100, -100, 8, 8, 3, -100, -100, 7, -100, -100, -100, 8, 8, -100, 8, 8, 8, 8, -100, -100, 8, 8, 8, 8, 8, 8, -100, 8, -100, -100, 8, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100], 'input_ids': [101, 20636, 1

## Defining model

In [18]:
num_labels = len(label_list) 
model = AutoModelForTokenClassification.from_pretrained("dmis-lab/biobert-base-cased-v1.2", num_labels=num_labels)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
from transformers import TrainingArguments, Trainer 

args = TrainingArguments( 
    "test-ner",
    eval_strategy ="epoch", 
    save_strategy="epoch",
    learning_rate=5e-6,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=8,
    weight_decay=0.01,
    warmup_ratio=0.1,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
)

In [20]:
data_collator = DataCollatorForTokenClassification(tokenizer) 

In [21]:
import evaluate

metric = evaluate.load("seqeval") 

## Test dataset

In [22]:
example = final_datasets['train'][0]

In [23]:
label_list = final_datasets["train"].features["labels"].feature.names 

label_list

['B-CompositeMention',
 'B-DiseaseClass',
 'B-Modifier',
 'B-SpecificDisease',
 'I-CompositeMention',
 'I-DiseaseClass',
 'I-Modifier',
 'I-SpecificDisease',
 'O']

In [24]:
for i in example["labels"]:
  print(i)

8
8
8
8
8
8
8
8
8
8
8
8
3
7
8
8
8
8
8
8
8
8
8
8
8
8
8
8


In [25]:
labels = [label_list[i] for i in example["labels"]] 
labels

['O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-SpecificDisease',
 'I-SpecificDisease',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O']

In [26]:
metric.compute(predictions=[labels], references=[labels]) 

{'SpecificDisease': {'precision': np.float64(1.0),
  'recall': np.float64(1.0),
  'f1': np.float64(1.0),
  'number': np.int64(1)},
 'overall_precision': np.float64(1.0),
 'overall_recall': np.float64(1.0),
 'overall_f1': np.float64(1.0),
 'overall_accuracy': 1.0}

### Compute Metrics

In [27]:
label_list = final_datasets["train"].features["labels"].feature.names

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

## Training

In [28]:
trainer = Trainer( 
   model, 
   args, 
   train_dataset=tokenized_datasets["train"], 
   eval_dataset=tokenized_datasets["validation"], 
   data_collator=data_collator, 
   tokenizer=tokenizer, 
   compute_metrics=compute_metrics 
) 

  trainer = Trainer(


In [29]:
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.1521,0.133177,0.451777,0.465359,0.458467,0.960367
2,0.1023,0.088941,0.64799,0.695425,0.67087,0.976176
3,0.0296,0.07754,0.721166,0.743791,0.732304,0.979497
4,0.0321,0.074903,0.717365,0.783007,0.74875,0.980028
5,0.0402,0.073153,0.718343,0.793464,0.754037,0.979807
6,0.0248,0.074668,0.74105,0.811765,0.774797,0.980693
7,0.0498,0.074617,0.755152,0.814379,0.783648,0.981357
8,0.0362,0.075496,0.749395,0.80915,0.778127,0.981401


  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=2872, training_loss=0.1270265408213092, metrics={'train_runtime': 451.2898, 'train_samples_per_second': 101.558, 'train_steps_per_second': 6.364, 'total_flos': 2994127714768896.0, 'train_loss': 0.1270265408213092, 'epoch': 8.0})

## Save

In [30]:
model.save_pretrained("ner_model")

In [31]:
tokenizer.save_pretrained("tokenizer")

('tokenizer/tokenizer_config.json',
 'tokenizer/special_tokens_map.json',
 'tokenizer/vocab.txt',
 'tokenizer/added_tokens.json',
 'tokenizer/tokenizer.json')

In [32]:
id2label = {
    str(i): label for i,label in enumerate(label_list)
}
label2id = {
    label: str(i) for i,label in enumerate(label_list)
}

In [33]:
id2label

{'0': 'B-CompositeMention',
 '1': 'B-DiseaseClass',
 '2': 'B-Modifier',
 '3': 'B-SpecificDisease',
 '4': 'I-CompositeMention',
 '5': 'I-DiseaseClass',
 '6': 'I-Modifier',
 '7': 'I-SpecificDisease',
 '8': 'O'}

In [34]:
label2id

{'B-CompositeMention': '0',
 'B-DiseaseClass': '1',
 'B-Modifier': '2',
 'B-SpecificDisease': '3',
 'I-CompositeMention': '4',
 'I-DiseaseClass': '5',
 'I-Modifier': '6',
 'I-SpecificDisease': '7',
 'O': '8'}

## Loading model and prediction

In [35]:
import json

In [36]:
config = json.load(open("ner_model/config.json"))
config["id2label"] = id2label
config["label2id"] = label2id
json.dump(config, open("ner_model/config.json","w"))

In [37]:
from transformers import BertForTokenClassification, BertTokenizerFast

In [38]:
model = BertForTokenClassification.from_pretrained("ner_model")
tokenizer = BertTokenizerFast.from_pretrained("tokenizer")
tokenized_test = final_datasets["test"].map(tokenize_and_align_labels, batched=True)
results = trainer.evaluate(eval_dataset=tokenized_test)
print(results)

Map: 100%|██████████████████████████████████████████████| 978/978 [00:00<00:00, 4683.57 examples/s]


{'eval_loss': 0.08291115611791611, 'eval_precision': 0.7431102362204725, 'eval_recall': 0.8040468583599574, 'eval_f1': 0.7723785166240409, 'eval_accuracy': 0.9782458264855981, 'eval_runtime': 3.101, 'eval_samples_per_second': 315.381, 'eval_steps_per_second': 19.993, 'epoch': 8.0}


In [39]:
results_df = pd.DataFrame([results])
results_df.to_csv("evaluation_results.csv", index=False)