In [1]:
!pip install torch pandas numpy transformers accelerate datasets tokenizers seqeval evaluate



## Token classification

In [2]:
import os
import re
from pathlib import Path
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
from datasets import load_dataset, Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
MODEL_REGISTRY = {
    "bert": "google-bert/bert-base-uncased",
    "bert-large": "google-bert/bert-large-uncased",
    "biobert": "dmis-lab/biobert-base-cased-v1.2",
    "bluebert": "bionlp/bluebert_pubmed_uncased_L-24_H-1024_A-16",
    "clinical-bert": "emilyalsentzer/Bio_ClinicalBERT",
    "biomed_roberta": "allenai/biomed_roberta_base",
    "pubmedbert": "microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext"
}


model_to_use = "ner_model/new_model"
tokenizer_path = "ner_model/tokenizer"
dataset_to_use = "combined"

In [4]:
from collections import Counter

def filter_classes_from_dataset(dataset_dict, min_examples=25):
    label_counts = Counter(dataset_dict["train"]["relation"])
    allowed_classes = {label for label, count in label_counts.items() if count >= min_examples}
    def is_valid(example):
        return example["relation"] in allowed_classes

    filtered_dataset = dataset_dict.filter(is_valid)
    unique_labels = sorted(list(allowed_classes))
    label2id = {label: i for i, label in enumerate(unique_labels)}
    id2label = {i: label for label, i in label2id.items()}
    
    def map_label(example):
        example["label"] = label2id[example["relation"]]
        return example
        
    filtered_dataset = filtered_dataset.map(map_label)
    return filtered_dataset, label2id, id2label

In [5]:
from datasets import load_dataset

data_files = {
    "train": f"datasets/preprocessed_RE/{dataset_to_use}/{dataset_to_use}_train.csv",
    "validation": f"datasets/preprocessed_RE/{dataset_to_use}/{dataset_to_use}_dev.csv",
    "test": f"datasets/preprocessed_RE/{dataset_to_use}/{dataset_to_use}_test.csv"
}
raw_dataset = load_dataset("csv", data_files=data_files, delimiter=",")

print(raw_dataset["train"][0])

{'text': '<e1>Hepatocyte nuclear factor-6</e1>: associations between genetic variability and <e2>type II diabetes</e2> and between genetic variability and estimates of insulin secretion.', 'relation': 'Association', 'source': 'biored'}


In [6]:
from collections import Counter

relation_counts = Counter(raw_dataset["train"]["relation"])

for relation, count in relation_counts.most_common():
    print(f"{relation:20} → {count}")

Association          → 7283
Positive_Correlation → 4007
Negative_Correlation → 3013
Downregulator        → 2251
Regulator            → 1652
Upregulator          → 774
Substrate            → 727
Part_of              → 307
Not                  → 240
Antagonist           → 229
Bind                 → 217
Agonist              → 173
Comparison           → 171
Cotreatment          → 154
Drug_Interaction     → 54
Cofactor             → 34
Modulator            → 29
Conversion           → 3
Undefined            → 1


In [7]:
raw_dataset, label2id, id2label = filter_classes_from_dataset(raw_dataset, min_examples=55)

Filter: 100%|█████████████████████████████████████████████████████| 21319/21319 [00:00<00:00, 432369.65 examples/s]
Filter: 100%|███████████████████████████████████████████████████████| 8284/8284 [00:00<00:00, 408786.36 examples/s]
Filter: 100%|███████████████████████████████████████████████████████| 9515/9515 [00:00<00:00, 419995.40 examples/s]
Map: 100%|█████████████████████████████████████████████████████████| 21198/21198 [00:00<00:00, 29062.06 examples/s]
Map: 100%|███████████████████████████████████████████████████████████| 8261/8261 [00:00<00:00, 34615.64 examples/s]
Map: 100%|███████████████████████████████████████████████████████████| 9442/9442 [00:00<00:00, 34858.82 examples/s]


In [8]:
print(raw_dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'relation', 'source', 'label'],
        num_rows: 21198
    })
    validation: Dataset({
        features: ['text', 'relation', 'source', 'label'],
        num_rows: 8261
    })
    test: Dataset({
        features: ['text', 'relation', 'source', 'label'],
        num_rows: 9442
    })
})


In [9]:
from sklearn.utils.class_weight import compute_class_weight

unique_labels = list(set(raw_dataset["train"]["relation"]))
label2id = {label: i for i, label in enumerate(sorted(unique_labels))}
id2label = {i: label for label, i in label2id.items()}

def encode_labels(example):
    example["label"] = label2id[example["relation"]]
    return example

raw_dataset = raw_dataset.map(encode_labels)

train_labels = raw_dataset["train"]["label"]

class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.unique(train_labels),
    y=train_labels
)
class_weights = torch.tensor(class_weights, dtype=torch.float)

Map: 100%|█████████████████████████████████████████████████████████| 21198/21198 [00:00<00:00, 51973.92 examples/s]
Map: 100%|███████████████████████████████████████████████████████████| 8261/8261 [00:00<00:00, 48459.67 examples/s]
Map: 100%|███████████████████████████████████████████████████████████| 9442/9442 [00:00<00:00, 51103.05 examples/s]


In [10]:
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

def tokenize_example(example):
    return tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

tokenized_dataset = raw_dataset.map(tokenize_example, batched=True)

Map: 100%|█████████████████████████████████████████████████████████| 21198/21198 [00:01<00:00, 17420.69 examples/s]
Map: 100%|███████████████████████████████████████████████████████████| 8261/8261 [00:00<00:00, 14670.08 examples/s]
Map: 100%|███████████████████████████████████████████████████████████| 9442/9442 [00:00<00:00, 19273.46 examples/s]


In [11]:
tokenized_dataset = tokenized_dataset.remove_columns(["text"])
tokenized_dataset.set_format("torch")

In [12]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_to_use,
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ner_model/new_model and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ner_model/new_model and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([13]) in the checkpoint and torch.Size([14]) in the model instantiated
- classifier.weight: found shape torch.Size([13, 1024]) in the checkpoint and torch.Size([14, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=500,
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="macro_f1",
    greater_is_better=True,
    learning_rate=2e-5,
    warmup_ratio=0.1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=100,
    fp16=True
)

In [14]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='macro', zero_division=0)
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

In [15]:
from transformers import Trainer
from torch.nn import CrossEntropyLoss

class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = CrossEntropyLoss(weight=class_weights.to(model.device))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = WeightedTrainer(


In [16]:
from transformers import get_cosine_schedule_with_warmup
import torch

train_batch_size = training_args.per_device_train_batch_size
train_dataset_size = len(tokenized_dataset["train"])
gradient_accumulation = training_args.gradient_accumulation_steps
epochs = training_args.num_train_epochs

total_steps = (train_dataset_size // (train_batch_size * gradient_accumulation)) * epochs
warmup_steps = int(0.1 * total_steps)

optimizer = torch.optim.AdamW(
    trainer.model.parameters(),
    lr=training_args.learning_rate,
    weight_decay=training_args.weight_decay
)

scheduler = get_cosine_schedule_with_warmup(
    optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_steps,
)

trainer.optimizer = optimizer
trainer.lr_scheduler = scheduler

In [17]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
500,1.361,1.474852,0.53105,0.452839,0.508118,0.430853
1000,0.8263,1.295058,0.625953,0.541039,0.592242,0.550196
1500,0.53,1.41927,0.623169,0.593897,0.608342,0.581833
2000,0.4504,1.535952,0.638906,0.588572,0.629814,0.59005
2500,0.3577,1.717069,0.633458,0.558779,0.622119,0.572448
3000,0.2938,1.767856,0.660211,0.638648,0.631362,0.613194
3500,0.2775,1.803636,0.654521,0.618453,0.634846,0.617383
4000,0.3669,1.803435,0.67401,0.620006,0.640935,0.6177
4500,0.3056,1.885846,0.678731,0.633225,0.63005,0.61515
5000,0.2376,1.979587,0.664447,0.654285,0.627582,0.610436


TrainOutput(global_step=6625, training_loss=0.5045039020034502, metrics={'train_runtime': 458.2361, 'train_samples_per_second': 231.3, 'train_steps_per_second': 14.458, 'total_flos': 2.469485252371968e+16, 'train_loss': 0.5045039020034502, 'epoch': 5.0})

In [18]:
model_path = "./RE_model/model"
tokenizer_path = "./RE_model/tokenizer"

trainer.save_model(model_path)
tokenizer.save_pretrained(tokenizer_path)


('./RE_model/tokenizer/tokenizer_config.json',
 './RE_model/tokenizer/special_tokens_map.json',
 './RE_model/tokenizer/vocab.txt',
 './RE_model/tokenizer/added_tokens.json',
 './RE_model/tokenizer/tokenizer.json')

## Test dataset

## Loading model and prediction

In [19]:
import json

config = json.load(open("RE_model/model/config.json"))
config["id2label"] = id2label
config["label2id"] = label2id
json.dump(config, open("RE_model/model/config.json","w"))

In [20]:
def tokenize_example(example):
    return tokenizer(example["text"], padding="max_length", truncation=True, max_length=128)

In [21]:
model = AutoModelForSequenceClassification.from_pretrained("RE_model/model/")
tokenizer = AutoTokenizer.from_pretrained("RE_model/tokenizer")
tokenized_test = raw_dataset["test"].map(tokenize_example, batched=True)
results = trainer.evaluate(eval_dataset=tokenized_test)
print(results)

Map: 100%|███████████████████████████████████████████████████████████| 9442/9442 [00:00<00:00, 16399.22 examples/s]


{'eval_loss': 1.638489842414856, 'eval_accuracy': 0.6725270069900445, 'eval_precision': 0.6885509555521117, 'eval_recall': 0.677392440515314, 'eval_f1': 0.6689876118928787, 'eval_runtime': 7.6069, 'eval_samples_per_second': 1241.236, 'eval_steps_per_second': 77.692, 'epoch': 5.0}


In [22]:
from sklearn.metrics import classification_report

predictions = trainer.predict(tokenized_test)
preds = np.argmax(predictions.predictions, axis=1)
labels = predictions.label_ids

id2label = model.config.id2label

present_labels = sorted(set(labels) | set(preds))
target_names = [id2label[i] for i in present_labels]

print("id2label:", id2label)
print("Unique labels in dataset:", sorted(set(labels)))

print(classification_report(
    labels,
    preds,
    labels=present_labels,
    target_names=target_names,
    zero_division=0
))

id2label: {0: 'Agonist', 1: 'Antagonist', 2: 'Association', 3: 'Bind', 4: 'Comparison', 5: 'Cotreatment', 6: 'Downregulator', 7: 'Negative_Correlation', 8: 'Not', 9: 'Part_of', 10: 'Positive_Correlation', 11: 'Regulator', 12: 'Substrate', 13: 'Upregulator'}
Unique labels in dataset: [np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(8), np.int64(9), np.int64(10), np.int64(11), np.int64(12), np.int64(13)]
                      precision    recall  f1-score   support

             Agonist       0.57      0.78      0.66       193
          Antagonist       0.76      0.80      0.78       293
         Association       0.75      0.67      0.71      1796
                Bind       0.50      0.71      0.59        17
          Comparison       0.71      0.81      0.76        37
         Cotreatment       0.95      0.41      0.57        44
       Downregulator       0.75      0.78      0.76      1666
Negative_Correlation       0.52 