In [1]:
!pip install torch pandas numpy transformers accelerate datasets tokenizers seqeval evaluate



## Token classification

In [2]:
import os
import re
from pathlib import Path
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
from datasets import load_dataset, Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
MODEL_REGISTRY = {
    "bert": "google-bert/bert-base-uncased",
    "bert-large": "google-bert/bert-large-uncased",
    "biobert": "dmis-lab/biobert-base-cased-v1.2",
    "bluebert": "bionlp/bluebert_pubmed_uncased_L-24_H-1024_A-16",
    "clinical-bert": "emilyalsentzer/Bio_ClinicalBERT",
    "biomed_roberta": "allenai/biomed_roberta_base",
    "pubmedbert": "microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext"
}


model_to_use = "google-bert/bert-large-uncased"
base_path = Path("./datasets/preprocessed_RE/biored")

In [4]:
all_train_rows = []
all_test_rows = []

def preprocess_text(text: str) -> str:
    """
    Replace @XXXXXX$ with <eN>XXXXXX</eN> according to the order of appearance in the text.
    """
    matches = list(re.finditer(r'@([A-Z]+)\$', text))
    new_text = text
    offset = 0

    for idx, match in enumerate(matches):
        entity_type = match.group(1)
        tag = f"<e{idx+1}>{entity_type}</e{idx+1}>"
        start, end = match.span()
        new_text = new_text[:start + offset] + tag + new_text[end + offset:]
        offset += len(tag) - (end - start)
    return new_text

for i in range(1, 11):
    subfolder = base_path / str(i)
    train_file = subfolder / "train.tsv"
    test_file = subfolder / "test.tsv"

    if train_file.exists():
        df = pd.read_csv(train_file, sep="\t", header=None)
        if df.shape[1] > 2:
            df = df.iloc[:, -2:]
        df.iloc[:, 0] = df.iloc[:, 0].apply(preprocess_text)
        all_train_rows.append(df)

    if test_file.exists():
        with open(test_file) as f:
            has_header = "index" in f.readline()
        df = pd.read_csv(test_file, sep="\t", header=0 if has_header else None)
        if df.shape[1] == 3:
            df = df.iloc[:, 1:]
        df.iloc[:, 0] = df.iloc[:, 0].apply(preprocess_text)
        all_test_rows.append(df)

if all_train_rows:
    train_accumulated = pd.concat(all_train_rows, ignore_index=True)
    train_accumulated.to_csv(base_path / "train_accumulated.tsv", sep="\t", index=False, header=False)
else:
    print("train.tsv not found")

if all_test_rows:
    test_accumulated = pd.concat(all_test_rows, ignore_index=True)
    test_accumulated.to_csv(base_path / "test_accumulated.tsv", sep="\t", index=False, header=False)
else:
    print("test.tsv not found")

In [5]:
data_files = {
    "train": str(base_path/"train_accumulated.tsv"),
    "test": str(base_path/"test_accumulated.tsv")
}

raw_dataset = load_dataset(
    "csv",
    data_files=data_files,
    delimiter="\t",
    column_names=["text", "label"]
)

print(raw_dataset["train"][0])

Generating train split: 3195 examples [00:00, 463679.50 examples/s]
Generating test split: 355 examples [00:00, 176064.55 examples/s]

{'text': 'In humans, loss-of-function of the cilia-centrosomal protein <e1>GENE</e1> is associated with Joubert and <e2>DISEASE</e2>, whereas hypomorphic mutations result in Leber congenital amaurosis (LCA), a form of early-onset retinal dystrophy.', 'label': 1}





In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_to_use)

def tokenize_example(example):
    return tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

tokenized_dataset = raw_dataset.map(tokenize_example, batched=True)

Map: 100%|████████████████████████| 3195/3195 [00:00<00:00, 18514.33 examples/s]
Map: 100%|██████████████████████████| 355/355 [00:00<00:00, 18159.37 examples/s]


In [7]:
tokenized_dataset = tokenized_dataset.remove_columns(["text"])
tokenized_dataset.set_format("torch")

In [8]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_to_use,
    num_labels=2
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="no",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=100,
    load_best_model_at_end=False,
    metric_for_best_model="accuracy",
    save_total_limit=2
)

  trainer = Trainer(


In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [10]:
trainer.train()

Step,Training Loss
100,0.5084
200,0.3112
300,0.1709
400,0.0922
500,0.0384
600,0.0372


TrainOutput(global_step=600, training_loss=0.19304501334826152, metrics={'train_runtime': 80.5001, 'train_samples_per_second': 119.068, 'train_steps_per_second': 7.453, 'total_flos': 2233140529374720.0, 'train_loss': 0.19304501334826152, 'epoch': 3.0})

In [11]:
model_path = "./RE_model/model"
tokenizer_path = "./RE_model/tokenizer"

trainer.save_model(model_path)
tokenizer.save_pretrained(tokenizer_path)


('./RE_model/tokenizer/tokenizer_config.json',
 './RE_model/tokenizer/special_tokens_map.json',
 './RE_model/tokenizer/vocab.txt',
 './RE_model/tokenizer/added_tokens.json',
 './RE_model/tokenizer/tokenizer.json')

In [12]:
#from transformers import AutoModelForSequenceClassification, AutoTokenizer

#model = AutoModelForSequenceClassification.from_pretrained("./ER_model/model")
#tokenizer = AutoTokenizer.from_pretrained("./ER_model/tokenizer")

## Test dataset

## Loading model and prediction

In [13]:
model = AutoModelForSequenceClassification.from_pretrained("./RE_model/model").eval().to("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained("./RE_model/tokenizer")
device = next(model.parameters()).device

df = pd.read_csv("./datasets/preprocessed_RE/biored/test_accumulated.tsv", sep="\t", header=None, names=["text", "label"])
dataset = Dataset.from_pandas(df)

tokenized = dataset.map(lambda e: tokenizer(e["text"], padding="max_length", truncation=True, max_length=128), batched=True)
tokenized.set_format("torch", columns=["input_ids", "attention_mask", "label"])

preds, labels = [], []
for batch in torch.utils.data.DataLoader(tokenized, batch_size=16):
    inputs = {k: v.to(device) for k, v in batch.items() if k in ["input_ids", "attention_mask"]}
    with torch.no_grad():
        logits = model(**inputs).logits
        preds.extend(torch.argmax(logits, dim=1).cpu().numpy())
        labels.extend(batch["label"].numpy())

accuracy = accuracy_score(labels, preds)
precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")

print("Evaluation over test_accumulated.tsv")
print(f"Accuracy : {accuracy:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall   : {recall:.3f}")
print(f"F1 Score : {f1:.3f}")

Map: 100%|██████████████████████████| 355/355 [00:00<00:00, 17566.99 examples/s]


Evaluation over test_accumulated.tsv
Accuracy : 0.992
Precision: 1.000
Recall   : 0.989
F1 Score : 0.994
