In [1]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments
from datasets import load_dataset, load_metric
import numpy as np

In [2]:
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
print('Finsihed loading libraries')

Finsihed loading libraries


In [11]:
# Load the parquet files
train_df = pd.read_parquet('trainBio.parquet')
test_df = pd.read_parquet('testBio.parquet')

# Display the first few rows to understand the structure
print(train_df.head())
print(test_df.head())

                                              tokens  \
0  [Selegiline, -, induced, postural, hypotension...   
1  [OBJECTIVES, :, The, United, Kingdom, Parkinso...   
2  [CONCLUSION, :, This, study, confirms, our, pr...   
3  [High, doses, of, vitamin, D, are, known, to, ...   
4  [We, then, used, hyperbaric, oxygen, at, an, a...   

                                                tags  sentence_id  
0  [0, 0, 0, 1, 2, 0, 1, 2, 2, 2, 0, 0, 0, 0, 0, ...     BC5CDR-0  
1  [0, 0, 0, 0, 0, 1, 2, 2, 2, 0, 0, 0, 0, 0, 0, ...     BC5CDR-1  
2  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...    BC5CDR-10  
3  [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 0, 0, ...   BC5CDR-100  
4  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  BC5CDR-1000  
                                              tokens  \
0  [Torsade, de, pointes, ventricular, tachycardi...   
1  [The, authors, describe, the, case, of, a, 56,...   
2  [Risk, of, transient, hyperammonemic, encephal...   
3  [The, use, of, anorexigens, 

In [12]:
from datasets import Dataset

def convert_to_hf_dataset(df):
    dataset = Dataset.from_pandas(df)
    return dataset

train_dataset = convert_to_hf_dataset(train_df)
test_dataset = convert_to_hf_dataset(test_df)

# Check the dataset format
print(train_dataset)
print(test_dataset)

Dataset({
    features: ['tokens', 'tags', 'sentence_id'],
    num_rows: 15488
})
Dataset({
    features: ['tokens', 'tags', 'sentence_id'],
    num_rows: 5737
})


In [20]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples['tokens'], truncation=True, is_split_into_words=True, padding="max_length", max_length=77)
    labels = []
    for i, label in enumerate(examples['tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                # If the word index is None, it means it's a special token (e.g., [CLS], [SEP])
                # We'll assign a special label (-100) to it
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                # If the word index changed, we'll assign the corresponding label to the current token
                label_ids.append(label[word_idx])
            else:
                # If the word index is the same as the previous one, it means it's a subword of the same token
                # We'll also assign a special label (-100) to it
                label_ids.append(-100)
            previous_word_idx = word_idx
        # Adjust the length of label_ids to match the number of tokens
        # If the number of tokens is less than the number of labels, truncate the labels
        label_ids = label_ids[:len(tokenized_inputs['input_ids'][i])]
        # Pad the label_ids if the number of tokens is more than the number of labels
        label_ids = label_ids + [-100] * (len(tokenized_inputs['input_ids'][i]) - len(label_ids))
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Apply the function to the datasets
train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)
test_dataset = test_dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/15488 [00:00<?, ? examples/s]

Map:   0%|          | 0/5737 [00:00<?, ? examples/s]

In [21]:
# Mapping of tags to IDs
tag2id = {'O': 0, 'B-Disease': 1, 'I-Disease': 2}

# Load the model with the appropriate number of labels
model = AutoModelForTokenClassification.from_pretrained("dmis-lab/biobert-base-cased-v1.1", num_labels=len(tag2id))


training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

trainer.train()


Some weights of BertForTokenClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.0849,0.042024
2,0.0268,0.044297
3,0.0132,0.050491


TrainOutput(global_step=2904, training_loss=0.03456401299510777, metrics={'train_runtime': 13562.607, 'train_samples_per_second': 3.426, 'train_steps_per_second': 0.214, 'total_flos': 1825892919518976.0, 'train_loss': 0.03456401299510777, 'epoch': 3.0})

In [22]:
results = trainer.evaluate()
print(results)

{'eval_loss': 0.050490591675043106, 'eval_runtime': 416.9459, 'eval_samples_per_second': 13.76, 'eval_steps_per_second': 0.861, 'epoch': 3.0}
