# Import Libaries

In [1]:
import re
import html
import torch
import numpy as np

from datasets import load_dataset, DatasetDict
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, Trainer, TrainingArguments

# Load Dataset

In [2]:
dataset_url = "https://raw.githubusercontent.com/ktxdev/transfomers-hf/refs/heads/master/data/imdb_small.csv"
dataset = load_dataset("csv", data_files=dataset_url)

# Inspect Dataset

## Dataset dictionary

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'review', 'sentiment'],
        num_rows: 1000
    })
})

## Single record inspection

In [4]:
dataset['train'][34]

{'Unnamed: 0': 76,
 'review': "The Last Hard Men finds James Coburn an outlaw doing a long sentence breaking free from a chain gang. Do he and his friends head for the Mexican border from jail and safety. No they don't because Coburn has a mission of revenge. To kill the peace officer who brought him in and in the process killed his woman.<br /><br />That peace officer is Charlton Heston who is now retired and he knows what Coburn is after. As he explains it to his daughter, Barbara Hershey, Coburn was holed up in a shack and was involved in a Waco like standoff. His Indian woman was killed in the hail of bullets fired. It's not something he's proud of, she was a collateral casualty in a manhunt.<br /><br />Lest we feel sorry for Coburn he lets us know full well what an evil man he truly is. Heston is his usual stalwart hero, but the acting honors in The Last Hard Men go to James Coburn. He blows everyone else off the screen when he's on. <br /><br />Coburn gets the bright idea of maki

# Remove unwanted columns

In [5]:
dataset = dataset.remove_columns(['Unnamed: 0'])
dataset

DatasetDict({
    train: Dataset({
        features: ['review', 'sentiment'],
        num_rows: 1000
    })
})

# Data Pre-processing

In [6]:
def preprocess_text(example):
    """Removes HTML tags"""
    text = html.unescape(example['review'])
    text = re.sub(r"<br\s*/?>", "\n", text)
    text = text.strip()
    example['review'] = text
    return example

dataset = dataset.map(preprocess_text)
dataset['train'][0]

{'review': "One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.\n\nThe first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.\n\nIt is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.\n\nI would say the main appeal of the show is due to the fact that it go

# Tokenize dataset

In [7]:
checkpoint = "google-bert/bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

label2id = {'negative': 0, 'positive': 1}

def tokenize_dataset(examples):
    inputs = tokenizer(examples['review'], truncation=True)
    inputs['labels'] = [label2id[sentiment] for sentiment in examples['sentiment']]
    return inputs

tokenized_dataset = dataset.map(tokenize_dataset, batched=True)

# Split dataset into train, validation and test

In [8]:
# Split into train and test
train_test = tokenized_dataset['train'].train_test_split(test_size=0.3)
# Split test into validation and test
val_test = train_test['test'].train_test_split(test_size=0.5)
# Create final dataset
final_dataset = DatasetDict({
    "train": train_test["train"],
    "validation": val_test["train"],
    "test": val_test["test"],
})
final_dataset

DatasetDict({
    train: Dataset({
        features: ['review', 'sentiment', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 700
    })
    validation: Dataset({
        features: ['review', 'sentiment', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 150
    })
    test: Dataset({
        features: ['review', 'sentiment', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 150
    })
})

# Define evaluation metrics

In [9]:
def compute_metrics(eval_pred):
    """Computes accuracy, precision, recall and f1 score for evaluation"""
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    accuracy = np.round(accuracy_score(labels, predictions), 3)
    precision = np.round(precision_score(labels, predictions, average="weighted"), 3)
    recall = np.round(recall_score(labels, predictions, average="weighted"), 3)
    f1 = np.round(f1_score(labels, predictions, average="weighted"), 3)

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

# Load Pre-trained model

In [10]:
# Load model with binary classification head
id2label = {0: 'negative', 1: 'positive'}


model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint, 
    num_labels=2, 
    id2label=id2label,
    label2id=label2id
)
# Move to hardware accelerator if available
device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu"
model.to(device)
print(f"Using {device} device")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using mps device


# Create the training arguments

In [11]:
learning_rate = 2e-4
batch_size = 32
num_epochs = 5

training_args = TrainingArguments(
    output_dir = "bert-imdb-sentiment-analyzer",
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    logging_strategy="epoch",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)

# Freeze some parameters to reduce computational costs

In [12]:
# freaze all base model parameters
for name, param in model.base_model.named_parameters():
    param.requires_grad = False

# unfreeze the last 2 transformer layers
for param in model.base_model.encoder.layer[-2:].parameters():
    param.requires_grad = True

# unfreeze the classification head
for param in model.classifier.parameters():
    param.requires_grad=True

# Fine-tune the model

In [13]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model,
    training_args,
    train_dataset=final_dataset['train'],
    eval_dataset=final_dataset['validation'],
    compute_metrics=compute_metrics,
    data_collator=data_collator
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5411,0.232654,0.927,0.928,0.927,0.926
2,0.2893,0.199887,0.927,0.931,0.927,0.927
3,0.1901,0.198141,0.92,0.924,0.92,0.92
4,0.1033,0.261396,0.933,0.939,0.933,0.934
5,0.0583,0.293878,0.933,0.939,0.933,0.934


TrainOutput(global_step=110, training_loss=0.23642236427827315, metrics={'train_runtime': 178.1988, 'train_samples_per_second': 19.641, 'train_steps_per_second': 0.617, 'total_flos': 920724249350400.0, 'train_loss': 0.23642236427827315, 'epoch': 5.0})

# Validate with unseen data

In [14]:
predictions = trainer.predict(final_dataset['test'])

logits, labels = predictions.predictions, predictions.label_ids
metrics = compute_metrics((logits, labels))
print(metrics)

{'accuracy': np.float64(0.893), 'precision': np.float64(0.897), 'recall': np.float64(0.893), 'f1': np.float64(0.893)}
