In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv


In [1]:
!pip install transformers datasets torch scikit-learn

Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-1

In [4]:
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                          Trainer, TrainingArguments)
from datasets import load_dataset, DatasetDict
import torch
import numpy as np
import random
from sklearn.metrics import f1_score

Load and Preprocess IMDB Data

In [5]:
# Download directly using the datasets library
dataset = load_dataset("imdb")
# Split for small subset (for finetuning comparison)
small_train = dataset["train"].shuffle(seed=42).select(range(5000))
small_val = dataset["test"].shuffle(seed=42).select(range(1000))

# Define train/val subsets for initial experiments
data_splits = DatasetDict({
    "train": small_train,
    "val": small_val,
    "full_train": dataset["train"],
    "test": dataset["test"]
})

Tokenization Helper

In [6]:
def tokenize_function(examples, tokenizer):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

def preprocess(data_split, tokenizer):
    return data_split.map(lambda x: tokenize_function(x, tokenizer), batched=True)


Custom F1 Score Function

In [8]:
def custom_f1_score(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    tp = ((preds == 1) & (labels == 1)).sum()
    fp = ((preds == 1) & (labels == 0)).sum()
    fn = ((preds == 0) & (labels == 1)).sum()
    precision = tp / (tp + fp + 1e-8)
    recall = tp / (tp + fn + 1e-8)
    f1 = 2 * precision * recall / (precision + recall + 1e-8)
    return {"f1": f1}


 Model Training Loop for 5 Models

In [12]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding

# Define maximum sequence length per model
max_len_dict = {
    "microsoft/deberta-v3-base": 512,
    "bert-base-uncased": 512,
    "roberta-base": 512,
    "google/electra-base-discriminator": 512,  # Update to actual ModernBERT model ID if needed
    "distilbert-base-uncased": 512         # Update if Ettin has a lower max length
}

# Updated preprocess function to accept max_length
def preprocess(data_split, tokenizer, max_length=512):
    return data_split.map(
        lambda examples: tokenizer(
            examples["text"],
            padding="max_length",     # pad to fixed length
            truncation=True,          # truncate longer sequences
            max_length=max_length
        ),
        batched=True
    )

MODELS = [
    "microsoft/deberta-v3-base",
    "bert-base-uncased",
    "roberta-base",
    "google/electra-base-discriminator",  # ModernBERT placeholder
    "distilbert-base-uncased"         # Ettin placeholder
]

results = {}

for model_name in MODELS:
    print(f"\nFinetuning: {model_name}")

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    max_length = max_len_dict.get(model_name, 512)

    # Tokenize with fixed padding/truncation
    train_data = preprocess(data_splits["train"], tokenizer, max_length=max_length)
    val_data = preprocess(data_splits["val"], tokenizer, max_length=max_length)

    # Load model
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

    # Training arguments
    training_args = TrainingArguments(
        output_dir=f"./results_{model_name.replace('/', '_')}",
        eval_strategy="epoch",  # for latest transformers
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=2,
        logging_steps=100,
        save_strategy="no",
        report_to="none"
    )

    # Data collator for dynamic padding
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    # Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_data,
        eval_dataset=val_data,
        data_collator=data_collator,
        compute_metrics=custom_f1_score
    )

    # Train and evaluate
    trainer.train()
    metrics = trainer.evaluate()
    print(f"F1 Score: {metrics['eval_f1']:.4f}")
    results[model_name] = metrics['eval_f1']

print("\n=== Final Results ===")
for model, f1 in results.items():
    print(f"{model}: {f1:.4f}")



Finetuning: microsoft/deberta-v3-base




Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,0.3248,0.2573,0.921941
2,0.1187,0.241321,0.939734


F1 Score: 0.9397

Finetuning: bert-base-uncased


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,0.375,0.272588,0.898551
2,0.1327,0.350289,0.912351


F1 Score: 0.9124

Finetuning: roberta-base


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,0.3778,0.247628,0.920078
2,0.1945,0.290851,0.930653


F1 Score: 0.9307

Finetuning: facebook/modern-bert-base


OSError: facebook/modern-bert-base is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`

In [13]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding

# Define maximum sequence length per model
max_len_dict = {
    "microsoft/deberta-v3-base": 512,
    "bert-base-uncased": 512,
    "roberta-base": 512,
    "google/electra-base-discriminator": 512,  # Update to actual ModernBERT model ID if needed
    "distilbert-base-uncased": 512         # Update if Ettin has a lower max length
}

# Updated preprocess function to accept max_length
def preprocess(data_split, tokenizer, max_length=512):
    return data_split.map(
        lambda examples: tokenizer(
            examples["text"],
            padding="max_length",     # pad to fixed length
            truncation=True,          # truncate longer sequences
            max_length=max_length
        ),
        batched=True
    )

MODELS = [
    "microsoft/deberta-v3-base",
    "bert-base-uncased",
    "roberta-base",
    "google/electra-base-discriminator",  # ModernBERT placeholder
    "distilbert-base-uncased"         # Ettin placeholder
]

results = {}

for model_name in MODELS:
    print(f"\nFinetuning: {model_name}")

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    max_length = max_len_dict.get(model_name, 512)

    # Tokenize with fixed padding/truncation
    train_data = preprocess(data_splits["train"], tokenizer, max_length=max_length)
    val_data = preprocess(data_splits["val"], tokenizer, max_length=max_length)

    # Load model
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

    # Training arguments
    training_args = TrainingArguments(
        output_dir=f"./results_{model_name.replace('/', '_')}",
        eval_strategy="epoch",  # for latest transformers
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=2,
        logging_steps=100,
        save_strategy="no",
        report_to="none"
    )

    # Data collator for dynamic padding
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    # Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_data,
        eval_dataset=val_data,
        data_collator=data_collator,
        compute_metrics=custom_f1_score
    )

    # Train and evaluate
    trainer.train()
    metrics = trainer.evaluate()
    print(f"F1 Score: {metrics['eval_f1']:.4f}")
    results[model_name] = metrics['eval_f1']

print("\n=== Final Results ===")
for model, f1 in results.items():
    print(f"{model}: {f1:.4f}")



Finetuning: google/electra-base-discriminator


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss,F1
1,0.2936,0.298136,0.923383
2,0.0954,0.269734,0.93441


F1 Score: 0.9344

Finetuning: distilbert-base-uncased


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,0.357,0.323324,0.888889
2,0.147,0.374198,0.904714


F1 Score: 0.9047

=== Final Results ===
google/electra-base-discriminator: 0.9344
distilbert-base-uncased: 0.9047


Find Best Model

In [14]:
best_model_name = max(results, key=results.get)
print(f"\nBest model: {best_model_name} with F1 Score: {results[best_model_name]:.4f}")



Best model: google/electra-base-discriminator with F1 Score: 0.9344


In [16]:
best_model_name = 'microsoft/deberta-v3-base'

 Finetune the Best Model on Full Training Data

In [18]:
tokenizer = AutoTokenizer.from_pretrained(best_model_name)
full_train_data = preprocess(data_splits["full_train"], tokenizer)
val_data = preprocess(data_splits["val"], tokenizer)
model = AutoModelForSequenceClassification.from_pretrained(best_model_name, num_labels=2)

training_args = TrainingArguments(
    output_dir=f"./results_{best_model_name.replace('/', '_')}_full",
    eval_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    logging_steps=200,
    save_strategy="no",
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=full_train_data,
    eval_dataset=val_data,
    compute_metrics=custom_f1_score
)

trainer.train()
final_metrics = trainer.evaluate()
print(f"Final F1 on validation: {final_metrics['eval_f1']:.4f}")


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,0.2029,0.245012,0.922574
2,0.1258,0.202089,0.950153


Final F1 on validation: 0.9502


Run Inference on 10 Random Test Samples

In [20]:
import torch

# Prepare 10 sample reviews from shuffled test set
test_data = data_splits["test"].shuffle(seed=42)
samples = test_data.select(range(10))

# Make sure you use the same tokenizer and max_length as during training
max_length = 512  # or whatever you used for training

model.eval()
device = next(model.parameters()).device  # Will be cuda or cpu based on your setup

table = "| Review # | Text (truncated) | Prediction | Confidence |\n|---|---|---|---|\n"
for idx, sample in enumerate(samples):
    text = sample['text']
    # Tokenize single review
    inputs_encoded = tokenizer(
        text,
        return_tensors='pt',
        truncation=True,
        padding='max_length',
        max_length=max_length
    )
    # Move tensors to model device (cuda or cpu)
    inputs_encoded = {k: v.to(device) for k, v in inputs_encoded.items()}

    with torch.no_grad():
        outputs = model(**inputs_encoded)
        probs = torch.softmax(outputs.logits, dim=1).cpu().numpy()[0]
        pred = probs.argmax()

    label = "Positive" if pred == 1 else "Negative"
    conf = f"{probs[pred]*100:.1f}%"
    table += f"| {idx+1} | {text[:50].replace('|',' ')}... | {label} | {conf} |\n"

print(table)


| Review # | Text (truncated) | Prediction | Confidence |
|---|---|---|---|
| 1 | <br /><br />When I unsuspectedly rented A Thousand... | Positive | 99.9% |
| 2 | This is the latest entry in the long series of fil... | Positive | 99.3% |
| 3 | This movie was so frustrating. Everything seemed e... | Negative | 99.9% |
| 4 | I was truly and wonderfully surprised at "O' Broth... | Positive | 99.9% |
| 5 | This movie spends most of its time preaching that ... | Negative | 99.8% |
| 6 | After a very long time Marathi cinema has come wit... | Positive | 99.9% |
| 7 | This is a really sad, and touching movie! It deals... | Positive | 99.9% |
| 8 | Don't pay any attention to the rave reviews of thi... | Negative | 99.9% |
| 9 | Porn legend Gregory Dark directs this cheesy horro... | Negative | 99.9% |
| 10 | This was a great movie. Something not only for Bla... | Positive | 99.9% |

