In [8]:
import pandas as pd
import numpy as np
import re
import random
from bs4 import BeautifulSoup
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

from sklearn.metrics import f1_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import torch
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
2025-08-10 06:26:21.779838: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1754807181.797786     970 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1754807181.803165     970 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [None]:
pip install --upgrade transformers accelerate datasets evaluate

In [2]:
df = pd.read_csv('/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv',encoding="ISO-8859-1")

In [9]:
# Remove HTML tags
def remove_html(text):
    return BeautifulSoup(text, "html.parser").get_text()

# Remove special characters & digits
def remove_special_chars(text):
    return re.sub(r'[^a-zA-Z\s]', '', text)

# Lowercase text
def to_lowercase(text):
    return text.lower()

# Remove stopwords (optional for transformer models, more important for classical ML)
stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
    words = text.split()
    return ' '.join([word for word in words if word not in stop_words])

# Apply cleaning
df['review'] = df['review'].apply(remove_html)
df['review'] = df['review'].apply(to_lowercase)
df['review'] = df['review'].apply(remove_special_chars)
# Uncomment below if you want stopword removal
df['review'] = df['review'].apply(remove_stopwords)

# Map sentiment to numeric
df['label'] = df['sentiment'].map({'positive': 1, 'negative': 0})

# Shuffle dataset
df = df.sample(frac=1, random_state=42).reset_index(drop=True)


In [None]:
df.head()

In [10]:
X = df['review']
y = df['label']
import pandas as pd

train_df, test_df = train_test_split(df.drop('sentiment',axis=1), test_size=0.2, random_state=42, stratify=df["label"])


In [11]:
train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
test_dataset = Dataset.from_pandas(test_df.reset_index(drop=True))

In [12]:


# We won't fix model_name here, so tokenization can be redone quickly per model
def tokenize_data(tokenizer, train_dataset, test_dataset, max_length=256):
    def tokenize_fn(examples):
        return tokenizer(
            examples["review"],  # column name from CSV
            truncation=True,
            padding="max_length",
            max_length=max_length
        )

    train_tok = train_dataset.map(tokenize_fn, batched=True)
    test_tok = test_dataset.map(tokenize_fn, batched=True)

    # Keep only the required columns
    train_tok.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
    test_tok.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

    return train_tok, test_tok

In [13]:

def compute_f1(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    f1 = f1_score(labels, preds, average="weighted")  # weighted handles imbalance
    return {"f1": f1}

In [14]:

def finetune_model(model_name, train_dataset, test_dataset, epochs=2):
    print(f"\n--- Training {model_name} ---")

    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

    # Tokenize datasets for this model
    train_tok, test_tok = tokenize_data(tokenizer, train_dataset, test_dataset)

    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

    training_args = TrainingArguments(
        output_dir=f"./results_{model_name}",
        save_strategy="no",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=epochs,
        weight_decay=0.01,
        logging_dir=f"./logs_{model_name}",
        report_to="none",
        no_cuda=not torch.cuda.is_available()
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_tok,
        eval_dataset=test_tok,
        tokenizer=tokenizer,
        compute_metrics=compute_f1
    )

    trainer.train()
    metrics = trainer.evaluate()
    return metrics["eval_f1"], model, tokenizer


In [15]:
models_to_try = [
    "bert-base-uncased"
    "roberta-base",
    "google/electra-base-discriminator",
    "microsoft/deberta-base",
    "distilbert-base-uncased"
]

results = {}
trained_models = {}

# Small subset for quick comparison
train_subset = train_dataset.shuffle(seed=42).select(range(5000))
test_subset = test_dataset.shuffle(seed=42).select(range(2000))

for model_name in models_to_try:
    f1, model, tokenizer = finetune_model(model_name, train_subset, test_subset, epochs=2)
    results[model_name] = f1
    trained_models[model_name] = (model, tokenizer)

print("\nModel performance on subset:")
for name, score in results.items():
    print(f"{name}: F1 = {score:.4f}")

best_model_name = max(results, key=results.get)
print(f"\nBest model: {best_model_name}")



--- Training bert-base-uncased ---


Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss



--- Training roberta-base ---


Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss



--- Training google/electra-base-discriminator ---


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]



Step,Training Loss



--- Training microsoft/deberta-base ---


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/474 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/559M [00:00<?, ?B/s]

Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


model.safetensors:   0%|          | 0.00/559M [00:00<?, ?B/s]



Step,Training Loss



--- Training distilbert-base-uncased ---


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss



Model performance on subset:
bert-base-uncased: F1 = 0.8710
roberta-base: F1 = 0.8939
google/electra-base-discriminator: F1 = 0.9125
microsoft/deberta-base: F1 = 0.8980
distilbert-base-uncased: F1 = 0.8740

Best model: google/electra-base-discriminator
