In [1]:
# Install the transformers library by Hugging Face
!pip install transformers[torch] pandas scikit-learn matplotlib seaborn

print("Libraries installed successfully!")

Libraries installed successfully!


In [2]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import io

print("Libraries imported successfully!")

Libraries imported successfully!


In [3]:
# Install dependencies as needed:
# pip install kagglehub[pandas-datasets]
import kagglehub
from kagglehub import KaggleDatasetAdapter

# Set the path to the file you'd like to load
file_path = "IMDB Dataset.csv"

# Load the latest version
df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "lakshmi25npathi/imdb-dataset-of-50k-movie-reviews",
  file_path,
  # Provide any additional arguments like
  # sql_query or pandas_kwargs. See the
  # documenation for more information:
  # https://github.com/Kaggle/kagglehub/blob/main/README.md#kaggledatasetadapterpandas
)

print("First 5 records:", df.head())

  df = kagglehub.load_dataset(


Using Colab cache for faster access to the 'imdb-dataset-of-50k-movie-reviews' dataset.
First 5 records:                                               review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [4]:
df.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [5]:
# --- Assume 'df' is your pre-loaded DataFrame with 50,000 reviews ---
# For demonstration purposes, I will create a dummy 'df'.
# In your environment, you would already have this loaded.
# Example: df = pd.read_csv('IMDB Dataset.csv')

# --- Start of the actual cell ---

# 1. Map labels to integers
# We will map 'positive' to 1 and 'negative' to 0.
df['label'] = df['sentiment'].map({'positive': 1, 'negative': 0})

# 2. Check the data
print("--- Data Head ---")
print(df.head())
print("\n--- Data Info ---")
df.info()
print("\n--- Label Distribution ---")
print(df['label'].value_counts())


# 3. Split the dataset into training and validation sets (e.g., 90% train, 10% validation)
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['review'].tolist(),
    df['label'].tolist(),
    test_size=0.1,
    random_state=42,
    stratify=df['label'] # Stratify to maintain the same class distribution
)

print(f"\nTraining samples: {len(train_texts)}")
print(f"Validation samples: {len(val_texts)}")

--- Data Head ---
                                              review sentiment  label
0  One of the other reviewers has mentioned that ...  positive      1
1  A wonderful little production. <br /><br />The...  positive      1
2  I thought this was a wonderful way to spend ti...  positive      1
3  Basically there's a family where a little boy ...  negative      0
4  Petter Mattei's "Love in the Time of Money" is...  positive      1

--- Data Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
 2   label      50000 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 1.1+ MB

--- Label Distribution ---
label
1    25000
0    25000
Name: count, dtype: int64

Training samples: 45000
Validation samples: 5000


In [6]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments

# Ensure you have a GPU available
if not torch.cuda.is_available():
    print("Warning: CUDA (GPU) is not available. Training will be very slow.")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(f"Using device: {device}")

Using device: cuda


In [7]:
# Load the pre-trained BERT tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)

# Tokenize the training and validation texts
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)

print("Tokenization complete.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Tokenization complete.


In [8]:
class IMDbDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        # The encodings dictionary contains 'input_ids', 'token_type_ids', 'attention_mask'
        # We create a dictionary item for each key and convert the list to a tensor
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Create the Dataset objects
train_dataset = IMDbDataset(train_encodings, train_labels)
val_dataset = IMDbDataset(val_encodings, val_labels)

print("PyTorch Datasets created successfully.")

PyTorch Datasets created successfully.


In [9]:
# Load the model and move it to the GPU
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)

print("BERT model loaded and moved to GPU.")

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERT model loaded and moved to GPU.


In [10]:
# Define the function to compute metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Define the Training Arguments
# This class contains all the hyperparameters and settings for the training loop
training_args = TrainingArguments(
    output_dir='./results',              # Directory to save the model and logs
    num_train_epochs=2,                  # A smaller number of epochs is often enough for fine-tuning
    per_device_train_batch_size=16,       # Batch size per device during training (reduce if you get CUDA out-of-memory errors)
    per_device_eval_batch_size=16,       # Batch size for evaluation
    warmup_steps=500,                    # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,                   # Strength of weight decay
    logging_dir='./logs',                # Directory for storing logs
    logging_steps=100,                   # Log every 100 steps
    eval_strategy="steps",         # Evaluate at specified step intervals
    eval_steps=500,                      # Run evaluation every 500 steps
    save_strategy="steps",               # Save checkpoint every 'save_steps'
    save_steps=500,
    load_best_model_at_end=True,         # Load the best model found during training at the end
    report_to="none"                     # Disable integrations like wandb if not needed
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

print("Trainer initialized.")

Trainer initialized.


In [11]:
# Start the fine-tuning process
print("Starting fine-tuning...")
trainer.train()
print("Fine-tuning finished!")

Starting fine-tuning...


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
500,0.2798,0.23828,0.9086,0.911486,0.88359,0.9412
1000,0.2323,0.247567,0.9168,0.915413,0.930935,0.9004
1500,0.2278,0.224812,0.9206,0.922597,0.899962,0.9464
2000,0.2572,0.173036,0.9326,0.932855,0.929337,0.9364
2500,0.201,0.186504,0.9382,0.937411,0.949528,0.9256
3000,0.1079,0.226765,0.9388,0.939976,0.922248,0.9584
3500,0.1049,0.218803,0.941,0.942009,0.926169,0.9584
4000,0.1285,0.235126,0.9386,0.937487,0.954791,0.9208
4500,0.1177,0.181802,0.9452,0.945309,0.943426,0.9472
5000,0.1187,0.188702,0.944,0.944067,0.942937,0.9452


Fine-tuning finished!


In [12]:
# Run the final evaluation
final_evaluation_results = trainer.evaluate()

print("\n--- Final Evaluation Results ---")
for key, value in final_evaluation_results.items():
    print(f"{key}: {value:.4f}")


--- Final Evaluation Results ---
eval_loss: 0.1730
eval_accuracy: 0.9326
eval_f1: 0.9329
eval_precision: 0.9293
eval_recall: 0.9364
eval_runtime: 31.5945
eval_samples_per_second: 158.2560
eval_steps_per_second: 9.9070
epoch: 2.0000


In [30]:
from transformers import pipeline

# Create a prediction pipeline
# The trainer.model is the best model from our training
sentiment_analyzer = pipeline("sentiment-analysis", model=trainer.model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)

# Example predictions
new_review_1 = "This movie was absolutely brilliant. The acting, the story, everything was perfect!"
new_review_2 = "It was a complete and utter waste of my time. The plot made no sense."
new_review_3 = "The film was okay, not great but not terrible either."
new_review_4 = "It was a really nice film actaully"
new_review_5 = "perfect"


print(f"Review: '{new_review_1}'")
print(f"Predicted Sentiment: {sentiment_analyzer(new_review_1)[0]}")
print("-" * 30)

print(f"Review: '{new_review_2}'")
print(f"Predicted Sentiment: {sentiment_analyzer(new_review_2)[0]}")
print("-" * 30)

print(f"Review: '{new_review_3}'")
print(f"Predicted Sentiment: {sentiment_analyzer(new_review_3)[0]}")
print("-" * 30)

print(f"Review: '{new_review_4}'")
print(f"Predicted Sentiment: {sentiment_analyzer(new_review_4)[0]}")
print("-" * 30)

print(f"Review: '{new_review_5}'")
print(f"Predicted Sentiment: {sentiment_analyzer(new_review_5)[0]}")
print("-" * 30)


# The pipeline output will be in the format {'label': 'LABEL_1', 'score': 0.99...}
# LABEL_1 corresponds to 'positive' and LABEL_0 to 'negative'

Device set to use cuda:0


Review: 'This movie was absolutely brilliant. The acting, the story, everything was perfect!'
Predicted Sentiment: {'label': 'LABEL_1', 'score': 0.988531231880188}
------------------------------
Review: 'It was a complete and utter waste of my time. The plot made no sense.'
Predicted Sentiment: {'label': 'LABEL_0', 'score': 0.9926389455795288}
------------------------------
Review: 'The film was okay, not great but not terrible either.'
Predicted Sentiment: {'label': 'LABEL_0', 'score': 0.7362467050552368}
------------------------------
Review: 'It was a really nice film actaully'
Predicted Sentiment: {'label': 'LABEL_1', 'score': 0.9725638031959534}
------------------------------
Review: 'perfect'
Predicted Sentiment: {'label': 'LABEL_1', 'score': 0.8860015869140625}
------------------------------
