importing libraries

In [None]:

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import pandas as pd
from tqdm import tqdm
from torch.optim import AdamW

 Load Dataset

In [None]:
dataset = pd.read_csv('/content/big_data.csv', engine='python', on_bad_lines='skip')
texts = dataset['review'].tolist()
labels = dataset['label'].tolist()

 Encode Labels

In [None]:
encoder = LabelEncoder()
y = encoder.fit_transform(labels)
num_classes = len(encoder.classes_)
print(y)
print("Detected classes:", encoder.classes_)

[1 1 1 ... 0 0 0]
Detected classes: ['Social Media' 'book' 'movie' 'sports']


In [None]:
print(len(texts))
print(len(y))

121362
121362


 Split Dataset

In [None]:
train_texts, test_texts, y_train, y_test = train_test_split(
    texts, y, test_size=0.2, stratify=y , random_state=42
)


In [None]:
print("Train set size:", len(train_texts))
print("Test set size:", len(test_texts))

Train set size: 97089
Test set size: 24273


 Speed Optimization Settings

In [None]:
train_fraction = 0.1  # Train on 10% of the data for faster runs
sample_size = int(train_fraction * len(train_texts))
train_texts = train_texts[:sample_size]
y_train = y_train[:sample_size]
print(f"Using {sample_size} samples for training ({train_fraction*100:.0f}% of data)")

BATCH_SIZE = 10
EPOCHS = 2

Using 9708 samples for training (10% of data)


 Load BERT Model & Tokenizer

In [None]:
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
bert = BertModel.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

 Model Definition

In [None]:
class BertClassifier(nn.Module):
    def __init__(self, bert, num_classes):
        super(BertClassifier, self).__init__()
        self.bert = bert
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0]  # [CLS] token
        return self.fc(self.dropout(cls_output))

model = BertClassifier(bert, num_classes)

 Custom Dataset

In [None]:
class ReviewDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long),
        }


 Training Setup

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

train_dataset = ReviewDataset(train_texts, y_train, tokenizer)
test_dataset = ReviewDataset(test_texts, y_test, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

criterion = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=2e-5)

 Training Loop

In [None]:
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0

    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS} Training"):
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{EPOCHS}, Avg Training Loss: {avg_loss:.4f}")

Epoch 1/2 Training: 100%|██████████| 971/971 [04:05<00:00,  3.95it/s]


Epoch 1/2, Avg Training Loss: 0.0704


Epoch 2/2 Training: 100%|██████████| 971/971 [04:09<00:00,  3.90it/s]

Epoch 2/2, Avg Training Loss: 0.0195





 Evaluation Function with Metrics

In [None]:
def evaluate_model(model, data_loader):
    model.eval()
    preds, true_labels = [], []

    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids, attention_mask)
            predictions = torch.argmax(outputs, dim=1)

            preds.extend(predictions.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    acc = accuracy_score(true_labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, preds, average='weighted')
    return acc, precision, recall, f1

Evaluate on Test Data

In [None]:
acc, precision, recall, f1 = evaluate_model(model, test_loader)
print("\n Evaluation Metrics:")
print(f"Accuracy:  {acc:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1-Score:  {f1:.4f}")

Evaluating: 100%|██████████| 2428/2428 [03:30<00:00, 11.53it/s]


 Evaluation Metrics:
Accuracy:  0.9895
Precision: 0.9895
Recall:    0.9895
F1-Score:  0.9893





 Prediction Function

In [None]:
def classify_review(review):
    model.eval()
    with torch.no_grad():
        encoding = tokenizer(
            review,
            return_tensors="pt",
            truncation=True,
            padding=True,
            max_length=128
        )
        input_ids = encoding["input_ids"].to(device)
        attention_mask = encoding["attention_mask"].to(device)
        outputs = model(input_ids, attention_mask)
        pred = torch.argmax(outputs, dim=1).cpu().item()
        return encoder.inverse_transform([pred])[0]

Example Predictions

In [None]:
print("\nExample Predictions:")
sample_reviews = [
    "This movie was absolutely fantastic!",
    "The author describe the story too good",
      "Accidentally sent a love letter to the wrong person. Love note fail: Maximum embarrassment! ",
    "cricket players are too good."
    ]
for rev in sample_reviews:
    print(f"Review: {rev}")
    print("Predicted Sentiment:", classify_review(rev))
    print("-" * 50)


Example Predictions:
Review: This movie was absolutely fantastic!
Predicted Sentiment: movie
--------------------------------------------------
Review: The author describe the story too good
Predicted Sentiment: book
--------------------------------------------------
Review: Accidentally sent a love letter to the wrong person. Love note fail: Maximum embarrassment! 
Predicted Sentiment: book
--------------------------------------------------
Review: cricket players are too good.
Predicted Sentiment: sports
--------------------------------------------------
