<a href="https://colab.research.google.com/github/harshit1441/NLP/blob/main/NLP_Assignment_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Cell 1: Install required packages and download dataset from Google Drive
!pip install -q gdown
!pip install -q torch torchvision torchaudio
!pip install -q scikit-learn matplotlib tqdm

# Download dataset from the provided Google Drive link
# Replace the file id if needed. File id from your link: 1A8mqzrqeTUj8Rbh52w0mru_GxONjHVJv
import gdown
url = "https://drive.google.com/uc?id=1A8mqzrqeTUj8Rbh52w0mru_GxONjHVJv"
output = "reviews_dataset.csv"
gdown.download(url, output, quiet=False)


Downloading...
From: https://drive.google.com/uc?id=1A8mqzrqeTUj8Rbh52w0mru_GxONjHVJv
To: /content/reviews_dataset.csv
100%|██████████| 66.2M/66.2M [00:00<00:00, 93.2MB/s]


'reviews_dataset.csv'

In [2]:
# Cell 2: imports and basic config
import os
import random
import re
import math
from collections import Counter
from typing import List, Tuple, Dict

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, classification_report

# reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", DEVICE)


Device: cpu


In [3]:
# Cell 3: load dataset (assumes CSV with columns "review" and "label" or similar)
df = pd.read_csv("reviews_dataset.csv")

# If the CSV structure is different, inspect first few rows
print("Columns:", df.columns.tolist())
df.head()


Columns: ['review', 'sentiment']


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
# Cell 4: preprocessing functions
def clean_text(text: str) -> str:
    # Basic cleaning: lowercase, remove URLs, remove non-alphanumeric (keep spaces), collapse spaces
    text = str(text).lower()
    text = re.sub(r'http\S+|www\.\S+', ' ', text)  # remove urls
    text = re.sub(r'[^a-z0-9\s]', ' ', text)        # remove punctuation but keep alphanum
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def tokenize(text: str) -> List[str]:
    return text.split()

# Apply cleaning and tokenization
TEXT_COL = 'review'   # change if your csv uses a different column
LABEL_COL = 'label'   # change if different

# try to auto-detect if columns differ
if TEXT_COL not in df.columns:
    # attempt guesses
    possible_text = [c for c in df.columns if 'review' in c.lower() or 'text' in c.lower()]
    if possible_text:
        TEXT_COL = possible_text[0]
    else:
        raise ValueError("Couldn't find text column. Update TEXT_COL manually.")

if LABEL_COL not in df.columns:
    possible_label = [c for c in df.columns if 'label' in c.lower() or 'sentiment' in c.lower() or 'target' in c.lower()]
    if possible_label:
        LABEL_COL = possible_label[0]
    else:
        raise ValueError("Couldn't find label column. Update LABEL_COL manually.")

df['clean'] = df[TEXT_COL].astype(str).apply(clean_text)
df['tokens'] = df['clean'].apply(tokenize)

# If labels are strings like 'pos'/'neg' or 'positive'/'negative', convert to 0/1
if df[LABEL_COL].dtype == object:
    # naive map
    df[LABEL_COL] = df[LABEL_COL].str.lower().map({'positive':1, 'pos':1, 'p':1, 'negative':0, 'neg':0, 'n':0})
    if df[LABEL_COL].isnull().any():
        # try another mapping if values are '1'/'0' strings
        df[LABEL_COL] = pd.to_numeric(df[LABEL_COL], errors='coerce').fillna(0).astype(int)

print("Sample cleaned text & tokens:")
display(df[['clean','tokens', LABEL_COL]].head())


Sample cleaned text & tokens:


Unnamed: 0,clean,tokens,sentiment
0,one of the other reviewers has mentioned that ...,"[one, of, the, other, reviewers, has, mentione...",1
1,a wonderful little production br br the filmin...,"[a, wonderful, little, production, br, br, the...",1
2,i thought this was a wonderful way to spend ti...,"[i, thought, this, was, a, wonderful, way, to,...",1
3,basically there s a family where a little boy ...,"[basically, there, s, a, family, where, a, lit...",0
4,petter mattei s love in the time of money is a...,"[petter, mattei, s, love, in, the, time, of, m...",1


In [5]:
# Cell 5: Build vocab (indexing starts from 1; 0 reserved for PAD)
MIN_FREQ = 2    # words with freq < MIN_FREQ -> OOV (tunable)
MAX_VOCAB = 30000

counter = Counter()
for tokens in df['tokens']:
    counter.update(tokens)

# keep tokens with freq >= MIN_FREQ
vocab_list = [w for w, c in counter.most_common(MAX_VOCAB) if c >= MIN_FREQ]
# Reserve indices:
# 0 -> PAD
# 1 -> UNK (out-of-vocab)  (we'll put UNK at index 1)
word2idx = {word: idx+2 for idx, word in enumerate(vocab_list)}  # start at 2
word2idx['<PAD>'] = 0
word2idx['<UNK>'] = 1
idx2word = {idx: w for w, idx in word2idx.items()}

vocab_size = len(word2idx)
print("Vocab size:", vocab_size)

def encode_tokens(tokens: List[str], word2idx: Dict[str,int], max_len:int) -> Tuple[List[int], int]:
    seq = []
    for t in tokens[:max_len]:
        seq.append(word2idx.get(t, word2idx['<UNK>']))
    length = len(seq)
    if length < max_len:
        seq = seq + [word2idx['<PAD>']] * (max_len - length)
    return seq, length

# set max sequence length
MAX_LEN = 100   # tune: 100-300 depending on average review length
df['length'] = df['tokens'].apply(lambda t: min(len(t), MAX_LEN))
df['encoded'] = df['tokens'].apply(lambda t: encode_tokens(t, word2idx, MAX_LEN)[0])
df['seq_len'] = df['tokens'].apply(lambda t: min(len(t), MAX_LEN))

# Quick check
print(df[['tokens','encoded','seq_len']].head(2))


Vocab size: 30002
                                              tokens  \
0  [one, of, the, other, reviewers, has, mentione...   
1  [a, wonderful, little, production, br, br, the...   

                                             encoded  seq_len  
0  [30, 5, 2, 79, 2063, 48, 1065, 13, 102, 151, 4...      100  
1  [4, 396, 122, 356, 8, 8, 2, 1384, 2981, 7, 55,...      100  


In [6]:
# Cell 6: split dataset and create PyTorch Dataset
train_df, test_df = train_test_split(df, test_size=0.15, random_state=SEED, stratify=df[LABEL_COL])
train_df, val_df = train_test_split(train_df, test_size=0.1765, random_state=SEED, stratify=train_df[LABEL_COL])
# 0.1765 * 0.85 ~= 0.15 -> val ~15%, test ~15%, train ~70%

print("Sizes: train, val, test:", len(train_df), len(val_df), len(test_df))

class ReviewsDataset(Dataset):
    def __init__(self, df):
        self.seqs = df['encoded'].tolist()
        self.labels = df[LABEL_COL].astype(int).tolist()
        self.lengths = df['seq_len'].tolist()
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        return torch.tensor(self.seqs[idx], dtype=torch.long), torch.tensor(self.labels[idx], dtype=torch.float), self.lengths[idx]

def collate_fn(batch):
    # batch: list of (seq_tensor, label_tensor, length)
    batch_sorted = sorted(batch, key=lambda x: x[2], reverse=True)
    seqs = torch.stack([item[0] for item in batch_sorted])
    labels = torch.stack([item[1] for item in batch_sorted])
    lengths = torch.tensor([item[2] for item in batch_sorted], dtype=torch.long)
    return seqs.to(DEVICE), labels.to(DEVICE), lengths.to(DEVICE)

BATCH_SIZE = 64
train_loader = DataLoader(ReviewsDataset(train_df), batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_loader   = DataLoader(ReviewsDataset(val_df), batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)
test_loader  = DataLoader(ReviewsDataset(test_df), batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)


Sizes: train, val, test: 34998 7502 7500


In [7]:
# Cell 7: Model definitions

class BaseRNNClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim=100, hidden_size=128, num_layers=1, rnn_type='lstm', bidirectional=False, dropout=0.3):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.rnn_type = rnn_type.lower()
        self.bidirectional = bidirectional
        self.hidden_size = hidden_size
        self.num_directions = 2 if bidirectional else 1

        if self.rnn_type == 'lstm':
            self.rnn = nn.LSTM(embed_dim, hidden_size, num_layers=num_layers, batch_first=True, bidirectional=bidirectional, dropout=(dropout if num_layers>1 else 0.0))
        elif self.rnn_type == 'gru':
            self.rnn = nn.GRU(embed_dim, hidden_size, num_layers=num_layers, batch_first=True, bidirectional=bidirectional, dropout=(dropout if num_layers>1 else 0.0))
        elif self.rnn_type == 'rnn':
            self.rnn = nn.RNN(embed_dim, hidden_size, num_layers=num_layers, nonlinearity='tanh', batch_first=True, bidirectional=bidirectional, dropout=(dropout if num_layers>1 else 0.0))
        else:
            raise ValueError("Invalid rnn_type")

        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(hidden_size * self.num_directions, 1)  # binary

    def forward(self, x, lengths):
        # x: (batch, seq_len)
        emb = self.embed(x)  # (batch, seq_len, embed_dim)
        # pack
        packed = pack_padded_sequence(emb, lengths.cpu(), batch_first=True, enforce_sorted=True)
        packed_out, hidden = self.rnn(packed)
        # handle hidden depending on rnn type
        if self.rnn_type == 'lstm':
            h_n, c_n = hidden
        else:
            h_n = hidden
        # h_n shape: (num_layers * num_directions, batch, hidden_size)
        # take the last layer's hidden states
        last_h = h_n[-self.num_directions:]  # (num_directions, batch, hidden_size)
        if self.num_directions == 2:
            # concat forward and backward
            last_h = torch.cat([last_h[0], last_h[1]], dim=1)  # (batch, hidden_size*2)
        else:
            last_h = last_h.squeeze(0)  # (batch, hidden_size)

        out = self.dropout(last_h)
        logits = self.classifier(out).squeeze(1)  # (batch,)
        return logits  # raw logits (use BCEWithLogitsLoss)

# convenience builders
def build_rnn(vocab_size, **kwargs):
    return BaseRNNClassifier(vocab_size, rnn_type='rnn', **kwargs)

def build_lstm(vocab_size, **kwargs):
    return BaseRNNClassifier(vocab_size, rnn_type='lstm', **kwargs)

def build_bilstm(vocab_size, **kwargs):
    return BaseRNNClassifier(vocab_size, rnn_type='lstm', bidirectional=True, **kwargs)


In [8]:
# Cell 8: training and evaluation utilities
import time

def train_epoch(model, dataloader, optimizer, criterion, clip=1.0):
    model.train()
    running_loss = 0.0
    for seqs, labels, lengths in dataloader:
        optimizer.zero_grad()
        logits = model(seqs, lengths)
        loss = criterion(logits, labels)
        loss.backward()
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        running_loss += loss.item() * seqs.size(0)
    return running_loss / len(dataloader.dataset)

@torch.no_grad()
def evaluate(model, dataloader, criterion):
    model.eval()
    running_loss = 0.0
    all_logits = []
    all_labels = []
    for seqs, labels, lengths in dataloader:
        logits = model(seqs, lengths)
        loss = criterion(logits, labels)
        running_loss += loss.item() * seqs.size(0)
        all_logits.append(logits.detach().cpu().numpy())
        all_labels.append(labels.detach().cpu().numpy())
    avg_loss = running_loss / len(dataloader.dataset)
    all_logits = np.concatenate(all_logits)
    all_labels = np.concatenate(all_labels)
    # convert logits to probabilities with sigmoid
    probs = 1 / (1 + np.exp(-all_logits))
    preds = (probs >= 0.5).astype(int)
    acc = accuracy_score(all_labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, preds, average='binary', zero_division=0)
    return {'loss': avg_loss, 'acc': acc, 'precision': precision, 'recall': recall, 'f1': f1, 'probs': probs, 'preds': preds, 'labels': all_labels}


In [None]:
# Cell 9: hyperparameters and training both models
EMBED_DIM = 150
HIDDEN_SIZE = 128
NUM_LAYERS = 1
DROPOUT = 0.3
LR = 1e-3
NUM_EPOCHS = 5

models = {
    'RNN': build_rnn(vocab_size, embed_dim=EMBED_DIM, hidden_size=HIDDEN_SIZE, num_layers=NUM_LAYERS, bidirectional=False, dropout=DROPOUT).to(DEVICE),
    'LSTM': build_lstm(vocab_size, embed_dim=EMBED_DIM, hidden_size=HIDDEN_SIZE, num_layers=NUM_LAYERS, bidirectional=False, dropout=DROPOUT).to(DEVICE),
    'BiLSTM': build_bilstm(vocab_size, embed_dim=EMBED_DIM, hidden_size=HIDDEN_SIZE, num_layers=NUM_LAYERS, dropout=DROPOUT).to(DEVICE)
}

hist = {}
criterion = nn.BCEWithLogitsLoss()

for name, model in models.items():
    print(f"\n=== Training {name} ===")
    optimizer = torch.optim.Adam(model.parameters(), lr=LR)
    best_val_loss = float('inf')
    hist[name] = {'train_loss': [], 'val_loss': [], 'val_acc': [], 'val_f1': []}
    for epoch in range(1, NUM_EPOCHS+1):
        t0 = time.time()
        train_loss = train_epoch(model, train_loader, optimizer, criterion)
        val_metrics = evaluate(model, val_loader, criterion)
        hist[name]['train_loss'].append(train_loss)
        hist[name]['val_loss'].append(val_metrics['loss'])
        hist[name]['val_acc'].append(val_metrics['acc'])
        hist[name]['val_f1'].append(val_metrics['f1'])
        elapsed = time.time() - t0
        print(f"Epoch {epoch}/{NUM_EPOCHS} | train_loss: {train_loss:.4f} | val_loss: {val_metrics['loss']:.4f} | val_acc: {val_metrics['acc']:.4f} | val_f1: {val_metrics['f1']:.4f} | {elapsed:.0f}s")
        # save best
        if val_metrics['loss'] < best_val_loss:
            best_val_loss = val_metrics['loss']
            torch.save(model.state_dict(), f"best_{name}.pt")
    # load best model back
    model.load_state_dict(torch.load(f"best_{name}.pt"))



=== Training RNN ===
Epoch 1/5 | train_loss: 0.6977 | val_loss: 0.6876 | val_acc: 0.5429 | val_f1: 0.5335 | 50s
Epoch 2/5 | train_loss: 0.6595 | val_loss: 0.6686 | val_acc: 0.6049 | val_f1: 0.5733 | 51s
Epoch 3/5 | train_loss: 0.6141 | val_loss: 0.6401 | val_acc: 0.6637 | val_f1: 0.6608 | 55s
Epoch 4/5 | train_loss: 0.5645 | val_loss: 0.6004 | val_acc: 0.7103 | val_f1: 0.7208 | 51s
Epoch 5/5 | train_loss: 0.5125 | val_loss: 0.5798 | val_acc: 0.7353 | val_f1: 0.7325 | 50s

=== Training LSTM ===


In [None]:
# Cell 10: Plot training & validation loss for all models
plt.figure(figsize=(10,6))
for name in hist:
    plt.plot(hist[name]['train_loss'], label=f'{name} train')
    plt.plot(hist[name]['val_loss'], '--', label=f'{name} val')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Train & Val Loss')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
# Cell 11: Evaluate each model on the test set and show metrics
results = {}
for name, model in models.items():
    print(f"\n--- Test evaluation: {name} ---")
    metrics = evaluate(model, test_loader, criterion)
    results[name] = metrics
    print(f"Loss: {metrics['loss']:.4f} | Acc: {metrics['acc']:.4f} | Precision: {metrics['precision']:.4f} | Recall: {metrics['recall']:.4f} | F1: {metrics['f1']:.4f}")
    print("Classification report:")
    preds = metrics['preds']
    labels = metrics['labels']
    print(classification_report(labels, preds, digits=4))
    cm = confusion_matrix(labels, preds)
    print("Confusion matrix:\n", cm)


In [None]:
# Cell 12: Ensemble predictions (majority voting & average probs)
# Get probs from each model for test set
probs_by_model = {}
preds_by_model = {}
labels = None

for name, model in models.items():
    m = model
    m.eval()
    all_probs = []
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for seqs, labs, lengths in test_loader:
            logits = m(seqs, lengths)
            probs = torch.sigmoid(logits).cpu().numpy()
            preds = (probs >= 0.5).astype(int)
            all_probs.append(probs)
            all_preds.append(preds)
            all_labels.append(labs.cpu().numpy())
    probs_by_model[name] = np.concatenate(all_probs)
    preds_by_model[name] = np.concatenate(all_preds)
    labels = np.concatenate(all_labels)

# Average probabilities
avg_probs = np.mean(np.stack([probs_by_model[n] for n in probs_by_model]), axis=0)
avg_preds = (avg_probs >= 0.5).astype(int)

# Majority voting
votes = np.sum(np.stack([preds_by_model[n] for n in preds_by_model]), axis=0)
maj_preds = (votes >= 2).astype(int)  # at least 2 of 3

def print_metrics(y_true, y_pred, title):
    acc = accuracy_score(y_true, y_pred)
    prec, rec, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='binary', zero_division=0)
    print(f"{title} -> Acc: {acc:.4f} | Precision: {prec:.4f} | Recall: {rec:.4f} | F1: {f1:.4f}")
    print(classification_report(y_true, y_pred, digits=4))
    print("Confusion matrix:\n", confusion_matrix(y_true, y_pred))

print("\n--- Ensemble: Average Probabilities ---")
print_metrics(labels, avg_preds, "AvgProb Ensemble")

print("\n--- Ensemble: Majority Voting ---")
print_metrics(labels, maj_preds, "Majority Vote Ensemble")


In [None]:
# Cell 13: Build a summary table and plot bar chart for Acc/F1
import pandas as pd

rows = []
for name, m in results.items():
    rows.append({
        'model': name,
        'test_loss': results[name]['loss'],
        'accuracy': results[name]['acc'],
        'precision': results[name]['precision'],
        'recall': results[name]['recall'],
        'f1': results[name]['f1']
    })
# add ensembles
def metrics_from_preds(y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    prec, rec, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='binary', zero_division=0)
    return acc, prec, rec, f1

acc, prec, rec, f1 = metrics_from_preds(labels, avg_preds)
rows.append({'model':'Ensemble_AvgProb', 'test_loss':np.nan, 'accuracy':acc, 'precision':prec, 'recall':rec, 'f1':f1})
acc, prec, rec, f1 = metrics_from_preds(labels, maj_preds)
rows.append({'model':'Ensemble_Majority', 'test_loss':np.nan, 'accuracy':acc, 'precision':prec, 'recall':rec, 'f1':f1})

summary_df = pd.DataFrame(rows).sort_values('f1', ascending=False).reset_index(drop=True)
display(summary_df)

# bar plot for accuracy & f1
plt.figure(figsize=(10,5))
x = np.arange(len(summary_df))
width = 0.35
plt.bar(x - width/2, summary_df['accuracy'], width, label='Accuracy')
plt.bar(x + width/2, summary_df['f1'], width, label='F1')
plt.xticks(x, summary_df['model'], rotation=45)
plt.ylim(0,1)
plt.ylabel('Score')
plt.title('Test Accuracy & F1 by Model')
plt.legend()
plt.grid(axis='y')
plt.show()


In [None]:
# Cell 14: Save models and print a short report summary
for name, model in models.items():
    torch.save(model.state_dict(), f"{name}_final.pt")
print("Saved model weights.")

# Short textual summary (you can copy-paste into report)
print("\n=== Quick Report Summary ===\n")
print("Preprocessing:")
print(f"- Lowercased, removed URLs and non-alphanumeric characters; tokenized by whitespace.")
print(f"- Built vocab of size {vocab_size} (min_freq={MIN_FREQ}); index 0 reserved for PAD, 1 for UNK.")
print(f"- Sequences padded/truncated to MAX_LEN = {MAX_LEN}.\n")

print("Model architectures:")
print("- RNN: embedding -> RNN -> dropout -> linear (BCEWithLogits).")
print("- LSTM: embedding -> LSTM -> dropout -> linear.")
print("- BiLSTM: same as LSTM but bidirectional; final hidden is concatenation of forward+backward.\n")

print("Training:")
print(f"- Criterion: BCEWithLogitsLoss; Optimizer: Adam (lr={LR}).")
print(f"- Trained for {NUM_EPOCHS} epochs. Monitor training & validation loss for overfitting.\n")

print("Evaluation insights:")
print("- See summary table above for final test accuracy / precision / recall / f1 for each model and ensembles.")
print("- Compare RNN vs LSTM: LSTM/BiLSTM typically perform better on longer sequences due to gating handling long-term dependencies.")
print("- Ensembles: average probabilities or majority voting may improve metrics if models make complementary errors.\n")

print("Recommended next steps:")
print("- Try pre-trained embeddings (GloVe/FastText) and freeze/unfreeze embedding layers.")
print("- Tune MAX_LEN, embed_dim, hidden_size, and dropout; try 2-layer LSTM or attention mechanism.")
print("- Use class weights or balanced sampling if dataset is imbalanced.")
