# Homework 3

In [None]:
!pip install tokenizers transformers torchmetrics

In [39]:
import numpy as np
import sklearn
import torch
import os
import pandas as pd
import tqdm

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import torchmetrics

In [40]:
device = 'cpu'

In [None]:
if not os.path.exists('IMDB-Dataset.csv'):
  !wget -O IMDB-Dataset.csv -q "https://www.dropbox.com/scl/fi/0c7zc2adk1mgwgut5w80w/IMDB-Dataset.csv?rlkey=1drfg4zw36mhu32ndy2ihnygw&dl=1"

In [42]:
df = pd.read_csv('IMDB-Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [43]:
text = list(df['review'].str.replace('<br />',''))
labels = np.array(df['sentiment'].map({'negative':0,'positive':1}))

In [44]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

Example of how to tokenize text:


In [45]:
seq = text[0][:10]
seq

'One of the'

In [46]:
token_ids = tokenizer(seq)['input_ids']
token_ids

[101, 1448, 1104, 1103, 102]

In [47]:
tokenizer.decode(token_ids+[0,0,0])

'[CLS] One of the [SEP] [PAD] [PAD] [PAD]'

Bag of Words Model:

In [48]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neural_network import MLPClassifier

X_train, X_test, y_train, y_test = train_test_split(
    text, labels, test_size=0.1, random_state=42, stratify=labels
)

# TF-IDF weighted histograms using the top 1000 words
vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# MLP Classifier
mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=42)
mlp.fit(X_train_tfidf, y_train)

train_accuracy = mlp.score(X_train_tfidf, y_train)
test_accuracy = mlp.score(X_test_tfidf, y_test)

print("Train accuracy:", train_accuracy)
print("Test accuracy:", test_accuracy)

Train accuracy: 1.0
Test accuracy: 0.8654


RNN Model:

In [49]:
# pre-tokenized dataset
pretokenized_texts = [tokenizer(t)['input_ids'] for t in text]

# 90/10 train/test split
train_texts, test_texts, train_labels, test_labels = train_test_split(
    pretokenized_texts, labels, test_size=0.1, random_state=42, stratify=labels
)

Token indices sequence length is longer than the specified maximum sequence length for this model (543 > 512). Running this sequence through the model will result in indexing errors


In [52]:
# Custom Dataset class for tokenized text
import random

class IMDBDataset(Dataset):
    def __init__(self, tokenized_texts, labels, max_seq_len=100):
        self.tokenized_texts = tokenized_texts
        self.labels = labels
        self.max_seq_len = max_seq_len
        
    def __len__(self):
        return len(self.tokenized_texts)
    
    def __getitem__(self, idx):
        tokens = self.tokenized_texts[idx]
        # Longer than max_seq_len take random contiguous sub-sequence
        if len(tokens) > self.max_seq_len:
            start_idx = random.randint(0, len(tokens) - self.max_seq_len)
            tokens = tokens[start_idx:start_idx + self.max_seq_len]
        else:
            # Pad the sequence with token index 0 if too short
            tokens = tokens + [0] * (self.max_seq_len - len(tokens))
        tokens = torch.tensor(tokens, dtype=torch.long)
        label = torch.tensor(self.labels[idx], dtype=torch.float)
        return tokens, label

In [55]:
# Create dataset and dataloader objects
max_seq_len = 100
batch_size = 32

train_dataset = IMDBDataset(train_texts, train_labels, max_seq_len)
test_dataset = IMDBDataset(test_texts, test_labels, max_seq_len)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader  = DataLoader(test_dataset, batch_size=batch_size)

# GRU-based sentiment classifier model
class GRUSentimentClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim=100, hidden_size=100, num_layers=3):
        super(GRUSentimentClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.gru = nn.GRU(
            input_size=embedding_dim,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True
        )
        self.fc = nn.Linear(hidden_size, 1)
        
    def forward(self, x):
        embedded = self.embedding(x)
        output, hidden = self.gru(embedded)
        last_hidden = hidden[-1]
        logits = self.fc(last_hidden)
        return logits.squeeze(1)

In [56]:
vocab_size = tokenizer.vocab_size
model = GRUSentimentClassifier(vocab_size=vocab_size, embedding_dim=100, hidden_size=100, num_layers=3)

model = model.to(device)

# Adam optimizer with learning rate 3e-4
optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)
criterion = nn.BCEWithLogitsLoss()
num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    total_correct = 0
    total_examples = 0
    
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        logits = model(inputs)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item() * inputs.size(0)
        preds = (torch.sigmoid(logits) > 0.5).float()
        total_correct += (preds == labels).sum().item()
        total_examples += inputs.size(0)
        
    avg_loss = total_loss / total_examples
    train_accuracy = total_correct / total_examples
    print(f"Epoch {epoch+1}/{num_epochs} | Loss: {avg_loss:.4f}, Accuracy: {train_accuracy:.4f}")

# Save the trained model parameters
torch.save(model.state_dict(), "gru_model.pth")

Epoch 1/10 | Loss: 0.6442, Accuracy: 0.6164
Epoch 2/10 | Loss: 0.4953, Accuracy: 0.7598
Epoch 3/10 | Loss: 0.4214, Accuracy: 0.8042
Epoch 4/10 | Loss: 0.3831, Accuracy: 0.8267
Epoch 5/10 | Loss: 0.3559, Accuracy: 0.8405
Epoch 6/10 | Loss: 0.3398, Accuracy: 0.8495
Epoch 7/10 | Loss: 0.3198, Accuracy: 0.8603
Epoch 8/10 | Loss: 0.3061, Accuracy: 0.8677
Epoch 9/10 | Loss: 0.2923, Accuracy: 0.8742
Epoch 10/10 | Loss: 0.2712, Accuracy: 0.8850


In [57]:
vocab_size = tokenizer.vocab_size
model = GRUSentimentClassifier(vocab_size, embedding_dim=100, hidden_size=100, num_layers=3)

device = 'mps' if torch.backends.mps.is_available() else 'cpu'
model = model.to(device)

# Load saved model params
model.load_state_dict(torch.load("gru_model.pth", map_location=device))

criterion = nn.BCEWithLogitsLoss()

# Evaluate on the test set
model.eval()
total_test_loss = 0.0
total_test_correct = 0
total_test_examples = 0

with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        logits = model(inputs)
        loss = criterion(logits, labels)
        total_test_loss += loss.item() * inputs.size(0)
        preds = (torch.sigmoid(logits) > 0.5).float()
        total_test_correct += (preds == labels).sum().item()
        total_test_examples += inputs.size(0)
        
avg_test_loss = total_test_loss / total_test_examples
test_accuracy = total_test_correct / total_test_examples
print(f"Test Loss: {avg_test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")

Test Loss: 0.3763, Test Accuracy: 0.8356
