In [16]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from collections import Counter
import numpy as np
import os

SEED = 1234
torch.manual_seed(SEED)
np.random.seed(SEED)

In [17]:
data_path = '/home/jsarrato/PersonalProjects/ML_Portfolio/Sentiment_Analysis/Transformer/aclImdb_v1/aclImdb'

test_data = {'text': [], 'label': []}
train_data = {'text': [], 'label': []}

data = {'train': train_data, 'test': test_data}

for split in ['train', 'test']:
    for label in ['pos', 'neg']:
        path = f'{data_path}/{split}/{label}'
        for file in os.listdir(path):
            with open(f'{path}/{file}', 'r', encoding='utf-8') as f:
                text = f.read()
                data[split]['text'].append(text)
                data[split]['label'].append(1 if label == 'pos' else 0)


In [18]:
device = 'cpu'

In [19]:

class IMDBDataset(Dataset):
    def __init__(self, texts, labels, vocab, max_len):
        self.texts = texts
        self.labels = labels
        self.vocab = vocab
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        
        # Convert text to indices
        text_indices = [self.vocab.get(word, self.vocab['<UNK>']) for word in text.split()[:self.max_len]] # Get token of word, or unknown. Cut excess after max_len
        text_indices = text_indices + [self.vocab['<PAD>']] * (self.max_len - len(text_indices)) # Add padding
        
        return torch.tensor(text_indices, dtype=torch.long), torch.tensor(label, dtype=torch.float)

def build_vocab(texts, max_vocab_size='auto'):
    if max_vocab_size == 'auto':
        max_vocab_size = len(set(word for text in texts for word in text.split())) + 2 # +2 for <PAD> and <UNK>

    print(f"Building vocabulary with max size: {max_vocab_size}")

    counter = Counter()
    for text in texts:
        counter.update(text.split())

    vocab = {'<PAD>': 0, '<UNK>': 1} # Set token values for padding spaces or for unknown words
    for word, _ in counter.most_common(max_vocab_size - 2):
        vocab[word] = len(vocab)
    return vocab

In [None]:

test_texts, test_labels = data['test']['text'], data['test']['label']
train_texts, val_texts, train_labels, val_labels = train_test_split(data['train']['text'], data['train']['label'], test_size=0.1, random_state=SEED)

vocab = build_vocab(train_texts)

MAX_LEN = 128
BATCH_SIZE = 64

train_dataset = IMDBDataset(train_texts, train_labels, vocab, MAX_LEN)
val_dataset = IMDBDataset(val_texts, val_labels, vocab, MAX_LEN)
test_dataset = IMDBDataset(test_texts, test_labels, vocab, MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)


Building vocabulary with max size: 262196


In [None]:
class TransformerModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads, num_layers, hidden_dim, output_dim, dropout, max_len):
        super(TransformerModel, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.positional_encoding = nn.Parameter(torch.zeros(1, max_len, embed_dim))
        self.transformer = nn.Transformer(
            d_model=embed_dim,
            nhead=num_heads,
            num_encoder_layers=num_layers,
            num_decoder_layers=num_layers,
            dim_feedforward=hidden_dim,
            dropout=dropout
        )
        self.fc = nn.Linear(embed_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, text):
        # text: [batch_size, seq_len]
        embedded = self.dropout(self.embedding(text) + self.positional_encoding[:, :text.size(1), :])
        
        # Transformer expects [seq_len, batch_size, embed_dim]
        embedded = embedded.permute(1, 0, 2)
        transformer_output = self.transformer(embedded, embedded)
        
        pooled = transformer_output.mean(dim=0)
        
        return self.fc(pooled).squeeze(1)

VOCAB_SIZE = len(vocab)
EMBED_DIM = 128
NUM_HEADS = 4
NUM_LAYERS = 2
HIDDEN_DIM = 256
OUTPUT_DIM = 1
DROPOUT = 0.5

model = TransformerModel(VOCAB_SIZE, EMBED_DIM, NUM_HEADS, NUM_LAYERS, HIDDEN_DIM, OUTPUT_DIM, DROPOUT, MAX_LEN).to(device)





In [None]:
optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss().to(device)

def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    acc = correct.sum() / len(correct)
    return acc



In [None]:
def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.train()
    
    for batch in iterator:
        text, labels = batch
        text, labels = text.to(device), labels.to(device)
        
        optimizer.zero_grad()
        predictions = model(text)
        loss = criterion(predictions, labels)
        acc = binary_accuracy(predictions, labels)
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
    
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    
    with torch.no_grad():
        for batch in iterator:
            text, labels = batch
            text, labels = text.to(device), labels.to(device)
            
            predictions = model(text)
            loss = criterion(predictions, labels)
            acc = binary_accuracy(predictions, labels)
            
            epoch_loss += loss.item()
            epoch_acc += acc.item()
    
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

N_EPOCHS = 20
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

import tqdm

for epoch in tqdm.tqdm(range(N_EPOCHS)):
    train_loss, train_acc = train(model, train_loader, optimizer, criterion)
    val_loss, val_acc = evaluate(model, val_loader, criterion)
    
    print(f"Epoch: {epoch+1}")
    print(f"\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%")
    print(f"\tVal Loss: {val_loss:.3f} | Val Acc: {val_acc*100:.2f}%")

# Test the model
test_loss, test_acc = evaluate(model, test_loader, criterion)
print(f"Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%")

  5%|▌         | 1/20 [03:26<1:05:30, 206.84s/it]

Epoch: 1
	Train Loss: 0.639 | Train Acc: 62.11%
	Val Loss: 0.620 | Val Acc: 74.69%


 10%|█         | 2/20 [06:52<1:01:49, 206.10s/it]

Epoch: 2
	Train Loss: 0.515 | Train Acc: 74.70%
	Val Loss: 0.524 | Val Acc: 78.05%


 15%|█▌        | 3/20 [10:18<58:26, 206.29s/it]  

Epoch: 3
	Train Loss: 0.455 | Train Acc: 78.77%
	Val Loss: 0.560 | Val Acc: 79.84%


 20%|██        | 4/20 [13:47<55:12, 207.05s/it]

Epoch: 4
	Train Loss: 0.417 | Train Acc: 81.45%
	Val Loss: 0.563 | Val Acc: 79.88%


 25%|██▌       | 5/20 [17:16<51:55, 207.71s/it]

Epoch: 5
	Train Loss: 0.413 | Train Acc: 81.29%
	Val Loss: 0.533 | Val Acc: 79.14%


 30%|███       | 6/20 [20:45<48:35, 208.28s/it]

Epoch: 6
	Train Loss: 0.422 | Train Acc: 80.41%
	Val Loss: 0.529 | Val Acc: 79.53%


 35%|███▌      | 7/20 [24:11<44:59, 207.63s/it]

Epoch: 7
	Train Loss: 0.423 | Train Acc: 81.04%
	Val Loss: 0.781 | Val Acc: 72.97%


 40%|████      | 8/20 [27:34<41:14, 206.20s/it]

Epoch: 8
	Train Loss: 0.468 | Train Acc: 77.88%
	Val Loss: 0.682 | Val Acc: 73.55%


 45%|████▌     | 9/20 [31:04<38:00, 207.32s/it]

Epoch: 9
	Train Loss: 0.469 | Train Acc: 77.91%
	Val Loss: 0.735 | Val Acc: 73.59%


 50%|█████     | 10/20 [34:27<34:18, 205.88s/it]

Epoch: 10
	Train Loss: 0.474 | Train Acc: 77.49%
	Val Loss: 0.561 | Val Acc: 77.89%


 55%|█████▌    | 11/20 [37:50<30:46, 205.21s/it]

Epoch: 11
	Train Loss: 0.473 | Train Acc: 77.37%
	Val Loss: 0.802 | Val Acc: 73.16%


 60%|██████    | 12/20 [41:17<27:24, 205.52s/it]

Epoch: 12
	Train Loss: 0.505 | Train Acc: 75.45%
	Val Loss: 0.745 | Val Acc: 72.19%


In [None]:
import random

misclassified = []

for idx, test_data in enumerate(test_dataset):
    text, label = test_data
    text = text.unsqueeze(0).to(device)
    prediction = model(text)
    prediction = torch.sigmoid(prediction).item()
    
    if (prediction >= 0.5 and label == 0) or (prediction < 0.5 and label == 1):
        misclassified.append((idx, test_texts[idx], label.item(), prediction))

# Randomly select 10 misclassified examples to print
random.shuffle(misclassified)
for example in misclassified[:10]:
    idx, text, true_label, predicted = example
    print(f"Example {idx}:")
    print(f"Text: {text}")
    print(f"True Label: {true_label}, Predicted: {predicted:.2f}")
    print("-" * 50)

Example 22402:
Text: Adored by fans for his unusually charming creativity and by Hollywood for his softball, user-friendly movie-making techniques, Tim Burton tipped the scales too far in formula's favor with his new upset of a cinematic legend, Sleepy Hollow. Following the quest of Ichabod Crane  played by Johnny Depp, delivering this dreary film's only shining point  to the heart of the mystery surrounding a town's seemingly random and gruesome murders by a fabled headless horseman, the story plays out as if it were purposely trying to be repugnantly predictable. Contrived as a children's bedtime story, humdrum character introduction is laced with intended-upon exciting non-engaging chase scenes which, with undeveloped characters fleeing for their lives, produce about as much fright and thrill as The Nightmare Before Christmas.<br /><br />Toss in an endless bundle of old trees for ambience and a wide-eyed, big-busted blonde love interest (Christina Reechi) and Burton has himself a 