# Ch15: Modeling Sequential Data Using Recurrent Neural Networks

Preprocessing

# Building an RNN model

# LSTM for sentiment analysis task

In [1]:
!nvidia-smi

Wed Jan 28 07:34:35 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   56C    P8             10W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [3]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
import re
from collections import Counter, OrderedDict
from datasets import load_dataset

# ============================================================================
# TOKENIZER
# ============================================================================
def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall(
        '(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower()
    )
    text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
    tokenized = text.split()
    return tokenized

# ============================================================================
# VOCABULARY WRAPPER
# ============================================================================
class VocabWrapper:
    def __init__(self):
        self.stoi = {}
        self.itos = {}
        self.default_index = None
    
    def __call__(self, token):
        if token in self.stoi:
            return self.stoi[token]
        elif self.default_index is not None:
            return self.default_index
        else:
            raise KeyError(f"Token '{token}' not in vocabulary")
    
    def __getitem__(self, token):
        if token in self.stoi:
            return self.stoi[token]
        elif self.default_index is not None:
            return self.default_index
        else:
            raise KeyError(f"Token '{token}' not in vocabulary")
    
    def __len__(self):
        return len(self.stoi)
    
    def insert_token(self, token, idx):
        self.stoi[token] = idx
        self.itos[idx] = token
    
    def set_default_index(self, idx):
        self.default_index = idx

# ============================================================================
# DATASET WRAPPER
# ============================================================================
class IMDBDataset(Dataset):
    def __init__(self, hf_dataset):
        self.dataset = hf_dataset
    
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        item = self.dataset[idx]
        return (item['label'], item['text'])

# ============================================================================
# COLLATE FUNCTION
# ============================================================================
def collate_batch(batch):
    label_list, text_list, lengths = [], [], []
    for _label, _text in batch:
        label_list.append(label_pipeline(_label))
        encoded = []
        for token in tokenizer(_text):
            idx = vocab[token]
            encoded.append(idx)
        
        processed_text = torch.tensor(encoded, dtype=torch.int64)
        text_list.append(processed_text)
        lengths.append(processed_text.size(0))
    
    label_list = torch.tensor(label_list, dtype=torch.float32)
    padded_text_list = pad_sequence(text_list, batch_first=True, padding_value=0)
    return padded_text_list, label_list, torch.tensor(lengths)

# ============================================================================
# LOAD DATA
# ============================================================================
print("Loading datasets...")
train_dataset_hf = load_dataset('stanfordnlp/imdb', split='train')
test_dataset_hf = load_dataset('stanfordnlp/imdb', split='test')

# Split train into train and valid (80/20 split)
split_datasets = train_dataset_hf.train_test_split(test_size=0.2, seed=42)
train_dataset_hf = split_datasets['train']
valid_dataset_hf = split_datasets['test']

# ============================================================================
# BUILD VOCABULARY
# ============================================================================
print("Building vocabulary...")
token_counts = Counter()
for example in train_dataset_hf:
    tokens = tokenizer(example['text'])
    token_counts.update(tokens)

sorted_by_freq_tuples = sorted(token_counts.items(), key=lambda x: x[1], reverse=True)

# Create vocab with special tokens FIRST
vocab = VocabWrapper()
vocab.insert_token("<pad>", 0)
vocab.insert_token("<unk>", 1)

# Then add all other tokens starting from index 2
for idx, (token, count) in enumerate(sorted_by_freq_tuples, start=2):
    vocab.insert_token(token, idx)

vocab.set_default_index(1)

print(f"Vocab size: {len(vocab)}")

# ============================================================================
# PIPELINES
# ============================================================================
text_pipeline = lambda x: [vocab[token] for token in tokenizer(x)]
label_pipeline = lambda x: 1. if x == 1 else 0.

# ============================================================================
# CREATE DATASETS
# ============================================================================
train_dataset = IMDBDataset(train_dataset_hf)
valid_dataset = IMDBDataset(valid_dataset_hf)
test_dataset = IMDBDataset(test_dataset_hf)

# ============================================================================
# CREATE DATALOADERS
# ============================================================================
batch_size = 32
train_dl = DataLoader(train_dataset, batch_size=batch_size,
                      shuffle=True, collate_fn=collate_batch, num_workers=0)
valid_dl = DataLoader(valid_dataset, batch_size=batch_size, 
                      shuffle=False, collate_fn=collate_batch, num_workers=0)
test_dl = DataLoader(test_dataset, batch_size=batch_size, 
                     shuffle=False, collate_fn=collate_batch, num_workers=0)

print(f"Train samples: {len(train_dataset)}")
print(f"Valid samples: {len(valid_dataset)}")
print(f"Test samples: {len(test_dataset)}")

# Test a batch
text_batch, label_batch, length_batch = next(iter(train_dl))
print(f"\nBatch shapes:")
print(f"Text batch: {text_batch.shape}")
print(f"Label batch: {label_batch.shape}")
print(f"Lengths: {length_batch[:5]}")

# ============================================================================
# MODEL
# ============================================================================
class SentimentRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, 
                 n_layers, bidirectional=False, dropout=0.3):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers,
                           bidirectional=bidirectional, dropout=dropout if n_layers > 1 else 0,
                           batch_first=True)
        self.fc = nn.Linear(hidden_dim * (2 if bidirectional else 1), output_dim)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, text, lengths):
        embedded = self.dropout(self.embedding(text))
        
        # Pack padded sequence (lengths must be on CPU)
        packed_embedded = pack_padded_sequence(embedded, lengths.cpu(), batch_first=True, 
                                               enforce_sorted=False)
        
        # Pass through RNN
        packed_output, (hidden, cell) = self.rnn(packed_embedded)
        
        # Unpack
        output, output_lengths = pad_packed_sequence(packed_output, batch_first=True)
        
        # Use the last hidden state
        if self.rnn.bidirectional:
            hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
        else:
            hidden = self.dropout(hidden[-1,:,:])
        
        # Pass through FC layer
        out = self.fc(hidden)
        return out

# ============================================================================
# INITIALIZE MODEL
# ============================================================================
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"\nUsing device: {device}")

model = SentimentRNN(
    vocab_size=len(vocab),
    embedding_dim=100,
    hidden_dim=256,
    output_dim=1,
    n_layers=2,
    bidirectional=True,
    dropout=0.3
).to(device)

# ============================================================================
# TRAINING SETUP
# ============================================================================
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.BCEWithLogitsLoss()

def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    acc = correct.sum() / len(correct)
    return acc

# ============================================================================
# TRAINING FUNCTIONS
# ============================================================================
def train(dataloader):
    model.train()
    total_loss = 0
    total_acc = 0
    count = 0
    
    for text_batch, label_batch, lengths in dataloader:
        text_batch = text_batch.to(device)
        label_batch = label_batch.to(device)
        
        optimizer.zero_grad()
        pred = model(text_batch, lengths)
        pred = pred.squeeze()
        
        loss = loss_fn(pred, label_batch)
        acc = binary_accuracy(pred, label_batch)
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        
        total_loss += loss.item()
        total_acc += acc.item()
        count += 1
    
    return total_acc / count, total_loss / count

def evaluate(dataloader):
    model.eval()
    total_loss = 0
    total_acc = 0
    count = 0
    
    with torch.no_grad():
        for text_batch, label_batch, lengths in dataloader:
            text_batch = text_batch.to(device)
            label_batch = label_batch.to(device)
            
            pred = model(text_batch, lengths)
            pred = pred.squeeze()
            
            loss = loss_fn(pred, label_batch)
            acc = binary_accuracy(pred, label_batch)
            
            total_loss += loss.item()
            total_acc += acc.item()
            count += 1
    
    return total_acc / count, total_loss / count

# ============================================================================
# TRAINING LOOP
# ============================================================================
num_epochs = 5
torch.manual_seed(1)

print("\nStarting training...")
for epoch in range(num_epochs):
    acc_train, loss_train = train(train_dl)
    acc_valid, loss_valid = evaluate(valid_dl)
    print(f'Epoch {epoch+1} | Train Acc: {acc_train:.4f} | Train Loss: {loss_train:.4f} | Val Acc: {acc_valid:.4f} | Val Loss: {loss_valid:.4f}')

# ============================================================================
# TEST EVALUATION
# ============================================================================
print("\nEvaluating on test set...")
acc_test, loss_test = evaluate(test_dl)
print(f'Test Accuracy: {acc_test:.4f} | Test Loss: {loss_test:.4f}')

  '(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower()
  text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')


Loading datasets...
Building vocabulary...
Vocab size: 69299
Train samples: 20000
Valid samples: 5000
Test samples: 25000

Batch shapes:
Text batch: torch.Size([32, 575])
Label batch: torch.Size([32])
Lengths: tensor([172, 394, 400, 160, 540])

Using device: cuda

Starting training...
Epoch 1 | Train Acc: 0.6402 | Train Loss: 0.6220 | Val Acc: 0.6937 | Val Loss: 0.5503
Epoch 2 | Train Acc: 0.7879 | Train Loss: 0.4538 | Val Acc: 0.8193 | Val Loss: 0.4134
Epoch 3 | Train Acc: 0.8524 | Train Loss: 0.3408 | Val Acc: 0.8063 | Val Loss: 0.4267
Epoch 4 | Train Acc: 0.8910 | Train Loss: 0.2646 | Val Acc: 0.8666 | Val Loss: 0.3218
Epoch 5 | Train Acc: 0.9167 | Train Loss: 0.2121 | Val Acc: 0.8885 | Val Loss: 0.2835

Evaluating on test set...
Test Accuracy: 0.8807 | Test Loss: 0.3145
