# Deep Learning: More Recurrent Neural Networks

In [None]:
import torch
import torch.nn as nn
import torchtext
from torchtext.datasets import SST2
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from tqdm import tqdm
import matplotlib.pyplot as plt
torch.manual_seed(12)

print(f"Pytorch version: {torch.__version__}")
print(f"Torchtext version: {torchtext.__version__}")


### Note: Python version

This notebook has been updated to work with modern PyTorch and torchtext APIs (compatible with Python 3.11+)

In [None]:
# DEVICE CONFIGURATION
if torch.backends.mps.is_available():          # Apple Silicon
    device = torch.device("mps")
elif torch.cuda.is_available():                # CUDA GPU
    device = torch.device("cuda")
else:
    device = torch.device("cpu")               # Fallback

print("Using device:", device)

In [None]:
# Initialize tokenizer
tokenizer = get_tokenizer('basic_english')

# Load SST2 dataset (binary sentiment: positive/negative)
train_iter, test_iter = SST2(split=('train', 'test'))

# Convert iterators to lists for easier processing
train_data = list(train_iter)
test_data = list(test_iter)

print(f"Loaded {len(train_data)} training samples and {len(test_data)} test samples")

In [None]:
# Build vocabulary from training data
def yield_tokens(data_iter):
    for label, text in data_iter:
        yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(train_data), specials=['<unk>', '<pad>'], max_tokens=10000)
vocab.set_default_index(vocab['<unk>'])
pad_idx = vocab['<pad>']

print(f"Vocabulary size: {len(vocab)}")

NameError: name 'train_data' is not defined

The SST2 dataset is a binary sentiment classification dataset (Stanford Sentiment Treebank).

Each sample consists of:
- A text string (movie review)
- A label: 1 (positive sentiment) or 2 (negative sentiment)

Note: The original SST dataset had 5 classes (fine-grained), but SST2 simplifies it to binary classification.

In [None]:
# Build vocabulary
MAX_VOCAB_SIZE = 10000
TEXT.build_vocab(train_data, max_size=MAX_VOCAB_SIZE)
LABEL.build_vocab(train_data)

vocab = TEXT.vocab.stoi
pad_idx = TEXT.vocab.stoi[TEXT.pad_token]

NameError: name 'TEXT' is not defined

### Q: What variable type is `vocab`? What does the `build_vocab_from_iterator` function do?
Hint: The vocab object maps strings (tokens) to integers!

### A: 
YOUR ANSWER HERE

In [None]:
# Process the dataset
def process_dataset(dataset):
    texts = []
    labels = []
    for label, text in dataset:
        # Tokenize the text
        tokens = tokenizer(text)
        texts.append(tokens)
        # Convert label: SST2 uses 1 (negative) and 2 (positive), convert to 0 and 1
        labels.append(label - 1)
    return texts, labels

train_texts, train_labels = process_dataset(train_data)
test_texts, test_labels = process_dataset(test_data)

NameError: name 'train_data' is not defined

### Sanity Check ðŸ§  
Let's check that the decoded dataset makes sense!

In [None]:
label_map = {0: 'negative', 1: 'positive'}

# Change i and see what happens!
i = 0

print("Text:", " ".join(test_texts[i]))
print("Label:", label_map[test_labels[i]])

NameError: name 'LABEL' is not defined

----

Now let's create custom DataLoaders with a collate function for batching:

In [None]:
BATCH_SIZE = 32

# Custom collate function to handle variable-length sequences
def collate_batch(batch):
    texts, labels, lengths = [], [], []
    for text, label in batch:
        # Convert tokens to indices
        processed_text = torch.tensor([vocab[token] for token in text], dtype=torch.long)
        texts.append(processed_text)
        labels.append(label)
        lengths.append(len(processed_text))
    
    # Pad sequences
    texts = pad_sequence(texts, padding_value=pad_idx)
    labels = torch.tensor(labels, dtype=torch.float)
    lengths = torch.tensor(lengths, dtype=torch.long)
    
    return texts, labels, lengths

# Create datasets
from torch.utils.data import TensorDataset

class TextDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

train_dataset = TextDataset(train_texts, train_labels)
test_dataset = TextDataset(test_texts, test_labels)

# Create DataLoaders
train_iterator = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
test_iterator = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_batch)

NameError: name 'BucketIterator' is not defined

Below is a custom classifier:

In [None]:
# Custom LSTM Classifier
class SimpleLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=False)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, text, text_lengths):
        # text: [batch_size, seq_len] if batch_first=True
        embedded = self.embedding(text)  # [batch_size, seq_len, emb_dim]

        # Pack padded sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.cpu(), batch_first=False, enforce_sorted=False)
        packed_output, (hidden, cell) = self.lstm(packed_embedded)

        # hidden: [1, batch_size, hidden_dim]  -> take the last hidden state
        return self.fc(hidden.squeeze(0))  # [batch_size, output_dim]

NameError: name 'nn' is not defined

Here's a helper function so we can vary parameters later:

In [None]:
# TRAINING LOOP FUNCTION
def run_training_loop(num_epochs, random_state):
    # set random seed
    torch.manual_seed(random_state)

    EMBEDDING_DIM = 100
    HIDDEN_DIM = 256
    OUTPUT_DIM = 1

    # Send model to device
    model = SimpleLSTM(len(vocab), EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, pad_idx).to(device)
    criterion = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    # init output vars
    train_losses = []
    train_accs = []

    # train for num_epochs
    for epoch in range(num_epochs):
        model.train()
        epoch_loss = 0
        epoch_acc = 0

        # Wrap the iterator with tqdm
        for text, labels, text_lengths in tqdm(train_iterator, desc=f"Epoch {epoch+1}/{num_epochs}"):
            # Move to device
            text = text.to(device)
            text_lengths = text_lengths.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()
            predictions = model(text, text_lengths).squeeze(1)

            # Loss
            loss = criterion(predictions, labels)
            loss.backward()
            optimizer.step()

            # Compute accuracy
            preds = torch.round(torch.sigmoid(predictions))
            acc = (preds == labels).float().mean()
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
        avg_loss = epoch_loss / len(train_iterator)
        avg_acc = epoch_acc / len(train_iterator)
        train_losses.append(avg_loss)
        train_accs.append(avg_acc)
        print(f"Loss = {avg_loss:.4f}, Accuracy = {avg_acc:.4f}\n")
    return train_losses, train_accs, model

And another helper function plot the `train_losses` and `train_accs`:

In [None]:
def plot_training_performance(train_losses, train_accs, num_epochs, random_state):
    fig = plt.figure(figsize=(8,4), tight_layout=True)
    fig.suptitle(f"Model Training for State {random_state} over {num_epochs} Epochs")

    plt.subplot(1,2,1)
    plt.plot(train_losses, "-o", label="Train Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.grid()
    plt.legend()

    plt.subplot(1,2,2)
    plt.plot(train_accs, "-o", label="Train Accuracy")
    plt.xlabel("Epoch")
    plt.ylabel("Accuracy (%)")
    plt.grid()
    plt.legend()
    plt.show()


Let's train our model!

In [None]:
NUM_EPOCHS = 15
RANDOM_STATE = 42
train_losses, train_accs, model = run_training_loop(NUM_EPOCHS, RANDOM_STATE)

NameError: name 'torch' is not defined

In [None]:
plot_training_performance(train_losses, train_accs, NUM_EPOCHS, RANDOM_STATE)

NameError: name 'train_losses' is not defined

### Q: Try changing the value of `NUM_EPOCHS`. What happens to the train loss and accuracy plots? What does this tell us about the optimal number of training epochs?

### A: 
YOUR ANSWER HERE

### Q: Try changing the value of `RANDOM_STATE`. What happens to the train loss and accuracy plots? What does this tell us about the impact of the initial guess (i.e. epoch 0)?

### A: 
YOUR ANSWER HERE

----

## Performance on the Test Set

Here's a helper function to run our model on the test data:

In [None]:

# Evaluate on test set
def run_evaluation_test():
    model.eval()
    test_loss, test_acc = 0, 0

    criterion = nn.BCEWithLogitsLoss()
    
    with torch.no_grad():
        for text, labels, text_lengths in test_iterator:
            # Move to device
            text = text.to(device)
            text_lengths = text_lengths.to(device)
            labels = labels.to(device)

            predictions = model(text, text_lengths).squeeze(1)
            loss = criterion(predictions, labels)

            preds = torch.round(torch.sigmoid(predictions))
            acc = (preds == labels).float().mean()

            test_loss += loss.item()
            test_acc += acc.item()

    print(f"Test Accuracy: {100 * test_acc / len(test_iterator):.2f}%")


Finally, let's check that our model can generalize by processing the test data:

In [None]:
run_evaluation_test()

Test Accuracy: 56.34%


----
## Further Exploration ðŸ”Ž

### Q: What happens to the **test** accuracy as `NUM_EPOCHS` is changed? What does this tell us about our neural network?
Note: remember to re-train your model before running on the test set!

### A:
YOUR ANSWER HERE

### Q: What happens to the **test** accuracy as `RANDOM_STATE` is changed? What does this tell us about our neural network?
Note: remember to re-train your model before running on the test set!

### A:
YOUR ANSWER HERE

### Q: What happens to your results if you use the full SST dataset (5 classes) instead of SST2 (binary)?
The SST2 dataset simplifies the problem to binary classification (positive/negative). How do you think the model performance would change if we used the original 5-class problem (very negative, negative, neutral, positive, very positive)?

### A: 
YOUR ANSWER HERE