# Assignment 2 - Recurrent Neural Networks



## Programming (Full points: 100)

In this assignment, our goal is to use PyTorch to implement Recurrent Neural Networks (RNN) for sentiment analysis task. Sentiment analysis is to classify sentences (input) into certain sentiments (output labels), which includes positive, negative and neutral.

We will use a benckmark dataset, SST, for this assignment.
* we download the SST dataset from torchtext package, and do some preprocessing to build vocabulary and split the dataset into training/validation/test sets. You don't need to modify the code in this step.


In [2]:
import copy

import torch
from torch import nn
from torch import optim
import torchtext
from torchtext import data
from torchtext import datasets


# Check if a GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

TEXT = data.Field(sequential=True, batch_first=True, lower=True)
LABEL = data.LabelField()

# Load data splits
train_data, val_data, test_data = datasets.SST.splits(TEXT, LABEL)

# Build dictionary
TEXT.build_vocab(train_data)
LABEL.build_vocab(train_data)

# Hyperparameters
vocab_size = len(TEXT.vocab)
label_size = len(LABEL.vocab)
padding_idx = TEXT.vocab.stoi['<pad>']
embedding_dim = 128
hidden_dim = 256

# Build iterators, and move them to the GPU
train_iter, val_iter, test_iter = data.BucketIterator.splits(
    (train_data, val_data, test_data), 
    batch_size=32,
    device=device)

* define the training and evaluation function in the cell below.
### (25 points)


In [1]:
def train(model, iterator, optimizer, criterion, clip_grad=1.0):
    model.train()
    total_loss = 0.0
    total_correct = 0
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    for batch in iterator:
        optimizer.zero_grad()
        text = batch.text.to(device)  # Move text data to the GPU
        label = batch.label.to(device)  # Move label data to the GPU
        
        predictions = model(text)
        loss = criterion(predictions, label)
        loss.backward()
        
        # Gradient clipping to prevent exploding gradients
#         torch.nn.utils.clip_grad_norm_(model.parameters(), clip_grad)
        
        optimizer.step()
        
        total_loss += loss.item()
        total_correct += (predictions.argmax(1) == label).sum().item()
    
    return total_loss / len(iterator), total_correct / len(iterator.dataset)

# Define a function for evaluation
def evaluate(model, iterator, criterion):
    model.eval()
    total_loss = 0.0
    total_correct = 0
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    with torch.no_grad():
        for batch in iterator:
            text = batch.text.to(device)  # Move text data to the GPU
            label = batch.label.to(device)  # Move label data to the GPU
            
            predictions = model(text)
            loss = criterion(predictions, label)
            
            total_loss += loss.item()
            total_correct += (predictions.argmax(1) == label).sum().item()
    
    return total_loss / len(iterator), total_correct / len(iterator.dataset)

* build a RNN model for sentiment analysis in the cell below.
We have provided several hyperparameters we needed for building the model, including vocabulary size (vocab_size), the word embedding dimension (embedding_dim), the hidden layer dimension (hidden_dim), the number of layers (num_layers) and the number of sentence labels (label_size). Please fill in the missing codes, and implement a RNN model.
### (40 points)

In [3]:
class RNNClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, label_size, padding_idx, num_layers=1, momentum=0.9):
        super(RNNClassifier, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.label_size = label_size
        self.num_layers = num_layers

        # Define the layers required for sentiment analysis.
        self.embedding = nn.Embedding(self.vocab_size, self.embedding_dim, padding_idx=padding_idx)
        self.rnn = nn.RNN(self.embedding_dim, self.hidden_dim, num_layers=self.num_layers, batch_first=True)
        self.fc = nn.Linear(self.hidden_dim, self.label_size)
        self.dropout = nn.Dropout(0.5)

        # Set the momentum for the optimizer
        self.momentum = momentum

    def zero_state(self, batch_size):
        # Implement the function, which returns an initial hidden state.
        return torch.zeros(self.num_layers, batch_size, self.hidden_dim)

    def forward(self, text):
        # Implement the forward function of the model.
        h0 = self.zero_state(text.size(0)).to(text.device)
        embedding = self.embedding(text)
        rnn_output, _ = self.rnn(embedding, h0)
        output = self.fc(rnn_output[:, -1, :])
        return output


* train the model and compute the accuracy in the cell below.
### (20 points)

In [9]:
from torch.optim import lr_scheduler

# Define a learning rate scheduler


num_epochs = 10
learning_rate = 0.001

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = RNNClassifier(vocab_size, embedding_dim, hidden_dim, label_size, padding_idx,3,None)
model = model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
scheduler = lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9)


for epoch in range(num_epochs):
    train_loss, train_acc = train(model, train_iter, optimizer, criterion)
    val_loss, val_acc = evaluate(model, val_iter, criterion)
    
    scheduler.step()
    
    print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}')


# model.load_state_dict(torch.load('best_model.pt'))
test_loss, test_acc = evaluate(model, test_iter, criterion)
print(f'Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.4f}')

Epoch 1/10, Train Loss: 1.1942, Train Acc: 0.3848, Val Loss: 1.1052, Val Acc: 0.3969
Epoch 2/10, Train Loss: 1.1512, Train Acc: 0.3878, Val Loss: 1.2136, Val Acc: 0.3688
Epoch 3/10, Train Loss: 1.1722, Train Acc: 0.3817, Val Loss: 1.0687, Val Acc: 0.3960
Epoch 4/10, Train Loss: 1.1172, Train Acc: 0.3965, Val Loss: 1.1583, Val Acc: 0.3951
Epoch 5/10, Train Loss: 1.1490, Train Acc: 0.3857, Val Loss: 1.1252, Val Acc: 0.4069
Epoch 6/10, Train Loss: 1.1081, Train Acc: 0.3908, Val Loss: 1.0992, Val Acc: 0.3951
Epoch 7/10, Train Loss: 1.1201, Train Acc: 0.3991, Val Loss: 1.1310, Val Acc: 0.3951
Epoch 8/10, Train Loss: 1.1071, Train Acc: 0.3917, Val Loss: 1.1056, Val Acc: 0.4069
Epoch 9/10, Train Loss: 1.1001, Train Acc: 0.3976, Val Loss: 1.0986, Val Acc: 0.3951
Epoch 10/10, Train Loss: 1.0941, Train Acc: 0.4085, Val Loss: 1.1041, Val Acc: 0.3678
Test Loss: 1.0928, Test Acc: 0.3538


* try to train a model with better accuracy in the cell below. For example, you can use different optimizers such as SGD and Adam. You can also compare different hyperparameters and model size.
### (15 points), to obtain FULL point in this problem, the accuracy needs to be higher than 70%

In [15]:
embedding_dim2 = 300
hidden_dim2 = 512  
num_epochs2 = 20
layers = 2
momentum = None

learning_rate_adam = 0.001
learning_rate_sgd = 0.0001
model = RNNClassifier(vocab_size, embedding_dim2, hidden_dim2, label_size, padding_idx,layers,momentum)
model = model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer_adam = optim.Adam(model.parameters(), lr=learning_rate_adam)
optimizer_sgd = optim.SGD(model.parameters(), lr=learning_rate_sgd)

scheduler_adam = lr_scheduler.StepLR(optimizer_adam, step_size=1, gamma=0.95)
scheduler_sdg = lr_scheduler.StepLR(optimizer_sgd, step_size=1, gamma=0.95)

for epoch in range(num_epochs2):
    train_loss, train_acc = train(model, train_iter, optimizer_adam, criterion)
    val_loss, val_acc = evaluate(model, val_iter, criterion)
    
    scheduler_adam.step()
    
    print(f'Adam - Epoch {epoch+1}/{num_epochs2}, Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}')

test_loss, test_acc = evaluate(model, test_iter, criterion)
print(f'Test Loss (Adam): {test_loss:.4f}, Test Acc (Adam): {test_acc:.4f}')



Adam - Epoch 1/20, Train Loss: 1.0959, Train Acc: 0.3866, Val Loss: 1.0727, Val Acc: 0.3606
Adam - Epoch 2/20, Train Loss: 1.0723, Train Acc: 0.4168, Val Loss: 1.0984, Val Acc: 0.4051
Adam - Epoch 3/20, Train Loss: 1.0688, Train Acc: 0.4080, Val Loss: 1.0834, Val Acc: 0.3778
Adam - Epoch 4/20, Train Loss: 1.0693, Train Acc: 0.4018, Val Loss: 1.0665, Val Acc: 0.4069
Adam - Epoch 5/20, Train Loss: 1.0639, Train Acc: 0.4115, Val Loss: 1.0791, Val Acc: 0.3751
Adam - Epoch 6/20, Train Loss: 1.0683, Train Acc: 0.4038, Val Loss: 1.0862, Val Acc: 0.3969
Adam - Epoch 7/20, Train Loss: 1.0665, Train Acc: 0.4088, Val Loss: 1.0983, Val Acc: 0.3978
Adam - Epoch 8/20, Train Loss: 1.0700, Train Acc: 0.3984, Val Loss: 1.0697, Val Acc: 0.4223
Adam - Epoch 9/20, Train Loss: 1.0596, Train Acc: 0.4149, Val Loss: 1.0721, Val Acc: 0.3896
Adam - Epoch 10/20, Train Loss: 1.0639, Train Acc: 0.4142, Val Loss: 1.0779, Val Acc: 0.3896
Adam - Epoch 11/20, Train Loss: 1.0664, Train Acc: 0.4060, Val Loss: 1.0757, Va

In [16]:
for epoch in range(num_epochs2):
    train_loss, train_acc = train(model, train_iter, optimizer_sgd, criterion)
    val_loss, val_acc = evaluate(model, val_iter, criterion)
    
    scheduler_sdg.step()
    
    print(f'SGD - Epoch {epoch+1}/{num_epochs2}, Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}')
    
test_loss_sgd, test_acc_sgd = evaluate(model, test_iter, criterion)
print(f'Test Loss (SGD): {test_loss_sgd:.4f}, Test Acc (SGD): {test_acc_sgd:.4f}')

SGD - Epoch 1/20, Train Loss: 1.0530, Train Acc: 0.4106, Val Loss: 1.0590, Val Acc: 0.4114
SGD - Epoch 2/20, Train Loss: 1.0471, Train Acc: 0.4143, Val Loss: 1.0597, Val Acc: 0.4114
SGD - Epoch 3/20, Train Loss: 1.0473, Train Acc: 0.4205, Val Loss: 1.0596, Val Acc: 0.4033
SGD - Epoch 4/20, Train Loss: 1.0474, Train Acc: 0.4205, Val Loss: 1.0598, Val Acc: 0.4033
SGD - Epoch 5/20, Train Loss: 1.0472, Train Acc: 0.4199, Val Loss: 1.0598, Val Acc: 0.4033
SGD - Epoch 6/20, Train Loss: 1.0471, Train Acc: 0.4225, Val Loss: 1.0598, Val Acc: 0.4033
SGD - Epoch 7/20, Train Loss: 1.0471, Train Acc: 0.4225, Val Loss: 1.0601, Val Acc: 0.4033
SGD - Epoch 8/20, Train Loss: 1.0473, Train Acc: 0.4225, Val Loss: 1.0603, Val Acc: 0.4033
SGD - Epoch 9/20, Train Loss: 1.0473, Train Acc: 0.4225, Val Loss: 1.0600, Val Acc: 0.4033
SGD - Epoch 10/20, Train Loss: 1.0472, Train Acc: 0.4225, Val Loss: 1.0604, Val Acc: 0.4033
SGD - Epoch 11/20, Train Loss: 1.0473, Train Acc: 0.4225, Val Loss: 1.0602, Val Acc: 0.40

In [13]:
# Define hyperparameter ranges
import itertools


embedding_dim_range = [128, 256, 512, 1024]
hidden_dim_range = [128,256, 512, 1024]
num_layers_range = [1, 2, 3,4]
learning_rate_range = [0.0001, 0.001, 0.01]
momentum_range = [0.8, 0.85, 0.9, None]
optimizer_list = ['SGD', 'Adam', 'RMSprop']

# Create a list of hyperparameter combinations
hyperparameters_grid = list(itertools.product(
    embedding_dim_range, hidden_dim_range, num_layers_range,
    learning_rate_range, momentum_range, optimizer_list))

# Convert the list of combinations into a list of dictionaries
hyperparameters_grid = [
    {
        'embedding_dim2': emb_dim,
        'hidden_dim2': hid_dim,
        'num_layers': num_layers,
        'optimizer': optimizer,
        'learning_rate': lr,
        'momentum': momentum
    }
    for emb_dim, hid_dim, num_layers, lr, momentum, optimizer in hyperparameters_grid
]
print(hyperparameters_grid)
print(len(hyperparameters_grid))

[{'embedding_dim2': 128, 'hidden_dim2': 128, 'num_layers': 1, 'optimizer': 'SGD', 'learning_rate': 0.0001, 'momentum': 0.8}, {'embedding_dim2': 128, 'hidden_dim2': 128, 'num_layers': 1, 'optimizer': 'Adam', 'learning_rate': 0.0001, 'momentum': 0.8}, {'embedding_dim2': 128, 'hidden_dim2': 128, 'num_layers': 1, 'optimizer': 'RMSprop', 'learning_rate': 0.0001, 'momentum': 0.8}, {'embedding_dim2': 128, 'hidden_dim2': 128, 'num_layers': 1, 'optimizer': 'SGD', 'learning_rate': 0.0001, 'momentum': 0.85}, {'embedding_dim2': 128, 'hidden_dim2': 128, 'num_layers': 1, 'optimizer': 'Adam', 'learning_rate': 0.0001, 'momentum': 0.85}, {'embedding_dim2': 128, 'hidden_dim2': 128, 'num_layers': 1, 'optimizer': 'RMSprop', 'learning_rate': 0.0001, 'momentum': 0.85}, {'embedding_dim2': 128, 'hidden_dim2': 128, 'num_layers': 1, 'optimizer': 'SGD', 'learning_rate': 0.0001, 'momentum': 0.9}, {'embedding_dim2': 128, 'hidden_dim2': 128, 'num_layers': 1, 'optimizer': 'Adam', 'learning_rate': 0.0001, 'momentum':

In [12]:
import os


def save_checkpoint(model, optimizer, iteration, checkpoint_path):
    checkpoint = {
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'iteration': iteration,
    }
    torch.save(checkpoint, checkpoint_path)
    
# Define a function to load the latest checkpoint if available
def load_latest_checkpoint(model, optimizer, checkpoint_dir):
    checkpoint_files = [f for f in os.listdir(checkpoint_dir) if f.endswith('.pt')]
    if not checkpoint_files:
        return model, optimizer, 0
    
    # Sort checkpoint files by name (which includes epoch and iteration)
    checkpoint_files.sort()
    latest_checkpoint = checkpoint_files[-1]
    
    checkpoint_path = os.path.join(checkpoint_dir, latest_checkpoint)
    checkpoint = torch.load(checkpoint_path)
    
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    iteration = checkpoint['iteration']
    
    return model, optimizer, iteration

# Your existing code continues here

# Define the directory where checkpoints are saved
checkpoint_dir = 'checkpoints/'

model, optimizer, start_iteration = load_latest_checkpoint(model, optimizer, checkpoint_dir)


In [15]:
import itertools
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader


best_hyperparameters = None
best_val_accuracy = 0.0
best_epochs_to_accuracy = None  # Track the epoch range for best validation accuracy
desired_accuracy = 0.7  # Set the desired accuracy threshold
num_epochs2 = 100  # Change the number of epochs as needed
iteration = 0


# Loop through hyperparameters
for hyperparameters in hyperparameters_grid:
    # Extract hyperparameters
    embedding_dim2 = hyperparameters['embedding_dim2']
    hidden_dim2 = hyperparameters['hidden_dim2']
    num_layers = hyperparameters['num_layers']
    optimizer_type = hyperparameters['optimizer']
    learning_rate = hyperparameters['learning_rate']

    # Create your model and optimizer based on hyperparameters
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    model = RNNClassifier(vocab_size, embedding_dim2, hidden_dim2, label_size, padding_idx, num_layers=num_layers).to(device)
    criterion = nn.CrossEntropyLoss()
    
    if optimizer_type == 'SGD':
        optimizer = optim.SGD(model.parameters(), lr=learning_rate)
    elif optimizer_type == 'Adam':
        optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    elif optimizer_type == 'RMSprop':
        optimizer = optim.RMSprop(model.parameters(), lr=learning_rate)
    
    

    # Training and evaluation loop
    epochs_to_accuracy = None
    for epoch in range(num_epochs2):
        iteration += 1
        train_loss, train_acc = train(model, train_iter, optimizer, criterion)
        # Save the model's state after every iteration
        # checkpoint_path = os.path.join(checkpoint_dir, f'checkpoint_{hyperparameters}_{optimizer_type}_epoch_{epoch + 1}_iteration_{iteration}.pt')
        # save_checkpoint(model, optimizer, iteration, checkpoint_path)
        
        val_loss, val_acc = evaluate(model, val_iter, criterion)

        print(f'Hyperparameters: {hyperparameters}')
        print(f'Optimizer: {optimizer_type}, Epoch {epoch + 1}/{num_epochs2}, '
              f'Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, '
              f'Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}')

        if val_acc >= desired_accuracy:
            print(f'Reached the desired accuracy of {desired_accuracy * 100}% on the validation set at epoch {epoch + 1}.')
            epochs_to_accuracy = epoch + 1
            if val_acc > best_val_accuracy:
                best_val_accuracy = val_acc
                best_hyperparameters = hyperparameters
                best_epochs_to_accuracy = epochs_to_accuracy
            break
        

# After the loop, you can evaluate on the test set using the best hyperparameters
if best_hyperparameters:
    print(f'Best hyperparameters: {best_hyperparameters}')
    print(f'Best validation accuracy: {best_val_accuracy:.4f}')

    # Create a model with the best hyperparameters
    best_model = RNNClassifier(vocab_size, best_hyperparameters['embedding_dim2'], best_hyperparameters['hidden_dim2'], label_size, padding_idx, num_layers=best_hyperparameters['num_layers']).to(device)

    # Choose the optimizer based on the best hyperparameters
    if best_hyperparameters['optimizer'] == 'SGD':
        best_optimizer = optim.SGD(best_model.parameters(), lr=best_hyperparameters['learning_rate'])
    elif best_hyperparameters['optimizer'] == 'Adam':
        best_optimizer = optim.Adam(best_model.parameters(), lr=best_hyperparameters['learning_rate'])

    # Train the best model on the entire training dataset
    for epoch in range(num_epochs2):
        train_loss, train_acc = train(best_model, train_iter, best_optimizer, criterion)

    # Evaluate the best model on the test set
    test_loss, test_acc = evaluate(best_model, test_iter, criterion)
    print(f'Test Loss ({best_hyperparameters["optimizer"]}): {test_loss:.4f}, Test Acc ({best_hyperparameters["optimizer"]}): {test_acc:.4f}')
else:
    print("No hyperparameter combination achieved the desired accuracy.")
if best_val_accuracy == 0.0:
    print("No hyperparameter combination reached the desired accuracy.")
    print(f"Best found hyperparameters: {best_hyperparameters}")


Hyperparameters: {'embedding_dim2': 128, 'hidden_dim2': 128, 'num_layers': 1, 'optimizer': 'SGD', 'learning_rate': 0.0001, 'momentum': 0.8}
Optimizer: SGD, Epoch 1/100, Train Loss: 1.1096, Train Acc: 0.2010, Val Loss: 1.1241, Val Acc: 0.2925
Hyperparameters: {'embedding_dim2': 128, 'hidden_dim2': 128, 'num_layers': 1, 'optimizer': 'SGD', 'learning_rate': 0.0001, 'momentum': 0.8}
Optimizer: SGD, Epoch 2/100, Train Loss: 1.1061, Train Acc: 0.3668, Val Loss: 1.1197, Val Acc: 0.3206
Hyperparameters: {'embedding_dim2': 128, 'hidden_dim2': 128, 'num_layers': 1, 'optimizer': 'SGD', 'learning_rate': 0.0001, 'momentum': 0.8}
Optimizer: SGD, Epoch 3/100, Train Loss: 1.1024, Train Acc: 0.3849, Val Loss: 1.1157, Val Acc: 0.3270
Hyperparameters: {'embedding_dim2': 128, 'hidden_dim2': 128, 'num_layers': 1, 'optimizer': 'SGD', 'learning_rate': 0.0001, 'momentum': 0.8}
Optimizer: SGD, Epoch 4/100, Train Loss: 1.0996, Train Acc: 0.3823, Val Loss: 1.1122, Val Acc: 0.3406
Hyperparameters: {'embedding_dim

In [None]:
print(best_hyperparameters)

In [None]:
class LSTMWithBatchNorm(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, label_size, num_layers, dropout, padding_idx):
        super(LSTMWithBatchNorm, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.label_size = label_size
        self.num_layers = num_layers

        # Define the layers required for sentiment analysis.
        self.embedding = nn.Embedding(self.vocab_size, self.embedding_dim, padding_idx=padding_idx)
        self.lstm = nn.LSTM(self.embedding_dim, self.hidden_dim, num_layers=self.num_layers, batch_first=True, dropout=dropout)
        self.layernorm = nn.LayerNorm(self.hidden_dim)  # Add BatchNorm layer
        self.fc = nn.Linear(self.hidden_dim, self.label_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text):
        # Implement the forward function of the model.
        embedded = self.embedding(text)
        lstm_output, _ = self.lstm(embedded)
        # Apply BatchNorm
        normalized_output = self.layernorm(lstm_output)
        output = self.fc(normalized_output[:, -1, :])
        return output



In [None]:
import matplotlib.pyplot as plt

# Train and evaluate your model
num_epochs = 20
learning_rate = 0.001

vocab_size = len(TEXT.vocab)
label_size = len(LABEL.vocab)
EMBEDDING_DIM = 512
HIDDEN_DIM = 512
NUM_LAYERS = 2  # Make sure this is an integer
DROPOUT = 0.5  # Specify as a float value
PADDING_IDX = TEXT.vocab.stoi['<pad>']

optimizer = optim.Adam(model.parameters(),lr=learning_rate)
scheduler = lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9)
criterion = nn.CrossEntropyLoss()

# Create the model instance
model = LSTMWithBatchNorm(vocab_size, EMBEDDING_DIM, HIDDEN_DIM, label_size, NUM_LAYERS, DROPOUT, PADDING_IDX).to(device)

# Training
CLIP = 1.0  # Gradient clipping threshold

# Lists to store training and validation loss
train_losses = []
val_losses = []

for epoch in range(num_epochs):
    model.train()
    train_loss, train_acc = train(model, train_iter, optimizer, criterion)
    val_loss, val_acc = evaluate(model, val_iter, criterion)
    
    # Append the loss values to the lists
    train_losses.append(train_loss)
    val_losses.append(val_loss)
    
    scheduler.step()
    
    print(f'Epoch: {epoch+1:02}')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\tVal Loss: {val_loss:.3f} | Val Acc: {val_acc*100:.2f}%')

# Plot the loss curve
plt.figure(figsize=(10, 5))
plt.plot(range(1, num_epochs+1), train_losses, label='Training Loss')
plt.plot(range(1, num_epochs+1), val_losses, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.title('Training and Validation Loss Over Time')
plt.show()

# Evaluation
model.eval()
test_loss, test_acc = evaluate(model, test_iter, criterion)
print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

In [None]:
import torch.nn as nn

class GRUWithLayerNorm(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, label_size, num_layers, dropout, padding_idx):
        super(GRUWithLayerNorm, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.label_size = label_size
        self.num_layers = num_layers

        # Define the layers required for sentiment analysis.
        self.embedding = nn.Embedding(self.vocab_size, self.embedding_dim, padding_idx=padding_idx)
        self.gru = nn.GRU(self.embedding_dim, self.hidden_dim, num_layers=self.num_layers, batch_first=True, dropout=dropout)
        self.layernorm = nn.LayerNorm(self.hidden_dim)  # Add LayerNorm layer
        self.fc = nn.Linear(self.hidden_dim, self.label_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text):
        # Implement the forward function of the model.
        embedded = self.embedding(text)
        gru_output, _ = self.gru(embedded)
        # Apply LayerNorm
        normalized_output = self.layernorm(gru_output)
        output = self.fc(normalized_output[:, -1, :])
        return output


In [8]:


# Train and evaluate your model
num_epochs = 20
learning_rate = 0.0001

vocab_size = len(TEXT.vocab)
label_size = len(LABEL.vocab)
EMBEDDING_DIM = 1024
HIDDEN_DIM = 1024
NUM_LAYERS = 4  # Make sure this is an integer
DROPOUT = 0.3  # Specify as a float value
PADDING_IDX = TEXT.vocab.stoi['<pad>']
optimizer = optim.Adam(model.parameters(),lr=learning_rate)
criterion = nn.CrossEntropyLoss()

# Create the model instance
model = GRUWithLayerNorm(vocab_size, EMBEDDING_DIM, HIDDEN_DIM, label_size, NUM_LAYERS, DROPOUT, PADDING_IDX).to(device)

# Training
CLIP = 1.0  # Gradient clipping threshold

# Lists to store training and validation loss
train_losses = []
val_losses = []

for epoch in range(num_epochs):
    model.train()
    train_loss, train_acc = train(model, train_iter, optimizer, criterion)
    val_loss, val_acc = evaluate(model, val_iter, criterion)
    
    # Append the loss values to the lists
    train_losses.append(train_loss)
    val_losses.append(val_loss)
    
    print(f'Epoch: {epoch+1:02}')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\tVal Loss: {val_loss:.3f} | Val Acc: {val_acc*100:.2f}%')

# Plot the loss curve
plt.figure(figsize=(10, 5))
plt.plot(range(1, num_epochs+1), train_losses, label='Training Loss')
plt.plot(range(1, num_epochs+1), val_losses, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.title('Training and Validation Loss Over Time')
plt.show()

# Evaluation
model.eval()
test_loss, test_acc = evaluate(model, test_iter, criterion)
print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Epoch: 01
	Train Loss: 1.126 | Train Acc: 39.09%
	Val Loss: 1.131 | Val Acc: 36.97%
Epoch: 02
	Train Loss: 1.127 | Train Acc: 38.75%
	Val Loss: 1.131 | Val Acc: 36.97%
Epoch: 03
	Train Loss: 1.127 | Train Acc: 38.85%
	Val Loss: 1.131 | Val Acc: 36.97%
Epoch: 04
	Train Loss: 1.127 | Train Acc: 38.89%
	Val Loss: 1.131 | Val Acc: 36.97%


KeyboardInterrupt: 