<a href="https://colab.research.google.com/github/harshavardhinisri/CS-6320-NLP/blob/main/nlp_a2_q2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import random
import json
import string
from tqdm import tqdm
import pickle
import zipfile
import os

In [None]:
zip_file_path = 'Data_Embedding.zip'
extracted_dir_path = 'Data_Embedding'

import os
os.makedirs(extracted_dir_path, exist_ok=True)

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extracted_dir_path)

In [None]:
# a single RNN layer with tanh nonlinearity, which processes the input sequence
# (inputs) and generates a sequence of hidden states (rnn_out).
# The output of the RNN layer is summed across the time dimension to create a
# final representation, which is then passed through a linear layer (W) followed
# by a softmax function to produce the final predicted output vector. The model
# also includes a method (compute_Loss) to calculate the negative log-likelihood
# loss, commonly used in classification tasks.
class RNN(nn.Module):
    def __init__(self, input_dim, h):
        super(RNN, self).__init__()
        self.rnn = nn.RNN(input_dim, h, nonlinearity='tanh')
        self.W = nn.Linear(h, 5)
        self.softmax = nn.LogSoftmax(dim=1)
        self.loss = nn.NLLLoss()

    def compute_Loss(self, predicted_vector, gold_label):
        return self.loss(predicted_vector, gold_label)

    def forward(self, inputs):
        rnn_out, _ = self.rnn(inputs)
        output_representation = torch.sum(rnn_out, dim=0)
        predicted_vector = self.softmax(self.W(output_representation))
        return predicted_vector


In [None]:
def load_data(file_path):
    with open(file_path) as f:
        data = json.load(f)
    return [(elt["text"].split(), int(elt["stars"] - 1)) for elt in data]

def preprocess_data(data, word_embedding):
    processed_data = []
    for words, label in data:
        words = " ".join(words).translate(str.maketrans("", "", string.punctuation)).split()
        vectors = [word_embedding.get(word.lower(), word_embedding['unk']) for word in words]
        vectors = torch.tensor(vectors).view(len(vectors), 1, -1)
        processed_data.append((vectors, label))
    return processed_data


In [None]:
def train_model(model, train_data, optimizer, minibatch_size):
    model.train()
    total_loss, total_correct, total = 0, 0, 0
    random.shuffle(train_data)
    for minibatch_index in tqdm(range(len(train_data) // minibatch_size)):
        optimizer.zero_grad()
        minibatch_loss = None
        for example_index in range(minibatch_size):
            input_vector, gold_label = train_data[minibatch_index * minibatch_size + example_index]
            output = model(input_vector)
            predicted_label = torch.argmax(output)
            total_correct += int(predicted_label == gold_label)
            total += 1
            example_loss = model.compute_Loss(output.view(1, -1), torch.tensor([gold_label]))
            minibatch_loss = example_loss if minibatch_loss is None else minibatch_loss + example_loss
        minibatch_loss = minibatch_loss / minibatch_size
        total_loss += minibatch_loss.item()
        minibatch_loss.backward()
        optimizer.step()
    return total_correct / total

def evaluate_model(model, data, minibatch_size):
    model.eval()
    total_correct, total = 0, 0
    for minibatch_index in tqdm(range(len(data) // minibatch_size)):
        for example_index in range(minibatch_size):
            input_vector, gold_label = data[minibatch_index * minibatch_size + example_index]
            output = model(input_vector)
            predicted_label = torch.argmax(output)
            total_correct += int(predicted_label == gold_label)
            total += 1
    return total_correct / total


In [None]:
# Setting hyperparameters (like hidden dimensions and number of epochs),
# loading and preprocessing data (including converting text to vector
# representations using a pre-trained word embedding).
# The RNN model is then trained across multiple epochs on the
#  training data, optimizing its parameters with the Adam optimizer,
#  and its performance is evaluated on validation data after each epoch.

hidden_dim = 128
epochs = 10
train_data_file = 'Data_Embedding/training.json'
val_data_file = 'Data_Embedding/validation.json'
test_data_file = 'Data_Embedding/test.json'
minibatch_size = 16

# Load and preprocess data
word_embedding = pickle.load(open('Data_Embedding/word_embedding.pkl', 'rb'))
train_data = preprocess_data(load_data(train_data_file), word_embedding)
val_data = preprocess_data(load_data(val_data_file), word_embedding)
test_data = preprocess_data(load_data(test_data_file), word_embedding)

# Initialize model and optimizer
model = RNN(50, hidden_dim)
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Training and Validation
for epoch in range(epochs):
    train_accuracy = train_model(model, train_data, optimizer, minibatch_size)
    val_accuracy = evaluate_model(model, val_data, minibatch_size)
    print(f"Epoch {epoch + 1}: Train Accuracy: {train_accuracy:.2f}, Validation Accuracy: {val_accuracy:.2f}")

# Testing
test_accuracy = evaluate_model(model, test_data, minibatch_size)
print(f"Test Accuracy: {test_accuracy:.2f}")


  vectors = torch.tensor(vectors).view(len(vectors), 1, -1)
100%|██████████| 1000/1000 [03:15<00:00,  5.11it/s]
100%|██████████| 50/50 [00:03<00:00, 15.80it/s]


Epoch 1: Train Accuracy: 0.31, Validation Accuracy: 0.36


100%|██████████| 1000/1000 [03:16<00:00,  5.08it/s]
100%|██████████| 50/50 [00:03<00:00, 16.40it/s]


Epoch 2: Train Accuracy: 0.28, Validation Accuracy: 0.30


100%|██████████| 1000/1000 [03:16<00:00,  5.08it/s]
100%|██████████| 50/50 [00:03<00:00, 16.40it/s]


Epoch 3: Train Accuracy: 0.29, Validation Accuracy: 0.28


100%|██████████| 1000/1000 [03:17<00:00,  5.06it/s]
100%|██████████| 50/50 [00:03<00:00, 16.57it/s]


Epoch 4: Train Accuracy: 0.31, Validation Accuracy: 0.34


100%|██████████| 1000/1000 [03:16<00:00,  5.09it/s]
100%|██████████| 50/50 [00:03<00:00, 16.64it/s]


Epoch 5: Train Accuracy: 0.30, Validation Accuracy: 0.43


100%|██████████| 1000/1000 [03:17<00:00,  5.07it/s]
100%|██████████| 50/50 [00:03<00:00, 16.32it/s]


Epoch 6: Train Accuracy: 0.32, Validation Accuracy: 0.34


100%|██████████| 1000/1000 [03:16<00:00,  5.08it/s]
100%|██████████| 50/50 [00:03<00:00, 16.39it/s]


Epoch 7: Train Accuracy: 0.31, Validation Accuracy: 0.43


100%|██████████| 1000/1000 [03:16<00:00,  5.10it/s]
100%|██████████| 50/50 [00:03<00:00, 16.52it/s]


Epoch 8: Train Accuracy: 0.32, Validation Accuracy: 0.30


100%|██████████| 1000/1000 [03:16<00:00,  5.08it/s]
100%|██████████| 50/50 [00:03<00:00, 16.52it/s]


Epoch 9: Train Accuracy: 0.34, Validation Accuracy: 0.38


100%|██████████| 1000/1000 [03:20<00:00,  4.99it/s]
100%|██████████| 50/50 [00:03<00:00, 16.25it/s]


Epoch 10: Train Accuracy: 0.32, Validation Accuracy: 0.35


100%|██████████| 50/50 [00:02<00:00, 20.61it/s]

Test Accuracy: 0.18





In [None]:
# The RNN layer now includes a dropout mechanism (dropout=dropout_rate).After
# the RNN layer and before the linear transformation (self.W), layer
# normalization (self.layer_norm) is applied.
class RNN(nn.Module):
    def __init__(self, input_dim, h, dropout_rate=0.5):
        super(RNN, self).__init__()
        self.rnn = nn.RNN(input_dim, h, nonlinearity='tanh', dropout=dropout_rate)
        self.layer_norm = nn.LayerNorm(h)  # Using layer normalization
        self.W = nn.Linear(h, 5)
        self.softmax = nn.LogSoftmax(dim=1)
        self.loss = nn.NLLLoss()

    def compute_Loss(self, predicted_vector, gold_label):
        return self.loss(predicted_vector, gold_label)

    def forward(self, inputs):
        rnn_out, _ = self.rnn(inputs)
        output_representation = torch.sum(rnn_out, dim=0)
        output_representation = self.layer_norm(output_representation)  # Apply layer normalization
        predicted_vector = self.softmax(self.W(output_representation))
        return predicted_vector


In [None]:
def load_data(file_path):
    with open(file_path) as f:
        data = json.load(f)
    return [(elt["text"].split(), int(elt["stars"] - 1)) for elt in data]

def preprocess_data(data, word_embedding):
    processed_data = []
    for words, label in data:
        words = " ".join(words).translate(str.maketrans("", "", string.punctuation)).split()
        vectors = [word_embedding.get(word.lower(), word_embedding['unk']) for word in words]
        vectors = torch.tensor(vectors).view(len(vectors), 1, -1)
        processed_data.append((vectors, label))
    return processed_data


In [None]:
def train_model(model, train_data, optimizer, minibatch_size, clip_value=5):
    model.train()
    total_loss, total_correct, total = 0, 0, 0
    random.shuffle(train_data)
    for minibatch_index in tqdm(range((len(train_data) + minibatch_size - 1) // minibatch_size)):  # Ensure all data is used
        optimizer.zero_grad()
        minibatch_loss = None
        for example_index in range(minibatch_size):
            actual_index = minibatch_index * minibatch_size + example_index
            if actual_index >= len(train_data):
                break  # Skip if we've passed the end of the dataset
            input_vector, gold_label = train_data[actual_index]
            output = model(input_vector)
            predicted_label = torch.argmax(output)
            total_correct += int(predicted_label == gold_label)
            total += 1
            example_loss = model.compute_Loss(output.view(1, -1), torch.tensor([gold_label]))
            minibatch_loss = example_loss if minibatch_loss is None else minibatch_loss + example_loss
        if minibatch_loss is not None:
            minibatch_loss = minibatch_loss / minibatch_size
            total_loss += minibatch_loss.item()
            minibatch_loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), clip_value)
            optimizer.step()
    return total_correct / total if total > 0 else 0

def evaluate_model(model, data, minibatch_size):
    model.eval()
    total_correct, total = 0, 0
    for minibatch_index in tqdm(range((len(data) + minibatch_size - 1) // minibatch_size)):
        for example_index in range(minibatch_size):
            actual_index = minibatch_index * minibatch_size + example_index
            if actual_index >= len(data):
                break
            input_vector, gold_label = data[actual_index]
            output = model(input_vector)
            predicted_label = torch.argmax(output)
            total_correct += int(predicted_label == gold_label)
            total += 1
    return total_correct / total if total > 0 else 0


In [None]:
# Includes a dropout mechanism with a rate of 0.5. he optimizer used
# here is Adam with an additional weight decay parameter (weight_decay=1e-4).

hidden_dim = 128
epochs = 10
dropout_rate = 0.5
learning_rate = 0.01
weight_decay = 1e-4
minibatch_size = 16

# Load data
word_embedding = pickle.load(open('Data_Embedding/word_embedding.pkl', 'rb'))
train_data = preprocess_data(load_data('Data_Embedding/training.json'), word_embedding)
val_data = preprocess_data(load_data('Data_Embedding/validation.json'), word_embedding)
test_data = preprocess_data(load_data('Data_Embedding/test.json'), word_embedding)

# Initialize model and optimizer
model = RNN(50, hidden_dim, dropout_rate)
optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
# Training and validation
for epoch in range(epochs):
    train_accuracy = train_model(model, train_data, optimizer, minibatch_size)
    val_accuracy = evaluate_model(model, val_data, minibatch_size)
    print(f"Epoch {epoch + 1}: Train Acc: {train_accuracy:.2f}, Val Acc: {val_accuracy:.2f}")

# Testing
test_accuracy = evaluate_model(model, test_data, minibatch_size)
print(f"Test Accuracy: {test_accuracy:.2f}")

100%|██████████| 1000/1000 [04:43<00:00,  3.53it/s]
100%|██████████| 50/50 [00:03<00:00, 14.14it/s]


Epoch 1: Train Acc: 0.28, Val Acc: 0.43


100%|██████████| 1000/1000 [04:40<00:00,  3.56it/s]
100%|██████████| 50/50 [00:03<00:00, 14.14it/s]


Epoch 2: Train Acc: 0.37, Val Acc: 0.38


100%|██████████| 1000/1000 [04:39<00:00,  3.57it/s]
100%|██████████| 50/50 [00:03<00:00, 13.79it/s]


Epoch 3: Train Acc: 0.37, Val Acc: 0.45


100%|██████████| 1000/1000 [04:42<00:00,  3.54it/s]
100%|██████████| 50/50 [00:03<00:00, 13.63it/s]


Epoch 4: Train Acc: 0.40, Val Acc: 0.45


100%|██████████| 1000/1000 [04:38<00:00,  3.60it/s]
100%|██████████| 50/50 [00:03<00:00, 13.15it/s]


Epoch 5: Train Acc: 0.40, Val Acc: 0.46


100%|██████████| 1000/1000 [04:37<00:00,  3.60it/s]
100%|██████████| 50/50 [00:03<00:00, 14.13it/s]


Epoch 6: Train Acc: 0.41, Val Acc: 0.45


100%|██████████| 1000/1000 [04:33<00:00,  3.66it/s]
100%|██████████| 50/50 [00:03<00:00, 14.07it/s]


Epoch 7: Train Acc: 0.40, Val Acc: 0.42


100%|██████████| 1000/1000 [04:36<00:00,  3.62it/s]
100%|██████████| 50/50 [00:03<00:00, 13.98it/s]


Epoch 8: Train Acc: 0.41, Val Acc: 0.40


100%|██████████| 1000/1000 [04:38<00:00,  3.60it/s]
100%|██████████| 50/50 [00:04<00:00, 12.37it/s]


Epoch 9: Train Acc: 0.41, Val Acc: 0.53


100%|██████████| 1000/1000 [04:35<00:00,  3.63it/s]
100%|██████████| 50/50 [00:03<00:00, 13.83it/s]


Epoch 10: Train Acc: 0.41, Val Acc: 0.50


100%|██████████| 50/50 [00:02<00:00, 17.82it/s]

Test Accuracy: 0.34





In [None]:
# The model now includes multiple RNN layers (num_layers=2), compared to the single-layer RNN in the previous version.
# The addition of a dropout mechanism (dropout=dropout_rate) in between the RNN layers helps prevent overfitting.
class ImprovedRNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers=2, dropout_rate=0.5):
        super(ImprovedRNN, self).__init__()
        self.rnn = nn.RNN(input_dim, hidden_dim, num_layers=num_layers, dropout=dropout_rate, batch_first=True)
        self.fc = nn.Linear(hidden_dim, 5)
        self.softmax = nn.LogSoftmax(dim=1)
        self.loss = nn.NLLLoss()

    def compute_Loss(self, predicted_vector, gold_label):
        return self.loss(predicted_vector, gold_label)

    def forward(self, inputs):
        rnn_out, _ = self.rnn(inputs)
        output_representation = torch.sum(rnn_out, dim=0)
        predicted_vector = self.softmax(self.fc(output_representation))
        return predicted_vector

In [None]:
# Set hyperparameters and file paths
hidden_dim = 128
epochs = 15  # Increased epochs
dropout_rate = 0.5
learning_rate = 0.005  # Adjusted learning rate
weight_decay = 1e-4
minibatch_size = 32  # Adjusted minibatch size

# Load and preprocess data
word_embedding = pickle.load(open('Data_Embedding/word_embedding.pkl', 'rb'))
train_data = preprocess_data(load_data('Data_Embedding/training.json'), word_embedding)
val_data = preprocess_data(load_data('Data_Embedding/validation.json'), word_embedding)
test_data = preprocess_data(load_data('Data_Embedding/test.json'), word_embedding)

# Initialize improved model and optimizer
model = ImprovedRNN(50, hidden_dim, num_layers=2, dropout_rate=dropout_rate)
optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

# Training and Validation
for epoch in range(epochs):
    train_accuracy = train_model(model, train_data, optimizer, minibatch_size)
    val_accuracy = evaluate_model(model, val_data, minibatch_size)
    print(f"Epoch {epoch + 1}: Train Acc: {train_accuracy:.2f}, Val Acc: {val_accuracy:.2f}")

# Testing
test_accuracy = evaluate_model(model, test_data, minibatch_size)
print(f"Test Accuracy: {test_accuracy:.2f}")

100%|██████████| 500/500 [00:34<00:00, 14.53it/s]
100%|██████████| 25/25 [00:00<00:00, 33.90it/s]


Epoch 1: Train Acc: 0.35, Val Acc: 0.46


100%|██████████| 500/500 [00:34<00:00, 14.58it/s]
100%|██████████| 25/25 [00:00<00:00, 33.23it/s]


Epoch 2: Train Acc: 0.39, Val Acc: 0.46


100%|██████████| 500/500 [00:34<00:00, 14.58it/s]
100%|██████████| 25/25 [00:00<00:00, 33.97it/s]


Epoch 3: Train Acc: 0.40, Val Acc: 0.43


100%|██████████| 500/500 [00:35<00:00, 14.13it/s]
100%|██████████| 25/25 [00:00<00:00, 33.29it/s]


Epoch 4: Train Acc: 0.39, Val Acc: 0.31


100%|██████████| 500/500 [00:34<00:00, 14.29it/s]
100%|██████████| 25/25 [00:00<00:00, 33.57it/s]


Epoch 5: Train Acc: 0.40, Val Acc: 0.37


100%|██████████| 500/500 [00:34<00:00, 14.29it/s]
100%|██████████| 25/25 [00:00<00:00, 34.34it/s]


Epoch 6: Train Acc: 0.40, Val Acc: 0.44


100%|██████████| 500/500 [00:34<00:00, 14.36it/s]
100%|██████████| 25/25 [00:00<00:00, 33.82it/s]


Epoch 7: Train Acc: 0.40, Val Acc: 0.47


100%|██████████| 500/500 [00:34<00:00, 14.40it/s]
100%|██████████| 25/25 [00:00<00:00, 32.86it/s]


Epoch 8: Train Acc: 0.39, Val Acc: 0.50


100%|██████████| 500/500 [00:34<00:00, 14.42it/s]
100%|██████████| 25/25 [00:00<00:00, 32.43it/s]


Epoch 9: Train Acc: 0.40, Val Acc: 0.41


100%|██████████| 500/500 [00:34<00:00, 14.39it/s]
100%|██████████| 25/25 [00:00<00:00, 34.37it/s]


Epoch 10: Train Acc: 0.41, Val Acc: 0.43


100%|██████████| 500/500 [00:34<00:00, 14.29it/s]
100%|██████████| 25/25 [00:00<00:00, 33.67it/s]


Epoch 11: Train Acc: 0.41, Val Acc: 0.45


100%|██████████| 500/500 [00:34<00:00, 14.39it/s]
100%|██████████| 25/25 [00:00<00:00, 33.27it/s]


Epoch 12: Train Acc: 0.40, Val Acc: 0.42


100%|██████████| 500/500 [00:34<00:00, 14.39it/s]
100%|██████████| 25/25 [00:00<00:00, 33.60it/s]


Epoch 13: Train Acc: 0.42, Val Acc: 0.48


100%|██████████| 500/500 [00:34<00:00, 14.38it/s]
100%|██████████| 25/25 [00:00<00:00, 33.68it/s]


Epoch 14: Train Acc: 0.41, Val Acc: 0.36


100%|██████████| 500/500 [00:34<00:00, 14.49it/s]
100%|██████████| 25/25 [00:00<00:00, 34.15it/s]


Epoch 15: Train Acc: 0.42, Val Acc: 0.47


100%|██████████| 25/25 [00:00<00:00, 36.69it/s]

Test Accuracy: 0.43



