In [5]:
!pip install datasets
!pip install gensim #word vector model (contains word2vec)
!pip install nltk   #natural language toolkit (tokenizer)
!pip install kagglehub



# Import Relevant Libraries and Datasets


In [19]:
# import datasets (given)
from datasets import load_dataset
dataset = load_dataset ("rotten_tomatoes")
train_dataset = dataset['train']
validation_dataset = dataset['validation']
test_dataset = dataset['test']

# import libraries
import nltk # Natural Language ToolKit
import numpy as np # numpy
from nltk.corpus import stopwords # Pre-compiled Stopwords list
import re # Regular Expression Matching Operations
from gensim.models import Word2Vec  # Using Gensim library for the Word2Vec model
from multiprocessing import cpu_count # Number of CPU cores (model parameter for processing efficiency)
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# Part 0. Dataset Preparation

## Text Cleaning (Tokenisation Included)

In [9]:
# Custom Tokenizer Function using RegEx-based pattern
def custom_tokenizer(data):

  # pattern removes all punctuation and retains the abbreviations (additional preprocessing is required to remove the period marking the end of the sentence)
  pattern = r'''(?x)          # set flag to allow verbose regexps
          (?:[A-Za-z]\.)+        # abbreviations(both upper and lower case, like "e.g.", "U.S.A.")
          | \w+(?:-\w+)*        # words with optional internal hyphens
          | \[\][.,;"'?():_-]    # these are separate tokens; includes ], [
          | \.                   # periods are separate tokens
      '''

  sentences = []
  for i in range(len(data)):
    text = re.sub(r'\s*([,.":])\s*', r'\1', data[i]['text']) #removes additional whitespace within sequences
    print(nltk.regexp_tokenize(text, pattern))
    sentences.append(nltk.regexp_tokenize(text, pattern))

  return sentences

# #Train Set
# train_sentences = custom_tokenizer(train_dataset)

In [10]:
# Additional Preprocessing Function to remove periods and stopwords to minimise inaccuracy in Word2Vec
import nltk
nltk.download('stopwords')

def preprocesssing(data):
  #Additional preproceesing step to remove 'end-of-sentence' period
  sentences = custom_tokenizer(data)
  for sentence in sentences:
    for token in sentence:
      if token == '.':
        sentence.remove(token)
  print('Original Sentences:')
  print(sentences)
  print()

  # Preprocessing to remove stop words using NLTK library
  processed_sentences = []
  for sentence in sentences:
    new_sentence = []
    for token in sentence:
      if token.lower() not in stopwords.words('english'):
        new_sentence.append(token)
    processed_sentences.append(new_sentence)

  print('Sentences after removing stopwords:')
  print(processed_sentences)
  return processed_sentences

[nltk_data] Downloading package stopwords to /home/jdgoh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
# Train Set
train_processed_sentences = preprocesssing(train_dataset)
validation_processed_sentences = preprocesssing(validation_dataset)
test_processed_sentences = preprocesssing(test_dataset)

['the', 'rock', 'is', 'destined', 'to', 'be', 'the', '21st', 'century', 's', 'new', 'conan', 'and', 'that', 'he', 's', 'going', 'to', 'make', 'a', 'splash', 'even', 'greater', 'than', 'arnold', 'schwarzenegger', 'jean-claud', 'van', 'damme', 'or', 'steven', 'segal', '.']
['the', 'gorgeously', 'elaborate', 'continuation', 'of', 'the', 'lord', 'of', 'the', 'rings', 'trilogy', 'is', 'so', 'huge', 'that', 'a', 'column', 'of', 'words', 'cannot', 'adequately', 'describe', 'co-writer', 'director', 'peter', 'jackson', 's', 'expanded', 'vision', 'of', 'j.r.r.', 'tolkien', 's', 'middle-earth', '.']
['effective', 'but', 'too-tepid', 'biopic']
['if', 'you', 'sometimes', 'like', 'to', 'go', 'to', 'the', 'movies', 'to', 'have', 'fun', 'wasabi', 'is', 'a', 'good', 'place', 'to', 'start', '.']
['emerges', 'as', 'something', 'rare', 'an', 'issue', 'movie', 'that', 's', 'so', 'honest', 'and', 'keenly', 'observed', 'that', 'it', 'doesn', 't', 'feel', 'like', 'one', '.']
['the', 'film', 'provides', 'some'

# Part 1. Preparing Word Embeddings

## Model Building (Gensim - Word2Vec)

In [12]:
import gensim
import kagglehub
import os
import numpy as np

# Download the latest versionpreprocesssing
model_path = kagglehub.dataset_download("leadbest/googlenewsvectorsnegative300")

# Find the actual .bin file in the directory
bin_file_path = os.path.join(model_path, "GoogleNews-vectors-negative300.bin")  # Adjust filename if needed

# Load the pretrained model
pretrained_model = gensim.models.KeyedVectors.load_word2vec_format(bin_file_path, binary=True)
embedding_dim = pretrained_model.vector_size

## Qn 1a. Vocabulary Size

In [13]:
unique_strings = set()
for sentence in train_processed_sentences:
    unique_strings.update(token for token in sentence)

print(len(unique_strings))

17659


## Qn 1b. OOV words

In [14]:
oov_words = [word for word in unique_strings if word not in pretrained_model]
num_oov_words = len(oov_words)

print(f"Number of OOV Words: {num_oov_words}")

Number of OOV Words: 3339


# TO CHANGE TO INCLUDE PADDING AND STUFF

In [15]:
word_to_idx = {"<pad>": 0, "<unk>": 1}  # 0 for padding, 1 for OOV
word_to_idx.update({word: idx + 2 for idx, word in enumerate(unique_strings)})  # Start indexing from 2 for all other words

print("Total words in vocabulary (including OOV and padding):", len(word_to_idx))

Total words in vocabulary (including OOV and padding): 17661


In [16]:
# function to get the longest sentence in our dataset

def get_max_len(*datasets):
    max_len = 0
    for dataset in datasets:
        for sentence in dataset:
            max_len = max(max_len, len(sentence))
    return max_len

# Calculate the max_len using all three datasets
max_len = get_max_len(train_processed_sentences, validation_processed_sentences, test_processed_sentences)
print("Maximum sentence length across all datasets:", max_len)

Maximum sentence length across all datasets: 39


In [20]:
# function to pading our sentence for batch processing in RNN

def preprocess_sentences(sentences, word_to_idx, max_len):
    tokenized_sentences = []

    for sentence in sentences:
        indices = [word_to_idx.get(word, 1) for word in sentence]
        
        # Pad the sentence with 0s (padding index) or truncate to max_len
        if len(indices) < max_len:
            indices = indices + [0] * (max_len - len(indices))
        else:
            indices = indices[:max_len]

        tokenized_sentences.append(indices)
    
    # Convert to tensor for model input
    return torch.tensor(tokenized_sentences, dtype=torch.long)



In [21]:
train_padded = preprocess_sentences(train_processed_sentences, word_to_idx, max_len)
val_padded = preprocess_sentences(validation_processed_sentences, word_to_idx, max_len)
test_padded = preprocess_sentences(test_processed_sentences, word_to_idx, max_len)

print("Train set shape:", train_padded.shape)
print("Validation set shape:", val_padded.shape)
print("Test set shape:", test_padded.shape)

Train set shape: torch.Size([8530, 39])
Validation set shape: torch.Size([1066, 39])
Test set shape: torch.Size([1066, 39])


In [24]:
embedding_matrix = np.zeros((len(word_to_idx), embedding_dim))  # len(word_to_idx) includes padding and OOV

for word, idx in word_to_idx.items():
    if word in pretrained_model.key_to_index:
        embedding_matrix[idx] = pretrained_model[word]
    elif word == "<OOV>":
        embedding_matrix[idx] = np.random.normal(scale=0.6, size=(embedding_dim,))  # Random vector for OOV words
    else:
        embedding_matrix[idx] = np.zeros(embedding_dim)  # Padding remains a zero vector

print("Embedding matrix shape:", embedding_matrix.shape)

Embedding matrix shape: (17661, 300)


## Qn 1c. Stategies to mitigate OOV limitations

A common approach to handle out-of-vocabulary (OOV) words in embeddings like Word2Vec or GloVe is to use a fallback strategy for these words.

**Strategy 1: Random Initalization of OOV Words**

Assign a random embedding vector to each OOV word, allowing the model to learn some representation for these words during training.

In [10]:
import numpy as np

# Initialize a random vector for OOV words, based on the embedding dimension
embedding_dim = 300  # Use the dimension of your pretrained embeddings

def get_embedding(word, embedding_dict):
    if word in embedding_dict:
        return embedding_dict[word]
    else:
        # Generate a random vector for OOV words
        return np.random.normal(scale=0.6, size=(embedding_dim,))

**Strategy 2: Average Embedding of OOV Words**

Assign the average embedding of all known words in the vocabulary to OOV words. This can help provide a generic but meaningful representation.

In [None]:
# Compute average embedding for known words
all_known_embeddings = np.array(list(embedding_dict.values()))
average_embedding = np.mean(all_known_embeddings, axis=0)

def get_embedding(word, embedding_dict):
    if word in embedding_dict:
        return embedding_dict[word]
    else:
        # Use the average embedding vector for OOV words
        return average_embedding

**Stategy 3: Root-based Approximation of OOV Words**

For OOV words that share roots with in-vocabulary words, approximate the embedding by averaging embeddings of similar words. This can work for cases where OOV words are derived forms of known words.

In [None]:
import nltk
from nltk.corpus import wordnet

nltk.download('wordnet')

def get_root_embedding(word, embedding_dict):
    synsets = wordnet.synsets(word)
    if not synsets:
        return np.random.normal(scale=0.6, size=(embedding_dim,))  # Use random if no root found

    # Find root words
    root_words = {lemma.name().lower() for synset in synsets for lemma in synset.lemmas()}
    root_embeddings = [embedding_dict[root] for root in root_words if root in embedding_dict]

    if root_embeddings:
        return np.mean(root_embeddings, axis=0)  # Average root embeddings if found
    else:
        return np.random.normal(scale=0.6, size=(embedding_dim,))  # Random fallback for OOV

# Example usage:
def get_embedding(word, embedding_dict):
    if word in embedding_dict:
        return embedding_dict[word]
    else:
        return get_root_embedding(word, embedding_dict)

[nltk_data] Downloading package wordnet to /root/nltk_data...


An example of how to create an embedding matrix for the model based on the chosen strategy

In [None]:
embedding_matrix = np.zeros((len(vocab) + 1, embedding_dim))  # +1 for padding index 0
word_to_idx = {word: i+1 for i, word in enumerate(vocab)}  # Start indices at 1

for word, idx in word_to_idx.items():
    embedding_matrix[idx] = get_embedding(word, embedding_dic)

NameError: name 'vocab' is not defined

**How do the strategies help migitate OOV limitation?**

---
**Random Initalization:**
A random vector will be assigned to each OOV word, but performance can vary since it doesn't uses any information from known words

**Average Embedding:**
A generic vector representation will be assigned based on the average vector of known words, generally a stable strategy if OOV words are limited or are related to general vocabulary

**Root-Based Approximation:**
Searches for similar semantic-related root words to estimate OOV word's embedding, could be better than randoming and averaging, but highly depend on accuracy of identifying root words







We will have to assess the performance in terms of accuracy for all three strategies, which will be done in **Part 3b**.

A **hybrid apporach** is also considered if Root-based Approximation performs well. We can use Root-based Approximation as the main strategy, and consider using randoming or averaging in the event **no roots are found**.

**Question 2**

In [46]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset
import torch.nn.functional as F
from torch.utils.data import Dataset as TorchDataset  # Import for PyTorch Dataset

In [30]:
import tensorflow as tf

print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

cuda_available = torch.cuda.is_available()

print(f"CUDA Available: {cuda_available}")
if cuda_available:
  print(f"Current CUDA Device: {torch.cuda.current_device()}")
  print(f"CUDA Device Name: {torch.cuda.get_device_name(torch.cuda.current_device())}")

Num GPUs Available:  1
CUDA Available: True
Current CUDA Device: 0
CUDA Device Name: NVIDIA GeForce RTX 3050 Laptop GPU


In [24]:
# embedding_matrix_tensor = torch.tensor(embedding_matrix, dtype=torch.float32)

# embedding_layer = nn.Embedding.from_pretrained(embedding_matrix_tensor, freeze=True)  # freeze=True keeps embeddings fixed

In [27]:
# import torch
# from torch.utils.data import Dataset, DataLoader
# from torch.nn.utils.rnn import pad_sequence

# class SentimentDataset(Dataset):
#     def __init__(self, data, labels, word_to_idx, max_len=50):
#         self.data = data
#         self.labels = labels
#         self.word_to_idx = word_to_idx
#         self.max_len = max_len

#     def __len__(self):
#         return len(self.data)

#     def __getitem__(self, idx):
#         sentence = self.data[idx]
#         label = self.labels[idx]

#         # Convert words to indices and pad to max_len
#         indices = [self.word_to_idx.get(word, 0) for word in sentence]
#         if len(indices) < self.max_len:
#             indices = indices + [0] * (self.max_len - len(indices))  # Pad with zeros
#         else:
#             indices = indices[:self.max_len]  # Truncate to max_len

#         return torch.tensor(indices, dtype=torch.long), torch.tensor(label, dtype=torch.float)

# # Example usage
# train_dataset = SentimentDataset(train_processed_sentences, train_labels, word_to_idx, max_len=50)
# train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)


In [55]:
class SentimentRNN(nn.Module):
    def __init__(self, embedding_matrix, hidden_dim, output_dim=1, num_layers=1, bidirectional = False, dropout=0.5):
        super(SentimentRNN, self).__init__()
        # self.hidden_dim = hidden_dim
        # self.n_layers = n_layers
        
        self.hidden_dim = hidden_dim
        self.bidirectional = bidirectional

        embedding_matrix_tensor = torch.tensor(embedding_matrix, dtype=torch.float32)
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix_tensor, freeze=True)

        self.mode = 'final'        
        self.rnn = nn.RNN(embedding_matrix.shape[1], 
                          hidden_size = hidden_dim, 
                          num_layers = num_layers,
                          bidirectional = bidirectional, 
                          # dropout=dropout,
                          batch_first = True, 
                          nonlinearity = "tanh")
        
        # Fully connected layer 
        rnn_output_dim = hidden_dim * 2 if bidirectional else hidden_dim
        self.fc = nn.Linear(rnn_output_dim, output_dim)
        
        # Dropout layer
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # Embedding lookup
        embedded = self.embedding(x)                   # Shape: (batch_size, seq_len, embedding_dim)
        
        # RNN forward pass
        rnn_out, hidden = self.rnn(embedded)                # Shape: (batch_size, seq_len, hidden_dim)
        
        if self.mode == 'mean':
            # Mean pooling over the sequence
            sentence_representation = rnn_out.mean(dim=1)               # Shape: (batch_size, hidden_dim)
        elif self.mode == 'max':
            # Max pooling over the sequence
            sentence_representation, _ = rnn_out.max(dim=1)             # Shape: (batch_size, hidden_dim)
        else: 
            # Default to final hidden state
            if self.bidirectional:
                # Concatenate last hidden states from both directions
                sentence_representation = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)  # Shape: (batch_size, hidden_dim * 2)
            else:
                # Use the last hidden state from the last layer
                sentence_representation = hidden[-1,:,:]             

        
        # Apply dropout and pass through the fully connected layer for classification
        sentence_representation = self.dropout(sentence_representation)
        output = self.fc(sentence_representation)
        
        # Apply sigmoid for binary classification
        return torch.sigmoid(output).squeeze(1)  # Output shape: (batch_size)
        

In [57]:
# Updated train function with early stopping patience
def train(model, train_loader, val_loader, criterion, optimizer, num_epochs, device, patience=3):
    model.to(device)
    best_val_acc = 0
    epochs_no_improve = 0  # Counter for epochs without improvement

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0

        for batch in train_loader:
            inputs, labels = batch
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels.float())
            loss.backward()
            optimizer.step()

            running_loss += loss.item() * inputs.size(0)

        # Calculate validation accuracy
        val_acc = evaluate(model, val_loader, device)
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {running_loss/len(train_loader.dataset):.4f}, Val Accuracy: {val_acc:.4f}")

        # Check if validation accuracy has improved
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            epochs_no_improve = 0  # Reset counter if there’s improvement
            torch.save(model.state_dict(), "best_model.pt")
        else:
            epochs_no_improve += 1
            print(f"No improvement in validation accuracy for {epochs_no_improve} epochs.")

        # Early stopping check
        if epochs_no_improve >= patience:
            print("Early stopping triggered. Stopping training.")
            break

    print("Training complete.")
    


In [60]:
from sklearn.metrics import accuracy_score

# Evaluation function to calculate accuracy
def evaluate(model, data_loader, device):
    model.eval()  # Set the model to evaluation mode
    all_preds = []
    all_labels = []
    
    with torch.no_grad():  # No need to compute gradients during evaluation
        for batch in data_loader:
            inputs, labels = batch
            inputs, labels = inputs.to(device), labels.to(device)

            # Get model predictions
            outputs = model(inputs)
            preds = (outputs >= 0.5).int()  # Apply threshold for binary classification (0.5)

            # Collect all predictions and labels
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Calculate accuracy
    accuracy = accuracy_score(all_labels, all_preds)
    return accuracy

In [51]:
# Extract labels from each dataset
train_labels = torch.tensor([example['label'] for example in train_dataset], dtype=torch.long)
val_labels = torch.tensor([example['label'] for example in validation_dataset], dtype=torch.long)
test_labels = torch.tensor([example['label'] for example in test_dataset], dtype=torch.long)

# Ensure you have `train_padded`, `val_padded`, and `test_padded` from preprocessing steps

# Create TensorDatasets for DataLoader
train_tensor_dataset = TensorDataset(train_padded, train_labels)
val_tensor_dataset = TensorDataset(val_padded, val_labels)
test_tensor_dataset = TensorDataset(test_padded, test_labels)

In [62]:
import torch.optim as optim
from torch.utils.data import DataLoader
import itertools

# Define device based on availability of GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

param_grid = {
    'epochs': [10, 20],
    'learning_rate': [0.001, 0.01],
    'optimizer': ['Adam', 'SGD'],
    'batch_size': [16, 32]
}

# Function to get the optimizer
def get_optimizer(optimizer_name, model, learning_rate):
    if optimizer_name == 'Adam':
        return optim.Adam(model.parameters(), lr=learning_rate)
    elif optimizer_name == 'SGD':
        return optim.SGD(model.parameters(), lr=learning_rate)
    # Add more optimizers if needed

# Track best configuration
best_val_acc = 0
best_config = {}

# Grid search over all parameter combinations
for config in itertools.product(*param_grid.values()):
    # Unpack configuration
    num_epochs, learning_rate, optimizer_name, batch_size = config
    
    print(f"Training with config - Epochs: {num_epochs}, LR: {learning_rate}, Optimizer: {optimizer_name}, Batch size: {batch_size}")
    
    # Initialize model
    model = SentimentRNN(embedding_matrix, hidden_dim=128, output_dim=1, num_layers=1, bidirectional=False, dropout=0.5)
    model.to(device)

    # Set up criterion and optimizer
    criterion = nn.BCELoss()
    optimizer = get_optimizer(optimizer_name, model, learning_rate)

    # Prepare data loaders
    # Create DataLoaders
    train_loader = DataLoader(train_tensor_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_tensor_dataset, batch_size=batch_size)


    # Train the model
    train(model, train_loader, val_loader, criterion, optimizer, num_epochs, device, patience=5)

    # Evaluate on validation set
    model.load_state_dict(torch.load("best_model.pt"))  # Load the best model from early stopping
    val_acc = evaluate(model, val_loader, device)

    print(f"Validation Accuracy for config {config}: {val_acc:.4f}")

    # Update best configuration if current one is better
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        best_config = {
            'epochs': num_epochs,
            'learning_rate': learning_rate,
            'optimizer': optimizer_name,
            'batch_size': batch_size
        }

print("Best configuration:", best_config)
print("Best validation accuracy:", best_val_acc)

Training with config - Epochs: 10, LR: 0.001, Optimizer: Adam, Batch size: 16
Epoch 1/10, Loss: 0.6946, Val Accuracy: 0.5009
Epoch 2/10, Loss: 0.6940, Val Accuracy: 0.4906
No improvement in validation accuracy for 1 epochs.
Epoch 3/10, Loss: 0.7107, Val Accuracy: 0.4831
No improvement in validation accuracy for 2 epochs.
Epoch 4/10, Loss: 0.7005, Val Accuracy: 0.5169
Epoch 5/10, Loss: 0.6980, Val Accuracy: 0.4803
No improvement in validation accuracy for 1 epochs.
Epoch 6/10, Loss: 0.6975, Val Accuracy: 0.5000
No improvement in validation accuracy for 2 epochs.
Epoch 7/10, Loss: 0.6979, Val Accuracy: 0.5178
Epoch 8/10, Loss: 0.6972, Val Accuracy: 0.5159
No improvement in validation accuracy for 1 epochs.
Epoch 9/10, Loss: 0.6967, Val Accuracy: 0.5188
Epoch 10/10, Loss: 0.6961, Val Accuracy: 0.5206
Training complete.
Validation Accuracy for config (10, 0.001, 'Adam', 16): 0.5206
Training with config - Epochs: 10, LR: 0.001, Optimizer: Adam, Batch size: 32


  model.load_state_dict(torch.load("best_model.pt"))  # Load the best model from early stopping


Epoch 1/10, Loss: 0.6939, Val Accuracy: 0.5000
Epoch 2/10, Loss: 0.6937, Val Accuracy: 0.4991
No improvement in validation accuracy for 1 epochs.
Epoch 3/10, Loss: 0.6978, Val Accuracy: 0.4719
No improvement in validation accuracy for 2 epochs.
Epoch 4/10, Loss: 0.7067, Val Accuracy: 0.5178
Epoch 5/10, Loss: 0.6999, Val Accuracy: 0.5084
No improvement in validation accuracy for 1 epochs.
Epoch 6/10, Loss: 0.6994, Val Accuracy: 0.5122
No improvement in validation accuracy for 2 epochs.
Epoch 7/10, Loss: 0.6969, Val Accuracy: 0.4916
No improvement in validation accuracy for 3 epochs.
Epoch 8/10, Loss: 0.6960, Val Accuracy: 0.5197
Epoch 9/10, Loss: 0.6968, Val Accuracy: 0.4850
No improvement in validation accuracy for 1 epochs.
Epoch 10/10, Loss: 0.6961, Val Accuracy: 0.5103
No improvement in validation accuracy for 2 epochs.
Training complete.
Validation Accuracy for config (10, 0.001, 'Adam', 32): 0.5197
Training with config - Epochs: 10, LR: 0.001, Optimizer: SGD, Batch size: 16


  model.load_state_dict(torch.load("best_model.pt"))  # Load the best model from early stopping


Epoch 1/10, Loss: 0.6936, Val Accuracy: 0.5009
Epoch 2/10, Loss: 0.6932, Val Accuracy: 0.5009
No improvement in validation accuracy for 1 epochs.
Epoch 3/10, Loss: 0.6935, Val Accuracy: 0.5009
No improvement in validation accuracy for 2 epochs.
Epoch 4/10, Loss: 0.6937, Val Accuracy: 0.5009
No improvement in validation accuracy for 3 epochs.
Epoch 5/10, Loss: 0.6932, Val Accuracy: 0.5009
No improvement in validation accuracy for 4 epochs.
Epoch 6/10, Loss: 0.6935, Val Accuracy: 0.5009
No improvement in validation accuracy for 5 epochs.
Early stopping triggered. Stopping training.
Training complete.
Validation Accuracy for config (10, 0.001, 'SGD', 16): 0.5009
Training with config - Epochs: 10, LR: 0.001, Optimizer: SGD, Batch size: 32


  model.load_state_dict(torch.load("best_model.pt"))  # Load the best model from early stopping


Epoch 1/10, Loss: 0.6945, Val Accuracy: 0.5000
Epoch 2/10, Loss: 0.6941, Val Accuracy: 0.5000
No improvement in validation accuracy for 1 epochs.
Epoch 3/10, Loss: 0.6937, Val Accuracy: 0.5000
No improvement in validation accuracy for 2 epochs.
Epoch 4/10, Loss: 0.6938, Val Accuracy: 0.5000
No improvement in validation accuracy for 3 epochs.
Epoch 5/10, Loss: 0.6936, Val Accuracy: 0.5000
No improvement in validation accuracy for 4 epochs.
Epoch 6/10, Loss: 0.6932, Val Accuracy: 0.4991
No improvement in validation accuracy for 5 epochs.
Early stopping triggered. Stopping training.
Training complete.
Validation Accuracy for config (10, 0.001, 'SGD', 32): 0.5000
Training with config - Epochs: 10, LR: 0.01, Optimizer: Adam, Batch size: 16


  model.load_state_dict(torch.load("best_model.pt"))  # Load the best model from early stopping


Epoch 1/10, Loss: 0.7454, Val Accuracy: 0.5113
Epoch 2/10, Loss: 0.7543, Val Accuracy: 0.5338
Epoch 3/10, Loss: 0.7486, Val Accuracy: 0.5000
No improvement in validation accuracy for 1 epochs.
Epoch 4/10, Loss: 0.7453, Val Accuracy: 0.4972
No improvement in validation accuracy for 2 epochs.
Epoch 5/10, Loss: 0.7586, Val Accuracy: 0.5310
No improvement in validation accuracy for 3 epochs.
Epoch 6/10, Loss: 0.7546, Val Accuracy: 0.4878
No improvement in validation accuracy for 4 epochs.
Epoch 7/10, Loss: 0.7454, Val Accuracy: 0.4962
No improvement in validation accuracy for 5 epochs.
Early stopping triggered. Stopping training.
Training complete.
Validation Accuracy for config (10, 0.01, 'Adam', 16): 0.5338
Training with config - Epochs: 10, LR: 0.01, Optimizer: Adam, Batch size: 32


  model.load_state_dict(torch.load("best_model.pt"))  # Load the best model from early stopping


Epoch 1/10, Loss: 0.7336, Val Accuracy: 0.5000
Epoch 2/10, Loss: 0.7273, Val Accuracy: 0.5000
No improvement in validation accuracy for 1 epochs.
Epoch 3/10, Loss: 0.7258, Val Accuracy: 0.5000
No improvement in validation accuracy for 2 epochs.
Epoch 4/10, Loss: 0.7326, Val Accuracy: 0.5000
No improvement in validation accuracy for 3 epochs.
Epoch 5/10, Loss: 0.7288, Val Accuracy: 0.5000
No improvement in validation accuracy for 4 epochs.
Epoch 6/10, Loss: 0.7285, Val Accuracy: 0.5000
No improvement in validation accuracy for 5 epochs.
Early stopping triggered. Stopping training.
Training complete.
Validation Accuracy for config (10, 0.01, 'Adam', 32): 0.5000
Training with config - Epochs: 10, LR: 0.01, Optimizer: SGD, Batch size: 16


  model.load_state_dict(torch.load("best_model.pt"))  # Load the best model from early stopping


Epoch 1/10, Loss: 0.6937, Val Accuracy: 0.5000
Epoch 2/10, Loss: 0.6937, Val Accuracy: 0.5000
No improvement in validation accuracy for 1 epochs.
Epoch 3/10, Loss: 0.6933, Val Accuracy: 0.5000
No improvement in validation accuracy for 2 epochs.
Epoch 4/10, Loss: 0.6937, Val Accuracy: 0.5000
No improvement in validation accuracy for 3 epochs.
Epoch 5/10, Loss: 0.6930, Val Accuracy: 0.5000
No improvement in validation accuracy for 4 epochs.
Epoch 6/10, Loss: 0.6938, Val Accuracy: 0.5000
No improvement in validation accuracy for 5 epochs.
Early stopping triggered. Stopping training.
Training complete.
Validation Accuracy for config (10, 0.01, 'SGD', 16): 0.5000
Training with config - Epochs: 10, LR: 0.01, Optimizer: SGD, Batch size: 32


  model.load_state_dict(torch.load("best_model.pt"))  # Load the best model from early stopping


Epoch 1/10, Loss: 0.6935, Val Accuracy: 0.5000
Epoch 2/10, Loss: 0.6935, Val Accuracy: 0.5009
Epoch 3/10, Loss: 0.6931, Val Accuracy: 0.5000
No improvement in validation accuracy for 1 epochs.
Epoch 4/10, Loss: 0.6934, Val Accuracy: 0.5000
No improvement in validation accuracy for 2 epochs.
Epoch 5/10, Loss: 0.6934, Val Accuracy: 0.5009
No improvement in validation accuracy for 3 epochs.
Epoch 6/10, Loss: 0.6937, Val Accuracy: 0.5000
No improvement in validation accuracy for 4 epochs.
Epoch 7/10, Loss: 0.6932, Val Accuracy: 0.5009
No improvement in validation accuracy for 5 epochs.
Early stopping triggered. Stopping training.
Training complete.
Validation Accuracy for config (10, 0.01, 'SGD', 32): 0.5009
Training with config - Epochs: 20, LR: 0.001, Optimizer: Adam, Batch size: 16


  model.load_state_dict(torch.load("best_model.pt"))  # Load the best model from early stopping


Epoch 1/20, Loss: 0.6941, Val Accuracy: 0.4991
Epoch 2/20, Loss: 0.7068, Val Accuracy: 0.5000
Epoch 3/20, Loss: 0.7009, Val Accuracy: 0.5056
Epoch 4/20, Loss: 0.6979, Val Accuracy: 0.5000
No improvement in validation accuracy for 1 epochs.
Epoch 5/20, Loss: 0.6985, Val Accuracy: 0.5000
No improvement in validation accuracy for 2 epochs.
Epoch 6/20, Loss: 0.6997, Val Accuracy: 0.5000
No improvement in validation accuracy for 3 epochs.
Epoch 7/20, Loss: 0.6965, Val Accuracy: 0.5000
No improvement in validation accuracy for 4 epochs.
Epoch 8/20, Loss: 0.6977, Val Accuracy: 0.5000
No improvement in validation accuracy for 5 epochs.
Early stopping triggered. Stopping training.
Training complete.
Validation Accuracy for config (20, 0.001, 'Adam', 16): 0.5056
Training with config - Epochs: 20, LR: 0.001, Optimizer: Adam, Batch size: 32


  model.load_state_dict(torch.load("best_model.pt"))  # Load the best model from early stopping


Epoch 1/20, Loss: 0.6949, Val Accuracy: 0.5056
Epoch 2/20, Loss: 0.6936, Val Accuracy: 0.5000
No improvement in validation accuracy for 1 epochs.
Epoch 3/20, Loss: 0.6938, Val Accuracy: 0.5000
No improvement in validation accuracy for 2 epochs.
Epoch 4/20, Loss: 0.7003, Val Accuracy: 0.4803
No improvement in validation accuracy for 3 epochs.
Epoch 5/20, Loss: 0.7051, Val Accuracy: 0.4962
No improvement in validation accuracy for 4 epochs.
Epoch 6/20, Loss: 0.6989, Val Accuracy: 0.5038
No improvement in validation accuracy for 5 epochs.
Early stopping triggered. Stopping training.
Training complete.
Validation Accuracy for config (20, 0.001, 'Adam', 32): 0.5056
Training with config - Epochs: 20, LR: 0.001, Optimizer: SGD, Batch size: 16


  model.load_state_dict(torch.load("best_model.pt"))  # Load the best model from early stopping


Epoch 1/20, Loss: 0.6936, Val Accuracy: 0.5000
Epoch 2/20, Loss: 0.6938, Val Accuracy: 0.5000
No improvement in validation accuracy for 1 epochs.
Epoch 3/20, Loss: 0.6940, Val Accuracy: 0.5009
Epoch 4/20, Loss: 0.6938, Val Accuracy: 0.5009
No improvement in validation accuracy for 1 epochs.
Epoch 5/20, Loss: 0.6939, Val Accuracy: 0.5000
No improvement in validation accuracy for 2 epochs.
Epoch 6/20, Loss: 0.6936, Val Accuracy: 0.5009
No improvement in validation accuracy for 3 epochs.
Epoch 7/20, Loss: 0.6934, Val Accuracy: 0.5000
No improvement in validation accuracy for 4 epochs.
Epoch 8/20, Loss: 0.6938, Val Accuracy: 0.5000
No improvement in validation accuracy for 5 epochs.
Early stopping triggered. Stopping training.
Training complete.
Validation Accuracy for config (20, 0.001, 'SGD', 16): 0.5009
Training with config - Epochs: 20, LR: 0.001, Optimizer: SGD, Batch size: 32


  model.load_state_dict(torch.load("best_model.pt"))  # Load the best model from early stopping


Epoch 1/20, Loss: 0.6937, Val Accuracy: 0.5000
Epoch 2/20, Loss: 0.6932, Val Accuracy: 0.5000
No improvement in validation accuracy for 1 epochs.
Epoch 3/20, Loss: 0.6938, Val Accuracy: 0.5000
No improvement in validation accuracy for 2 epochs.
Epoch 4/20, Loss: 0.6937, Val Accuracy: 0.5000
No improvement in validation accuracy for 3 epochs.
Epoch 5/20, Loss: 0.6935, Val Accuracy: 0.5000
No improvement in validation accuracy for 4 epochs.
Epoch 6/20, Loss: 0.6933, Val Accuracy: 0.5000
No improvement in validation accuracy for 5 epochs.
Early stopping triggered. Stopping training.
Training complete.
Validation Accuracy for config (20, 0.001, 'SGD', 32): 0.5000
Training with config - Epochs: 20, LR: 0.01, Optimizer: Adam, Batch size: 16


  model.load_state_dict(torch.load("best_model.pt"))  # Load the best model from early stopping


Epoch 1/20, Loss: 0.7427, Val Accuracy: 0.5328
Epoch 2/20, Loss: 0.7446, Val Accuracy: 0.5056
No improvement in validation accuracy for 1 epochs.
Epoch 3/20, Loss: 0.7570, Val Accuracy: 0.4700
No improvement in validation accuracy for 2 epochs.
Epoch 4/20, Loss: 0.7473, Val Accuracy: 0.4700
No improvement in validation accuracy for 3 epochs.
Epoch 5/20, Loss: 0.7514, Val Accuracy: 0.5000
No improvement in validation accuracy for 4 epochs.
Epoch 6/20, Loss: 0.7478, Val Accuracy: 0.4700
No improvement in validation accuracy for 5 epochs.
Early stopping triggered. Stopping training.
Training complete.
Validation Accuracy for config (20, 0.01, 'Adam', 16): 0.5328
Training with config - Epochs: 20, LR: 0.01, Optimizer: Adam, Batch size: 32


  model.load_state_dict(torch.load("best_model.pt"))  # Load the best model from early stopping


Epoch 1/20, Loss: 0.7350, Val Accuracy: 0.5113
Epoch 2/20, Loss: 0.7249, Val Accuracy: 0.5394
Epoch 3/20, Loss: 0.7251, Val Accuracy: 0.5113
No improvement in validation accuracy for 1 epochs.
Epoch 4/20, Loss: 0.7264, Val Accuracy: 0.5009
No improvement in validation accuracy for 2 epochs.
Epoch 5/20, Loss: 0.7217, Val Accuracy: 0.4897
No improvement in validation accuracy for 3 epochs.
Epoch 6/20, Loss: 0.7255, Val Accuracy: 0.4897
No improvement in validation accuracy for 4 epochs.
Epoch 7/20, Loss: 0.7293, Val Accuracy: 0.5291
No improvement in validation accuracy for 5 epochs.
Early stopping triggered. Stopping training.
Training complete.
Validation Accuracy for config (20, 0.01, 'Adam', 32): 0.5394
Training with config - Epochs: 20, LR: 0.01, Optimizer: SGD, Batch size: 16


  model.load_state_dict(torch.load("best_model.pt"))  # Load the best model from early stopping


Epoch 1/20, Loss: 0.6938, Val Accuracy: 0.5000
Epoch 2/20, Loss: 0.6936, Val Accuracy: 0.5000
No improvement in validation accuracy for 1 epochs.
Epoch 3/20, Loss: 0.6936, Val Accuracy: 0.5000
No improvement in validation accuracy for 2 epochs.
Epoch 4/20, Loss: 0.6934, Val Accuracy: 0.5000
No improvement in validation accuracy for 3 epochs.
Epoch 5/20, Loss: 0.6936, Val Accuracy: 0.5000
No improvement in validation accuracy for 4 epochs.
Epoch 6/20, Loss: 0.6937, Val Accuracy: 0.5000
No improvement in validation accuracy for 5 epochs.
Early stopping triggered. Stopping training.
Training complete.
Validation Accuracy for config (20, 0.01, 'SGD', 16): 0.5000
Training with config - Epochs: 20, LR: 0.01, Optimizer: SGD, Batch size: 32


  model.load_state_dict(torch.load("best_model.pt"))  # Load the best model from early stopping


Epoch 1/20, Loss: 0.6935, Val Accuracy: 0.4991
Epoch 2/20, Loss: 0.6939, Val Accuracy: 0.5000
Epoch 3/20, Loss: 0.6934, Val Accuracy: 0.5000
No improvement in validation accuracy for 1 epochs.
Epoch 4/20, Loss: 0.6936, Val Accuracy: 0.5000
No improvement in validation accuracy for 2 epochs.
Epoch 5/20, Loss: 0.6939, Val Accuracy: 0.4991
No improvement in validation accuracy for 3 epochs.
Epoch 6/20, Loss: 0.6935, Val Accuracy: 0.5000
No improvement in validation accuracy for 4 epochs.
Epoch 7/20, Loss: 0.6930, Val Accuracy: 0.4991
No improvement in validation accuracy for 5 epochs.
Early stopping triggered. Stopping training.
Training complete.
Validation Accuracy for config (20, 0.01, 'SGD', 32): 0.5000
Best configuration: {'epochs': 20, 'learning_rate': 0.01, 'optimizer': 'Adam', 'batch_size': 32}
Best validation accuracy: 0.5393996247654784


  model.load_state_dict(torch.load("best_model.pt"))  # Load the best model from early stopping


In [17]:
# # import torch
# # import numpy as np
# # from datasets import Dataset
# # from torch.utils.data import Dataset as TorchDataset  # Import for PyTorch Dataset

# class SentimentDataset(TorchDataset):  # Inherit from PyTorch Dataset
#     def __init__(self, data, word_embeddings):
#         print(f"Initializing SentimentDataset")
#         print(f"Data type: {type(data)}")  # Print the type of data
#         print(f"Word embeddings type: {type(word_embeddings)}")
#         self.data = data  # Store the dataset
#         self.word_embeddings = word_embeddings

#     def __len__(self):
#         return len(self.data)

#     def __getitem__(self, idx):

#         try:
#             if isinstance(self.data, list):  # Handle list (train_dataset)
#                 text = self.data[idx]
#                 label = 0  # Assign a default label or load from another source
#             elif isinstance(self.data, tuple):  # Handle tuple
#                 text, label = self.data[idx]
#             elif isinstance(self.data, Dataset):  # Check for Hugging Face Dataset
#                 text = self.data[idx]['text']
#                 label = self.data[idx]['label']
#             else:
#                 raise TypeError("Data object doesn't seem to be a supported type (list, tuple, or Hugging Face Dataset).")

#         except Exception as e:
#             print(f"Error accessing data[idx]: {str(e)}")
#             raise

#         # Get embeddings for words in the text
#         words = [self.word_embeddings.get_vector(word) for word in text.split() if word in self.word_embeddings.index_to_key]

#         # Handle case when no words are found
#         if len(words) == 0:
#             try:
#                 first_word = self.word_embeddings.index_to_key[0]
#                 words = [self.word_embeddings.get_vector(first_word)]  # Use the first word's embedding
#             except IndexError:
#                 raise ValueError("Word embeddings are empty.")

#         # Pad sequence to fixed length
#         seq_len = 50
#         words = words[:seq_len]  # Trim longer sequences

#         # Convert the NumPy arrays in words to PyTorch tensors
#         words = [torch.tensor(w, dtype=torch.float32) for w in words]

#         # Pad shorter sequences with zeros (after converting `words[0]` to tensor)
#         padding = [torch.zeros_like(words[0])] * (seq_len - len(words))
#         words += padding

#         # Convert the list of tensors into a single tensor
#         words_tensor = torch.stack(words)

#         return words_tensor, torch.tensor(label)

Model-Based on (Train Set)

In [22]:
# #word_embeddings = w2v_model.wv.vectors
# print(validation_dataset)  # Check if this contains 'text' and 'label'
# print(train_dataset)
# print(test_dataset)


# print("Train dataset type:", type(train_dataset))
# print("Validation dataset type:", type(validation_dataset))
# print("Test dataset type:", type(test_dataset))



# validation_dataset = SentimentDataset(validation_dataset, embedding_matrix)


# test_dataset = SentimentDataset(test_dataset, embedding_matrix)

# train_dataset = SentimentDataset(train_dataset, embedding_matrix)  # Assuming train_dataset is a list

# print(type(embedding_matrix))

<__main__.SentimentDataset object at 0x7f0a2eba8250>
<__main__.SentimentDataset object at 0x7f0a2ebb0c50>
<__main__.SentimentDataset object at 0x7f0a2ebb2950>
Train dataset type: <class '__main__.SentimentDataset'>
Validation dataset type: <class '__main__.SentimentDataset'>
Test dataset type: <class '__main__.SentimentDataset'>
Initializing SentimentDataset
Data type: <class '__main__.SentimentDataset'>
Word embeddings type: <class 'numpy.ndarray'>
Initializing SentimentDataset
Data type: <class '__main__.SentimentDataset'>
Word embeddings type: <class 'numpy.ndarray'>
Initializing SentimentDataset
Data type: <class '__main__.SentimentDataset'>
Word embeddings type: <class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [None]:
pretrained_model

<gensim.models.keyedvectors.KeyedVectors at 0x7ff3648c1480>

Model based on Pre-Trained (KeyedVectors)

In [21]:
# #word_embeddings = pretrained_model.wv.vectors
# print(validation_dataset)  # Check if this contains 'text' and 'label'
# print(train_dataset)
# print(test_dataset)


# print("Train dataset type:", type(train_dataset))
# print("Validation dataset type:", type(validation_dataset))
# print("Test dataset type:", type(test_dataset))



# validation_dataset = SentimentDataset(validation_dataset, pretrained_model)


# test_dataset = SentimentDataset(test_dataset, pretrained_model)

# train_dataset = SentimentDataset(train_dataset, pretrained_model)  # Assuming train_dataset is a list

# print(type(pretrained_model))

# ## If you run into a datatype error for SentimentDataset, re-run the original
# ## train,test,val datasets to display [text, label] before running the ML models]


<__main__.SentimentDataset object at 0x7f0a2f17e010>
<__main__.SentimentDataset object at 0x7f0a2ebb3090>
<__main__.SentimentDataset object at 0x7f0a27d71610>
Train dataset type: <class '__main__.SentimentDataset'>
Validation dataset type: <class '__main__.SentimentDataset'>
Test dataset type: <class '__main__.SentimentDataset'>
Initializing SentimentDataset
Data type: <class '__main__.SentimentDataset'>
Word embeddings type: <class 'gensim.models.keyedvectors.KeyedVectors'>
Initializing SentimentDataset
Data type: <class '__main__.SentimentDataset'>
Word embeddings type: <class 'gensim.models.keyedvectors.KeyedVectors'>
Initializing SentimentDataset
Data type: <class '__main__.SentimentDataset'>
Word embeddings type: <class 'gensim.models.keyedvectors.KeyedVectors'>
<class 'gensim.models.keyedvectors.KeyedVectors'>


In [None]:
#Define the diff hyperparameters
hyperparams = {
    'epochs': [20,30,40],
    'learning_rate': [0.001, 0.01, 0.1],
    'optimizer': ['Adam', 'SGD', 'RMSprop'],
    'batch_size': [16, 32, 64]
}


In [23]:
#Define the diff hyperparameters
hyperparams = {
    'epochs': [20,30,40],
    'learning_rate': [0.001, 0.01, 0.1],
    'optimizer': ['Adam', 'SGD', 'RMSprop'],
    'batch_size': [16, 32, 64]
}

#Define RNN Model
class RNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers=1, dropout=0.2):
        super(RNN, self).__init__()
        self.rnn = nn.GRU(input_dim, hidden_dim, num_layers=num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        h0 = torch.zeros(self.rnn.num_layers, x.size(0), self.rnn.hidden_size).to(x.device)
        out, _ = self.rnn(x, h0)
        out = out[:, -1, :]  # Last hidden state
        out = self.dropout(out)
        out = self.fc(out)
        out = F.softmax(out, dim=1)
        return out

# Initialize model
input_dim = 300  # Word embedding dimension
hidden_dim = 128  # RNN hidden state dimension
output_dim = 2  # Number of sentiment classes (positive, negative)
model = RNN(input_dim, hidden_dim, output_dim)

In [None]:
# Define training function
def train_model(model, device, train_loader, val_loader, optimizer, epochs, criterion, patience):
    model.train()
    best_accuracy = 0
    patience_counter = 0
    for epoch in range(epochs):
        total_loss = 0
        model.train()
        for batch in train_loader:
            inputs, labels = batch
            inputs = inputs.to(device)
            labels = labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        # Evaluate model on validation set
        model.eval()
        accuracy = evaluate_model(model, device, val_loader)
        print(f'Epoch {epoch+1}, Loss: {total_loss / len(train_loader)}, Val Acc: {accuracy:.4f}')

        # Early stopping
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f'Early stopping at epoch {epoch+1}')
                break

# Define evaluation function
def evaluate_model(model, device, val_loader):
    model.eval()
    correct = 0
    with torch.no_grad():
        for batch in val_loader:
            inputs, labels = batch
            inputs = inputs.to(device)
            labels = labels.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            correct += (predicted == labels).sum().item()
    accuracy = correct / len(val_loader.dataset)
    return accuracy

In [None]:


import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

cuda_available = torch.cuda.is_available()

print(f"CUDA Available: {cuda_available}")
if cuda_available:
  print(f"Current CUDA Device: {torch.cuda.current_device()}")
  print(f"CUDA Device Name: {torch.cuda.get_device_name(torch.cuda.current_device())}")

Num GPUs Available:  1
CUDA Available: True
Current CUDA Device: 0
CUDA Device Name: Tesla T4


In [None]:
# Fix epochs, learning rate, and batch size
fixed_epochs = 40
fixed_learning_rate = 0.01
fixed_batch_size = 64

best_accuracy = 0
best_optimizer = None
optimizer_accuracies = {}

for optimizer_name in hyperparams['optimizer']:
    # Initialize model, optimizer, and loss function
    model = RNN(input_dim=300, hidden_dim=128, output_dim=2)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    if optimizer_name == 'Adam':
        optimizer = optim.Adam(model.parameters(), lr=fixed_learning_rate)
    elif optimizer_name == 'SGD':
        optimizer = optim.SGD(model.parameters(), lr=fixed_learning_rate)
    elif optimizer_name == 'RMSprop':
        optimizer = optim.RMSprop(model.parameters(), lr=fixed_learning_rate)

    criterion = nn.CrossEntropyLoss()

    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=fixed_batch_size, shuffle=True)
    val_loader = DataLoader(validation_dataset, batch_size=fixed_batch_size, shuffle=False)

    # Train model
    train_model(model, device, train_loader, val_loader, optimizer, fixed_epochs, criterion,patience=20)

    # Evaluate model
    accuracy = evaluate_model(model, device, val_loader)
    print(f'Optimizer: {optimizer_name}, Val Acc: {accuracy:.4f}')

    # Store accuracy for current optimizer
    optimizer_accuracies[optimizer_name] = accuracy

    # Update best optimizer
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_optimizer = optimizer_name


print("\nOptimizer Accuracies:")
for optimizer, accuracy in optimizer_accuracies.items():
    print(f"{optimizer}: {accuracy:.4f}")

print(f'Best Optimizer: {best_optimizer}, Best Val Acc: {best_accuracy:.4f}')

KeyboardInterrupt: 

In [None]:
# Fix optimizer, learning rate, and batch size
fixed_optimizer = 'RMSprop'
fixed_learning_rate = 0.01
fixed_batch_size = 64

best_accuracy = 0
best_epochs = None

for epochs in hyperparams['epochs']:
    # Initialize model, optimizer, and loss function
    model = RNN(input_dim=300, hidden_dim=128, output_dim=2)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    optimizer = optim.RMSprop(model.parameters(), lr=fixed_learning_rate)

    criterion = nn.CrossEntropyLoss()

    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=fixed_batch_size, shuffle=True)
    train_features, train_labels = next(iter(train_loader))
    print(f"Feature batch shape: {train_features.size()}")
    print(f"Labels batch shape: {train_labels.size()}")
    val_loader = DataLoader(validation_dataset, batch_size=fixed_batch_size, shuffle=False)

    # Train model
    train_model(model, device, train_loader,val_loader, optimizer, epochs, criterion, patience=20)

    # Evaluate model
    accuracy = evaluate_model(model, device, val_loader)
    print(f'Epochs: {epochs}, Val Acc: {accuracy:.4f}')

    # Update best epochs
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_epochs = epochs


print(f"Best Epochs: {best_epochs}, Best Val Acc: {best_accuracy:.4f}")

Feature batch shape: torch.Size([64, 50, 300])
Labels batch shape: torch.Size([64])
Epoch 1, Loss: 0.7473514249075704, Val Acc: 0.5366
Epoch 2, Loss: 0.7070046262954598, Val Acc: 0.5000


Best Epoches: 20

In [None]:
# Fix optimizer, learning rate, and epochs
fixed_optimizer = 'RMSprop'
fixed_learning_rate = 0.01
fixed_epochs = 40

best_accuracy = 0
best_batch_size = None

for batch_size in hyperparams['batch_size']:
    # Initialize model, optimizer, and loss function
    model = RNN(input_dim=300, hidden_dim=128, output_dim=2)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    optimizer = optim.RMSprop(model.parameters(), lr=fixed_learning_rate)

    criterion = nn.CrossEntropyLoss()

    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    train_features, train_labels = next(iter(train_loader))
    print(f"Feature batch shape: {train_features.size()}")
    print(f"Labels batch shape: {train_labels.size()}")
    val_loader = DataLoader(validation_dataset, batch_size=batch_size, shuffle=False)

    # Train model
    train_model(model, device, train_loader, val_loader,optimizer, fixed_epochs, criterion, patience=20)

    # Evaluate model
    accuracy = evaluate_model(model, device, val_loader)
    print(f'Batch Size: {batch_size}, Val Acc: {accuracy:.4f}')

    # Update best batch size
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_batch_size = batch_size


print(f"Best Batch Size: {best_batch_size}, Best Val Acc: {best_accuracy:.4f}")

Feature batch shape: torch.Size([16, 50, 300])
Labels batch shape: torch.Size([16])
Epoch 1, Loss: 0.7683348911866713, Val Acc: 0.4831
Epoch 2, Loss: 0.736051562350341, Val Acc: 0.5000
Epoch 3, Loss: 0.7301544120695707, Val Acc: 0.5000
Epoch 4, Loss: 0.7333034318261379, Val Acc: 0.5000
Epoch 5, Loss: 0.7282603086157238, Val Acc: 0.5000
Epoch 6, Loss: 0.7218908927637093, Val Acc: 0.5000
Epoch 7, Loss: 0.7236052714706807, Val Acc: 0.5000
Epoch 8, Loss: 0.7284331042668346, Val Acc: 0.5000
Epoch 9, Loss: 0.7208578912506389, Val Acc: 0.4991
Epoch 10, Loss: 0.7118065963300426, Val Acc: 0.4972
Epoch 11, Loss: 0.7811902278148279, Val Acc: 0.5000
Epoch 12, Loss: 0.7163849436165242, Val Acc: 0.5000
Epoch 13, Loss: 0.7069365295131554, Val Acc: 0.5009
Epoch 14, Loss: 0.7074920204694798, Val Acc: 0.5122
Epoch 15, Loss: 0.7003809580865424, Val Acc: 0.5075
Epoch 16, Loss: 0.6948485549134709, Val Acc: 0.5169
Epoch 17, Loss: 0.6944006104371074, Val Acc: 0.5216
Epoch 18, Loss: 0.6906778774457925, Val Ac

Best Batch Size: 32

In [None]:
# Fix optimizer, batch size, and epochs
fixed_optimizer = 'RMSprop'
fixed_batch_size = 32
fixed_epochs = 40

best_accuracy = 0
best_learning_rate = None

for learning_rate in hyperparams['learning_rate']:
    # Initialize model, optimizer, and loss function
    model = RNN(input_dim=300, hidden_dim=128, output_dim=2)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    optimizer = optim.RMSprop(model.parameters(), lr=learning_rate)

    criterion = nn.CrossEntropyLoss()

    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=fixed_batch_size, shuffle=True)
    train_features, train_labels = next(iter(train_loader))
    print(f"Feature batch shape: {train_features.size()}")
    print(f"Labels batch shape: {train_labels.size()}")
    val_loader = DataLoader(validation_dataset, batch_size=fixed_batch_size, shuffle=False)

    # Train model
    train_model(model, device, train_loader, val_loader,optimizer, fixed_epochs, criterion, patience=20)

    # Evaluate model
    accuracy = evaluate_model(model, device, val_loader)
    print(f'Learning Rate: {learning_rate}, Val Acc: {accuracy:.4f}')

    # Update best learning rate
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_learning_rate = learning_rate


print(f"Best Learning Rate: {best_learning_rate}, Best Val Acc: {best_accuracy:.4f}")


Feature batch shape: torch.Size([32, 50, 300])
Labels batch shape: torch.Size([32])
Epoch 1, Loss: 0.6937899254681019, Val Acc: 0.5000
Epoch 2, Loss: 0.6931960120629729, Val Acc: 0.5000
Epoch 3, Loss: 0.6935247464126415, Val Acc: 0.5000
Epoch 4, Loss: 0.6933568789717857, Val Acc: 0.5000
Epoch 5, Loss: 0.6933773666731874, Val Acc: 0.5000
Epoch 6, Loss: 0.6933871239758609, Val Acc: 0.5000
Epoch 7, Loss: 0.6934353702523736, Val Acc: 0.5000
Epoch 8, Loss: 0.6933269351162714, Val Acc: 0.5000
Epoch 9, Loss: 0.6933422059602059, Val Acc: 0.5000
Epoch 10, Loss: 0.6933058250262942, Val Acc: 0.5000
Epoch 11, Loss: 0.6933663962932115, Val Acc: 0.5000
Epoch 12, Loss: 0.6932519658674462, Val Acc: 0.5000
Epoch 13, Loss: 0.6931617177381051, Val Acc: 0.5000
Epoch 14, Loss: 0.6932853578628225, Val Acc: 0.5000
Epoch 15, Loss: 0.6932989914765518, Val Acc: 0.5000
Epoch 16, Loss: 0.6933346149626742, Val Acc: 0.5000
Epoch 17, Loss: 0.6933027746972074, Val Acc: 0.5000
Epoch 18, Loss: 0.6932961601443058, Val A

2A) Optimal Hypermeters: Learning Rate: 0.01, Epoches: 20, Optimiser: RMSprop, Batch Size: 32

2B) Reporting the accuracy score on the test set, as well as the accuracy score on the validation
set for each epoch during training.

In [None]:
def test_model(model, device, test_loader):
    model.eval()
    correct = 0
    with torch.no_grad():
        for batch in val_loader:
            inputs, labels = batch
            inputs = inputs.to(device)
            labels = labels.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            correct += (predicted == labels).sum().item()
    accuracy = correct / len(val_loader.dataset)
    return accuracy

In [None]:
#train model with all optimal hyperparameters
# Define optimal hyperparameters
optimal_learning_rate = 0.01
optimal_epochs = 20
optimal_optimizer = 'RMSprop'
optimal_batch_size = 32

# Initialize model, optimizer, and loss function
model = RNN(input_dim=300, hidden_dim=128, output_dim=2)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

optimizer = optim.RMSprop(model.parameters(), lr=optimal_learning_rate)
criterion = nn.CrossEntropyLoss()

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=optimal_batch_size, shuffle=True)
val_loader = DataLoader(validation_dataset, batch_size=optimal_batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=optimal_batch_size, shuffle=False)

# Train model
train_model(model, device, train_loader, val_loader, optimizer, optimal_epochs, criterion, patience=20)

test_accuracy = test_model(model, device, test_loader)
print(f'Best Model - Test Acc: {test_accuracy:.4f}')

Epoch 1, Loss: 0.7751642617393523, Val Acc: 0.5056
Epoch 2, Loss: 0.7231779770458235, Val Acc: 0.5094
Epoch 3, Loss: 0.7065492098251086, Val Acc: 0.5066
Epoch 4, Loss: 0.7017141169823065, Val Acc: 0.5000
Epoch 5, Loss: 0.6986718182260178, Val Acc: 0.5000
Epoch 6, Loss: 0.6983468316021036, Val Acc: 0.5150
Epoch 7, Loss: 0.6950231728035859, Val Acc: 0.5000
Epoch 8, Loss: 0.6962356299496768, Val Acc: 0.5000
Epoch 9, Loss: 0.6951887817418531, Val Acc: 0.5047
Epoch 10, Loss: 0.6939468144924007, Val Acc: 0.5000
Epoch 11, Loss: 0.6936169291256965, Val Acc: 0.5084
Epoch 12, Loss: 0.6939254508036353, Val Acc: 0.5094
Epoch 13, Loss: 0.6931770716267132, Val Acc: 0.5178
Epoch 14, Loss: 0.6939771653114634, Val Acc: 0.5028
Epoch 15, Loss: 0.693986246871591, Val Acc: 0.5310
Epoch 16, Loss: 0.6921319204769777, Val Acc: 0.5000
Epoch 17, Loss: 0.6915673347894619, Val Acc: 0.5103
Epoch 18, Loss: 0.6901113265909059, Val Acc: 0.5159
Epoch 19, Loss: 0.6897147422872679, Val Acc: 0.5094
Epoch 20, Loss: 0.6878

2C) We have already implemented one method to derive the final sentence representation by taking the Last Hidden State of the RNN

In [None]:
# implemeted in the forward method of the RNN class
"""def forward(self, x):
        h0 = torch.zeros(self.rnn.num_layers, x.size(0), self.rnn.hidden_size).to(x.device)
        out, _ = self.rnn(x, h0)
        out = out[:, -1, :]  # Last hidden state
        out = self.dropout(out)
        out = self.fc(out)
        out = F.softmax(out, dim=1)
        return out

We can implement mean pooling or max pooling
Mean Pooling

*   Computes the element-wise mean of all hidden states.
*   Averages the information from all time steps.
*   Useful when all time steps are equally important.

Max Pooling
*   Computes the element-wise maximum of all hidden states.
*   Selects the most prominent features from all time steps.
*   Useful when certain time steps have distinctive features.











In [None]:
class RNN_MeanPooling(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers=1, dropout=0.2):
        super(RNN_MeanPooling, self).__init__()
        self.rnn = nn.GRU(input_dim, hidden_dim, num_layers=num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        h0 = torch.zeros(self.rnn.num_layers, x.size(0), self.rnn.hidden_size).to(x.device)
        out, _ = self.rnn(x, h0)

        # Mean Pooling
        out = torch.mean(out, dim=1)

        out = self.dropout(out)
        out = self.fc(out)
        out = F.softmax(out, dim=1)
        return out

In [None]:
class RNN_MaxPooling(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers=1, dropout=0.2):
        super(RNN_MaxPooling, self).__init__()
        self.rnn = nn.GRU(input_dim, hidden_dim, num_layers=num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        h0 = torch.zeros(self.rnn.num_layers, x.size(0), self.rnn.hidden_size).to(x.device)
        out, _ = self.rnn(x, h0)

        # Max Pooling
        out, _ = torch.max(out, dim=1)

        out = self.dropout(out)
        out = self.fc(out)
        out = F.softmax(out, dim=1)
        return out

In [None]:
# Mean Pooling
model_mean = RNN_MeanPooling(input_dim=300, hidden_dim=128, output_dim=2)

# Max Pooling
model_max = RNN_MaxPooling(input_dim=300, hidden_dim=128, output_dim=2)

# Train model
train_model(model_mean, device, train_loader, val_loader, optimizer, optimal_epochs, criterion, patience=20)

test_accuracy = evaluate_model(model_mean, device, test_loader)
print(f'Best Model - Test Acc: {test_accuracy:.4f}')

Epoch 1, Loss: 0.6932403925206331, Val Acc: 0.5000
Epoch 2, Loss: 0.6932291002309278, Val Acc: 0.5000
Epoch 3, Loss: 0.6931162248836474, Val Acc: 0.5000
Epoch 4, Loss: 0.6932642093758458, Val Acc: 0.5000
Epoch 5, Loss: 0.6931248764866746, Val Acc: 0.5000
Epoch 6, Loss: 0.6931259116429961, Val Acc: 0.5000
Epoch 7, Loss: 0.6931836857331379, Val Acc: 0.5000
Epoch 8, Loss: 0.6931090966592567, Val Acc: 0.5000
Epoch 9, Loss: 0.6931254464142331, Val Acc: 0.5000
Epoch 10, Loss: 0.6931482127096769, Val Acc: 0.5000
Epoch 11, Loss: 0.6931380497828852, Val Acc: 0.5000
Epoch 12, Loss: 0.6931109401617157, Val Acc: 0.5000
Epoch 13, Loss: 0.6931445386525843, Val Acc: 0.5000
Epoch 14, Loss: 0.6931101306993863, Val Acc: 0.5000
Epoch 15, Loss: 0.6931157355451405, Val Acc: 0.5000
Epoch 16, Loss: 0.6932231332925375, Val Acc: 0.5000
Epoch 17, Loss: 0.6931149604615201, Val Acc: 0.5000
Epoch 18, Loss: 0.6931283440482751, Val Acc: 0.5000
Epoch 19, Loss: 0.6931440848089783, Val Acc: 0.5000
Epoch 20, Loss: 0.693

In [None]:
train_model(model_max, device, train_loader, val_loader, optimizer, optimal_epochs, criterion, patience=20)
test_accuracy = evaluate_model(model_max, device, test_loader)
print(f'Best Model - Test Acc: {test_accuracy:.4f}')

Epoch 1, Loss: 0.693199175350675, Val Acc: 0.5009
Epoch 2, Loss: 0.6931255493271217, Val Acc: 0.5009
Epoch 3, Loss: 0.6931251798676195, Val Acc: 0.5009
Epoch 4, Loss: 0.6930800129411819, Val Acc: 0.5009
Epoch 5, Loss: 0.6931752743345968, Val Acc: 0.5009
Epoch 6, Loss: 0.6930518549926272, Val Acc: 0.5009
Epoch 7, Loss: 0.6931906907299484, Val Acc: 0.5009
Epoch 8, Loss: 0.6931225521287668, Val Acc: 0.5009
Epoch 9, Loss: 0.6931742277931184, Val Acc: 0.5009
Epoch 10, Loss: 0.6931020101804412, Val Acc: 0.5009
Epoch 11, Loss: 0.6931737895761982, Val Acc: 0.5009
Epoch 12, Loss: 0.6930217680413179, Val Acc: 0.5009
Epoch 13, Loss: 0.6931728376877888, Val Acc: 0.5009
Epoch 14, Loss: 0.693045874213458, Val Acc: 0.5009
Epoch 15, Loss: 0.6931474775857247, Val Acc: 0.5009
Epoch 16, Loss: 0.6932124805807621, Val Acc: 0.5009
Epoch 17, Loss: 0.6931823434008195, Val Acc: 0.5009
Epoch 18, Loss: 0.6930404345640976, Val Acc: 0.5009
Epoch 19, Loss: 0.693078400043959, Val Acc: 0.5009
Epoch 20, Loss: 0.693135

3a)

In [None]:
import torch.nn as nn

class SentimentRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(SentimentRNN, self).__init__()

        # Define a trainable embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        # Define the RNN layer
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)

        # Define the output layer
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)  # Embeddings will update during training
        output, hidden = self.rnn(embedded)
        return self.fc(hidden.squeeze(0))

# Define the model, optimizer, and loss function
model = SentimentRNN(vocab_size=10000, embedding_dim=100, hidden_dim=256, output_dim=1)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCEWithLogitsLoss()


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import accuracy_score

# Initialize the tokenizer (using BERT tokenizer)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Define the custom dataset (as done previously)
class SentimentDataset(Dataset):
    def __init__(self, dataset, tokenizer, max_len):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        text = self.dataset[idx]['text']  # Get the text for the review
        label = self.dataset[idx]['label']  # Get the sentiment label (0 or 1)

        # Tokenize the input text using BERT tokenizer
        inputs = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_len, return_tensors='pt')

        # We squeeze the tensor to remove the batch dimension, as we only have a single example
        input_ids = inputs['input_ids'].squeeze(0)
        attention_mask = inputs['attention_mask'].squeeze(0)

        return {'input_ids': input_ids, 'attention_mask': attention_mask, 'label': torch.tensor(label)}

# Create datasets for PyTorch
max_len = 128  # Maximum length for tokenized sequences
train_data = SentimentDataset(train_dataset, tokenizer, max_len)
validation_data = SentimentDataset(validation_dataset, tokenizer, max_len)
test_data = SentimentDataset(test_dataset, tokenizer, max_len)

# Create DataLoader objects for batching during training
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
validation_loader = DataLoader(validation_data, batch_size=32, shuffle=False)
test_loader = DataLoader(test_data, batch_size=32, shuffle=False)

# Initialize the model (BERT for sequence classification)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)  # Binary classification (positive/negative)

# Set device to GPU if available, otherwise CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Set up optimizer and loss function
optimizer = optim.AdamW(model.parameters(), lr=2e-5)  # You can adjust learning rate here
criterion = nn.CrossEntropyLoss()

# Training loop
num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    correct_predictions = 0
    total_predictions = 0

    for batch in train_loader:
        # Move data to the device (GPU or CPU)
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        # Forward pass
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        # Backward pass
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

        # Calculate accuracy
        _, predictions = torch.max(logits, dim=1)
        correct_predictions += (predictions == labels).sum().item()
        total_predictions += labels.size(0)

    # Average loss and accuracy for the epoch
    avg_loss = epoch_loss / len(train_loader)
    accuracy = correct_predictions / total_predictions

    print(f"Epoch {epoch + 1}/{num_epochs}")
    print(f"Training Loss: {avg_loss:.4f}, Training Accuracy: {accuracy * 100:.2f}%")

    # Validate the model after each epoch
    model.eval()
    correct_predictions = 0
    total_predictions = 0

    with torch.no_grad():
        for batch in validation_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            _, predictions = torch.max(logits, dim=1)
            correct_predictions += (predictions == labels).sum().item()
            total_predictions += labels.size(0)

    validation_accuracy = correct_predictions / total_predictions
    print(f"Validation Accuracy: {validation_accuracy * 100:.2f}%")

# After training, evaluate on the test set
model.eval()
correct_predictions = 0
total_predictions = 0

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        _, predictions = torch.max(logits, dim=1)
        correct_predictions += (predictions == labels).sum().item()
        total_predictions += labels.size(0)

test_accuracy = correct_predictions / total_predictions
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")


The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3
Training Loss: 0.3992, Training Accuracy: 81.85%
Validation Accuracy: 84.33%
Epoch 2/3
Training Loss: 0.2191, Training Accuracy: 91.96%
Validation Accuracy: 85.46%
Epoch 3/3
Training Loss: 0.1110, Training Accuracy: 96.57%
Validation Accuracy: 83.11%
Test Accuracy: 83.49%


3b)

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from nltk.corpus import wordnet
import nltk
import gensim

nltk.download('wordnet')

def get_embedding_random():
    return np.random.normal(scale=0.6, size=(embedding_dim,))

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
def get_embedding_average():
    all_known_embeddings = np.array([pretrained_model[word] for word in pretrained_model.key_to_index])
    return np.mean(all_known_embeddings, axis=0)

In [None]:
def get_embedding_root(word):
    synsets = wordnet.synsets(word)
    root_words = {lemma.name().lower() for synset in synsets for lemma in synset.lemmas()}
    root_embeddings = [pretrained_model[root] for root in root_words if root in pretrained_model.key_to_index]

    if root_embeddings:
        return np.mean(root_embeddings, axis=0)
    else:
        return get_embedding_random()

In [None]:
# Identify unique words and OOV words
unique_strings = set(word for sentence in train_processed_sentences for word in sentence)
oov_words = [word for word in unique_strings if word not in pretrained_model]

# Create word-to-index dictionary
word_to_idx = {word: idx + 1 for idx, word in enumerate(unique_strings)}  # +1 for padding index 0

# Initialize embedding matrices for each strategy
embedding_matrices = {
    "random": np.zeros((len(word_to_idx) + 1, embedding_dim)),
    "average": np.zeros((len(word_to_idx) + 1, embedding_dim)),
    "root": np.zeros((len(word_to_idx) + 1, embedding_dim)),
}

In [None]:
# Populate embedding matrices
for word, idx in word_to_idx.items():
    if word in pretrained_model.key_to_index:
        embedding_matrices["random"][idx] = pretrained_model[word]
        embedding_matrices["average"][idx] = pretrained_model[word]
        embedding_matrices["root"][idx] = pretrained_model[word]
    else:
        embedding_matrices["random"][idx] = get_embedding_random()
        embedding_matrices["average"][idx] = get_embedding_average()
        embedding_matrices["root"][idx] = get_embedding_root(word)

In [None]:
# Helper to get sentence embeddings
def get_sentence_embedding(sentence, embedding_matrix):
    embeddings = [embedding_matrix[word_to_idx[word]] for word in sentence if word in word_to_idx]
    return np.mean(embeddings, axis=0)  # Average embedding for the sentence

In [None]:
# Extract text and labels from the dataset
train_texts = [item['text'] for item in train_dataset]
train_labels = [item['label'] for item in train_dataset]

validation_texts = [item['text'] for item in validation_dataset]
validation_labels = [item['label'] for item in validation_dataset]

test_texts = [item['text'] for item in test_dataset]
test_labels = [item['label'] for item in test_dataset]

# Process the labels similarly for training/testing split
labels = train_labels  # use this in the main code where `labels` was required


In [None]:
from sklearn.model_selection import train_test_split

# Evaluate each strategy
results = {}
sentence_embeddings_dict = {strategy: [] for strategy in embedding_matrices}  # Store embeddings by strategy

for strategy, embedding_matrix in embedding_matrices.items():
    sentence_embeddings = np.array([get_sentence_embedding(sentence, embedding_matrix) for sentence in train_processed_sentences])
    sentence_embeddings_dict[strategy] = sentence_embeddings

    # Split into train/test sets
    X_train, X_test, y_train, y_test = train_test_split(
        sentence_embeddings, labels, test_size=0.2, random_state=42, stratify=labels
    )

    # Train and evaluate model
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    results[strategy] = accuracy
    print(f"Accuracy with {strategy} OOV strategy: {accuracy:.4f}")

# Print final comparison
print("Final comparison of OOV handling strategies:", results)


Accuracy with random OOV strategy: 0.7421
Accuracy with average OOV strategy: 0.7679
Accuracy with root OOV strategy: 0.7368
Final comparison of OOV handling strategies: {'random': 0.7420867526377491, 'average': 0.7678780773739742, 'root': 0.7368112543962485}


3c) biLSTM & GRU (GRU Model used above ^)


biLSTM

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class BiLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers=1, dropout=0.2):
        super(BiLSTM, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers=num_layers, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)  # Double the hidden dimension for bidirectional
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        h0 = torch.zeros(2 * self.lstm.num_layers, x.size(0), self.lstm.hidden_size).to(x.device)
        c0 = torch.zeros(2 * self.lstm.num_layers, x.size(0), self.lstm.hidden_size).to(x.device)
        out, _ = self.lstm(x, (h0, c0))
        out = out[:, -1, :]  # Concatenate the final forward and backward hidden states
        out = self.dropout(out)
        out = self.fc(out)
        out = F.softmax(out, dim=1)
        return out




In [None]:
import torch
import nltk
import re
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import accuracy_score

# Custom Tokenizer Function using RegEx-based pattern
def custom_tokenizer(data):
    # pattern removes all punctuation and retains the abbreviations (additional preprocessing is required to remove the period marking the end of the sentence)
    pattern = r'''(?x)          # set flag to allow verbose regexps
          (?:[A-Za-z]\.)+        # abbreviations(both upper and lower case, like "e.g.", "U.S.A.")
          | \w+(?:-\w+)*        # words with optional internal hyphens
          | \[\][.,;"'?():_-]    # these are separate tokens; includes ], [
          | \.                   # periods are separate tokens
      '''
    sentences = []
    for i in range(len(data)):
        text = re.sub(r'\s*([,.":])\s*', r'\1', data[i]['text'])  # removes additional whitespace within sequences
        sentences.append(nltk.regexp_tokenize(text, pattern))  # Tokenize
    return sentences


# Dataset Class
class SentimentDataset(Dataset):
    def __init__(self, data, tokenizer, word_to_idx, max_length=100):
        self.data = data
        self.tokenizer = tokenizer
        self.word_to_idx = word_to_idx
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data[idx]['text']
        label = self.data[idx]['label']

        # Tokenize the text using the provided tokenizer function
        tokens = custom_tokenizer([self.data[idx]])  # Pass the dictionary as a list (i.e., single item)
        tokens = tokens[0]  # Extract the list of tokens (first element in the list)

        # Convert tokens to indices using word_to_idx, with 0 as the OOV token
        token_ids = [self.word_to_idx.get(token, 0) for token in tokens]

        # Pad or truncate the token_ids to max_length
        if len(token_ids) > self.max_length:
            token_ids = token_ids[:self.max_length]
        else:
            token_ids += [0] * (self.max_length - len(token_ids))

        return torch.tensor(token_ids), torch.tensor(label)


# Example word_to_idx
word_to_idx = {'the': 1, 'is': 2, 'a': 3, 'positive': 4, 'negative': 5, 'review': 6, 'this': 7}  # Example word-to-index mapping
vocab_size = len(word_to_idx) + 1  # +1 for padding
embedding_dim = 300  # Example embedding dimension
hidden_dim = 128  # Hidden dimension
output_dim = 2  # Binary classification (positive/negative)
num_layers = 2  # Number of layers in LSTM


# Define the BiLSTM Model
class BiLSTMClassifier(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, output_dim, num_layers, vocab_size, padding_idx):
        super(BiLSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=padding_idx)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers,
                            bidirectional=True, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, text):
        embedded = self.embedding(text)
        print(f"Embedded shape: {embedded.shape}")  # Check if embeddings are correct
        lstm_out, _ = self.lstm(embedded)
        print(f"LSTM output shape: {lstm_out.shape}")
        out = lstm_out[:, -1, :]
        print(f"Final output shape: {out.shape}")
        return self.fc(out)


# Define the BiGRU Model (used later for comparison)
class BiGRUClassifier(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, output_dim, num_layers, vocab_size, padding_idx):
        super(BiGRUClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=padding_idx)
        self.gru = nn.GRU(embedding_dim, hidden_dim, num_layers=num_layers,
                          bidirectional=True, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, text):
        embedded = self.embedding(text)
        gru_out, _ = self.gru(embedded)
        out = gru_out[:, -1, :]
        return self.fc(out)


# Sample data
train_dataset = [{'text': 'This is a positive review.', 'label': 1}, {'text': 'This is a negative review.', 'label': 0}]
validation_dataset = [{'text': 'This is another positive review.', 'label': 1}, {'text': 'This is another negative review.', 'label': 0}]
test_dataset = [{'text': 'This is a test sentence.', 'label': 0}]

# Check label distribution
labels = [item['label'] for item in train_dataset]
print(f"Label Distribution: {labels.count(0)} negative, {labels.count(1)} positive")

# Create DataLoader
train_data = SentimentDataset(train_dataset, custom_tokenizer, word_to_idx, max_length=100)
validation_data = SentimentDataset(validation_dataset, custom_tokenizer, word_to_idx, max_length=100)
test_data = SentimentDataset(test_dataset, custom_tokenizer, word_to_idx, max_length=100)

train_loader = DataLoader(train_data, batch_size=16, shuffle=True)  # Reduced batch size
val_loader = DataLoader(validation_data, batch_size=16)  # Reduced batch size
test_loader = DataLoader(test_data, batch_size=16)  # Reduced batch size


# Training Function
def train_model(model, train_loader, val_loader, num_epochs=5, learning_rate=0.0001):  # Reduced learning rate
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    model.train()

    for epoch in range(num_epochs):
        epoch_loss = 0
        for text, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(text)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()

        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss/len(train_loader):.4f}")
        evaluate_model(model, val_loader)  # Evaluate on validation set

# Evaluation Function
def evaluate_model(model, data_loader):
    model.eval()
    all_preds, all_labels = [], []

    with torch.no_grad():
        for text, labels in data_loader:
            outputs = model(text)
            _, preds = torch.max(outputs, dim=1)
            all_preds.extend(preds.tolist())
            all_labels.extend(labels.tolist())

    accuracy = accuracy_score(all_labels, all_preds)
    print(f"Validation Accuracy: {accuracy:.4f}")
    return accuracy


# Train and Evaluate biLSTM Model
bi_lstm_model = BiLSTMClassifier(embedding_dim, hidden_dim, output_dim, num_layers, vocab_size, padding_idx=0)
print("\nTraining biLSTM Model:")
train_model(bi_lstm_model, train_loader, val_loader)

# Evaluate biLSTM on Test Set
print("\nTesting biLSTM Model on Test Set:")
evaluate_model(bi_lstm_model, test_loader)

# Train and Evaluate biGRU Model
bi_gru_model = BiGRUClassifier(embedding_dim, hidden_dim, output_dim, num_layers, vocab_size, padding_idx=0)
print("\nTraining biGRU Model:")
train_model(bi_gru_model, train_loader, val_loader)

# Evaluate biGRU on Test Set
print("\nTesting biGRU Model on Test Set:")
evaluate_model(bi_gru_model, test_loader)


Label Distribution: 1 negative, 1 positive

Training biLSTM Model:
Embedded shape: torch.Size([2, 100, 300])
LSTM output shape: torch.Size([2, 100, 256])
Final output shape: torch.Size([2, 256])
Epoch 1/5, Loss: 0.6931
Embedded shape: torch.Size([2, 100, 300])
LSTM output shape: torch.Size([2, 100, 256])
Final output shape: torch.Size([2, 256])
Validation Accuracy: 0.5000
Embedded shape: torch.Size([2, 100, 300])
LSTM output shape: torch.Size([2, 100, 256])
Final output shape: torch.Size([2, 256])
Epoch 2/5, Loss: 0.6931
Embedded shape: torch.Size([2, 100, 300])
LSTM output shape: torch.Size([2, 100, 256])
Final output shape: torch.Size([2, 256])
Validation Accuracy: 0.5000
Embedded shape: torch.Size([2, 100, 300])
LSTM output shape: torch.Size([2, 100, 256])
Final output shape: torch.Size([2, 256])
Epoch 3/5, Loss: 0.6931
Embedded shape: torch.Size([2, 100, 300])
LSTM output shape: torch.Size([2, 100, 256])
Final output shape: torch.Size([2, 256])
Validation Accuracy: 0.5000
Embedded 

1.0

3d)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import accuracy_score
import random

# Initialize the tokenizer (using BERT tokenizer)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Define the custom dataset (as done previously)
class SentimentDataset(Dataset):
    def __init__(self, dataset, tokenizer, max_len, augment=False):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.augment = augment  # Whether to apply data augmentation

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        text = self.dataset[idx]['text']  # Get the text for the review
        label = self.dataset[idx]['label']  # Get the sentiment label (0 or 1)

        if self.augment:
            text = self.augment_text(text)  # Augment the text

        # Tokenize the input text using BERT tokenizer
        inputs = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_len, return_tensors='pt')

        # We squeeze the tensor to remove the batch dimension, as we only have a single example
        input_ids = inputs['input_ids'].squeeze(0)
        attention_mask = inputs['attention_mask'].squeeze(0)

        return {'input_ids': input_ids, 'attention_mask': attention_mask, 'label': torch.tensor(label)}

    def augment_text(self, text):
        """Simple text augmentation: random token shuffling."""
        words = text.split()
        random.shuffle(words)  # Shuffle the words in the sentence
        return ' '.join(words)

# Create datasets for PyTorch
max_len = 128  # Maximum length for tokenized sequences
train_data = SentimentDataset(train_dataset, tokenizer, max_len, augment=True)
validation_data = SentimentDataset(validation_dataset, tokenizer, max_len)
test_data = SentimentDataset(test_dataset, tokenizer, max_len)

# Create DataLoader objects for batching during training
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
validation_loader = DataLoader(validation_data, batch_size=32, shuffle=False)
test_loader = DataLoader(test_data, batch_size=32, shuffle=False)

# Initialize the model (BERT for sequence classification)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)  # Binary classification (positive/negative)

# Set device to GPU if available, otherwise CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Set up optimizer and loss function with L2 regularization (weight decay)
optimizer = optim.AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)  # L2 regularization (weight decay)
criterion = nn.CrossEntropyLoss()

# Training loop
num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    correct_predictions = 0
    total_predictions = 0

    for batch in train_loader:
        # Move data to the device (GPU or CPU)
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        # Forward pass
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        # Backward pass
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

        # Calculate accuracy
        _, predictions = torch.max(logits, dim=1)
        correct_predictions += (predictions == labels).sum().item()
        total_predictions += labels.size(0)

    # Average loss and accuracy for the epoch
    avg_loss = epoch_loss / len(train_loader)
    accuracy = correct_predictions / total_predictions

    print(f"Epoch {epoch + 1}/{num_epochs}")
    print(f"Training Loss: {avg_loss:.4f}, Training Accuracy: {accuracy * 100:.2f}%")

    # Validate the model after each epoch
    model.eval()
    correct_predictions = 0
    total_predictions = 0

    with torch.no_grad():
        for batch in validation_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            _, predictions = torch.max(logits, dim=1)
            correct_predictions += (predictions == labels).sum().item()
            total_predictions += labels.size(0)

    validation_accuracy = correct_predictions / total_predictions
    print(f"Validation Accuracy: {validation_accuracy * 100:.2f}%")

# After training, evaluate on the test set
model.eval()
correct_predictions = 0
total_predictions = 0

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        _, predictions = torch.max(logits, dim=1)
        correct_predictions += (predictions == labels).sum().item()
        total_predictions += labels.size(0)

test_accuracy = correct_predictions / total_predictions
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5
Training Loss: 0.5336, Training Accuracy: 72.68%
Validation Accuracy: 82.36%
Epoch 2/5
Training Loss: 0.3900, Training Accuracy: 82.11%
Validation Accuracy: 82.93%
Epoch 3/5
Training Loss: 0.2964, Training Accuracy: 87.23%
Validation Accuracy: 83.21%
Epoch 4/5
Training Loss: 0.2052, Training Accuracy: 91.62%
Validation Accuracy: 79.64%
Epoch 5/5
Training Loss: 0.1199, Training Accuracy: 95.43%
Validation Accuracy: 80.11%
Test Accuracy: 81.14%


In [None]:
print(validation_dataset)  # Check if this contains 'text' and 'label'
print(train_dataset)
print(test_dataset)


print("Train dataset type:", type(train_dataset))
print("Validation dataset type:", type(validation_dataset))
print("Test dataset type:", type(test_dataset))

Dataset({
    features: ['text', 'label'],
    num_rows: 1066
})
Dataset({
    features: ['text', 'label'],
    num_rows: 8530
})
Dataset({
    features: ['text', 'label'],
    num_rows: 1066
})
Train dataset type: <class 'datasets.arrow_dataset.Dataset'>
Validation dataset type: <class 'datasets.arrow_dataset.Dataset'>
Test dataset type: <class 'datasets.arrow_dataset.Dataset'>


3e)

3f)