In [151]:
!pip install datasets
!pip install gensim




Part 1

In [152]:
from datasets import load_dataset

# Load the Rotten Tomatoes dataset
dataset = load_dataset("rotten_tomatoes")
train_dataset = dataset['train']
validation_dataset = dataset['validation']
test_dataset = dataset['test']

In [153]:
train_dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 8530
})

In [154]:
validation_dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 1066
})

In [155]:
test_dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 1066
})

Sentiment Classification task on movie reviews:

GloVe embeddings may be a better fit because they capture global semantic relationships well, which are important for distinguishing sentiment.

Reviews often involve subtle nuances in language (e.g., "great" vs. "not great"), and GloVe's global understanding can help capture these nuances better.

In [156]:
import numpy as np

# Load GloVe embeddings
def load_glove_embeddings(file_path):
    embedding_dict = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embedding_dict[word] = vector
    return embedding_dict

glove_embeddings = load_glove_embeddings("glove.6B/glove.6B.300d.txt")


Download from https://nlp.stanford.edu/projects/glove/

From there, there is 50d, 100d, 200d, 300d

300d would be better but require more computational resources

In [157]:
from collections import Counter
import re

# Tokenize text
def tokenize(text):
    return re.findall(r'\b\w+\b', text.lower())

# Build vocabulary from the training data
word_counter = Counter()
for sample in train_dataset:
    word_counter.update(tokenize(sample['text']))

# Create word-to-index mapping
vocab = {word: idx + 1 for idx, (word, _) in enumerate(word_counter.items())}
vocab_size = len(vocab) + 1  # +1 for padding index 0

print(f"Vocabulary Size: {vocab_size}")

Vocabulary Size: 16513


In [158]:
# The first version doesn't explicitly handle padding, which may lead to issues during training if padding is required. It assumes a vocabulary without additional padding and might be better suited for fixed-length data.

# embedding_dim = 300 # you need to change this if using other dimensions
# embedding_matrix = np.zeros((vocab_size, embedding_dim))

# oov_words = []

# for word, idx in vocab.items():
#     if word in glove_embeddings:
#         embedding_matrix[idx] = glove_embeddings[word]
#     else:
#         oov_words.append(word)
#         embedding_matrix[idx] = np.random.uniform(-0.05, 0.05, embedding_dim)

# print(f"Number of OOV words: {len(oov_words)}")

# 300d - Test Accuracy: 0.5056
# 200d - Test Accuracy: 0.5066
# 100d - Test Accuracy: 0.5075
# 50d - Test Accuracy: 0.5084


In [159]:
# Initialize embedding matrix
embedding_dim = 300
embedding_matrix = np.zeros((vocab_size, embedding_dim))  # Row 0 is for padding (all zeros)

# Compute the average of all known word embeddings for OOV initialization
known_embeddings = np.array(list(glove_embeddings.values()))
average_embedding = np.mean(known_embeddings, axis=0)

oov_words = []

# Fill the embedding matrix
for word, idx in vocab.items():
    if word in glove_embeddings:
        embedding_matrix[idx] = glove_embeddings[word]
    else:
        oov_words.append(word)
        # Assign the average of all known embeddings for OOV words
        embedding_matrix[idx] = average_embedding

print(f"Number of OOV words: {len(oov_words)}")

Number of OOV words: 591


The existence of the OOV words is one of the well-known limitations of Word2vec (or Glove).
Without using any transformer-based language models (e.g., BERT, GPT, T5), what do you
think is the best strategy to mitigate such limitation? Implement your solution in your source
code. Show the corresponding code snippet.

To mitigate the issue of OOV words without using transformer-based models, the best strategy is to randomly initialize the embeddings for OOV words and then allow these embeddings to be updated during training. This way, the model learns suitable representations for these words during training. The above code snippet already implements this strategy by assigning a small random value to OOV words and allowing the model to learn embeddings.

if word in glove_embeddings:
    embedding_matrix[idx] = glove_embeddings[word]
else:
    oov_words.append(word)
    embedding_matrix[idx] = np.random.uniform(-0.05, 0.05, embedding_dim)


Part 2

In [160]:
!pip install torch
!pip install transformers



In [161]:
import torch
from torch.utils.data import DataLoader, Dataset
import numpy as np
from collections import Counter
from torch.nn.utils.rnn import pad_sequence

# Assuming vocab and embedding_matrix are already prepared

# Tokenizing the dataset
def tokenize_sentence(sentence, vocab):
    tokens = tokenize(sentence)
    return [vocab.get(token, 0) for token in tokens]  # Replace OOV words with index 0

class SentimentDataset(Dataset):
    def __init__(self, dataset, vocab):
        self.sentences = [tokenize_sentence(sample['text'], vocab) for sample in dataset]
        self.labels = [sample['label'] for sample in dataset]

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        return torch.tensor(self.sentences[idx]), torch.tensor(self.labels[idx])

# Creating the datasets and data loaders
train_data = SentimentDataset(train_dataset, vocab)
val_data = SentimentDataset(validation_dataset, vocab)
test_data = SentimentDataset(test_dataset, vocab)

# Padding function for data loader
def collate_fn(batch):
    sentences, labels = zip(*batch)
    sentences_padded = pad_sequence(sentences, batch_first=True, padding_value=0)
    labels = torch.tensor(labels)
    return sentences_padded, labels

train_loader = DataLoader(train_data, batch_size=64, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_data, batch_size=64, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_data, batch_size=64, shuffle=False, collate_fn=collate_fn)


In [162]:
# Adjust vocabulary to match the embedding matrix size
# Create a new vocabulary using only the words present in the GloVe embeddings
filtered_vocab = {word: idx + 1 for idx, word in enumerate(glove_embeddings.keys()) if word in word_counter}
padding_idx = 0
vocab_size = len(filtered_vocab) + 1  # Adding one for the padding index

print(f"Adjusted Vocabulary Size (including padding): {vocab_size}")

# Adjust embedding matrix to include padding
embedding_matrix = np.zeros((vocab_size, embedding_dim))  # Initialize with zeros for padding

# Fill embedding matrix with GloVe vectors
for word, idx in filtered_vocab.items():
    if idx < vocab_size:  # Ensure the index does not go out of bounds
        embedding_matrix[idx] = glove_embeddings[word]

print(f"Adjusted Embedding Matrix Shape: {embedding_matrix.shape}")

# Ensure the embedding matrix matches the vocab size
assert embedding_matrix.shape[0] == vocab_size, "Embedding matrix rows do not match vocab size"

# Define the RNN model
class RNNModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, embedding_matrix):
        super(RNNModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=padding_idx)
        self.embedding.weight.data.copy_(torch.tensor(embedding_matrix, dtype=torch.float))
        self.embedding.weight.requires_grad = False  # Freeze embeddings

        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        _, hidden = self.rnn(embedded)
        output = self.fc(hidden.squeeze(0))
        return output

# Model parameters
hidden_dim = 128
output_dim = 2  # Sentiment (positive or negative)

# Initialize the model
model = RNNModel(vocab_size, embedding_dim, hidden_dim, output_dim, embedding_matrix)


Adjusted Vocabulary Size (including padding): 15922
Adjusted Embedding Matrix Shape: (15922, 300)


In [163]:
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence

# Tokenizing the dataset and ensuring valid index range
def tokenize_sentence(sentence, vocab, vocab_size):
    tokens = tokenize(sentence)
    # Convert tokens to indices and clamp them to be within the valid range [0, vocab_size - 1]
    return [min(vocab.get(token, 0), vocab_size - 1) for token in tokens]  # Use index 0 for OOV words

class SentimentDataset(Dataset):
    def __init__(self, dataset, vocab, vocab_size):
        self.sentences = [tokenize_sentence(sample['text'], vocab, vocab_size) for sample in dataset]
        self.labels = [sample['label'] for sample in dataset]

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        return torch.tensor(self.sentences[idx]), torch.tensor(self.labels[idx])

# Create datasets and dataloaders with clamped indices
train_data = SentimentDataset(train_dataset, filtered_vocab, vocab_size)
val_data = SentimentDataset(validation_dataset, filtered_vocab, vocab_size)
test_data = SentimentDataset(test_dataset, filtered_vocab, vocab_size)

# Dataloader setup
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)


In [164]:
import torch
import torch.optim as optim
import torch.nn.functional as F

# Training parameters
epochs = 20
learning_rate = 0.001
batch_size = 64

# Optimizer
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Loss function
criterion = nn.CrossEntropyLoss()

# Dataloader setup (train and validation)
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

# Training loop
best_val_acc = 0
patience = 3  # Stop training if validation accuracy does not improve for 3 consecutive epochs
epochs_no_improve = 0

for epoch in range(epochs):
    # Training phase
    model.train()
    train_loss = 0.0
    correct = 0
    total = 0

    for sentences, labels in train_loader:
        optimizer.zero_grad()  # Zero out gradients
        output = model(sentences)
        
        # Calculate loss and backpropagate
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

        # Calculate accuracy during training
        _, predicted = torch.max(output, 1)
        correct += (predicted == labels).sum().item()
        total += labels.size(0)

    train_acc = correct / total

    # Validation phase
    model.eval()
    correct_val = 0
    total_val = 0
    with torch.no_grad():
        for sentences, labels in val_loader:
            output = model(sentences)
            _, predicted = torch.max(output, 1)
            total_val += labels.size(0)
            correct_val += (predicted == labels).sum().item()

    val_acc = correct_val / total_val

    # Output training and validation results for this epoch
    print(f"Epoch [{epoch+1}/{epochs}], Train Loss: {train_loss:.4f}, Train Accuracy: {train_acc:.4f}, Validation Accuracy: {val_acc:.4f}")

    # Early stopping if validation accuracy is not improving
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        epochs_no_improve = 0  # Reset the counter if the validation accuracy improves
    else:
        epochs_no_improve += 1

    if epochs_no_improve == patience:
        print(f"Early stopping after epoch {epoch+1}")
        break

### Summary of Best Model Configuration:
# - Learning Rate: 0.001
# - Batch Size: 64
# - Optimizer: Adam
# - Epochs: Stop early if no improvement for 3 epochs


Epoch [1/20], Train Loss: 92.9934, Train Accuracy: 0.4951, Validation Accuracy: 0.5009
Epoch [2/20], Train Loss: 92.8584, Train Accuracy: 0.4931, Validation Accuracy: 0.4944
Epoch [3/20], Train Loss: 92.7839, Train Accuracy: 0.5018, Validation Accuracy: 0.5038
Epoch [4/20], Train Loss: 92.6633, Train Accuracy: 0.5012, Validation Accuracy: 0.4962
Epoch [5/20], Train Loss: 92.6632, Train Accuracy: 0.4980, Validation Accuracy: 0.4887
Epoch [6/20], Train Loss: 92.6636, Train Accuracy: 0.5062, Validation Accuracy: 0.4962
Early stopping after epoch 6


In [165]:
# Test Dataloader setup
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

# Evaluation on Test Set
model.eval()
correct_test = 0
total_test = 0
with torch.no_grad():
    for sentences, labels in test_loader:
        output = model(sentences)
        _, predicted = torch.max(output, 1)
        total_test += labels.size(0)
        correct_test += (predicted == labels).sum().item()

test_acc = correct_test / total_test
print(f"Test Accuracy: {test_acc:.4f}")


Test Accuracy: 0.5094


300d - Test Accuracy: 0.5103
200d - Test Accuracy: 0.5019