# Install dependencies


In [260]:
%pip install torch gensim datasets nltk

Note: you may need to restart the kernel to use updated packages.


# Import dependencies


In [261]:
import nltk

nltk.download("all")

import torch

import numpy as np
import torch.nn as nn
import torch.optim as optim


from collections import Counter
from datasets import load_dataset
from gensim.models import KeyedVectors
from nltk.tokenize import word_tokenize
from torch.utils.data import Dataset, DataLoader
from gensim.downloader import load as load_word2vec

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     /Users/harry.tran/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     /Users/harry.tran/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /Users/harry.tran/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /Users/harry.tran/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_eng is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /Users/harry.tran/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is alread

# Part 0. Dataset Preparation


In [262]:
dataset = load_dataset("rotten_tomatoes")
train_dataset = dataset["train"]
validation_dataset = dataset["validation"]
test_dataset = dataset["test"]

## Simple statatistics about dataset


In [263]:
print(f"Size of training set: {train_dataset.num_rows} sentences")
print(f"Size of validation set: {validation_dataset.num_rows} sentences")
print(f"Size of test set: {test_dataset.num_rows} sentences")

Size of training set: 8530 sentences
Size of validation set: 1066 sentences
Size of test set: 1066 sentences


In [264]:
print(f"Sample sentence from train dataset: {test_dataset[0]['text']}")
print(f"Label: {'Positive' if test_dataset[0]['label'] == 1 else 'Negative'}")

Sample sentence from train dataset: lovingly photographed in the manner of a golden book sprung to life , stuart little 2 manages sweetness largely without stickiness .
Label: Positive


# Part 1. Preparing Word Embeddings


## Question 1. Word Embedding


#### `(a)` What is the size of the vocabulary formed from your training data?


In [265]:
# Tokenize each sentence
tokenized_texts = [word_tokenize(text["text"].lower()) for text in train_dataset]
print(f"Sample tokens from a sentence {tokenized_texts[0]}")

# Flatten all tokens in each token
all_tokens = [token for sentence in tokenized_texts for token in sentence]

# Create a Counter object to count the frequency of each token
vocab = {"<PAD>": 0, "<UNK>": 1}
for token in all_tokens:
    if token not in vocab:
        vocab[token] = len(vocab)

vocab_size = len(vocab)

print(f"Number of tokens in all sentences: {len(all_tokens)}")
print(f"Size of the vocabulary including PADDING and UNKNOWN tokens: {vocab_size}")
print(f"Size of the vocabulary: {vocab_size - 2}")

Sample tokens from a sentence ['the', 'rock', 'is', 'destined', 'to', 'be', 'the', '21st', 'century', "'s", 'new', '``', 'conan', '``', 'and', 'that', 'he', "'s", 'going', 'to', 'make', 'a', 'splash', 'even', 'greater', 'than', 'arnold', 'schwarzenegger', ',', 'jean-claud', 'van', 'damme', 'or', 'steven', 'segal', '.']
Number of tokens in all sentences: 183968
Size of the vocabulary including PADDING and UNKNOWN tokens: 18031
Size of the vocabulary: 18029


`<PAD>` token was introduced for easier processing at later steps. Because RNN requires all sentences having the same length

`<UNK>` token was introduced for handling unknown word in `Word2vec` vocabulary. All unkown words will be assigned as a `<UNK>` token


By using punkt tokenizer from NLTK, size of the vocabulary formed from training data is `18029`


#### `(b)` We use OOV (out-of-vocabulary) to refer to those words appeared in the training data but not in the Word2vec (or Glove) dictionary. How many OOV words exist in your training data?

#### `(c)` The existence of the OOV words is one of the well-known limitations of Word2vec (or Glove). Without using any transformer-based language models (e.g., BERT, GPT, T5), what do you think is the best strategy to mitigate such limitation? Implement your solution in your source code. Show the corresponding code snippet.


In [266]:
# word2vec = load_word2vec('word2vec-google-news-300')
# word2vec.save("word2vec.kv")

word2vec = KeyedVectors.load("./word2vec.kv")
embedding_dim = word2vec.vector_size
print(f"Embedding dimension: {embedding_dim}")

ValueError: Incorrect model/corpus name

In [252]:
mean = np.mean(word2vec.vectors, axis=0)
std = np.std(word2vec.vectors, axis=0)

In [253]:
embedding_matrix = np.zeros((vocab_size, embedding_dim))
oov_count = 0

# Step 3: Populate the embedding matrix
for word, index in vocab.items():
    if word in word2vec:
        embedding_matrix[index] = word2vec[word]
    else:
        # Handle OOV words with random initialization
        oov_count += 1
        embedding_matrix[index] = np.random.normal(
            loc=mean, scale=std, size=(embedding_dim,)
        )

print(f"Sample embeddings of {word}: {embedding_matrix[index]}")

Sample embeddings of portent: [ 2.30468750e-01 -1.02539062e-02  1.08886719e-01 -2.01171875e-01
  5.32226562e-02 -2.91748047e-02  4.68750000e-02  3.47900391e-03
  1.88476562e-01  3.94531250e-01 -8.36181641e-03 -2.08984375e-01
 -6.39648438e-02 -6.39648438e-02  6.62231445e-03  3.30078125e-01
  1.00585938e-01  3.10546875e-01  1.71875000e-01 -4.78515625e-01
  2.46047974e-04  9.81445312e-02  2.28515625e-01 -2.81250000e-01
  6.98242188e-02 -6.29882812e-02  1.91406250e-01  1.34765625e-01
  4.32128906e-02  5.03906250e-01 -2.09960938e-02 -1.01074219e-01
  2.69775391e-02  8.83789062e-02 -1.78710938e-01  6.15234375e-02
  7.37304688e-02 -4.37011719e-02  3.51562500e-02  5.29785156e-02
  4.92187500e-01  7.91015625e-02  9.66796875e-02  6.80541992e-03
 -1.01562500e-01  1.56250000e-01 -1.59179688e-01  2.04101562e-01
  2.02148438e-01 -1.04980469e-01  1.04980469e-01  1.71875000e-01
 -1.09863281e-01  1.14257812e-01 -1.37695312e-01  1.69921875e-01
 -8.01086426e-04 -1.74804688e-01  2.71484375e-01 -8.42285156

In [254]:
print(f"Number of out-of-vocabulary words: {oov_count - 2}")

Number of out-of-vocabulary words: 3612


`(b)` There are 3612 out-of-vocabulary words in our data


In [255]:
print(f"Sample embeddings of an unkown word: {embedding_matrix[vocab.get('<UNK>')]}")

Sample embeddings of an unkown word: [ 8.30000981e-02  1.67480450e-02  4.64828163e-02  1.32721663e-01
  5.77022881e-03 -6.03187460e-02 -8.20359701e-02 -2.07644250e-01
  1.86239784e-02 -1.02173295e-01 -7.89212196e-03  4.66939772e-02
  2.37347617e-02  6.61435875e-02  1.04954436e-01  1.05205032e-01
  2.18846705e-01 -1.55617495e-02 -1.81992767e-02  5.77860669e-02
 -4.10592191e-02  3.22300012e-02  2.67630885e-02  3.59988489e-02
 -1.92145504e-01  1.28826277e-02 -1.29102486e-01  3.41687176e-02
  1.51969729e-02 -1.20016093e-01 -3.95793783e-02 -6.95113998e-02
 -1.95638195e-01 -1.59089887e-01  4.65387919e-04 -1.08833362e-02
  3.75744926e-02 -5.63716259e-02  7.78926620e-03 -4.13194679e-02
 -5.24402554e-03 -5.85758149e-02 -5.85411117e-02  2.47275089e-03
  6.87178706e-02  7.57839932e-02 -2.19895578e-01 -1.11943442e-01
  1.30330233e-01  8.48513692e-02 -4.38459941e-01  1.95377902e-01
  9.16618093e-02  8.01268090e-02  2.55622291e-02 -1.15092788e-01
  2.07613048e-02  9.25101986e-02  1.20242261e-01 -1.4

`(c)` To address out-of-vocabulary (OOV) words in Word2Vec, we assigned them embeddings generated from random samples of a normal distribution based on the mean and standard deviation of all existing word vectors. This approach offers several advantages:

- Consistency in Input: Randomly generated embeddings ensure a consistent shape and dimensionality for OOV words, facilitating seamless integration into the model.
- Statistical Alignment: By matching the mean and standard deviation of known embeddings, we align the OOV representations with the learned vocabulary, reducing the potential instability introduced by random noise.


# Part 2. Model Training & Evaluation - RNN


In [256]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, max_len):
        self.texts = texts
        self.labels = labels
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        token_indices = self.__tokenizer(text, self.max_len)
        return torch.tensor(token_indices, dtype=torch.long), torch.tensor(
            label, dtype=torch.long
        )

    def __tokenizer(self, sentence, max_len):
        tokens = word_tokenize(sentence.lower())
        token_indices = [vocab.get(token, vocab["<UNK>"]) for token in tokens]
        token_indices = token_indices[:max_len]
        padded_tokens = token_indices + [vocab["<PAD>"]] * (
            max_len - len(token_indices)
        )
        return padded_tokens

In [257]:
class RNNModel(nn.Module):
    def __init__(self, embedding_matrix, hidden_dim, output_dim):
        super(RNNModel, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(
            torch.tensor(embedding_matrix, dtype=torch.float32),
            freeze=True,  # Freeze the embeddings
        )
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(0.5)  # To prevent overfitting

    def forward(self, x):
        embedded = self.embedding(x)  # [batch_size, seq_length, embedding_dim]
        rnn_output, hidden = self.rnn(embedded)
        # We use the hidden state of the last RNN unit as the sentence representation
        last_hidden = hidden.squeeze(0)
        output = self.dropout(last_hidden)
        output = self.fc(output)
        return output

In [258]:
max_length = max(len(item) for item in tokenized_texts)
train_data = TextDataset(
    [item["text"] for item in train_dataset],
    [item["label"] for item in train_dataset],
    max_length,
)

valid_data = TextDataset(
    [item["text"] for item in validation_dataset],
    [item["label"] for item in validation_dataset],
    max_length,
)

test_data = TextDataset(
    [item["text"] for item in test_dataset],
    [item["label"] for item in test_dataset],
    max_length,
)

batch_size = 32
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(valid_data, batch_size=batch_size)
test_loader = DataLoader(test_data, batch_size=batch_size)

hidden_dim = 256
output_dim = 2

model = RNNModel(embedding_matrix, hidden_dim, output_dim)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

RNNModel(
  (embedding): Embedding(18031, 300)
  (rnn): RNN(300, 256, batch_first=True)
  (fc): Linear(in_features=256, out_features=2, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [259]:
def train_epoch(model, data_loader, criterion, optimizer):
    model.train()
    total_loss = 0
    correct_predictions = 0
    
    for texts, labels in data_loader:
        texts = texts.to(device)
        labels = labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(texts)
        
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        correct_predictions += (outputs.argmax(1) == labels).sum().item()
    
    return total_loss / len(data_loader), correct_predictions / len(data_loader.dataset)

def evaluate_epoch(model, data_loader, criterion):
    model.eval()
    total_loss = 0
    correct_predictions = 0
    
    with torch.no_grad():
        for texts, labels in data_loader:
            texts = texts.to(device)
            labels = labels.to(device)
            
            outputs = model(texts)
            loss = criterion(outputs, labels)
            
            total_loss += loss.item()
            correct_predictions += (outputs.argmax(1) == labels).sum().item()
    
    return total_loss / len(data_loader), correct_predictions / len(data_loader.dataset)

# Training the model with early stopping
num_epochs = 50
best_val_accuracy = 0
early_stopping_count = 0

for epoch in range(num_epochs):
    train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer)
    val_loss, val_acc = evaluate_epoch(model, val_loader, criterion)
    
    print(f'Epoch {epoch + 1}/{num_epochs}')
    print(f'Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}')
    print(f'Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}')
    
    # Early stopping if validation accuracy does not improve for 3 epochs
    if val_acc > best_val_accuracy:
        best_val_accuracy = val_acc
        torch.save(model.state_dict(), 'best_rnn_model.pth')
        early_stopping_count = 0
    else:
        early_stopping_count += 1
        if early_stopping_count >= 5:
            print("Early stopping as validation accuracy is not improving")
            break

# Evaluate on test set
model.load_state_dict(torch.load('best_rnn_model.pth'))
model.to(device)

test_loss, test_acc = evaluate_epoch(model, test_loader, criterion)
print(f'Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.4f}')

Epoch 1/50
Train Loss: 0.7147, Train Acc: 0.5094
Val Loss: 0.7028, Val Acc: 0.5066
Epoch 2/50
Train Loss: 0.7173, Train Acc: 0.5062
Val Loss: 0.7138, Val Acc: 0.4700
Epoch 3/50
Train Loss: 0.7126, Train Acc: 0.5072
Val Loss: 0.6948, Val Acc: 0.5328
Epoch 4/50
Train Loss: 0.7130, Train Acc: 0.5047
Val Loss: 0.7144, Val Acc: 0.5056
Epoch 5/50
Train Loss: 0.7053, Train Acc: 0.5137
Val Loss: 0.6945, Val Acc: 0.5066
Epoch 6/50
Train Loss: 0.7061, Train Acc: 0.5083
Val Loss: 0.7024, Val Acc: 0.5403
Epoch 7/50
Train Loss: 0.7056, Train Acc: 0.5117
Val Loss: 0.6887, Val Acc: 0.5403
Epoch 8/50
Train Loss: 0.7079, Train Acc: 0.5067
Val Loss: 0.6885, Val Acc: 0.5347
Epoch 9/50
Train Loss: 0.7054, Train Acc: 0.5117
Val Loss: 0.6962, Val Acc: 0.5356
Epoch 10/50
Train Loss: 0.7056, Train Acc: 0.5108
Val Loss: 0.7062, Val Acc: 0.4944
Epoch 11/50
Train Loss: 0.7010, Train Acc: 0.5151
Val Loss: 0.6929, Val Acc: 0.5272
Early stopping as validation accuracy is not improving


  model.load_state_dict(torch.load('best_rnn_model.pth'))


Test Loss: 0.7058, Test Acc: 0.5281
