In [1]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F
import torch.optim as optim
import torch.nn as nn
from datetime import datetime
from collections import Counter
import pandas as pd
import numpy as np
import torch
import csv
import os
import re

In [2]:
torch.manual_seed(256)
torch.cuda.manual_seed(256)
np.random.seed(256)

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using {device} device.")

Using cpu device.


## 2.1 Word embedding

1. Read txt files and tokenize them to obtain train/validation/test lists of words.

In [4]:
TOKENIZER = get_tokenizer("basic_english")


def read_txt_files(datapath):
    files = os.listdir(datapath)
    files = [datapath + f for f in files if f.endswith(".txt")]

    lines = []
    for f_name in files:
        with open(f_name) as f:
            lines += f.readlines()
    return lines


def tokenize(lines, tokenizer=TOKENIZER):
    list_text = []
    for line in lines:
        list_text += tokenizer(line)
    return list_text


def yield_tokens(lines, tokenizer=TOKENIZER):
    no_digits = "\w*[0-9]+\w*"  # Regex to match words containing numbers
    no_names = "\w*[A-Z]+\w*"  # Regex to match words with capital letters (names)
    no_spaces = "\s+"  # Regex to match sequences of whitespace

    # Processing each line to remove digits, names, and extra spaces
    for line in lines:
        line = re.sub(no_digits, " ", line)
        line = re.sub(no_names, " ", line)
        line = re.sub(no_spaces, " ", line)
        # Yielding the tokenized and cleaned line
        yield tokenizer(line)

In [5]:
GENERATED_PATH = "./generated/"  # Path where generated data files are stored

# Check if the training data file already exists in the generated path
if os.path.isfile(GENERATED_PATH + "words_train.pt"):
    # Load preprocessed training, validation, and test word lists from .pt files
    words_train = torch.load(GENERATED_PATH + "words_train.pt")
    words_val = torch.load(GENERATED_PATH + "words_val.pt")
    words_test = torch.load(GENERATED_PATH + "words_test.pt")
else:
    # If preprocessed data does not exist, read text files
    lines_books_train = read_txt_files("data/data_train/")
    lines_books_val = read_txt_files("data/data_val/")
    lines_books_test = read_txt_files("data/data_test/")

    # Tokenize the lines from train, validation, and test datasets
    words_train = tokenize(lines_books_train)
    words_val = tokenize(lines_books_val)
    words_test = tokenize(lines_books_test)

    # Save the tokenized word lists to .pt files
    torch.save(words_train, GENERATED_PATH + "words_train.pt")
    torch.save(words_val, GENERATED_PATH + "words_val.pt")
    torch.save(words_test, GENERATED_PATH + "words_test.pt")

2. Define a vocabulary based on the training dataset. To avoid getting a too large vocabulary, a solution can be to keep only words that appear at least 100 times in the training dataset. Report the total number of words in the training dataset, the number of distinct words in the training dataset, and the size of the defined vocabulary. Comment on your results.

In [6]:
MIN_FREQ = 100


def create_vocabulary(lines, min_freq=MIN_FREQ):
    # Building vocabulary from an iterator of tokenized lines, filtering out infrequent tokens
    vocab = build_vocab_from_iterator(yield_tokens(lines), min_freq=min_freq, specials=["<unk>"])
    # Appending token "I", since we removed all words with an uppercase when building the vocabulary
    vocab.append_token("i")
    # Setting default index for unknown words
    vocab.set_default_index(vocab["<unk>"])
    return vocab

In [7]:
VOCAB_FILENAME = "vocabulary.pt"

# Check if the vocabulary file already exists in the generated path
if os.path.isfile(GENERATED_PATH + VOCAB_FILENAME):
    # Load the vocabulary from a file if it already exists
    vocab = torch.load(GENERATED_PATH + VOCAB_FILENAME)
else:
    # If the vocabulary file does not exist, create a new vocabulary from training data
    vocab = create_vocabulary(lines_books_train, min_freq=MIN_FREQ)
    # Save the newly created vocabulary to a file
    torch.save(vocab, GENERATED_PATH + VOCAB_FILENAME)

VOCAB_SIZE = len(vocab)

In [8]:
print(f"Total number of words in the training dataset: {len(words_train):,}")
print(f"Total number of words in the validation dataset: {len(words_val):,}")
print(f"Total number of words in the test dataset: {len(words_test):,}", end="\n\n")

print(f"Number of distinct words in the training dataset: {len(set(words_train)):,}")
print(f"Number of distinct words in the validation dataset: {len(set(words_val)):,}")
print(f"Number of distinct words in the test dataset: {len(set(words_test)):,}", end="\n\n")

print(f"Size of the defined vocabulary: {VOCAB_SIZE:,}")

Total number of words in the training dataset: 2,684,706
Total number of words in the validation dataset: 49,526
Total number of words in the test dataset: 124,152

Number of distinct words in the training dataset: 52,105
Number of distinct words in the validation dataset: 5,778
Number of distinct words in the test dataset: 9,585

Size of the defined vocabulary: 1,880


In [9]:
def count_occurrences(words, vocab):
    occurrences = torch.zeros(len(vocab), dtype=torch.int)
    for w in words:
        occurrences[vocab[w]] += 1
    return occurrences

In [10]:
word_counts_df = pd.DataFrame({
    "Word": vocab.lookup_tokens(range(len(vocab))),
    "Occurrences": count_occurrences(words_train, vocab).numpy()
})

sorted_word_counts = word_counts_df.sort_values(by="Occurrences", ascending=False).reset_index(drop=True)
sorted_word_counts.index = sorted_word_counts.index + 1

In [11]:
sorted_word_counts

Unnamed: 0,Word,Occurrences
1,<unk>,433907
2,",",182537
3,the,151278
4,.,123727
5,and,82289
...,...,...
1876,pistol,100
1877,slipped,100
1878,station-master,100
1879,wounds,100


In [12]:
CONTEXT_SIZE = 3  # Number of words considered before and after the target word
target_counts = {}


def create_context_target_dataset(text, vocab, context_size=CONTEXT_SIZE, max_occurrences=None):
    contexts = []
    targets = []

    # Loop over the words list with enough space to form the context window
    for i in range(context_size, len(text) - context_size):
        target_word = text[i]

        # Skip if the target word is punctuation
        if target_word in [',', '.', '(', ')', '?', '!']:
            continue

        # Convert the word to its vocabulary index, skip if not in vocabulary
        target_idx = vocab.get_stoi().get(target_word, None)
        if target_idx is None:
            continue

        # Limit occurrences of each word
        if max_occurrences is not None:
            if target_word in target_counts and target_counts[target_word] >= max_occurrences:
                continue
            target_counts[target_word] = target_counts.get(target_word, 0) + 1

        # Extract the context words: context_size words before and context_size words after the target word
        context = [vocab[text[j]] for j in range(i - context_size, i + context_size + 1) if j != i]
        contexts.append(torch.tensor(context))
        targets.append(target_idx)

    return TensorDataset(torch.stack(contexts), torch.tensor(targets, dtype=torch.long))

In [13]:
def load_dataset(words, vocab, filename, generated_path=GENERATED_PATH):
    full_path = os.path.join(generated_path, filename)
    if os.path.isfile(full_path):
        return torch.load(full_path)
    else:
        dataset, target_counts = create_context_target_dataset(words, vocab, max_occurrences=10000)
        torch.save(dataset, full_path)
        return dataset


data_train = load_dataset(words_train, vocab, "data_train.pt")
data_val = load_dataset(words_val, vocab, "data_val.pt")
data_test = load_dataset(words_test, vocab, "data_test.pt")

In [14]:
print(f"Training dataset size: {len(data_train):,}")
print(f"Validation dataset size: {len(data_val):,}")
print(f"Test dataset size: {len(data_test):,}")

Training dataset size: 1,320,520
Validation dataset size: 36,216
Test dataset size: 83,645


In [15]:
batch_size = 128

train_loader = DataLoader(data_train, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(data_val, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(data_test, batch_size=batch_size, shuffle=True)

3. Define a continuous bag of words model architecture based on this vocabulary that contains an embedding layer. To drastically reduce the computational cost, the dimension of the embedding `emb_dim` can be very low such as 16, 12, or even 10. Of course, in a real setting, a larger space would be used. You are not allowed to use `nn.LazyLinear` in this project.

https://pytorch.org/tutorials/beginner/nlp/word_embeddings_tutorial.html

In [16]:
class CBOW(nn.Module):
    def __init__(self, vocab_size, emb_dim, context_size):
        super(CBOW, self).__init__()
        # Embedding layer: maps each word index to a dense vector representation
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        # Linear layer 1: maps flattened context embeddings to a hidden layer
        self.lin1 = nn.Linear(context_size * 2 * emb_dim, 128)
        # Activation function: applies ReLU to introduce non-linearity
        self.relu = nn.ReLU()
        # Linear layer 2: projects from the hidden layer to the output layer of size vocab_size
        self.lin2 = nn.Linear(128, vocab_size)

    def forward(self, inputs):
        # Embeds the input word indices to get dense vector representations
        embeds = self.embedding(inputs)
        # Flattens the embeddings into a single long vector per sample in the batch
        embeds = embeds.view(embeds.size(0), -1)
        out = self.lin1(embeds)
        out = self.relu(out)
        out = self.lin2(out)
        # Applies log softmax to compute log probabilities
        out = F.log_softmax(out, dim=1)
        return out

In [17]:
def train_model(model, train_loader, optimizer, loss_fn, epochs=20):
    training_losses = []

    for epoch in range(1, epochs + 1):
        model.train()
        train_loss = 0
        for contexts, targets in train_loader:
            contexts, targets = contexts.to(device), targets.to(device)
            optimizer.zero_grad()
            outputs = model(contexts)
            loss = loss_fn(outputs, targets)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        avg_train_loss = train_loss / len(train_loader)
        training_losses.append(avg_train_loss)

        timestamp = datetime.now().strftime('%H:%M:%S.%f')
        print(f"{timestamp} | Epoch {epoch} | Training Loss: {avg_train_loss:.5f}")

    return training_losses

In [18]:
def calculate_weights(data, vocab_size=VOCAB_SIZE):
    # Initialize counts with zeros for each possible word index
    counts = torch.zeros(vocab_size, dtype=torch.int)

    # Count occurrences of each target in the dataset
    for _, target in data:
        counts[target.item()] += 1

    # Total number of samples
    total_samples = counts.sum()

    # Weights for each class
    weights = total_samples / (counts * VOCAB_SIZE)
    
    # Replace inf/NaN with zero (in case some classes do not appear in the counts)
    weights[torch.isinf(weights) | torch.isnan(weights)] = 0
    
    # Normalize weights so that their maximum is 1
    weights = weights / weights.max()

    return weights.to(device)

In [19]:
class_weights = calculate_weights(data_train)

In [20]:
def save_losses(train_losses, model_name, folder="losses"):
    filepath = os.path.join(folder, f"{model_name}_losses.csv")
    df = pd.DataFrame({
        "train_loss": train_losses
    })
    df.to_csv(filepath, index=False)
    print(f"Losses saved to {filepath}")

In [21]:
def load_losses(model_name, folder="losses"):
    filepath = os.path.join(folder, f"{model_name}_losses.csv")
    df = pd.read_csv(filepath)
    print(f"Losses loaded from {filepath}")
    return df['train_loss'].tolist()

In [22]:
def save_model(model, model_name, folder="models"):
    filepath = os.path.join(folder, f"{model_name}.pth")
    torch.save(model.state_dict(), filepath)
    print(f"Model saved to {filepath}")

In [23]:
def load_model(model, model_name, folder="models"):
    filepath = os.path.join(folder, f"{model_name}.pth")
    model.load_state_dict(torch.load(filepath))
    model.to(device)
    print(f"Model loaded from {filepath}")
    return model

In [24]:
CBOW_model = CBOW(vocab_size=VOCAB_SIZE, emb_dim=64, context_size=CONTEXT_SIZE).to(device)

optimizer = optim.Adam(CBOW_model.parameters(), lr=1e-4)
loss_fn = nn.NLLLoss(weight=class_weights)

CBOW_train_losses = train_model(CBOW_model, train_loader, optimizer, loss_fn, epochs=50)

14:25:41.361325 | Epoch 1 | Training Loss: 7.03392
14:26:58.820247 | Epoch 2 | Training Loss: 6.27797
14:28:19.277783 | Epoch 3 | Training Loss: 5.98253
14:29:27.111803 | Epoch 4 | Training Loss: 5.79299
14:30:33.973495 | Epoch 5 | Training Loss: 5.65539
14:31:38.016560 | Epoch 6 | Training Loss: 5.54449
14:32:44.728740 | Epoch 7 | Training Loss: 5.45386
14:33:50.810275 | Epoch 8 | Training Loss: 5.37625
14:34:57.625661 | Epoch 9 | Training Loss: 5.30669
14:36:02.966691 | Epoch 10 | Training Loss: 5.24684
14:37:07.295056 | Epoch 11 | Training Loss: 5.19368
14:38:36.559131 | Epoch 12 | Training Loss: 5.14531
14:39:43.908241 | Epoch 13 | Training Loss: 5.10108
14:40:48.488931 | Epoch 14 | Training Loss: 5.06019
14:41:58.292487 | Epoch 15 | Training Loss: 5.02435
14:43:02.292321 | Epoch 16 | Training Loss: 4.98939
14:44:08.581666 | Epoch 17 | Training Loss: 4.95772
14:45:12.611048 | Epoch 18 | Training Loss: 4.92882
14:46:17.881096 | Epoch 19 | Training Loss: 4.90085
14:47:22.721523 | Epo

In [25]:
save_model(CBOW_model, "cbow_model")
save_losses(CBOW_train_losses, model_name="cbow_model")

Model saved to models/cbow_model.pth
Losses saved to losses/cbow_model_losses.csv


In [None]:
CBOW_model_loaded = CBOW(vocab_size=VOCAB_SIZE, emb_dim=64, context_size=3)
CBOW_model_loaded = load_model(CBOW_model_loaded, "cbow_model")

CBOW_train_losses_loaded = load_losses("cbow_model")

In [None]:
CBOW_model = CBOW_model_loaded
CBOW_train_losses = CBOW_train_losses_loaded

4. Train several models, select the best one, and evaluate its performance. Note that the performance here is potentially extremely low, but the real objective is not to train a good predictor, only to have a good representation of the semantics of each word in the vocabulary.

In [26]:
def accuracy(model, data_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for contexts, targets in data_loader:
            contexts, targets = contexts.to(device), targets.to(device)
            outputs = model(contexts)
            _, predicted = torch.max(outputs.data, 1)
            total += targets.size(0)
            correct += (predicted == targets).sum().item()
    return correct / total

In [27]:
def model_selection(models, model_names, val_loader):
    best_model = None
    best_model_name = ""
    best_acc = 0
    for model, model_name in zip(models, model_names):
        val_acc = accuracy(model, val_loader)
        print(f"{model_name} | Validation accuracy {val_acc:.2%}")
        if val_acc > best_acc:
            best_model = model
            best_model_name = model_name
            best_acc = val_acc
    return best_model, best_model_name

In [28]:
models = [CBOW_model, ]
model_names = ["CBOW", ]

# Choose the best model based on validation dataset
best_model, best_model_name = model_selection(models, model_names, val_loader)

CBOW | Validation accuracy 8.27%


In [29]:
# Evaluate the best model on the test set
test_acc = accuracy(best_model, test_loader)
print(f"The best model is {best_model_name} | Test accuracy {test_acc:.2%}")

The best model is CBOW | Test accuracy 7.83%


5. Compute the cosine similarity matrix of the vocabulary based on the trained embedding. For some words of your choice (e.g. *me*, *white*, *man*, *have*, *be*, *child*, *yes*, *what* etc.), report the 10 most similar words. Comment on your results.

In [30]:
best_model_embeddings = CBOW_model.embedding.weight.detach().cpu().numpy()

mat_size = len(best_model_embeddings)
cos_sim_mat = np.zeros(shape=(mat_size, mat_size))

for w1 in range(mat_size):
    for w2 in range(mat_size):
        cos_sim_mat[w1][w2] = np.dot(best_model_embeddings[w1], best_model_embeddings[w2]) / (
                np.linalg.norm(best_model_embeddings[w1]) * np.linalg.norm(best_model_embeddings[w2]))

In [31]:
test_words = ["me", "white", "man", "have", "be", "child", "yes", "what"]
test_words_idx = [vocab.get_stoi()[word] for word in test_words]

In [32]:
top_n = 10
similar_words = {}

for word, idx in zip(test_words, test_words_idx):
    # get cosine similarities for the current word against all other words
    similarities = cos_sim_mat[idx]

    # get indices of the top 10 most similar words (excluding the word itself)
    # argsort returns indices of sorted array, with the smallest first, so we take the last 10 items
    most_similar_idxs = np.argsort(similarities)[-top_n - 1:-1][::-1]

    # map indices back to words
    similar_words[word] = [vocab.get_itos()[i] for i in most_similar_idxs]

In [33]:
for word, similar in similar_words.items():
    print(f"Words most similar to '{word}': {similar}")

Words most similar to 'me': ['us', 'him', 'yourself', 'you', 'thee', 'sir', 'them', 'importance', 'back', 'state']
Words most similar to 'white': ['blue', 'red', 'green', 'black', 'mouth', 'yellow', 'anxiety', 'dead', 'some', 'sick']
Words most similar to 'man': ['woman', 'cat', 'soldier', 'gentleman', 'officer', 'captain', 'fellow', 'pistol', 'elder', 'men']
Words most similar to 'have': ['reality', 'need', 've', 'die', 'had', 'subject', 'tell', 'method', 'hast', 'having']
Words most similar to 'be': ['been', 're', 'speak', 'discovered', 'station-master', 'grown', 'is', 'remain', 'thoroughly', 'roses']
Words most similar to 'child': ['girl', 'm', 'cry', 'silently', 'expression', 'words', 'friend', 'sake', 'counsel', 'race']
Words most similar to 'yes': ['depths', 'nor', 'absolute', 'recognized', '!', 'considered', 'makes', 'll', 'becomes', 'greatest']
Words most similar to 'what': ['whatever', 'empty', 'how', 'every', 'nothing', 'something', 'hollow', 'understand', 'horror', 'conversa

6. Visualize the embedding space on <https://projector.tensorflow.org>. To do so, upload the vocabulary and their corresponding values in the embedding space as two tsv files. Try to find and select clusters. Report both plots (you can use screenshots) and their corresponding selections for some meaningful clusters. Comment on your results.

In [34]:
with open("tensorflow_projector/vectors.tsv", "w") as vectors_file:
    tsv_writer = csv.writer(vectors_file, delimiter="\t")
    for weight in best_model_embeddings:
        tsv_writer.writerow(weight)

with open("tensorflow_projector/metadata.tsv", "w") as metadata_file:
    tsv_writer = csv.writer(metadata_file, delimiter="\t")
    vocab_dict = vocab.get_itos()
    for word in vocab_dict:
        tsv_writer.writerow([word])