In [44]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F
import torch.optim as optim
import torch.nn as nn
from datetime import datetime
import pandas as pd
import numpy as np
import torch
import csv
import os
import re

In [45]:
torch.manual_seed(256)
torch.cuda.manual_seed(256)
np.random.seed(256)

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using {device} device.")

Using cpu device.


## 2.1 Word embedding

1. Read txt files and tokenize them to obtain train/validation/test lists of words.

In [4]:
TOKENIZER = get_tokenizer("basic_english")


def read_txt_files(datapath):
    files = os.listdir(datapath)
    files = [datapath + f for f in files if f.endswith(".txt")]

    lines = []
    for f_name in files:
        with open(f_name) as f:
            lines += f.readlines()
    return lines


def tokenize(lines, tokenizer=TOKENIZER):
    list_text = []
    for line in lines:
        list_text += tokenizer(line)
    return list_text


def yield_tokens(lines, tokenizer=TOKENIZER):
    no_digits = "\w*[0-9]+\w*"  # Regex to match words containing numbers
    no_names = "\w*[A-Z]+\w*"  # Regex to match words with capital letters (names)
    no_spaces = "\s+"  # Regex to match sequences of whitespace

    # Processing each line to remove digits, names, and extra spaces
    for line in lines:
        line = re.sub(no_digits, " ", line)
        line = re.sub(no_names, " ", line)
        line = re.sub(no_spaces, " ", line)
        # Yielding the tokenized and cleaned line
        yield tokenizer(line)

In [5]:
GENERATED_PATH = "./generated/"  # Path where generated data files are stored

# Check if the training data file already exists in the generated path
if os.path.isfile(GENERATED_PATH + "words_train.pt"):
    # Load preprocessed training, validation, and test word lists from .pt files
    words_train = torch.load(GENERATED_PATH + "words_train.pt")
    words_val = torch.load(GENERATED_PATH + "words_val.pt")
    words_test = torch.load(GENERATED_PATH + "words_test.pt")
else:
    # If preprocessed data does not exist, read text files
    lines_books_train = read_txt_files("data/data_train/")
    lines_books_val = read_txt_files("data/data_val/")
    lines_books_test = read_txt_files("data/data_test/")

    # Tokenize the lines from train, validation, and test datasets
    words_train = tokenize(lines_books_train)
    words_val = tokenize(lines_books_val)
    words_test = tokenize(lines_books_test)

    # Save the tokenized word lists to .pt files
    torch.save(words_train, GENERATED_PATH + "words_train.pt")
    torch.save(words_val, GENERATED_PATH + "words_val.pt")
    torch.save(words_test, GENERATED_PATH + "words_test.pt")

2. Define a vocabulary based on the training dataset. To avoid getting a too large vocabulary, a solution can be to keep only words that appear at least 100 times in the training dataset. Report the total number of words in the training dataset, the number of distinct words in the training dataset, and the size of the defined vocabulary. Comment on your results.

In [6]:
MIN_FREQ = 100


def create_vocabulary(lines, min_freq=MIN_FREQ):
    # Building vocabulary from an iterator of tokenized lines, filtering out infrequent tokens
    vocab = build_vocab_from_iterator(yield_tokens(lines), min_freq=min_freq, specials=["<unk>"])
    # Appending token "I", since we removed all words with an uppercase when building the vocabulary
    vocab.append_token("i")
    # Setting default index for unknown words
    vocab.set_default_index(vocab["<unk>"])
    return vocab

In [7]:
VOCAB_FILENAME = "vocabulary.pt"

# Check if the vocabulary file already exists in the generated path
if os.path.isfile(GENERATED_PATH + VOCAB_FILENAME):
    # Load the vocabulary from a file if it already exists
    vocab = torch.load(GENERATED_PATH + VOCAB_FILENAME)
else:
    # If the vocabulary file does not exist, create a new vocabulary from training data
    vocab = create_vocabulary(lines_books_train, min_freq=MIN_FREQ)
    # Save the newly created vocabulary to a file
    torch.save(vocab, GENERATED_PATH + VOCAB_FILENAME)

VOCAB_SIZE = len(vocab)

In [8]:
print(f"Total number of words in the training dataset: {len(words_train):,}")
print(f"Total number of words in the validation dataset: {len(words_val):,}")
print(f"Total number of words in the test dataset: {len(words_test):,}", end="\n\n")

print(f"Number of distinct words in the training dataset: {len(set(words_train)):,}")
print(f"Number of distinct words in the validation dataset: {len(set(words_val)):,}")
print(f"Number of distinct words in the test dataset: {len(set(words_test)):,}", end="\n\n")

print(f"Size of the defined vocabulary: {VOCAB_SIZE:,}")

Total number of words in the training dataset: 2,684,706
Total number of words in the validation dataset: 49,526
Total number of words in the test dataset: 124,152

Number of distinct words in the training dataset: 52,105
Number of distinct words in the validation dataset: 5,778
Number of distinct words in the test dataset: 9,585

Size of the defined vocabulary: 1,880


In [9]:
def count_occurrences(words, vocab):
    occurrences = torch.zeros(len(vocab), dtype=torch.int)
    for w in words:
        occurrences[vocab[w]] += 1
    return occurrences

In [10]:
word_counts_df = pd.DataFrame({
    "Word": vocab.lookup_tokens(range(len(vocab))),
    "Occurrences": count_occurrences(words_train, vocab).numpy()
})

sorted_word_counts = word_counts_df.sort_values(by="Occurrences", ascending=False).reset_index(drop=True)
sorted_word_counts.index = sorted_word_counts.index + 1

In [11]:
sorted_word_counts

Unnamed: 0,Word,Occurrences
1,<unk>,433907
2,",",182537
3,the,151278
4,.,123727
5,and,82289
...,...,...
1876,pistol,100
1877,slipped,100
1878,station-master,100
1879,wounds,100


In [12]:
def get_word_type_label(word):
    if word == '<unk>':  # Unknown word
        return 0
    elif word in [',', '.', '(', ')', '?', '!']:  # Punctuation
        return 1
    else:  # A valid word that exists in the vocabulary
        return 2


# Map each vocabulary word to its appropriate type label
MAP_TARGET = {vocab[w]: get_word_type_label(w) for w in vocab.lookup_tokens(range(VOCAB_SIZE))}
CONTEXT_SIZE = 3  # Number of words considered before and after the target word


def create_context_target_dataset(text, vocab, context_size=CONTEXT_SIZE, map_target=MAP_TARGET):
    n_text = len(text)
    contexts = []
    targets = []

    for i in range(context_size, n_text - context_size):
        # Extract the context words: context_size words before and context_size words after the target word
        context = [vocab[text[j]] for j in range(i - context_size, i + context_size + 1) if j != i]
        # Get the target word and convert it to its label using the map_target dictionary
        target = map_target[vocab[text[i]]]
        contexts.append(torch.tensor(context))
        targets.append(target)

    return TensorDataset(torch.stack(contexts), torch.tensor(targets))

In [13]:
def load_dataset(words, vocab, filename, generated_path=GENERATED_PATH):
    full_path = os.path.join(generated_path, filename)
    if os.path.isfile(full_path):
        return torch.load(full_path)
    else:
        dataset = create_context_target_dataset(words, vocab)
        torch.save(dataset, full_path)
        return dataset


data_train = load_dataset(words_train, vocab, "data_train.pt")
data_val = load_dataset(words_val, vocab, "data_val.pt")
data_test = load_dataset(words_test, vocab, "data_test.pt")

In [14]:
print(f"Training dataset size: {len(data_train):,}")
print(f"Validation dataset size: {len(data_val):,}")
print(f"Test dataset size: {len(data_test):,}")

Training dataset size: 2,684,700
Validation dataset size: 49,520
Test dataset size: 124,146


In [15]:
batch_size = 128

train_loader = DataLoader(data_train, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(data_val, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(data_test, batch_size=batch_size, shuffle=True)

3. Define a continuous bag of words model architecture based on this vocabulary that contains an embedding layer. To drastically reduce the computational cost, the dimension of the embedding `emb_dim` can be very low such as 16, 12, or even 10. Of course, in a real setting, a larger space would be used. You are not allowed to use `nn.LazyLinear` in this project.

https://pytorch.org/tutorials/beginner/nlp/word_embeddings_tutorial.html

In [16]:
class CBOW(nn.Module):
    def __init__(self, vocab_size, emb_dim):
        super(CBOW, self).__init__()
        # Embedding layer: maps each word index to a dense vector representation
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        # Linear layer 1: transforms the summed embeddings to a hidden layer of size 128
        self.linear1 = nn.Linear(emb_dim, 128)
        # Activation function 1: applies ReLU to introduce non-linearity
        self.activation_function1 = nn.ReLU()
        # Linear layer 2: projects from the hidden layer to the output layer of size vocab_size
        self.linear2 = nn.Linear(128, vocab_size)
        # Activation function 2: applies log softmax to compute log probabilities
        self.activation_function2 = nn.LogSoftmax(dim=-1)

    def forward(self, inputs):
        # Input shape: [batch_size, context_size * 2] (2 words to the left + 2 words to the right of the target)
        # Embeds the input word indices to get dense vector representations
        embeds = self.embedding(inputs)  # [batch_size, context_size * 2, emb_dim]
        embeds = embeds.sum(dim=1)  # [batch_size, emb_dim]
        out = self.linear1(embeds)  # [batch_size, 128]
        out = self.activation_function1(out)
        out = self.linear2(out)  # Output shape: [batch_size, vocab_size]
        # Applies log softmax to get log probabilities
        out = self.activation_function2(out)
        return out

In [17]:
def train_model(model, train_loader, optimizer, loss_fn, epochs=20):
    training_losses = []

    for epoch in range(1, epochs + 1):
        model.train()
        train_loss = 0
        for contexts, targets in train_loader:
            contexts, targets = contexts.to(device), targets.to(device)
            optimizer.zero_grad()
            outputs = model(contexts)
            loss = loss_fn(outputs, targets)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        avg_train_loss = train_loss / len(train_loader)
        training_losses.append(avg_train_loss)

        timestamp = datetime.now().strftime('%H:%M:%S.%f')
        print(f"{timestamp} | Epoch {epoch} | Training Loss: {avg_train_loss:.5f}")

    return training_losses

In [18]:
def save_losses(train_losses, model_name, folder="losses"):
    filepath = os.path.join(folder, f"{model_name}_losses.csv")
    df = pd.DataFrame({
        "train_loss": train_losses
    })
    df.to_csv(filepath, index=False)
    print(f"Losses saved to {filepath}")

In [19]:
def load_losses(model_name, folder="losses"):
    filepath = os.path.join(folder, f"{model_name}_losses.csv")
    df = pd.read_csv(filepath)
    print(f"Losses loaded from {filepath}")
    return df['train_loss'].tolist()

In [20]:
def save_model(model, model_name, folder="models"):
    filepath = os.path.join(folder, f"{model_name}.pth")
    torch.save(model.state_dict(), filepath)
    print(f"Model saved to {filepath}")

In [21]:
def load_model(model, model_name, folder="models"):
    filepath = os.path.join(folder, f"{model_name}.pth")
    model.load_state_dict(torch.load(filepath))
    model.to(device)
    print(f"Model loaded from {filepath}")
    return model

In [22]:
CBOW_model = CBOW(vocab_size=VOCAB_SIZE, emb_dim=64).to(device)

optimizer = optim.Adam(CBOW_model.parameters(), lr=0.001)
loss_fn = nn.NLLLoss()

CBOW_train_losses = train_model(CBOW_model, train_loader, optimizer, loss_fn, epochs=10)

22:23:29.460322 | Epoch 1 | Training Loss: 0.72974
22:26:15.371306 | Epoch 2 | Training Loss: 0.70739
22:28:55.706064 | Epoch 3 | Training Loss: 0.70234
22:31:24.830596 | Epoch 4 | Training Loss: 0.69890
22:34:01.106903 | Epoch 5 | Training Loss: 0.69613
22:36:35.083245 | Epoch 6 | Training Loss: 0.69380
22:39:01.562883 | Epoch 7 | Training Loss: 0.69174
22:41:25.126826 | Epoch 8 | Training Loss: 0.68997
22:43:54.359591 | Epoch 9 | Training Loss: 0.68826
22:46:14.071547 | Epoch 10 | Training Loss: 0.68688


In [23]:
save_model(CBOW_model, "cbow_model")
save_losses(CBOW_train_losses, model_name="cbow_model")

Model saved to models/cbow_model.pth
Losses saved to losses/cbow_model_losses.csv


In [24]:
CBOW_model_loaded = CBOW(vocab_size=VOCAB_SIZE, emb_dim=64)
CBOW_model_loaded = load_model(CBOW_model_loaded, "cbow_model")

CBOW_train_losses_loaded = load_losses("cbow_model")

Model loaded from models/cbow_model.pth
Losses loaded from losses/cbow_model_losses.csv


In [25]:
CBOW_model = CBOW_model_loaded
CBOW_train_losses = CBOW_train_losses_loaded

5. Compute the cosine similarity matrix of the vocabulary based on the trained embedding. For some words of your choice (e.g. *me*, *white*, *man*, *have*, *be*, *child*, *yes*, *what* etc.), report the 10 most similar words. Comment on your results.

In [46]:
best_model_embeddings = CBOW_model.embedding.weight.detach().cpu().numpy()

mat_size = len(best_model_embeddings)
cos_sim_mat = np.zeros(shape=(mat_size, mat_size))

for w1 in range(mat_size):
    for w2 in range(mat_size):
        cos_sim_mat[w1][w2] = np.dot(best_model_embeddings[w1], best_model_embeddings[w2]) / (
                np.linalg.norm(best_model_embeddings[w1]) * np.linalg.norm(best_model_embeddings[w2]))

In [47]:
test_words = ["me", "white", "man", "have", "be", "child", "yes", "what"]
test_words_idx = [vocab.get_stoi()[word] for word in test_words]

In [48]:
top_n = 10
similar_words = {}

for word, idx in zip(test_words, test_words_idx):
    # get cosine similarities for the current word against all other words
    similarities = cos_sim_mat[idx]

    # get indices of the top 10 most similar words (excluding the word itself)
    # argsort returns indices of sorted array, with the smallest first, so we take the last 10 items
    most_similar_idxs = np.argsort(similarities)[-top_n - 1:-1][::-1]
    
    # map indices back to words
    similar_words[word] = [vocab.get_itos()[i] for i in most_similar_idxs]

In [49]:
for word, similar in similar_words.items():
    print(f"Words most similar to '{word}': {similar}")

Words most similar to 'me': ['us', 'slipped', 'nature', 'turn', 'driving', 'yourself', 'nobody', 'play', 'weapons', 'skin']
Words most similar to 'white': ['some', 'itself', 'sufficient', 'feet', 'brave', 's', 'brain', 'skald', 'terrified', 'themselves']
Words most similar to 'man': ['woman', 'spirit', 'suffer', 'cat', 'pistol', 'kitchen', 'thing', 'wants', 'judge', 'castle']
Words most similar to 'have': ['would', 'shall', 'hast', 'wound', 'land', 'need', 'doing', 'merely', 'presented', 'ought']
Words most similar to 'be': ['speak', 'required', 'been', 'peasants', 'spread', 'hung', 'drawn', 'separated', 'talked', 'serve']
Words most similar to 'child': ['distance', 'candle', 'touch', 'window', 'distinguished', 'happen', 'officer', 'expect', 'inn', 'sofa']
Words most similar to 'yes': ['however', 'inquired', 'shouted', 'replied', 'sir', 'gentlemen', 'exclaimed', 'said', 'replies', 'repeated']
Words most similar to 'what': ['how', 'firm', 'done', 'whether', 'glad', 'books', 'anxiety', '

6. Visualize the embedding space on <https://projector.tensorflow.org>. To do so, upload the vocabulary and their corresponding values in the embedding space as two tsv files. Try to find and select clusters. Report both plots (you can use screenshots) and their corresponding selections for some meaningful clusters. Comment on your results.

In [50]:
with open("tensorflow_projector/vectors.tsv", "w") as vectors_file:
    tsv_writer = csv.writer(vectors_file, delimiter="\t")
    for weight in best_model_embeddings:
        tsv_writer.writerow(weight)


with open("tensorflow_projector/metadata.tsv", "w") as metadata_file:
    tsv_writer = csv.writer(metadata_file, delimiter="\t")
    vocab_dict = vocab.get_itos()
    for word in vocab_dict:
        tsv_writer.writerow([word])