Lucas Henneçon

In [1]:
%%capture
!pip install transformers datasets

In [2]:
import numpy as np
import torch
import torch.nn.functional as F
import torch.nn as nn
import math
from torch.utils.data import DataLoader
from tabulate import tabulate
from datasets import load_dataset

from tqdm.notebook import tqdm
from transformers import BertTokenizer

In [3]:
dataset = load_dataset("scikit-learn/imdb", split="train")
print(dataset)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/1.22k [00:00<?, ?B/s]

IMDB Dataset.csv:   0%|          | 0.00/66.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Dataset({
    features: ['review', 'sentiment'],
    num_rows: 50000
})


In [4]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



In [5]:
def preprocessing_fn(x, tokenizer):
    x["review_ids"] = tokenizer(
        x["review"],
        add_special_tokens=False,
        truncation=True,
        max_length=256,
        padding=False,
        return_attention_mask=False,
    )["input_ids"]
    x["label"] = 0 if x["sentiment"] == "negative" else 1
    return x

In [6]:
n_samples = 5000  # the number of training example

# We first shuffle the data !
# TODO
dataset = dataset.shuffle(seed=42)

# Select 5000 samples
# TODO
dataset = dataset.select(range(n_samples))

# Tokenize the dataset
# TODO
dataset = dataset.map(lambda x: preprocessing_fn(x, tokenizer))

# Remove useless columns
# TODO
dataset = dataset.remove_columns(
    [col for col in dataset.column_names if col not in ["review_ids", "label"]]
)


# # Split the train and validation
# # TODO
train_valid_split = dataset.train_test_split(test_size=0.2, seed=42)


document_train_set = train_valid_split["train"]  # TODO
document_valid_set = train_valid_split["test"]  # TODO

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [7]:
# Print the first 5 rows of the training set
for i in range(5):
    print(document_train_set[i])

print(type(document_train_set[0]["review_ids"][0]))
print(type(document_train_set[0]["label"]))

{'review_ids': [1996, 24385, 2003, 1037, 2428, 2204, 10874, 1012, 1998, 3492, 14888, 1012, 1045, 2228, 2057, 2064, 2035, 5993, 2008, 1996, 9592, 1997, 2770, 2046, 1037, 18224, 24385, 2024, 2172, 7046, 2084, 2770, 2046, 9219, 1010, 6144, 2030, 14106, 1012, 2008, 1005, 1055, 3599, 2339, 2023, 3185, 2003, 2061, 12459, 1012, 2076, 2023, 2143, 1010, 2017, 1005, 2222, 2763, 2228, 2055, 2115, 2219, 24385, 1037, 2261, 2335, 1012, 3251, 2002, 1005, 1055, 5214, 1997, 2725, 2107, 2477, 1012, 1012, 1012, 2017, 2488, 11839, 2010, 2564, 2987, 1005, 1056, 21910, 2006, 2032, 1012, 2008, 1005, 1055, 2054, 1996, 2466, 2003, 2035, 2055, 1012, 1037, 9768, 24385, 1999, 2474, 20057, 2043, 2002, 4858, 2041, 2010, 2564, 2003, 16789, 2006, 2032, 2007, 1996, 4770, 1011, 2879, 1012, 1006, 2008, 2442, 2022, 1996, 4602, 9518, 1999, 1996, 2088, 1010, 2011, 1996, 2126, 1012, 4770, 24916, 2467, 2202, 5056, 1997, 1996, 2160, 23744, 2043, 1996, 3129, 2003, 2012, 2010, 2147, 1007, 2013, 2059, 2006, 2256, 24385, 1010, 28

In [8]:
# Use a GPU if available, otherwise use MPS or CPU
if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")

print(f"Using device: {device}")

Using device: cpu


In [9]:
def extract_words_context(review_ids, R):
    w = []
    Cplus = []
    text_size = len(review_ids)

    # If the text size is smaller than the required window size (2 * R + 1),
    # we cannot form a valid context, so we return the empty lists.
    if text_size < 2 * R + 1:
        return w, Cplus

    else:
        for i in range(R, text_size - R):
            w.append(review_ids[i])
            left_context = review_ids[i - R : i]
            right_context = review_ids[i + 1 : i + R + 1]
            context = left_context + right_context
            Cplus.append(context)

    return w, Cplus

In [10]:
# Test the function
print(document_train_set[0])
w, Cplus = extract_words_context(document_train_set[0]["review_ids"], 2)
print(w[0], Cplus[0])

# Make sure that every Cplus has the same size:
print(
    min(len(Cplus[i]) for i in range(len(Cplus))),
    max(len(Cplus[i]) for i in range(len(Cplus))),
)

{'review_ids': [1996, 24385, 2003, 1037, 2428, 2204, 10874, 1012, 1998, 3492, 14888, 1012, 1045, 2228, 2057, 2064, 2035, 5993, 2008, 1996, 9592, 1997, 2770, 2046, 1037, 18224, 24385, 2024, 2172, 7046, 2084, 2770, 2046, 9219, 1010, 6144, 2030, 14106, 1012, 2008, 1005, 1055, 3599, 2339, 2023, 3185, 2003, 2061, 12459, 1012, 2076, 2023, 2143, 1010, 2017, 1005, 2222, 2763, 2228, 2055, 2115, 2219, 24385, 1037, 2261, 2335, 1012, 3251, 2002, 1005, 1055, 5214, 1997, 2725, 2107, 2477, 1012, 1012, 1012, 2017, 2488, 11839, 2010, 2564, 2987, 1005, 1056, 21910, 2006, 2032, 1012, 2008, 1005, 1055, 2054, 1996, 2466, 2003, 2035, 2055, 1012, 1037, 9768, 24385, 1999, 2474, 20057, 2043, 2002, 4858, 2041, 2010, 2564, 2003, 16789, 2006, 2032, 2007, 1996, 4770, 1011, 2879, 1012, 1006, 2008, 2442, 2022, 1996, 4602, 9518, 1999, 1996, 2088, 1010, 2011, 1996, 2126, 1012, 4770, 24916, 2467, 2202, 5056, 1997, 1996, 2160, 23744, 2043, 1996, 3129, 2003, 2012, 2010, 2147, 1007, 2013, 2059, 2006, 2256, 24385, 1010, 28

In [11]:
def flatten_dataset_to_list(dataset, R):
    W = []
    CPLUS = []
    for doc in dataset:
        w, Cplus = extract_words_context(doc["review_ids"], R)
        W.extend(w)
        CPLUS.extend(Cplus)

    return W, CPLUS

In [12]:
# Define a value for K,R:
R = 6
K = 2

In [13]:
flattened_train_W, flattened_train_Cplus = flatten_dataset_to_list(
    document_train_set, R
)
flattened_val_W, flattened_val_Cplus = flatten_dataset_to_list(document_valid_set, R)
print(flattened_train_W[0], flattened_train_Cplus[0])

10874 [1996, 24385, 2003, 1037, 2428, 2204, 1012, 1998, 3492, 14888, 1012, 1045]


In [14]:
from torch.utils.data import Dataset


class Word2vecdataset(Dataset):
    def __init__(self, flat_W: list, flat_Cplus: list):
        self.W = flat_W
        self.Cplus = flat_Cplus

    def __len__(self):
        return len(self.W)

    def __getitem__(self, idx: int):
        word = self.W[idx]
        context = self.Cplus[idx]
        return word, context

In [15]:
train_set = Word2vecdataset(flattened_train_W, flattened_train_Cplus)
valid_set = Word2vecdataset(flattened_val_W, flattened_val_Cplus)

In [16]:
# Test:
word, context = train_set[0]
print(word, context)

10874 [1996, 24385, 2003, 1037, 2428, 2204, 1012, 1998, 3492, 14888, 1012, 1045]


In [17]:
import random


def collate_fn(batch, K, R):
    word_id_batch = []
    positive_context_ids_batch = []
    negative_context_ids_batch = []
    vocab_size = tokenizer.vocab_size
    n_neg_samples = 2 * K * R

    for word, context in batch:
        word_id_batch.append(word)
        positive_context_ids_batch.append(context)

        # Randomly sample 2KR words from the vocabulary for the negative context
        random_word_ids = random.sample(range(vocab_size), n_neg_samples)
        negative_context_ids_batch.append(random_word_ids)

    word_id_batch = torch.tensor(word_id_batch, dtype=torch.long).to(device)
    positive_context_ids_batch = torch.tensor(
        positive_context_ids_batch, dtype=torch.long
    ).to(device)
    negative_context_ids_batch = torch.tensor(
        negative_context_ids_batch, dtype=torch.long
    ).to(device)

    return {
        "word_id": word_id_batch,
        "positive_context_ids": positive_context_ids_batch,
        "negative_context_ids": negative_context_ids_batch,
    }

In [18]:
batch_size = 32

train_loader = DataLoader(
    dataset=train_set,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=lambda batch: collate_fn(batch, K, R),
)

val_loader = DataLoader(
    dataset=valid_set,
    batch_size=batch_size,
    shuffle=False,
    collate_fn=lambda batch: collate_fn(batch, K, R),
)

In [19]:
num_iterations = 3
print(f"R = {R}", f"K = {K}")
for i, batch in enumerate(train_loader):
    word_ids = batch["word_id"]
    positive_contexts = batch["positive_context_ids"]
    negative_contexts = batch["negative_context_ids"]

    print(f"Shape of word_ids: {word_ids.shape}")
    print(f"Shape of positive_context_ids: {positive_contexts.shape}")
    print(f"Shape of negative_context_ids: {negative_contexts.shape}")

    if i + 1 == num_iterations:
        break

R = 6 K = 2
Shape of word_ids: torch.Size([32])
Shape of positive_context_ids: torch.Size([32, 12])
Shape of negative_context_ids: torch.Size([32, 24])
Shape of word_ids: torch.Size([32])
Shape of positive_context_ids: torch.Size([32, 12])
Shape of negative_context_ids: torch.Size([32, 24])
Shape of word_ids: torch.Size([32])
Shape of positive_context_ids: torch.Size([32, 12])
Shape of negative_context_ids: torch.Size([32, 24])


# Model

In [20]:
class Word2Vec(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.Ew = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)
        self.Ec = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)

    def forward(self, w, cplus, cminus):
        W = self.Ew(w)
        CPLUS = self.Ec(cplus)
        CMINUS = self.Ec(cminus)

        # Compute raw logits (dot products)
        logits_plus = torch.bmm(CPLUS, W.unsqueeze(2)).squeeze(2)
        logits_minus = torch.bmm(CMINUS, W.unsqueeze(2)).squeeze(2)

        # Concatenate logits and labels
        logits = torch.cat([logits_minus, logits_plus], dim=1)
        positive_labels = torch.ones_like(logits_plus)
        negative_labels = torch.zeros_like(logits_minus)
        labels = torch.cat([negative_labels, positive_labels], dim=1)

        permutation = torch.randperm(logits.size(1))
        shuffled_logits = logits[:, permutation]
        shuffled_labels = labels[:, permutation]

        return shuffled_logits, shuffled_labels

In [21]:
# Test to see the tensors similarity and labels
vocab_size = tokenizer.vocab_size
embedding_dim = 10
model = Word2Vec(vocab_size, embedding_dim).to(device)

num_iterations = 1

for i, batch in enumerate(train_loader):
    word_ids = batch["word_id"].to(device)
    positive_contexts = batch["positive_context_ids"].to(device)
    negative_contexts = batch["negative_context_ids"].to(device)
    shuffled_similarity, shuffled_labels = model(
        word_ids, positive_contexts, negative_contexts
    )
    print("Shuffled similarity", shuffled_similarity[0])
    print("Shuffled labels", shuffled_labels[0])

    if i + 1 == num_iterations:
        break

Shuffled similarity tensor([-5.1833, -2.1783,  4.5083,  2.3522,  1.1669,  0.2165,  7.9974,  2.1353,
        -5.0426,  3.0781, -8.2455, -6.5568, -9.5623, -0.5392,  2.9878,  3.2247,
         7.4860, -4.6788, -4.3246,  1.9820,  0.0298,  1.3031,  5.9460, -4.7181,
        -1.3950,  7.2008, -5.1156, -1.3221, 11.4176,  2.7218, -2.7624, -0.9249,
         3.6488,  7.0019,  0.1776,  4.6549], grad_fn=<SelectBackward0>)
Shuffled labels tensor([0., 0., 1., 0., 0., 0., 1., 0., 0., 1., 0., 1., 0., 0., 0., 0., 1., 1.,
        1., 0., 0., 0., 1., 1., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0.])


In [22]:
def train_model(model, B, E, train_set, learning_rate=0.01):
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    train_loader = DataLoader(
        dataset=train_set,
        batch_size=B,
        shuffle=True,
        collate_fn=lambda batch: collate_fn(batch, K, R),
    )

    for epoch in range(E):
        model.train()
        running_loss = 0.0
        epoch_loss = 0.0

        for i, batch in enumerate(train_loader):
            optimizer.zero_grad()

            # Extract word_ids and contexts
            w = batch["word_id"].to(device)
            Cplus = batch["positive_context_ids"].to(device)
            Cminus = batch["negative_context_ids"].to(device)

            # Forward pass
            logits, labels = model(w, Cplus, Cminus)

            # Compute loss
            loss = F.binary_cross_entropy_with_logits(logits, labels)

            # Backward pass and optimize
            loss.backward()
            optimizer.step()

            # Track running loss
            running_loss += loss.item()
            epoch_loss += loss.item()

            if i % 100 == 99:
                print(
                    f"[Epoch {epoch + 1}, Batch {i + 1}] Loss: {running_loss / 100:.4f}"
                )
                running_loss = 0.0

        print(
            f"Epoch {epoch + 1}/{E} completed. Loss: {epoch_loss / len(train_loader):.4f}"
        )

    print("Training completed.")

In [23]:
batch_size = 512
epochs = 1
embedding_dim = 128
mymodel = Word2Vec(vocab_size, embedding_dim).to(device)

train_model(mymodel, B=batch_size, E=epochs, train_set=train_set)

[Epoch 1, Batch 100] Loss: 3.5122
[Epoch 1, Batch 200] Loss: 2.1799
[Epoch 1, Batch 300] Loss: 1.4376
[Epoch 1, Batch 400] Loss: 1.0825
[Epoch 1, Batch 500] Loss: 0.9005
[Epoch 1, Batch 600] Loss: 0.7652
[Epoch 1, Batch 700] Loss: 0.6768
[Epoch 1, Batch 800] Loss: 0.6126
[Epoch 1, Batch 900] Loss: 0.5665
[Epoch 1, Batch 1000] Loss: 0.5321
[Epoch 1, Batch 1100] Loss: 0.5000
[Epoch 1, Batch 1200] Loss: 0.4781
[Epoch 1, Batch 1300] Loss: 0.4522
[Epoch 1, Batch 1400] Loss: 0.4377
[Epoch 1, Batch 1500] Loss: 0.4189
Epoch 1/1 completed. Loss: 0.9630
Training completed.


In [24]:
def evaluate_model(model, batch_size):
    model.eval()

    val_loader = DataLoader(
        dataset=valid_set,
        batch_size=batch_size,
        shuffle=False,
        collate_fn=lambda batch: collate_fn(batch, K, R),
    )

    val_loss = 0.0
    correct_predictions = 0
    total_samples = 0

    with torch.no_grad():
        for batch in val_loader:
            w = batch["word_id"].to(device)
            cplus = batch["positive_context_ids"].to(device)
            cminus = batch["negative_context_ids"].to(device)

            logits, labels = model(w, cplus, cminus)

            # Compute loss without shuffling
            loss = F.binary_cross_entropy_with_logits(logits, labels)
            val_loss += loss.item()

            # Use raw logits for predictions
            predictions = (logits >= 0).float()
            correct_predictions += (predictions == labels).sum().item()
            total_samples += labels.numel()

    avg_val_loss = val_loss / len(val_loader)
    accuracy = correct_predictions / total_samples

    print(f"Validation Loss: {avg_val_loss:.4f}, Accuracy: {accuracy:.4f}")

In [25]:
evaluate_model(mymodel, 512)

Validation Loss: 0.4267, Accuracy: 0.8912


In [26]:
def predict_middle_word(model, batch_size):
    model.eval()
    word_embeddings = model.Ew.weight.to(device)
    Ec = model.Ec.to(device)
    correct_predictions = 0.0
    total_samples = 0.0

    val_loader = DataLoader(
        dataset=valid_set,
        batch_size=batch_size,
        shuffle=False,
        collate_fn=lambda batch: collate_fn(batch, K, R),
    )

    with torch.no_grad():
        for i, batch in enumerate(val_loader):
            w = batch["word_id"].to(device)
            cplus = batch["positive_context_ids"].to(device)
            embedded_cplus = Ec(cplus)
            mean_cplus = torch.mean(embedded_cplus, dim=1)
            # print("Shape embeedded_cplus:", embedded_cplus.shape,"Shape mean_cplus:", mean_cplus.shape,"Shape word_embeddings:", word_embeddings.shape)

            # Normalize to ensure we compare directions of the vectors instead of raw magnitude
            word_embeddings = F.normalize(word_embeddings, p=2, dim=1)
            mean_cplus = F.normalize(mean_cplus, p=2, dim=1)
            all_similarities = word_embeddings @ mean_cplus.T
            # print("Shape all_similarities", all_similarities.shape)
            most_similar_words = torch.argmax(all_similarities, dim=0)
            correct_predictions += (most_similar_words == w).sum().item()
            total_samples += len(w)

    accuracy = correct_predictions / total_samples
    print(accuracy)

In [None]:
# Print the accuracy of the model:
predict_middle_word(mymodel, 256)

0.022758916971619934


In [None]:
# Compare it to a random prediction:
1 / tokenizer.vocab_size

In [None]:
def save_model(model, d, R, K, B, E, directory="."):
    file_name = f"model_dim-<{d}>_radius-<{R}>_ratio-<{K}>-batch-<{B}>-epoch-<{E}>.ckpt"
    file_path = f"{directory}/{file_name}"

    embeddings = {
        "Ew": model.Ew.weight.detach().cpu(),  # Word embeddings
        "Ec": model.Ec.weight.detach().cpu(),  # Context embeddings
    }

    torch.save(embeddings, file_path)
    print(f"Model embeddings saved to {file_path}")

In [None]:
save_model(mymodel, embedding_dim, R, K, batch_size, epochs)