## Skip-gram vs CBOW – Word Embeddings from Scratch

In this assignment, students will:

- Implement Skip-gram and CBOW models from scratch using PyTorch.
- Train these models on a real-world text corpus.
- Visualize and compare the learned word embeddings using t-SNE and UMAP.
- Interpret the semantic structure of the embedding space

In [16]:
import re
import collections
import random
from tqdm import tqdm
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from tqdm import tqdm
import matplotlib.pyplot as plt
import torch.nn.functional as F
import torch.nn as nn
from sklearn.manifold import TSNE
import umap
from datasets import load_dataset

### Cargar datos

In [8]:
# cargar datos desde Hugging Face
ds = load_dataset("afmck/text8")

# extraer texto
text = ds['train'][0]['text']

# Confirmar contenido
print("Longitud del texto:", len(text))
print("Primeros 500 caracteres:\n", text[:500])

Longitud del texto: 90000000
Primeros 500 caracteres:
  anarchism originated as a term of abuse first used against early working class radicals including the diggers of the english revolution and the sans culottes of the french revolution whilst the term is still used in a pejorative way to describe any act that used violent means to destroy the organization of society it has also been taken up as a positive label by self defined anarchists the word anarchism is derived from the greek without archons ruler chief king anarchism as a political philoso


### Preprocessing

In [10]:
text = text.lower()
text = re.sub(r'[^a-z0-9\s]', ' ', text)
tokens = text.split()

freq = collections.Counter(tokens)
tokens = [w for w in tokens if freq[w] >= 5]

most_common = [w for w, _ in freq.most_common(50000)]
vocab = set(most_common)
tokens = [w for w in tokens if w in vocab]

word2idx = {w: i for i, w in enumerate(most_common)}
idx2word = {i: w for w, i in word2idx.items()}
vocab_size = len(word2idx)
print("Vocab size:", vocab_size)

Vocab size: 50000


### Generar pares de entrenamiento (CBOW y Skip-gram)

In [12]:
def generate_pairs(tokens, word2idx, window_min=2, window_max=5, mode='skipgram'):
    pairs = []
    T = len(tokens)
    for i in tqdm(range(T), desc=f'generating {mode} pairs'):
        if tokens[i] not in word2idx:
            continue
        center = word2idx[tokens[i]]
        window = random.randint(window_min, window_max)
        start = max(0, i - window)
        end = min(T, i + window + 1)
        context = []
        for j in range(start, end):
            if j == i: continue
            if tokens[j] not in word2idx: continue
            context.append(word2idx[tokens[j]])
        if len(context) == 0:
            continue
        if mode == 'skipgram':
            # emit (center, each_context) as separate samples
            for ctx in context:
                pairs.append((center, ctx))
        else:  # cbow
            pairs.append((context, center))
    return pairs

# Generar ambos tipos 
skip_pairs = generate_pairs(tokens, word2idx, mode='skipgram')
cbow_pairs = generate_pairs(tokens, word2idx, mode='cbow')

print("Skip pairs:", len(skip_pairs))
print("CBOW pairs:", len(cbow_pairs))


generating skipgram pairs: 100%|██████████| 14928870/14928870 [00:18<00:00, 786091.31it/s]
generating cbow pairs: 100%|██████████| 14928870/14928870 [01:54<00:00, 130533.31it/s]

Skip pairs: 104505627
CBOW pairs: 14928870





### Dataset + DataLoader en PyTorch

In [None]:
class SkipGramDataset(Dataset):
    def __init__(self, pairs):
        self.pairs = pairs
    def __len__(self):
        return len(self.pairs)
    def __getitem__(self, idx):
        center, context = self.pairs[idx]
        return torch.long_tensor(center), torch.long_tensor(context)

class CBOWDataset(Dataset):
    def __init__(self, pairs):
        self.pairs = pairs
    def __len__(self):
        return len(self.pairs)
    def __getitem__(self, idx):
        context_list, target = self.pairs[idx]
        return torch.tensor(context_list, dtype=torch.long), torch.tensor(target, dtype=torch.long)

# collate for CBOW: pad variable-length contexts and return mask or simply average in model
def cbow_collate(batch):
    contexts, targets = zip(*batch)
    # contexts is list of 1d tensors with variable length
    lengths = [len(c) for c in contexts]
    max_len = max(lengths)
    padded = torch.zeros(len(contexts), max_len, dtype=torch.long)
    mask = torch.zeros(len(contexts), max_len, dtype=torch.float)
    for i, c in enumerate(contexts):
        padded[i, :len(c)] = c
        mask[i, :len(c)] = 1.0
    targets = torch.stack(targets)
    return padded, mask, targets



# Example DataLoader
skip_ds = SkipGramDataset(skip_pairs)
skip_loader = DataLoader(skip_ds, batch_size=1024, shuffle=True)

cbow_ds = CBOWDataset(cbow_pairs)
cbow_loader = DataLoader(cbow_ds, batch_size=1024, shuffle=True, collate_fn=cbow_collate)


### Model: clase base + CBOW y Skip-gram

In [18]:
class Word2VecBaseModel(nn.Module):
    def __init__(self, vocab_size, embed_dim=100):
        super().__init__()
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim
        self.in_embed = nn.Embedding(vocab_size, embed_dim)
        self.out_linear = nn.Linear(embed_dim, vocab_size, bias=False)

    def forward(self, *args, **kwargs):
        raise NotImplementedError

class SkipGramModel(Word2VecBaseModel):
    def forward(self, centers):
        emb = self.in_embed(centers)       
        logits = self.out_linear(emb)      
        return logits

class CBOWModel(Word2VecBaseModel):
    def forward(self, contexts_padded, mask):
        emb = self.in_embed(contexts_padded)   
        mask = mask.unsqueeze(-1)              
        summed = (emb * mask).sum(dim=1)       
        lens = mask.sum(dim=1).clamp(min=1)    
        avg = summed / lens
        logits = self.out_linear(avg)          
        return logits

### Entrenamiento — loop único (ej: Skip-gram). Repite para CBOW cambiando loader y modelo

In [24]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def train_model(model, dataloader, epochs=5, lr=1e-3, is_cbow=False):
    model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    losses = []
    for epoch in range(1, epochs+1):
        model.train()
        total_loss = 0.0
        it = tqdm(dataloader, desc=f'Epoch {epoch}/{epochs}')
        for batch in it:
            optimizer.zero_grad()
            if is_cbow:
                contexts_padded, mask, targets = batch
                contexts_padded = contexts_padded.to(device).long()   # indices -> long
                mask = mask.to(device)
                targets = targets.to(device).long()                   # ensure long
                logits = model(contexts_padded, mask)
            else:
                centers, contexts = batch  # contexts are single indices for skipgram
                centers = centers.to(device).long()                  # indices -> long
                targets = contexts.to(device).long()                 # ensure long
                logits = model(centers)
            loss = criterion(logits, targets)
            loss.backward()
            optimizer.step()
            total_loss += loss.item() * targets.size(0)
            it.set_postfix(loss=loss.item())
        avg = total_loss / len(dataloader.dataset)
        print(f"Epoch {epoch} avg loss: {avg:.4f}")
        losses.append(avg)
    # plot loss
    plt.plot(losses, marker='o')
    plt.title('Training loss')
    plt.xlabel('Epoch')
    plt.ylabel('Avg loss')
    plt.show()
    return model

# Entrenar SkipGram (ejemplo)
skip_model = SkipGramModel(vocab_size, 100)
skip_model = train_model(skip_model, skip_loader, epochs=5, lr=1e-3, is_cbow=False)

# Entrenar CBOW
cbow_model = CBOWModel(vocab_size, 100)
cbow_model = train_model(cbow_model, cbow_loader, epochs=5, lr=1e-3, is_cbow=True)

Epoch 1/5:   0%|          | 0/102057 [00:02<?, ?it/s]


AttributeError: module 'torch' has no attribute 'long_tensor'