# PRÁCTICA 2: word2vec / skip-gram
### Miembros: Raquel Almeida Quesada y Jorge Morales Llerandi


#### Cargar y tokenizar el corpus  

In [None]:
import re
from collections import Counter

with open("resources/dataset_word2vec.txt", "r", encoding="utf-8") as f:
    text = f.read().lower()
    
tokens = re.findall(r'\b[a-záéíóúüñ]+\b', text)
print("Número total de tokens:", len(tokens))

tokens


#### Crear vocabulario y los pares (centro, contexto)

In [None]:
from itertools import chain

window_size = 2
vocab = list(set(tokens))
word_to_ix = {w: i  for i, w in enumerate(vocab)}
ix_to_word = {i: w for w, i in word_to_ix.items()}

pairs = []
for i, center in enumerate(tokens):
    for j in range(max(0, i - window_size), min(len(tokens), i + window_size + 1)):
        if i != j:
            pairs.append((center, tokens[j]))
            
print("Ejemplo de par:", pairs[:10])
            

#### Crear el dataset para Pytorch

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

class Word2VecDataset(Dataset):
    def __init__(self, pairs, word_to_ix):
        self.pairs = pairs
        self.word_to_ix = word_to_ix
        
    def __len__(self):
        return len(self.pairs)
    
    def __getitem__(self, idx):
        center, context = self.pairs[idx]
        return torch.tensor(self.word_to_ix[center]), torch.tensor(self.word_to_ix[context])
    
dataset = Word2VecDataset(pairs, word_to_ix)
dataLoader = DataLoader(dataset, batch_size=64, shuffle=True)

#### Definir el modelo Skip-Gram

In [None]:
import torch.nn as nn

class SkipGramModel(nn.Module):
    def __init__(self, vocab_size, embedding_size):
        super(SkipGramModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.output = nn.Linear(embedding_size, vocab_size)
        
    def forward(self, center_words):
        embeds = self.embedding(center_words)
        out = self.output(embeds)
        return out
    
embedding_dim = 50
model = SkipGramModel(len(vocab), embedding_dim)

#### Entrenamiento

In [None]:
import torch.optim as optim

for epoch in range(10):
    total_loss = 0
    correct = 0
    total = 0

    for center, context in dataLoader:
        optimizer.zero_grad()
        output = model(center)
        loss = criterion(output, context)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        # Calcular accuracy (predicción correcta del contexto)
        preds = torch.argmax(output, dim=1)
        correct += (preds == context).sum().item()
        total += context.size(0)

    avg_loss = total_loss / len(dataLoader)
    accuracy = 100 * correct / total
    print(f"Época {epoch+1} | Pérdida media: {avg_loss:.4f} | Precisión: {accuracy:.2f}%")


#### Explorar los embeddings (vecinos más cercanos)

In [None]:
import torch.nn.functional as F

def get_embedding(word):
    idx = word_to_ix[word]
    return model.embeddings.weight[idx]

def nearest(word, top_n=5):
    word_emb = get_embedding(word)
    sims = F.cosine_similarity(word_emb.unsqueeze(0), model.embeddings.weight)
    best = torch.topk(sims, top_n+1).indices.tolist()[1:]
    return [ix_to_word[i] for i in best]

print("Vecinos de 'parís':", nearest("parís"))


#### Pruebas de analogías

In [None]:
def analogy(w1, w2, w3, top_n=1):
    emb = get_embedding(w2) - get_embedding(w1) + get_embedding(w3)
    sims = F.cosine_similarity(emb.unsqueeze(0), model.embeddings.weight)
    best = torch.topk(sims, top_n+3).indices.tolist()
    result = [ix_to_word[i] for i in best if ix_to_word[i] not in [w1, w2, w3]][:top_n]
    return result

print("parís : francia :: madrid : ?", analogy("parís", "francia", "madrid"))
