In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import wikipedia
from collections import Counter

# Load Wikipedia data

In [5]:
def get_wikipedia_text(page_title):
    return wikipedia.page(page_title).content

text = get_wikipedia_text("Artificial Intelligence")

text[:100]

'Artificial intelligence (AI), in its broadest sense, is intelligence exhibited by machines, particul'

# Tokenize text

In [6]:
def tokenize_text(text):
    text = text.lower().replace("\n", " ").split()
    return text

tokens = tokenize_text(text)

In [7]:
tokens[:10]

['artificial',
 'intelligence',
 '(ai),',
 'in',
 'its',
 'broadest',
 'sense,',
 'is',
 'intelligence',
 'exhibited']

# Create vocabulary

In [8]:
vocab = Counter(tokens)
vocab_size = 5000
most_common = vocab.most_common(vocab_size)
word_to_idx = {word: i for i, (word, _) in enumerate(most_common)}
idx_to_word = {i: word for word, i in word_to_idx.items()}

# Generate training data

In [16]:
def generate_training_data(tokens, window_size=2):
    data = []
    for i, word in enumerate(tokens):
        if word in word_to_idx:
            context = [tokens[j] for j in range(max(0, i - window_size), min(len(tokens), i + window_size + 1)) if j != i and tokens[j] in word_to_idx]
            if len(context) == 2 * window_size:  # Asegurar contexto completo
                data.append(([word_to_idx[ctx] for ctx in context], word_to_idx[word]))
    return data

training_data = generate_training_data(tokens)


In [17]:
training_data

[([27, 24, 4, 70], 1347),
 ([24, 1347, 70, 1348], 4),
 ([1347, 4, 1348, 1349], 70),
 ([4, 70, 1349, 8], 1348),
 ([70, 1348, 8, 24], 1349),
 ([1348, 1349, 24, 769], 8),
 ([1349, 8, 769, 12], 24),
 ([8, 24, 12, 1350], 769),
 ([24, 769, 1350, 298], 12),
 ([769, 12, 298, 68], 1350),
 ([12, 1350, 68, 770], 298),
 ([1350, 298, 770, 15], 68),
 ([298, 68, 15, 8], 770),
 ([68, 770, 8, 5], 15),
 ([770, 15, 5, 92], 8),
 ([15, 8, 92, 2], 5),
 ([8, 5, 2, 51], 92),
 ([5, 92, 51, 4], 2),
 ([92, 2, 4, 68], 51),
 ([2, 51, 68, 252], 4),
 ([51, 4, 252, 6], 68),
 ([4, 68, 6, 771], 252),
 ([68, 252, 771, 1], 6),
 ([252, 6, 1, 509], 771),
 ([6, 771, 509, 102], 1),
 ([771, 1, 102, 1], 509),
 ([1, 509, 1, 156], 102),
 ([509, 102, 156, 6], 1),
 ([102, 1, 6, 772], 156),
 ([1, 156, 772, 127], 6),
 ([156, 6, 127, 3], 772),
 ([6, 772, 3, 510], 127),
 ([772, 127, 510, 29], 3),
 ([127, 3, 29, 773], 510),
 ([3, 510, 773, 1], 29),
 ([510, 29, 1, 43], 773),
 ([29, 773, 43, 28], 1),
 ([773, 1, 28, 1], 43),
 ([1, 43, 1, 

# Generate the NN

In [19]:
class Word2VecCBOW(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(Word2VecCBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear_layer = nn.Linear(embedding_dim, vocab_size)
        
    def forward(self, x):
        x = self.embeddings(x)
        x = x.mean(dim=0, keepdim=True)
        x = self.linear_layer(x)
        return x

In [20]:
embedding_dim = 100
model = Word2VecCBOW(vocab_size, embedding_dim)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)


def train(model, data, epochs=10):
    for epoch in range(epochs):
        total_loss = 0
        for context, target in data:
            context_tensor = torch.tensor(context, dtype=torch.long)
            target_tensor = torch.tensor([target], dtype=torch.long)
            
            optimizer.zero_grad()
            output = model(context_tensor)
            loss = criterion(output, target_tensor)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        print(f"Epoch {epoch + 1}, Loss: {total_loss:.4f}")

train(model, training_data[:10000])


Epoch 1, Loss: 86958.2021
Epoch 2, Loss: 46801.0799
Epoch 3, Loss: 27634.4713
Epoch 4, Loss: 17479.6352
Epoch 5, Loss: 14728.0092
Epoch 6, Loss: 14305.1192
Epoch 7, Loss: 14412.7092
Epoch 8, Loss: 14130.4724
Epoch 9, Loss: 13737.4084
Epoch 10, Loss: 14128.5080


# Generate embeddings


In [21]:
word = "intelligence"
if word in word_to_idx:
    word_embedding = model.embeddings(torch.tensor(word_to_idx[word])).detach().numpy()
    print(f"Embedding para '{word}': {word_embedding}")


Embedding para 'intelligence': [ 1.66257834e+00 -1.79885834e-01 -2.33604097e+00  1.17206061e+00
 -1.23621392e+00 -1.79409730e+00  4.05841589e-01 -5.10636687e-01
 -1.81689894e+00  6.83415011e-02 -1.29402494e+00 -4.67476797e+00
 -5.02744973e-01 -6.98089361e-01  2.72539347e-01 -1.31165564e+00
  1.40020549e+00  8.11261058e-01  2.06933570e+00 -8.65141213e-01
  8.12190950e-01 -1.31560326e+00 -2.36804292e-01  3.48418951e-01
  1.31253934e+00  1.93441558e+00 -7.50973403e-01  5.55907845e-01
 -1.98047578e+00  5.86264534e-03 -1.11938335e-01  3.71036112e-01
  7.21017838e-01 -1.60618305e+00  4.72870409e-01  1.12088752e+00
 -2.86125016e+00 -1.59413815e+00 -6.50377989e-01 -2.07033610e+00
 -5.06077588e-01  1.38210952e-01  1.09794033e+00  9.59819317e-01
 -1.14853942e+00 -5.31218827e-01 -1.45121908e+00 -5.89357555e-01
 -3.85101414e+00  8.02872360e-01  1.65412283e+00  2.59400904e-01
  1.08029521e+00 -5.39861739e-01  2.36301243e-01 -9.73361313e-01
 -3.39813352e+00 -6.56648993e-01 -1.60537446e+00 -2.3922833