In [7]:
import torch
import torch.nn as nn
import torch.optim as optim
from collections import Counter
from torch.utils.data import DataLoader, Dataset

# Example corpus (replace with your own dataset)
corpus = [
    "i love natural language processing",
    "continuous bag of words model",
    # Add more sentences from your dataset
]

# Tokenize the corpus and build a vocabulary
words = ' '.join(corpus).split()
word_counts = Counter(words)
vocab = sorted(word_counts, key=word_counts.get, reverse=True)
word_to_idx = {word: idx for idx, word in enumerate(vocab)}

# Define hyperparameters
embedding_dim = 100
context_window = 2
batch_size = 64
learning_rate = 0.001
epochs = 100

# Create training data in context-target pairs
data = []
for sentence in corpus:
    words = sentence.split()
    for target_idx in range(len(words)):
        context = [
            words[i] for i in range(target_idx - context_window, target_idx)
            if i != target_idx and 0 <= i < len(words)
        ]
        target = words[target_idx]
        data.append((context, target))

# Define a custom dataset and dataloader
class CBOWDataset(Dataset):
    def __init__(self, data, word_to_idx, context_window):
        self.data = data
        self.word_to_idx = word_to_idx
        self.context_window = context_window

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        context, target = self.data[index]
        context = [self.word_to_idx[word] for word in context]
        target = self.word_to_idx[target]
        return context, target

def collate_fn(data):
    # Padding sequences to the length of the longest context sequence
    max_len = max(len(context) for context, _ in data)
    padded_contexts = []
    targets = []

    for context, target in data:
        padding = [0] * (max_len - len(context))
        padded_context = context + padding
        padded_contexts.append(padded_context)
        targets.append(target)

    return torch.tensor(padded_contexts), torch.tensor(targets)

dataset = CBOWDataset(data, word_to_idx, context_window)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

# Define the CBOW model
class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, context):
        context = self.embeddings(context).sum(dim=1)
        output = self.linear(context)
        return output

# Initialize the model, loss function, and optimizer
model = CBOW(len(vocab), embedding_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(epochs):
    total_loss = 0
    for context, target in dataloader:
        context = context.to(torch.long)
        target = target.to(torch.long)

        optimizer.zero_grad()
        output = model(context)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f'Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(dataloader)}')

# Save the learned word embeddings
word_embeddings = model.embeddings.weight.data
print(word_embeddings)
# You can now use the word embeddings for various NLP tasks


Epoch 1/100, Loss: 2.9385616779327393
Epoch 2/100, Loss: 2.9112191200256348
Epoch 3/100, Loss: 2.8844497203826904
Epoch 4/100, Loss: 2.85823655128479
Epoch 5/100, Loss: 2.832562208175659
Epoch 6/100, Loss: 2.807410717010498
Epoch 7/100, Loss: 2.782766103744507
Epoch 8/100, Loss: 2.7586147785186768
Epoch 9/100, Loss: 2.7349419593811035
Epoch 10/100, Loss: 2.7117342948913574
Epoch 11/100, Loss: 2.688978910446167
Epoch 12/100, Loss: 2.666663408279419
Epoch 13/100, Loss: 2.644775867462158
Epoch 14/100, Loss: 2.623304843902588
Epoch 15/100, Loss: 2.6022391319274902
Epoch 16/100, Loss: 2.5815672874450684
Epoch 17/100, Loss: 2.561278820037842
Epoch 18/100, Loss: 2.5413639545440674
Epoch 19/100, Loss: 2.5218119621276855
Epoch 20/100, Loss: 2.5026135444641113
Epoch 21/100, Loss: 2.4837594032287598
Epoch 22/100, Loss: 2.4652390480041504
Epoch 23/100, Loss: 2.447044849395752
Epoch 24/100, Loss: 2.4291672706604004
Epoch 25/100, Loss: 2.411597728729248
Epoch 26/100, Loss: 2.394327163696289
Epoch 27