In [2]:
# This code implements a Continuous Bag of Words (CBOW) model using PyTorch to predict a target word based on its context words. 
# It reads a text dataset, preprocesses it, defines the model, and trains it to learn word embeddings and predict words efficiently.

# Import required libraries
import torch  # Library for machine learning and deep learning
import torch.nn as nn  # Module for building neural networks
from torch.utils.data import Dataset, DataLoader  # Tools for handling datasets and creating data loaders

# Define hyperparameters for the model
CONTEXT_SIZE = 2  # Number of words on each side of the target word (context window size)
EMBEDDING_DIM = 100  # Size of word embeddings (vector representation of words)
BATCH_SIZE = 32  # Number of samples per batch for training
EPOCHS = 5  # Number of times to go through the entire dataset during training
LEARNING_RATE = 0.01  # Learning rate for the optimizer (how big the steps are for updating model weights)

# Load the dataset from a text file
with open('news_corpus.txt', 'r') as f:  # Open the file in read mode
    raw_text = f.read().splitlines()  # Read all lines and split them into a list of strings

# Create a vocabulary from the dataset
vocab = set(raw_text)  # Get unique words from the dataset
vocab_size = len(vocab)  # Count the number of unique words
word_to_ix = {word: i for i, word in enumerate(vocab)}  # Map each word to a unique index
ix_to_word = {i: word for word, i in word_to_ix.items()}  # Map each index back to its word

# Define a dataset class for CBOW
class CBOWDataset(Dataset):  # Inherit from PyTorch's Dataset class
    def __init__(self, text, context_size):  # Initialize the dataset
        self.data = []  # List to hold (context, target) pairs
        for i in range(context_size, len(text) - context_size):  # Iterate over text with room for context
            context = (  # Collect context words (words before and after the target)
                text[i - context_size:i] + text[i + 1:i + 1 + context_size]
            )
            target = text[i]  # The target word is the word in the middle
            self.data.append((context, target))  # Append the context and target pair to the data list

    def __len__(self):  # Return the total number of samples in the dataset
        return len(self.data)

    def __getitem__(self, idx):  # Get a single sample (context, target) at the given index
        context, target = self.data[idx]  # Extract the context and target
        context_ids = torch.tensor([word_to_ix[word] for word in context], dtype=torch.long)  # Convert context words to indices
        target_id = torch.tensor(word_to_ix[target], dtype=torch.long)  # Convert target word to index
        return context_ids, target_id  # Return the context and target as tensors

# Instantiate the dataset and dataloader
dataset = CBOWDataset(raw_text, CONTEXT_SIZE)  # Create the dataset
data_loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)  # Create the data loader to handle batching and shuffling

# Define the CBOW model
class CBOW(nn.Module):  # Inherit from PyTorch's nn.Module
    def __init__(self, vocab_size, embedding_dim):  # Initialize the model
        super(CBOW, self).__init__()  # Call the parent class initializer
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)  # Layer to learn word embeddings
        self.linear1 = nn.Linear(embedding_dim, 128)  # First linear layer to process embeddings
        self.relu = nn.ReLU()  # Activation function to introduce non-linearity
        self.linear2 = nn.Linear(128, vocab_size)  # Second linear layer to predict word probabilities

    def forward(self, inputs):  # Define the forward pass of the model
        embeds = self.embeddings(inputs).mean(dim=1)  # Get embeddings for inputs and average them (context representation)
        out = self.linear1(embeds)  # Pass through the first linear layer
        out = self.relu(out)  # Apply the ReLU activation
        out = self.linear2(out)  # Pass through the second linear layer
        return nn.functional.log_softmax(out, dim=1)  # Apply softmax and return log probabilities

# Initialize the model, loss function, and optimizer
model = CBOW(vocab_size, EMBEDDING_DIM)  # Create an instance of the CBOW model
loss_function = nn.NLLLoss()  # Negative Log-Likelihood Loss for classification
optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE)  # Stochastic Gradient Descent optimizer

# Training loop
for epoch in range(EPOCHS):  # Loop over the number of epochs
    total_loss = 0  # Initialize total loss for this epoch
    for context, target in data_loader:  # Loop over batches of data
        optimizer.zero_grad()  # Reset gradients before backpropagation
        output = model(context)  # Forward pass: get predictions
        loss = loss_function(output, target)  # Compute the loss
        loss.backward()  # Backpropagate the gradients
        optimizer.step()  # Update the model weights
        total_loss += loss.item()  # Accumulate the loss
    print(f"Epoch {epoch + 1}, Loss: {total_loss:.4f}")  # Print epoch loss

# Test the model with a sample context
context = raw_text[:4]  # Use the first 4 words as a sample context
context_vector = torch.tensor([word_to_ix[word] for word in context], dtype=torch.long).unsqueeze(0)  # Convert to tensor
prediction = model(context_vector)  # Get model predictions
predicted_word = ix_to_word[torch.argmax(prediction).item()]  # Find the word with the highest probability
print(f"Context: {context}")  # Print the input context
print(f"Predicted word: {predicted_word}")  # Print the predicted word

Epoch 1, Loss: 10388.5395
Epoch 2, Loss: 10384.4899
Epoch 3, Loss: 10380.4990
Epoch 4, Loss: 10376.6682
Epoch 5, Loss: 10372.8964
Context: ['Current events of September\xa03,\xa01995\xa0(1995-09-03) (Sunday) :', 'eBay is founded.', 'Current events of September\xa06,\xa01995\xa0(1995-09-06) (Wednesday) :', 'NATO air strikes against Bosnian Serb forces continue, after repeated attempts at a solution to the Bosnian War fail.']
Predicted word: Syrian civil war:
