In [3]:
# Importing required libraries
from collections import Counter
import re
import zipfile
import numpy as np
import random
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

#### Data loading (from local zip) and preprocessing


In [6]:
class PreprocessText8:
    def __init__(self, min_count=5):
        self.min_count = min_count  # Minimum word frequency for vocabulary inclusion
        self.vocab = None  # To store word to index mapping
        self.word_counts = None  # To store word frequencies

    def load_dataset(self, filepath):
        # Load text8 dataset from the local file
        with zipfile.ZipFile(filepath) as f:
            text = f.read(f.namelist()[0]).decode("utf-8")
        return text

    def preprocess_text(self, text):
        # Tokenize text by removing punctuation and splitting by spaces
        text = text.lower()
        tokens = re.findall(r"\b[a-z]+\b", text)
        return tokens

    def build_vocab(self, tokens):
        # Count word frequencies
        word_counts = Counter(tokens)
        word_counts = {
            word: count
            for word, count in word_counts.items()
            if count >= self.min_count
        }

        # Create word to index mapping (vocabulary)
        vocab = {word: i for i, (word, _) in enumerate(word_counts.items(), start=1)}
        vocab["<UNK>"] = 0  # Unknown words get a default index of 0

        self.vocab = vocab
        self.word_counts = word_counts
        return vocab, word_counts

    def text_to_indices(self, tokens):
        # Convert the tokenized words to their corresponding indices from the vocab
        indices = [self.vocab.get(word, self.vocab["<UNK>"]) for word in tokens]
        return indices

In [10]:
# Instantiate preprocessing class and execute loading/preprocessing
preprocessor = PreprocessText8(min_count=5)

# Step 1: Load dataset from the local file
text8_data = preprocessor.load_dataset(
    "data/text8-1mb.zip"
)  # Provide the correct file path

# Step 2: Preprocess the dataset (tokenization)
tokens = preprocessor.preprocess_text(text8_data)

# Step 3: Build the vocabulary
vocab, word_counts = preprocessor.build_vocab(tokens)

# Step 4: Convert the text into indices based on the vocabulary
text_indices = preprocessor.text_to_indices(tokens)

# Check results
print("First 10 words (as tokens):", tokens[:10])
print("Vocabulary size:", len(vocab))
print("First 10 indices of text:", text_indices[:10])

First 10 words (as tokens): ['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against']
Vocabulary size: 4126
First 10 indices of text: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]


#### Generating training data

In the skip-gram model, for each word (target word), you try to predict the surrounding words (context words) within a window. For example, if the window size is 2, for the sentence ["I", "love", "machine", "learning"], the training pairs would be:

("love", "I"), ("love", "machine")
("machine", "love"), ("machine", "learning")


In [11]:
class SkipGramDataGenerator:
    def __init__(self, vocab, window_size=2):
        self.vocab = vocab
        self.window_size = window_size  # Context window size

    def generate_training_pairs(self, text_indices):
        """
        Generates (input, context) pairs using the skip-gram model.
        """
        pairs = []
        for i, target_word in enumerate(text_indices):
            # Define the context window range
            start = max(i - self.window_size, 0)
            end = min(i + self.window_size + 1, len(text_indices))

            # For each word in the window (except the target word), generate a pair
            for context_word in text_indices[start:i] + text_indices[i + 1 : end]:
                pairs.append((target_word, context_word))

        return pairs

In [12]:
# Instantiate SkipGramDataGenerator
window_size = 2  # You can adjust the window size
data_generator = SkipGramDataGenerator(vocab, window_size)

# Step 2: Generate training pairs
training_pairs = data_generator.generate_training_pairs(text_indices)

# Check the first 5 pairs
print("First 5 training pairs (input, context):", training_pairs[:5])
print("Total training pairs generated:", len(training_pairs))

First 5 training pairs (input, context): [(1, 2), (1, 3), (2, 1), (2, 3), (2, 4)]
Total training pairs generated: 702390


#### 3. Building skip-gram model

Architecture Overview:

- Input: One-hot vector of size equal to the vocabulary size.

- Hidden Layer: Produces the embedding vector (size embedding_dim).

- Output: Softmax over the vocabulary size to predict the context word.


In [14]:
"""
Embedding Layer: Converts input words (as indices) to dense vectors of size embedding_dim. These vectors represent the word embeddings that will be learned.
Linear Layer: Maps the embedding vector to a vector of size vocab_size. This represents the probabilities of each word in the vocabulary being a context word.
Forward Pass: The forward method defines how input passes through the layer
"""


class SkipGramModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(SkipGramModel, self).__init__()
        # Embedding layer (hidden layer)
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)

        # Output layer (vocab size for softmax)
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, input_word):
        # Pass through the embedding layer
        embed = self.embeddings(input_word)

        # Pass through the linear layer (output layer)
        output = self.linear(embed)

        return output

In [15]:
# Parameters
vocab_size = len(vocab)
embedding_dim = 100  # You can adjust this dimensionality

# Initialize the model
model = SkipGramModel(vocab_size, embedding_dim)

# Loss function and optimizer
loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

# Example of how to run a forward pass with a batch of input words
sample_input = torch.tensor([1, 2, 3])  # Sample batch of word indices (input words)
output = model(sample_input)

print("Sample output (logits):", output)

Sample output (logits): tensor([[ 0.2970,  0.3989, -0.0523,  ..., -0.8852, -0.0619,  0.2866],
        [ 0.2224, -0.2155, -0.0630,  ..., -0.1476,  0.9516,  0.6456],
        [-0.4301,  0.3467, -0.8593,  ..., -0.4915,  0.3792,  1.1154]],
       grad_fn=<AddmmBackward0>)


In [18]:
len(output[0])

4126

#### 4. Training the model

- Input preparation: Convert the training pairs (input word, context word) into tensors.
- Forward pass: For each input word, predict the probability distribution over the context words.
- Loss calculation: Use cross-entropy loss to measure how far the predicted probabilities are from the true context word.
- Backpropagation: Compute gradients and update model weights using an optimizer.
- Repeat for several epochs: Go through the entire dataset multiple times to improve the model.


In [19]:
class SkipGramDataset(Dataset):
    def __init__(self, training_pairs):
        self.training_pairs = training_pairs

    def __len__(self):
        return len(self.training_pairs)

    def __getitem__(self, idx):
        input_word, context_word = self.training_pairs[idx]
        return torch.tensor(input_word), torch.tensor(context_word)

In [22]:
# Create DataLoader for batching
batch_size = 64
training_dataset = SkipGramDataset(training_pairs)
train_loader = DataLoader(training_dataset, batch_size=batch_size, shuffle=True)

In [23]:
# Training parameters
num_epochs = 5  # Adjust as needed
embedding_dim = 100
learning_rate = 0.01

In [24]:
# Initialize the model, loss function, and optimizer
model = SkipGramModel(vocab_size, embedding_dim)
optimizer = optim.SGD(model.parameters(), lr=learning_rate)
loss_function = nn.CrossEntropyLoss()

In [25]:
# Training loop
for epoch in range(num_epochs):
    total_loss = 0
    for input_words, context_words in train_loader:
        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass: Get predictions
        output = model(input_words)

        # Calculate loss
        loss = loss_function(output, context_words)

        # Backward pass: Compute gradients and update weights
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

In [26]:
# Print epoch loss
print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(train_loader)}")

Epoch 5/5, Loss: 6.127859456273038


#### Evaluation


In [27]:
# 1. Extract the embeddings from the trained model

import torch.nn.functional as F


# Extract embeddings from the model
def get_word_embedding(word, model, vocab):
    # Get the index of the word
    idx = vocab.get(word, vocab["<UNK>"])
    # Extract the embedding for the word
    embedding = model.embeddings(torch.tensor([idx]))
    return embedding.detach().numpy()


# Example: Get the embedding for the word "anarchism"
word = "anarchism"
embedding = get_word_embedding(word, model, vocab)
print(f"Embedding for '{word}': {embedding}")

Embedding for 'anarchism': [[-0.7744571  -1.3963333   1.6797013  -0.36338466  0.35906884 -0.56441194
   1.636218   -0.7358431  -1.329026   -0.35654166  0.7031084   0.5561964
  -0.24942185  0.9599037  -0.20940821 -1.0948676  -1.4011997   1.8756366
   0.00571415 -0.8789985   0.77912277  0.16245946 -1.2401024   0.84306574
  -0.8503677   1.2378135  -0.01179308  0.35991243 -1.3328391  -1.548498
  -0.9834696  -0.75525534 -0.5796744   0.15449347  0.42098314  1.4635736
  -0.4123435   0.6639243   1.0369222   0.4815263  -1.5264597  -1.7223165
  -0.60674995  1.6102108   0.48401964  0.05120361 -0.79321295 -1.6948316
   1.1633942  -0.10054369 -0.118899   -0.05108259  0.5949204  -0.91717595
   1.3532596  -0.2873556   1.0850874   0.86395913 -0.44619206 -0.78221434
   0.43233    -0.4406806   0.6004827   0.79305243 -0.88724846  0.72463626
   0.09062869 -2.2093837  -0.42560497  1.7481977  -0.21781035  0.34679073
   0.9230143  -1.7262104  -0.47373107  0.13895707 -0.24337332 -0.2147037
  -0.4275165   1.49

In [29]:
# 2. Compute the cosine similarity between the word embeddings


def get_word_embedding(word, model, vocab):
    # Get the index of the word
    idx = vocab.get(word, vocab["<UNK>"])
    # Extract the embedding for the word and flatten it to 1D
    embedding = model.embeddings(torch.tensor([idx])).squeeze()
    return embedding.detach().numpy()


# Function to compute cosine similarity between two vectors
def cosine_similarity(vec1, vec2):
    # Ensure both vectors are 1D arrays
    vec1 = vec1.flatten()
    vec2 = vec2.flatten()
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))


# Find the most similar words to a given word
def find_similar_words(word, model, vocab, top_n=5):
    word_embedding = get_word_embedding(word, model, vocab)
    similarities = {}

    for other_word in vocab:
        other_embedding = get_word_embedding(other_word, model, vocab)
        similarity = cosine_similarity(word_embedding, other_embedding)
        similarities[other_word] = similarity

    # Sort by similarity and return the top_n words
    similar_words = sorted(similarities.items(), key=lambda x: x[1], reverse=True)[
        :top_n
    ]
    return similar_words

Words most similar to 'anarchism': [('anarchism', np.float32(1.0000001)), ('tournaments', np.float32(0.33770555)), ('private', np.float32(0.30305058)), ('metals', np.float32(0.30206954)), ('honor', np.float32(0.30183873))]


In [30]:
# Example: Find words most similar to "anarchism"
similar_words = find_similar_words("anarchism", model, vocab)
print(f"Words most similar to 'anarchism': {similar_words}")

Words most similar to 'anarchism': [('anarchism', np.float32(1.0000001)), ('tournaments', np.float32(0.33770555)), ('private', np.float32(0.30305058)), ('metals', np.float32(0.30206954)), ('honor', np.float32(0.30183873))]


In [None]:
# Save the model
model_save_path = "models/saved/skipgram_word2vec_model.pth"

# Save the model's state dictionary
torch.save(model.state_dict(), model_save_path)

print(f"Model saved to {model_save_path}")

In [31]:
# next steps
# train on whole dataset
# evaluate results
# fine tune on titles