In [6]:
# Importing required libraries
from collections import Counter
import re
import zipfile
import numpy as np
import random
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F
import zipfile
import pandas as pd
import re
from collections import Counter
from io import BytesIO

#### 1. Data loading (from local zip) and preprocessing


In [20]:
class PreprocessText8:
    def __init__(self, min_count=5, batch_size=1000):
        """
        Initializes the PreprocessText8 class.
        Args:
        - min_count: Minimum word frequency for vocabulary inclusion.
        - batch_size: Number of rows to process in each batch.
        """
        self.min_count = min_count  # Minimum word frequency for vocabulary inclusion
        self.batch_size = batch_size  # Batch size for processing data
        self.vocab = None  # To store word-to-index mapping
        self.word_counts = None  # To store word frequencies

    def load_dataset(self, zip_filepath, parquet_filename, text_column):
        """
        Generator to load the dataset from a zipped Parquet file in batches.
        Args:
        - zip_filepath: Path to the ZIP file containing Parquet files.
        - parquet_filename: The specific Parquet file within the ZIP archive.
        - text_column: The column in the Parquet file containing the text data.
        """
        with zipfile.ZipFile(zip_filepath, "r") as z:
            with z.open(parquet_filename) as f:
                # Load the Parquet file into a pandas DataFrame in memory
                df = pd.read_parquet(BytesIO(f.read()), engine="pyarrow")

                # Process the data in batches
                df_iterator = df[text_column].astype(str).values
                for i in range(0, len(df_iterator), self.batch_size):
                    yield " ".join(df_iterator[i : i + self.batch_size])

    def preprocess_text(self, text):
        """
        Tokenize text by removing punctuation and splitting by spaces.
        Args:
        - text: Raw text data.
        Returns:
        - tokens: List of tokens.
        """
        text = text.lower()
        tokens = re.findall(r"\b[a-z]+\b", text)
        return tokens

    def build_vocab(self, tokens):
        """
        Build the vocabulary by counting word frequencies.
        Args:
        - tokens: List of tokenized words.
        Returns:
        - vocab: Word-to-index mapping.
        - word_counts: Word frequencies.
        """
        word_counts = Counter(tokens)
        word_counts = {
            word: count
            for word, count in word_counts.items()
            if count >= self.min_count
        }

        # Create word-to-index mapping (vocabulary)
        vocab = {word: i for i, (word, _) in enumerate(word_counts.items(), start=1)}
        vocab["<UNK>"] = 0  # Unknown words get a default index of 0

        self.vocab = vocab
        self.word_counts = word_counts
        return vocab, word_counts

    def text_to_indices(self, tokens):
        """
        Convert tokenized words to their corresponding indices from the vocabulary.
        Args:
        - tokens: List of tokens.
        Returns:
        - indices: List of indices corresponding to tokens.
        """
        indices = [self.vocab.get(word, self.vocab["<UNK>"]) for word in tokens]
        return indices

In [None]:
class SkipGramDataGenerator:
    def __init__(self, vocab, window_size=2):
        self.vocab = vocab
        self.window_size = window_size  # Context window size

    def generate_training_pairs(self, text_indices_batch):
        """
        Generates (input, context) pairs for a batch using the skip-gram model.
        Args:
        - text_indices_batch: List of word indices (batch of text).
        Returns:
        - pairs: List of (input_word, context_word) pairs.
        """
        pairs = []
        for i, target_word in enumerate(text_indices_batch):
            # Define the context window range
            start = max(i - self.window_size, 0)
            end = min(i + self.window_size + 1, len(text_indices_batch))

            # For each word in the window (except the target word), generate a pair
            for context_word in (
                text_indices_batch[start:i] + text_indices_batch[i + 1 : end]
            ):
                pairs.append((target_word, context_word))

        return pairs

In [29]:
# Modified process_dataset to handle large datasets in batches
def process_dataset_in_batches(
    zip_filepath, parquet_filename, text_column, preprocessor
):
    """
    Generalized function to load, preprocess, and process a dataset in batches.
    Args:
    - zip_filepath: Path to the ZIP file containing the Parquet file.
    - parquet_filename: Name of the Parquet file inside the ZIP archive.
    - text_column: The column in the Parquet file containing the text data.
    - preprocessor: PreprocessText8 class instance to handle preprocessing.

    Yields:
    - processed_tokens: List of processed tokens for each batch.
    """
    data_generator = preprocessor.load_dataset(
        zip_filepath, parquet_filename, text_column
    )
    for batch in data_generator:
        tokens = preprocessor.preprocess_text(batch)
        yield tokens  # Yield tokens batch by batch for processing


# Step 1: Process and build vocabulary incrementally
def build_vocab_in_batches(preprocessor, zip_filepath, parquet_filename, text_column):
    vocab, word_counts = None, None
    data_generator = process_dataset_in_batches(
        zip_filepath, parquet_filename, text_column, preprocessor
    )

    # Incrementally build vocab in batches
    for token_batch in data_generator:
        vocab, word_counts = preprocessor.build_vocab(token_batch)

    return vocab, word_counts


# Step 2: Convert text batches to indices in batches
def convert_to_indices_in_batches(
    preprocessor, zip_filepath, parquet_filename, text_column
):
    indices_generator = []
    data_generator = process_dataset_in_batches(
        zip_filepath, parquet_filename, text_column, preprocessor
    )

    # Convert each batch of tokens to indices
    for token_batch in data_generator:
        indices_batch = preprocessor.text_to_indices(token_batch)
        indices_generator.append(indices_batch)

    return indices_generator


# Step 3: Generate training pairs in batches
def generate_training_pairs_in_batches(
    data_generator, data_generator_instance, preprocessor, vocab
):
    training_pairs = []

    for text_indices_batch in data_generator_instance:
        batch_pairs = data_generator.generate_training_pairs(text_indices_batch)
        training_pairs.extend(batch_pairs)

    return training_pairs

#### Generating training data

In the skip-gram model, for each word (target word), you try to predict the surrounding words (context words) within a window. For example, if the window size is 2, for the sentence ["I", "love", "machine", "learning"], the training pairs would be:

("love", "I"), ("love", "machine")
("machine", "love"), ("machine", "learning")


In [47]:
# Instantiate PreprocessText8 class and SkipGramDataGenerator
preprocessor = PreprocessText8(min_count=5, batch_size=1000)
window_size = 2
data_generator = SkipGramDataGenerator(vocab=None, window_size=window_size)

# Step 1: Build vocabulary incrementally in batches
vocab, word_counts = build_vocab_in_batches(
    preprocessor, "data/train.zip", "train-00000-of-00001.parquet", "text"
)

# Step 2: Convert the training, test, and validation datasets into word indices in batches
train_indices_generator = convert_to_indices_in_batches(
    preprocessor, "data/train.zip", "train-00000-of-00001.parquet", "text"
)
test_indices_generator = convert_to_indices_in_batches(
    preprocessor, "data/test.zip", "test-00000-of-00001.parquet", "text"
)
validate_indices_generator = convert_to_indices_in_batches(
    preprocessor, "data/validation.zip", "validation-00000-of-00001.parquet", "text"
)

# Step 3: Generate training pairs for SkipGram in batches
training_pairs = generate_training_pairs_in_batches(
    data_generator, train_indices_generator, preprocessor, vocab
)
test_pairs = generate_training_pairs_in_batches(
    data_generator, test_indices_generator, preprocessor, vocab
)
validation_pairs = generate_training_pairs_in_batches(
    data_generator, validate_indices_generator, preprocessor, vocab
)

In [49]:
print(f"Total training pairs generated: {len(training_pairs)}")

Total training pairs generated: 61206990


#### 3. Building skip-gram model

Architecture Overview:

- Input: One-hot vector of size equal to the vocabulary size.

- Hidden Layer: Produces the embedding vector (size embedding_dim).

- Output: Softmax over the vocabulary size to predict the context word.


In [50]:
"""
Embedding Layer: Converts input words (as indices) to vectors of size embedding_dim. These vectors represent the word embeddings that will be learned.
Linear Layer: Maps the embedding vector to a vector of size vocab_size. This represents the probabilities of each word in the vocabulary being a context word.
Forward Pass: The forward method defines how input passes through the layer
"""


class SkipGramModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(SkipGramModel, self).__init__()
        # Embedding layer (hidden layer) with vocab_size and embedding_dim
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)

        # Linear layer to project the embedding to vocabulary size
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, input_words):
        """
        Forward pass for batched input words.
        input_words: Tensor of shape [batch_size] containing word indices.
        """
        # Convert input words to their corresponding embeddings
        embed = self.embeddings(input_words)  # Shape: [batch_size, embedding_dim]

        # Pass embeddings through the linear layer to get vocabulary logits
        output = self.linear(embed)  # Shape: [batch_size, vocab_size]

        return output

In [65]:
# Parameters
vocab_size = len(vocab)
embedding_dim = 100

# Initialize the model
model = SkipGramModel(vocab_size, embedding_dim)

# Ensure the model is on CPU (since you're processing locally)
device = torch.device("cpu")
model.to(device)

# Loss function and optimizer
loss_function = (
    nn.CrossEntropyLoss()
)  # Softmax + CrossEntropy for multi-class classification
optimizer = optim.SGD(model.parameters(), lr=0.01)  # Stochastic Gradient Descent

# Example of running a forward pass with a batch of input words
sample_input = torch.tensor([1, 2, 3], dtype=torch.long, device=device)

# Forward pass: Get the logits for the input words
output = model(sample_input)

# Print out the sample output (logits) and its shape for clarity
print("Sample output (logits):", output)
print("Logits shape:", output.shape)  # Should be [batch_size, vocab_size]

Sample output (logits): tensor([[ 0.1486,  0.2180, -0.3476,  ..., -0.1374, -0.0847,  0.9195],
        [-0.5377, -0.0513,  0.1219,  ..., -0.2479,  0.0938, -0.1595],
        [-0.7623,  0.4742, -0.1601,  ...,  0.6987,  0.8690,  0.5691]],
       grad_fn=<AddmmBackward0>)
Logits shape: torch.Size([3, 67428])


#### 4. Training the model

- Input preparation: Convert the training pairs (input word, context word) into tensors.
- Forward pass: For each input word, predict the probability distribution over the context words.
- Loss calculation: Use cross-entropy loss to measure how far the predicted probabilities are from the true context word.
- Backpropagation: Compute gradients and update model weights using an optimizer.
- Repeat for several epochs: Go through the entire dataset multiple times to improve the model.


In [66]:
import multiprocessing

print(multiprocessing.cpu_count())

16


In [52]:
from torch.utils.data import DataLoader

In [53]:
class SkipGramDataset(Dataset):
    def __init__(self, training_pairs):
        self.training_pairs = training_pairs

    def __len__(self):
        return len(self.training_pairs)

    def __getitem__(self, idx):
        input_word, context_word = self.training_pairs[idx]
        return torch.tensor(input_word), torch.tensor(context_word)

In [67]:
# Training parameters
num_epochs = 2
embedding_dim = 50
learning_rate = 0.1
batch_size = 64

# Initialize the model, loss function, and optimizer
model = SkipGramModel(vocab_size, embedding_dim)
device = torch.device("cpu")
model.to(device)

SkipGramModel(
  (embeddings): Embedding(67428, 50)
  (linear): Linear(in_features=50, out_features=67428, bias=True)
)

In [68]:
optimizer = optim.SGD(model.parameters(), lr=learning_rate)
loss_function = nn.CrossEntropyLoss()

In [69]:
# DataLoader for training and validation (assuming you have training_pairs and validation_pairs)
train_dataset = SkipGramDataset(training_pairs)
train_loader = DataLoader(
    train_dataset, batch_size=batch_size, shuffle=True, num_workers=6
)

In [70]:
validate_dataset = SkipGramDataset(validation_pairs)
validate_loader = DataLoader(
    validate_dataset, batch_size=batch_size, shuffle=False, num_workers=6
)

In [71]:
# Function to evaluate the model on the validation set
def evaluate_model(model, validate_loader, loss_function, device):
    model.eval()  # Set the model to evaluation mode
    total_loss = 0
    with torch.no_grad():  # Disable gradient computation
        for input_words, context_words in validate_loader:
            input_words, context_words = input_words.to(device), context_words.to(
                device
            )
            output = model(input_words)
            loss = loss_function(output, context_words)
            total_loss += loss.item()

    return total_loss / len(validate_loader)  # Return the average validation loss

In [72]:
# Training loop with validation
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}")
    model.train()  # Set the model to training mode
    total_loss = 0

    for input_words, context_words in train_loader:
        input_words, context_words = input_words.to(device), context_words.to(device)

        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass: Get predictions
        output = model(input_words)

        # Calculate loss
        loss = loss_function(output, context_words)

        # Backward pass: Compute gradients and update weights
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    # Average training loss for this epoch
    avg_training_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs}, Training Loss: {avg_training_loss}")

    # Evaluate on the validation dataset after each epoch
    avg_validation_loss = evaluate_model(model, validate_loader, loss_function, device)
    print(f"Epoch {epoch+1}/{num_epochs}, Validation Loss: {avg_validation_loss}")

Epoch 1/2


#### Evaluation


In [27]:
def cosine_similarity(word1, word2, model, vocab):
    # Get the word indices
    idx1 = vocab.get(word1, vocab["<UNK>"])
    idx2 = vocab.get(word2, vocab["<UNK>"])

    # Get embeddings for both words
    embedding1 = model.embeddings(torch.tensor([idx1]))
    embedding2 = model.embeddings(torch.tensor([idx2]))

    # Compute cosine similarity
    similarity = F.cosine_similarity(embedding1, embedding2)
    return similarity.item()

Embedding for 'anarchism': [[-0.7744571  -1.3963333   1.6797013  -0.36338466  0.35906884 -0.56441194
   1.636218   -0.7358431  -1.329026   -0.35654166  0.7031084   0.5561964
  -0.24942185  0.9599037  -0.20940821 -1.0948676  -1.4011997   1.8756366
   0.00571415 -0.8789985   0.77912277  0.16245946 -1.2401024   0.84306574
  -0.8503677   1.2378135  -0.01179308  0.35991243 -1.3328391  -1.548498
  -0.9834696  -0.75525534 -0.5796744   0.15449347  0.42098314  1.4635736
  -0.4123435   0.6639243   1.0369222   0.4815263  -1.5264597  -1.7223165
  -0.60674995  1.6102108   0.48401964  0.05120361 -0.79321295 -1.6948316
   1.1633942  -0.10054369 -0.118899   -0.05108259  0.5949204  -0.91717595
   1.3532596  -0.2873556   1.0850874   0.86395913 -0.44619206 -0.78221434
   0.43233    -0.4406806   0.6004827   0.79305243 -0.88724846  0.72463626
   0.09062869 -2.2093837  -0.42560497  1.7481977  -0.21781035  0.34679073
   0.9230143  -1.7262104  -0.47373107  0.13895707 -0.24337332 -0.2147037
  -0.4275165   1.49

In [None]:
# Example: Compare similarity between "king" and "queen"
similarity = cosine_similarity("king", "queen", model, vocab)
print(f"Cosine similarity between 'king' and 'queen': {similarity}")

In [None]:
# word analogy test
def word_analogy(word1, word2, word3, model, vocab):
    """
    Solves the analogy: word1 is to word2 as word3 is to ?
    """
    # Get word indices
    idx1 = vocab.get(word1, vocab["<UNK>"])
    idx2 = vocab.get(word2, vocab["<UNK>"])
    idx3 = vocab.get(word3, vocab["<UNK>"])

    # Get embeddings
    embed1 = model.embeddings(torch.tensor([idx1]))
    embed2 = model.embeddings(torch.tensor([idx2]))
    embed3 = model.embeddings(torch.tensor([idx3]))

    # Solve analogy: word1 - word2 + word3
    analogy_vector = embed1 - embed2 + embed3

    # Find the word closest to analogy_vector
    all_embeddings = model.embeddings.weight
    similarities = F.cosine_similarity(
        analogy_vector, all_embeddings.unsqueeze(0), dim=2
    )

    # Get the most similar word (excluding input words)
    top_match = torch.argmax(similarities, dim=1)
    for word, idx in vocab.items():
        if idx == top_match:
            return word

In [None]:
# Example: "man" is to "king" as "woman" is to ?
result = word_analogy("man", "king", "woman", model, vocab)
print(f"'man' is to 'king' as 'woman' is to: {result}")

In [29]:
# 2. Compute the cosine similarity between the word embeddings


def get_word_embedding(word, model, vocab):
    # Get the index of the word
    idx = vocab.get(word, vocab["<UNK>"])
    # Extract the embedding for the word and flatten it to 1D
    embedding = model.embeddings(torch.tensor([idx])).squeeze()
    return embedding.detach().numpy()


# Function to compute cosine similarity between two vectors
def cosine_similarity(vec1, vec2):
    # Ensure both vectors are 1D arrays
    vec1 = vec1.flatten()
    vec2 = vec2.flatten()
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))


# Find the most similar words to a given word
def find_similar_words(word, model, vocab, top_n=5):
    word_embedding = get_word_embedding(word, model, vocab)
    similarities = {}

    for other_word in vocab:
        other_embedding = get_word_embedding(other_word, model, vocab)
        similarity = cosine_similarity(word_embedding, other_embedding)
        similarities[other_word] = similarity

    # Sort by similarity and return the top_n words
    similar_words = sorted(similarities.items(), key=lambda x: x[1], reverse=True)[
        :top_n
    ]
    return similar_words

Words most similar to 'anarchism': [('anarchism', np.float32(1.0000001)), ('tournaments', np.float32(0.33770555)), ('private', np.float32(0.30305058)), ('metals', np.float32(0.30206954)), ('honor', np.float32(0.30183873))]


In [2]:
# Example: Find words most similar to "anarchism"
similar_words = find_similar_words("king", model, vocab)
print(f"Words most similar to 'king': {similar_words}")

In [None]:
# Save the model
model_save_path = "models/saved/skipgram_word2vec_model.pth"

# Save the model's state dictionary
torch.save(model.state_dict(), model_save_path)

print(f"Model saved to {model_save_path}")

In [31]:
# next steps
# train on whole dataset
# evaluate results
# fine tune on titles