In [6]:
from collections import Counter
from io import BytesIO
import random
import re
import zipfile

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

#### 1. Data loading (from local zip) and preprocessing


In [62]:
class PreprocessText8:
    def __init__(self, min_count=5, batch_size=1000, subsample_threshold=1e-5):
        """
        Initializes the PreprocessText8 class.
        Args:
        - min_count: Minimum word frequency for vocabulary inclusion.
        - batch_size: Number of rows to process in each batch.
        - subsample_threshold: Threshold for subsampling frequent words.
        """
        self.min_count = min_count  # Minimum word frequency for vocabulary inclusion
        self.batch_size = batch_size  # Batch size for processing data
        self.subsample_threshold = subsample_threshold  # Threshold for subsampling frequent words
        self.vocab = None  # To store word-to-index mapping
        self.word_counts = None  # To store word frequencies

    def load_dataset(self, zip_filepath, parquet_filename, text_column):
        """
        Generator to load the dataset from a zipped Parquet file in batches.
        Args:
        - zip_filepath: Path to the ZIP file containing Parquet files.
        - parquet_filename: The specific Parquet file within the ZIP archive.
        - text_column: The column in the Parquet file containing the text data.
        """
        with zipfile.ZipFile(zip_filepath, "r") as z:
            with z.open(parquet_filename) as f:
                # Load the Parquet file into a pandas DataFrame in memory
                df = pd.read_parquet(BytesIO(f.read()), engine="pyarrow")

                # Process the data in batches
                df_iterator = df[text_column].astype(str).values
                for i in range(0, len(df_iterator), self.batch_size):
                    yield " ".join(df_iterator[i : i + self.batch_size])

    def preprocess_text(self, text):
        """
        Tokenize text by removing punctuation and splitting by spaces.
        Args:
        - text: Raw text data.
        Returns:
        - tokens: List of tokens.
        """
        text = text.lower()
        tokens = re.findall(r"\b[a-z]+\b", text)
        return tokens

    def build_vocab(self, tokens):
        """
        Build the vocabulary by counting word frequencies and filtering rare words.
        Args:
        - tokens: List of tokenized words.
        Returns:
        - vocab: Word-to-index mapping.
        - word_counts: Word frequencies.
        """
        word_counts = Counter(tokens)
        # Filter out words below the minimum count
        word_counts = {
            word: count
            for word, count in word_counts.items()
            if count >= self.min_count
        }

        # Create word-to-index mapping (vocabulary)
        vocab = {word: i for i, (word, _) in enumerate(word_counts.items(), start=1)}
        vocab["<UNK>"] = 0  # Unknown words get a default index of 0

        self.vocab = vocab
        self.word_counts = word_counts
        return vocab, word_counts

    def subsample_frequent_words(self, tokens):
        """
        Subsample frequent words based on the subsample_threshold to reduce their frequency.
        Args:
        - tokens: List of tokenized words.
        Returns:
        - subsampled_tokens: List of tokens after subsampling.
        """
        total_count = sum(self.word_counts.values())

        # Calculate the subsampling probability for each word
        subsample_probs = {
            word: 1 - np.sqrt(self.subsample_threshold / (count / total_count))
            for word, count in self.word_counts.items()
        }

        # Subsample the tokens based on their probability
        subsampled_tokens = [
            word for word in tokens if word not in self.word_counts or np.random.rand() > subsample_probs[word]
        ]

        return subsampled_tokens

    def filter_and_subsample(self, tokens):
        """
        Combines filtering of rare words and subsampling of frequent words.
        Args:
        - tokens: List of tokenized words.
        Returns:
        - processed_tokens: List of tokens after filtering and subsampling.
        """
        if self.vocab is None:
            raise ValueError("Vocabulary is not set. Please build the vocabulary before filtering.")
    
        # Replace rare words with <UNK>
        filtered_tokens = [word if word in self.vocab else "<UNK>" for word in tokens]
    
        # Subsample frequent words
        subsampled_tokens = self.subsample_frequent_words(filtered_tokens)
    
        return subsampled_tokens


    def text_to_indices(self, tokens):
        """
        Convert tokenized words to their corresponding indices from the vocabulary.
        Args:
        - tokens: List of tokens.
        Returns:
        - indices: List of indices corresponding to tokens.
        """
        indices = [self.vocab.get(word, self.vocab["<UNK>"]) for word in tokens]
        return indices

In [64]:
class SkipGramDataGenerator:
    def __init__(self, vocab, window_size=2):
        self.vocab = vocab
        self.window_size = window_size  # Context window size

    def generate_training_pairs(self, text_indices_batch):
        """
        Generates (input, context) pairs for a batch using the skip-gram model.
        Args:
        - text_indices_batch: List of word indices (batch of text).
        Returns:
        - pairs: List of (input_word, context_word) pairs.
        """
        pairs = []
        for i, target_word in enumerate(text_indices_batch):
            # Define the context window range
            start = max(i - self.window_size, 0)
            end = min(i + self.window_size + 1, len(text_indices_batch))

            # For each word in the window (except the target word), generate a pair
            for context_word in (
                text_indices_batch[start:i] + text_indices_batch[i + 1 : end]
            ):
                pairs.append((target_word, context_word))

        return pairs

In [72]:
# Modified process_dataset to handle large datasets in batches without filtering
def process_dataset_in_batches_no_filter(zip_filepath, parquet_filename, text_column, preprocessor):
    """
    Generalized function to load, preprocess, and process a dataset in batches without filtering.
    Args:
    - zip_filepath: Path to the ZIP file containing the Parquet file.
    - parquet_filename: Name of the Parquet file inside the ZIP archive.
    - text_column: The column in the Parquet file containing the text data.
    - preprocessor: PreprocessText8 class instance to handle preprocessing.

    Yields:
    - tokens: List of tokenized words for each batch.
    """
    data_generator = preprocessor.load_dataset(zip_filepath, parquet_filename, text_column)
    for batch in data_generator:
        tokens = preprocessor.preprocess_text(batch)
        yield tokens  # Yield the tokens batch by batch for processing


# Process and build vocabulary incrementally without filtering
def build_vocab_in_batches(preprocessor, zip_filepath, parquet_filename, text_column):
    vocab, word_counts = None, None
    data_generator = process_dataset_in_batches_no_filter(zip_filepath, parquet_filename, text_column, preprocessor)

    # Accumulate token batches to build the final vocabulary
    all_tokens = []
    for token_batch in data_generator:
        all_tokens.extend(token_batch)

    # Build the vocabulary using the accumulated tokens
    vocab, word_counts = preprocessor.build_vocab(all_tokens)

    # Set the processed vocabulary and word counts in the preprocessor
    preprocessor.vocab = vocab
    preprocessor.word_counts = word_counts

    return vocab, word_counts

# Filter and subsample tokens in batches
def filter_and_subsample_in_batches(preprocessor, zip_filepath, parquet_filename, text_column):
    filtered_data_generator = []
    data_generator = process_dataset_in_batches_no_filter(zip_filepath, parquet_filename, text_column, preprocessor)

    # Process each batch using the vocabulary set in the preprocessor
    for token_batch in data_generator:
        processed_tokens = preprocessor.filter_and_subsample(token_batch)
        filtered_data_generator.append(processed_tokens)

    return filtered_data_generator

In [73]:
# Instantiate PreprocessText8 class and SkipGramDataGenerator
# preprocessor = PreprocessText8(min_count=5, batch_size=1000)
preprocessor = PreprocessText8(min_count=5, batch_size=10, subsample_threshold=1e-5)
window_size = 2
data_generator = SkipGramDataGenerator(vocab=None, window_size=window_size)

# Build vocabulary incrementally in batches
vocab, word_counts = build_vocab_in_batches(
    preprocessor, "data/train.zip", "train-00000-of-00001.parquet", "text"
)

# Filter and subsample the data after the vocabulary is built
train_filtered_generator = filter_and_subsample_in_batches(
    preprocessor, "data/train.zip", "train-00000-of-00001.parquet", "text")

test_filtered_generator = filter_and_subsample_in_batches(
    preprocessor, "data/test.zip", "test-00000-of-00001.parquet", "text")

validate_filtered_generator = filter_and_subsample_in_batches(
    preprocessor, "data/validation.zip", "validation-00000-of-00001.parquet", "text")

In [76]:
# Convert filtered token batches to indices
def convert_filtered_to_indices_in_batches(filtered_generator, preprocessor):
    """
    Converts filtered token batches to word indices in batches.
    Args:
    - filtered_generator: Generator of filtered token batches.
    - preprocessor: Instance of PreprocessText8 to convert tokens to indices.
    Returns:
    - indices_generator: List of batches with word indices.
    """
    indices_generator = []
    for token_batch in filtered_generator:
        indices_batch = preprocessor.text_to_indices(token_batch)
        indices_generator.append(indices_batch)

    return indices_generator

# Generate training pairs in batches from index batches
def generate_training_pairs_from_indices(data_generator, indices_batches):
    """
    Generates (input, context) pairs for the Skip-gram model.
    Args:
    - data_generator: Instance of SkipGramDataGenerator.
    - indices_batches: List of index batches to generate pairs from.
    Returns:
    - training_pairs: List of all (input_word, context_word) pairs.
    """
    training_pairs = []

    for text_indices_batch in indices_batches:
        batch_pairs = data_generator.generate_training_pairs(text_indices_batch)
        training_pairs.extend(batch_pairs)

    return training_pairs


#### Generating training data

In the skip-gram model, for each word (target word), you try to predict the surrounding words (context words) within a window. For example, if the window size is 2, for the sentence ["Please", "mind", "the", "gap"], the training pairs would be:

("mind", "Please"), ("mind", "the")
("the", "mind"), ("the", "gap")


In [77]:
# Convert filtered and subsampled data to word indices in batches
train_indices_generator = convert_filtered_to_indices_in_batches(
    train_filtered_generator, preprocessor
)
test_indices_generator = convert_filtered_to_indices_in_batches(
    test_filtered_generator, preprocessor
)
validate_indices_generator = convert_filtered_to_indices_in_batches(
    validate_filtered_generator, preprocessor
)

# Generate training pairs for SkipGram in batches
training_pairs = generate_training_pairs_from_indices(
    data_generator, train_indices_generator
)
test_pairs = generate_training_pairs_from_indices(
    data_generator, test_indices_generator
)
validation_pairs = generate_training_pairs_from_indices(
    data_generator, validate_indices_generator
)

In [78]:
print(f"Total training pairs generated: {len(training_pairs)}")

Total training pairs generated: 17829410


#### 3. Building skip-gram model

Architecture Overview:

- Input: One-hot vector of size equal to the vocabulary size.

- Hidden Layer: Produces the embedding vector (size embedding_dim).

- Output: Softmax over the vocabulary size to predict the context word.


In [12]:
"""
Embedding Layer: Converts input words (as indices) to vectors of size embedding_dim. These vectors represent the word embeddings that will be learned.
Linear Layer: Maps the embedding vector to a vector of size vocab_size. This represents the probabilities of each word in the vocabulary being a context word.
Forward Pass: The forward method defines how input passes through the layer
"""


class SkipGramModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(SkipGramModel, self).__init__()
        # Embedding layer (hidden layer) with vocab_size and embedding_dim
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)

        # Linear layer to project the embedding to vocabulary size
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, input_words):
        """
        Forward pass for batched input words.
        input_words: Tensor of shape [batch_size] containing word indices.
        """
        # Convert input words to their corresponding embeddings
        embed = self.embeddings(input_words) 

        # Pass embeddings through the linear layer to get vocabulary logits
        output = self.linear(embed)  

        return output

In [13]:
'''
Sense check the model architecture
'''

# Parameters
vocab_size = len(vocab)
embedding_dim = 100

# Initialize the model
model = SkipGramModel(vocab_size, embedding_dim)

# Ensure the model is on CPU
device = torch.device("cpu")
model.to(device)

# Loss function and optimizer
loss_function = (
    nn.CrossEntropyLoss()
)  # Softmax + CrossEntropy for multi-class classification
optimizer = optim.SGD(model.parameters(), lr=0.01)  
# optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)


# Example of running a forward pass with a batch of input words
sample_input = torch.tensor([1, 2, 3], dtype=torch.long, device=device)

# Forward pass: Get the logits for the input words
output = model(sample_input)

# Print out the sample output (logits) and its shape for clarity
print("Sample output (logits):", output)
print("Logits shape:", output.shape)  # Should be [batch_size, vocab_size]

Sample output (logits): tensor([[ 0.0104, -0.8299, -0.3614,  ...,  0.3564,  0.1144,  0.0151],
        [ 1.7373, -0.5475, -0.0808,  ...,  0.2547,  0.1420,  1.0908],
        [-0.3742,  0.4437, -0.1803,  ...,  0.0894, -0.5240,  0.4222]],
       grad_fn=<AddmmBackward0>)
Logits shape: torch.Size([3, 67428])


#### 4. Training the model

- Input preparation: Convert the training pairs (input word, context word) into tensors.
- Forward pass: For each input word, predict the probability distribution over the context words.
- Loss calculation: Use cross-entropy loss to measure how far the predicted probabilities are from the true context word.
- Backpropagation: Compute gradients and update model weights using an optimizer.
- Repeat for several epochs: Go through the entire dataset multiple times to improve the model.


In [66]:
import multiprocessing

print(multiprocessing.cpu_count())

16


In [41]:
torch.cuda.is_available()

True

In [42]:
class SkipGramDataset(Dataset):
    def __init__(self, training_pairs):
        self.training_pairs = training_pairs

    def __len__(self):
        return len(self.training_pairs)

    def __getitem__(self, idx):
        input_word, context_word = self.training_pairs[idx]
        return torch.tensor(input_word), torch.tensor(context_word)

In [43]:
# Training parameters
num_epochs = 7
embedding_dim = 300
learning_rate = 0.05
batch_size = 1024

# Initialize the model, loss function, and optimizer
model = SkipGramModel(vocab_size, embedding_dim)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model.to(device)

Using device: cuda


SkipGramModel(
  (embeddings): Embedding(67428, 300)
  (linear): Linear(in_features=300, out_features=67428, bias=True)
)

In [45]:
# optimizer = optim.SGD(model.parameters(), lr=learning_rate)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

loss_function = nn.CrossEntropyLoss()

In [46]:
# DataLoader for training and validation (assuming you have training_pairs and validation_pairs)
train_dataset = SkipGramDataset(training_pairs)
train_loader = DataLoader(
    train_dataset, batch_size=batch_size, shuffle=True, num_workers=6
)

In [47]:
validate_dataset = SkipGramDataset(validation_pairs)
validate_loader = DataLoader(
    validate_dataset, batch_size=batch_size, shuffle=False, num_workers=6
)

In [48]:
# Function to evaluate the model on the validation set
def evaluate_model(model, validate_loader, loss_function, device):
    model.eval()  # Set the model to evaluation mode
    total_loss = 0
    with torch.no_grad():  # Disable gradient computation
        for input_words, context_words in validate_loader:
            input_words, context_words = input_words.to(device), context_words.to(
                device
            )
            output = model(input_words)
            loss = loss_function(output, context_words)
            total_loss += loss.item()

    return total_loss / len(validate_loader)  # Return the average validation loss

In [49]:
# Training loop with validation
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}")
    model.train()  # Set the model to training mode
    total_loss = 0

    for input_words, context_words in train_loader:
        input_words, context_words = input_words.to(device), context_words.to(device)

        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass: Get predictions
        output = model(input_words)

        # Calculate loss
        loss = loss_function(output, context_words)

        # Backward pass: Compute gradients and update weights
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    # Average training loss for this epoch
    avg_training_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs}, Training Loss: {avg_training_loss}")

    # Evaluate on the validation dataset after each epoch
    avg_validation_loss = evaluate_model(model, validate_loader, loss_function, device)
    print(f"Epoch {epoch+1}/{num_epochs}, Validation Loss: {avg_validation_loss}")

Epoch 1/7
Epoch 1/7, Training Loss: 23.29752178652279
Epoch 1/7, Validation Loss: 22.971514416517948
Epoch 2/7
Epoch 2/7, Training Loss: 23.71898649623283
Epoch 2/7, Validation Loss: 22.69840625617327
Epoch 3/7
Epoch 3/7, Training Loss: 23.69685013558713
Epoch 3/7, Validation Loss: 22.806201033946184
Epoch 4/7


KeyboardInterrupt: 

In [50]:
# Save the model
model_save_path = "models/saved/skipgram_word2vec_2.pth"

# Save the model's state dictionary
torch.save(model.state_dict(), model_save_path)

print(f"Model saved to {model_save_path}")

Model saved to models/saved/skipgram_word2vec_2.pth


#### Evaluation


In [51]:
def cosine_similarity(word1, word2, model, vocab):
    # Get the device where the model's embeddings are located
    device = next(model.parameters()).device

    # Get the word indices
    idx1 = vocab.get(word1, vocab["<UNK>"])
    idx2 = vocab.get(word2, vocab["<UNK>"])

    # Move the indices to the same device as the model
    embedding1 = model.embeddings(torch.tensor([idx1], device=device))
    embedding2 = model.embeddings(torch.tensor([idx2], device=device))

    # Compute cosine similarity
    similarity = F.cosine_similarity(embedding1, embedding2)
    return similarity.item()

In [52]:
# Example: Compare similarity between "king" and "queen"
similarity = cosine_similarity("king", "queen", model, vocab)
print(f"Cosine similarity between 'king' and 'queen': {similarity}")

Cosine similarity between 'king' and 'queen': 0.04516332969069481


In [39]:
def word_analogy(word1, word2, word3, model, vocab):
    """
    Solves the analogy: word1 is to word2 as word3 is to ?
    """
    # Get the device where the model's embeddings are located
    device = next(model.parameters()).device

    # Get word indices
    idx1 = vocab.get(word1, vocab["<UNK>"])
    idx2 = vocab.get(word2, vocab["<UNK>"])
    idx3 = vocab.get(word3, vocab["<UNK>"])

    # Move the indices to the same device as the model
    embed1 = model.embeddings(torch.tensor([idx1], device=device))
    embed2 = model.embeddings(torch.tensor([idx2], device=device))
    embed3 = model.embeddings(torch.tensor([idx3], device=device))

    # Solve analogy: word1 - word2 + word3
    analogy_vector = embed1 - embed2 + embed3

    # Find the word closest to analogy_vector
    all_embeddings = model.embeddings.weight.to(device)  # Ensure embeddings are on the same device
    similarities = F.cosine_similarity(analogy_vector, all_embeddings.unsqueeze(0), dim=2)

    # Get the most similar word (excluding input words)
    top_match = torch.argmax(similarities, dim=1).item()
    for word, idx in vocab.items():
        if idx == top_match:
            return word

    # If no match is found, return None
    return None

In [54]:
result = word_analogy("paris", "france", "berlin", model, vocab)
print(f"'paris' is to 'france' as 'berlin' is to: {result}")

result = word_analogy("doctor", "hospital", "teacher", model, vocab)
print(f"'doctor' is to 'hospital' as 'teacher' is to: {result}")

result = word_analogy("car", "road", "boat", model, vocab)
print(f"'car' is to 'road' as 'boat' is to: {result}")

result = word_analogy("coffee", "morning", "wine", model, vocab)
print(f"'coffee' is to 'morning' as 'wine' is to: {result}")

result = word_analogy("pencil", "paper", "brush", model, vocab)
print(f"'pencil' is to 'paper' as 'brush' is to: {result}")

result = word_analogy("cat", "kitten", "dog", model, vocab)
print(f"'cat' is to 'kitten' as 'dog' is to: {result}")

result = word_analogy("moon", "night", "sun", model, vocab)
print(f"'moon' is to 'night' as 'sun' is to: {result}")

result = word_analogy("water", "drink", "bread", model, vocab)
print(f"'water' is to 'drink' as 'bread' is to: {result}")

result = word_analogy("student", "school", "worker", model, vocab)
print(f"'student' is to 'school' as 'worker' is to: {result}")

result = word_analogy("novel", "author", "painting", model, vocab)
print(f"'novel' is to 'author' as 'painting' is to: {result}")


'paris' is to 'france' as 'berlin' is to: paris
'doctor' is to 'hospital' as 'teacher' is to: teacher
'car' is to 'road' as 'boat' is to: car
'coffee' is to 'morning' as 'wine' is to: coffee
'pencil' is to 'paper' as 'brush' is to: pencil
'cat' is to 'kitten' as 'dog' is to: modern
'moon' is to 'night' as 'sun' is to: moon
'water' is to 'drink' as 'bread' is to: bread
'student' is to 'school' as 'worker' is to: worker
'novel' is to 'author' as 'painting' is to: painting


In [29]:
# 2. Compute the cosine similarity between the word embeddings


def get_word_embedding(word, model, vocab):
    # Get the index of the word
    idx = vocab.get(word, vocab["<UNK>"])
    # Extract the embedding for the word and flatten it to 1D
    embedding = model.embeddings(torch.tensor([idx])).squeeze()
    return embedding.detach().numpy()


# Function to compute cosine similarity between two vectors
def cosine_similarity(vec1, vec2):
    # Ensure both vectors are 1D arrays
    vec1 = vec1.flatten()
    vec2 = vec2.flatten()
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))


# Find the most similar words to a given word
def find_similar_words(word, model, vocab, top_n=5):
    word_embedding = get_word_embedding(word, model, vocab)
    similarities = {}

    for other_word in vocab:
        other_embedding = get_word_embedding(other_word, model, vocab)
        similarity = cosine_similarity(word_embedding, other_embedding)
        similarities[other_word] = similarity

    # Sort by similarity and return the top_n words
    similar_words = sorted(similarities.items(), key=lambda x: x[1], reverse=True)[
        :top_n
    ]
    return similar_words

Words most similar to 'anarchism': [('anarchism', np.float32(1.0000001)), ('tournaments', np.float32(0.33770555)), ('private', np.float32(0.30305058)), ('metals', np.float32(0.30206954)), ('honor', np.float32(0.30183873))]


In [2]:
# Example: Find words most similar to "anarchism"
similar_words = find_similar_words("king", model, vocab)
print(f"Words most similar to 'king': {similar_words}")

In [31]:
# next steps
# train on whole dataset
# evaluate results
# fine tune on titles