# Development notes

Actions:
* Right now word to vec will use all data to train its embeddings. This should probably be validated by a train, test, validate split.
* Why do I use Adam for my skip gram

# Step 0a - Configuration

In [154]:
# Configuration
should_use_small_data = True
corpus_filename = "./datasets/ms_marco_corpus.txt"
weights_output_path = "skipgram_weights"

sentencepiece_model_prefix = "ms_marco"
sentencepiece_model_file_name = sentencepiece_model_prefix + ".model"

# Hyper parameters
sentence_piece_vocab_size = 10000

# Word2Vec hyper parameters
embedding_feature_dimension = 128
window_size =2
learning_rate = 0.01
skipgram_epochs = 20
batch_size = 32

# Two tower hyper parameters
hidden_size = 128  # Example hidden size for RNN
output_size = 64  # Example output size
vocab_size = 10000  # Example vocabulary size, adjust as needed

# Step 0 - Load dependencies

In [155]:
import os
import sentencepiece as spm
from datasets import list_datasets
from datasets import load_dataset
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

from tqdm import tqdm

# Step 1 - Load the datasets

In [156]:
datasets_list = list_datasets()
ms_df_dict = load_dataset("ms_marco", "v1.1")
ms_train_df = ms_df_dict['train']
ms_validation_df = ms_df_dict['validation']
ms_test_df = ms_df_dict['test']

# Convert to Pandas
ms_train_df = pd.DataFrame(ms_train_df)
ms_validation_df = pd.DataFrame(ms_validation_df)
ms_test_df = pd.DataFrame(ms_test_df)

print(ms_train_df.head())

if should_use_small_data: 
    # Setup a smaller dataset for debugging purposes
    print("Running with smaller datasets enabled")
    ms_train_df = ms_train_df.iloc[0:1000]
    ms_validation_df = ms_validation_df.iloc[0:600]
    ms_test_df = ms_test_df[0:200]

                                             answers  \
0  [Results-Based Accountability is a disciplined...   
1                                              [Yes]   
2                                    [20-25 minutes]   
3                       [$11 to $22 per square foot]   
4                      [Due to symptoms in the body]   

                                            passages  \
0  {'is_selected': [0, 0, 0, 0, 0, 1, 0, 0, 0, 0]...   
1  {'is_selected': [0, 1, 0, 0, 0, 0, 0], 'passag...   
2  {'is_selected': [0, 0, 0, 0, 1, 0, 0, 0, 0, 0]...   
3  {'is_selected': [0, 0, 0, 0, 0, 0, 0, 0, 1], '...   
4  {'is_selected': [0, 0, 1, 0, 0, 0, 0, 0], 'pas...   

                                               query  query_id   query_type  \
0                                        what is rba     19699  description   
1                       was ronald reagan a democrat     19700  description   
2  how long do you need for sydney and surroundin...     19701      numeric   
3         

# Step X - Sentence piece preparation

First we create the corpus from the combination from the combination of the the queries and the documents (both positive and negative)

In [157]:
query_string = "query"
passage_string = "passages"
passage_text_string = "passage_text"

# Gather all of queries into one list
all_queries = list(ms_train_df[query_string]) + list(ms_validation_df[query_string]) + list(ms_test_df[query_string])

assert (len(all_queries) == len(ms_train_df) + len(ms_validation_df) + len(ms_test_df))

# Helper function for reading out the data from a given container.
def read_passage_texts_from_data(data_frame):
    passages = data_frame[passage_string]
    texts = [passage[passage_text_string] for passage in passages]
    return texts

train_texts = read_passage_texts_from_data(ms_train_df)
test_texts = read_passage_texts_from_data(ms_test_df)
validation_texts = read_passage_texts_from_data(ms_validation_df)

# Now lets read out all of documents for the corpus
def read_all_documents(data_frame):
    return [entry for passage in data_frame[passage_string] for entry in passage[passage_text_string]]

all_documents = read_all_documents(ms_train_df) + read_all_documents(ms_test_df) + read_all_documents(ms_validation_df)

corpus = all_queries + all_documents

Write the corpus to a file for preview

In [158]:
with open(corpus_filename, "w") as corpus_file:
    for sentence in corpus:
        corpus_file.write(sentence + os.linesep)

Train and generate the sentence piece model using the corpus text

In [124]:
spm.SentencePieceTrainer.train(input=corpus_filename, model_prefix=sentencepiece_model_prefix, vocab_size=sentence_piece_vocab_size)

sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: ./datasets/ms_marco_corpus.txt
  input_format: 
  model_prefix: ms_marco
  model_type: UNIGRAM
  vocab_size: 10000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
  enable_differential

Step X - Generate tokens for the queries and the documents using sentence piece model

In [132]:
sp_model = spm.SentencePieceProcessor()
sp_model.load(sentencepiece_model_file_name)

tokenized_queries = [sp_model.encode_as_pieces(query) for query in all_queries]

id_queries = [[sp_model.piece_to_id(token) for token in query_tokens] for query_tokens in tokenized_queries]

print(tokenized_queries)
print(id_queries)

[[95, 11, 559, 1255], [52, 559, 6812, 192, 53, 108, 120, 8, 6693], [125, 166, 119, 26, 213, 14, 286, 41, 36, 3182, 9, 2641, 592], [369, 10, 1400, 3258, 12, 6186], [1242, 4111, 5013, 12, 97], [189, 19, 4, 760, 480, 12, 4, 312], [84, 10, 179, 8, 1726], [95, 183, 8, 3513, 4270, 213, 10, 7139, 4, 442], [426, 414, 2577, 22, 524, 7, 31, 1779, 1628, 1328, 319], [95, 11, 8, 6744], [12, 694, 149, 2754, 130, 19, 647, 34, 9, 1358, 399, 130, 19, 647, 34], [3826, 16, 4, 77, 7, 4, 3556, 135, 4400, 4, 1334, 12, 4, 2916], [125, 166, 1279, 595, 1616, 12, 4, 1410, 1282], [83, 84, 7, 3535, 75, 469, 729], [11, 13, 3508, 5874, 291, 815, 99, 187, 7, 3290, 18, 4117, 43], [1836, 1737, 6, 84], [95, 1137, 7, 1901, 11, 8, 1089, 6470, 6, 243], [135, 3790, 22, 4, 1607, 32, 11, 95, 32, 11], [95, 11, 8177, 2551], [125, 166, 11, 3586, 120, 3822, 3819], [95, 11, 8, 5994, 365], [125, 166, 183, 32, 221, 10, 3447, 8, 2337, 2938], [95, 11, 4, 824, 1280, 7, 918, 561, 1130], [201, 1000, 12, 251, 1177], [95, 52, 414, 315], [

# Step X - Skipgram Word2Vec model implementation

Helper class to prepare the input data to the Skipgram model

In [126]:
def generate_skip_gram_pairs(tokenized_sentences, window_size=2):
    pairs = []
    for sentence in tokenized_sentences:
        for center_word_pos in range(len(sentence)):
            for w in range(-window_size, window_size + 1):
                context_word_pos = center_word_pos + w
                if context_word_pos < 0 or context_word_pos >= len(sentence) or center_word_pos == context_word_pos:
                    continue
                center_word_index = sentence[center_word_pos]
                context_word_index = sentence[context_word_pos]
                pairs.append((center_word_index, context_word_index))
    return pairs

class SkipGramDataset(Dataset):
    def __init__(self, pairs):
        self.pairs = pairs

    def __len__(self):
        return len(self.pairs)
    
    def __getitem__(self, index):
        center, context = self.pairs[index]
        return torch.tensor(center, dtype=torch.long), torch.tensor(context, dtype=torch.long)
    
class SkipGramModel(nn.Module):
    def __init__(self, vocab_size, embedding_dimension):
        super(SkipGramModel, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dimension)
        self.linear = nn.Linear(embedding_dimension, vocab_size)

    def forward(self, target_word):
        embedded = self.embeddings(target_word)
        out = self.linear(embedded)
        log_probabilities = nn.functional.log_softmax(out, dim = 1)
        return log_probabilities


# Step X - Skipgram embeddings trainings

In [141]:
skipgram_pairs = generate_skip_gram_pairs(id_queries, window_size)
skipgram_dataset = SkipGramDataset(skipgram_pairs)
skipgram_dataloader = DataLoader(skipgram_dataset, batch_size=batch_size, shuffle=True)

skipgram_model = SkipGramModel(sentence_piece_vocab_size, embedding_feature_dimension)
loss_function = nn.NLLLoss()
optimizer = optim.Adam(skipgram_model.parameters(), lr = learning_rate)

if not os.path.exists(weights_output_path):
    os.makedirs(weights_output_path)

for epoch in range(skipgram_epochs):
    total_loss = 0
    for center_word, context_word in tqdm(skipgram_dataloader, desc=f"Epoch {epoch+1}"):
        optimizer.zero_grad()
        log_probs = skipgram_model(center_word)
        loss = loss_function(log_probs, context_word)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    
    print(f"Epoch {epoch+1}: Total Loss = {total_loss}")

    torch.save(skipgram_model.state_dict(), os.path.join(weights_output_path, f"skip_gram_model_weights_epoch_{epoch+1}.pth"))
    

Epoch 1: 100%|██████████| 1347/1347 [00:10<00:00, 123.84it/s]


Epoch 1: Total Loss = 10479.125319004059


Epoch 2: 100%|██████████| 1347/1347 [00:09<00:00, 137.82it/s]


Epoch 2: Total Loss = 8744.99948644638


Epoch 3: 100%|██████████| 1347/1347 [00:10<00:00, 133.87it/s]


Epoch 3: Total Loss = 8112.299795150757


Epoch 4: 100%|██████████| 1347/1347 [00:10<00:00, 123.03it/s]


Epoch 4: Total Loss = 7778.576106548309


Epoch 5: 100%|██████████| 1347/1347 [00:10<00:00, 132.40it/s]


Epoch 5: Total Loss = 7560.9476437568665


Epoch 6: 100%|██████████| 1347/1347 [00:10<00:00, 132.74it/s]


Epoch 6: Total Loss = 7424.860005140305


Epoch 7: 100%|██████████| 1347/1347 [00:10<00:00, 132.59it/s]


Epoch 7: Total Loss = 7307.0662450790405


Epoch 8: 100%|██████████| 1347/1347 [00:10<00:00, 132.20it/s]


Epoch 8: Total Loss = 7253.30398440361


Epoch 9: 100%|██████████| 1347/1347 [00:09<00:00, 137.63it/s]


Epoch 9: Total Loss = 7199.340917825699


Epoch 10: 100%|██████████| 1347/1347 [00:10<00:00, 134.23it/s]


Epoch 10: Total Loss = 7146.031084775925


Epoch 11: 100%|██████████| 1347/1347 [00:11<00:00, 117.96it/s]


Epoch 11: Total Loss = 7095.650991201401


Epoch 12: 100%|██████████| 1347/1347 [00:10<00:00, 128.54it/s]


Epoch 12: Total Loss = 7061.385607242584


Epoch 13: 100%|██████████| 1347/1347 [00:10<00:00, 122.88it/s]


Epoch 13: Total Loss = 7019.932780265808


Epoch 14: 100%|██████████| 1347/1347 [00:10<00:00, 124.30it/s]


Epoch 14: Total Loss = 6989.381695985794


Epoch 15: 100%|██████████| 1347/1347 [00:11<00:00, 121.87it/s]


Epoch 15: Total Loss = 6945.926291465759


Epoch 16: 100%|██████████| 1347/1347 [00:10<00:00, 127.01it/s]


Epoch 16: Total Loss = 6935.01266503334


Epoch 17: 100%|██████████| 1347/1347 [00:10<00:00, 123.86it/s]


Epoch 17: Total Loss = 6903.023324012756


Epoch 18: 100%|██████████| 1347/1347 [00:10<00:00, 123.75it/s]


Epoch 18: Total Loss = 6880.301221370697


Epoch 19: 100%|██████████| 1347/1347 [00:10<00:00, 124.68it/s]


Epoch 19: Total Loss = 6859.23535990715


Epoch 20: 100%|██████████| 1347/1347 [00:10<00:00, 128.10it/s]

Epoch 20: Total Loss = 6843.622785568237





# Step X - Prepare the queries using the embeddings for use with the query encoding RNN

First we start off with reading in the embeddings in our model:

In [159]:
skipgram_embeddings_file_path = os.path.join(weights_output_path, f"skip_gram_model_weights_epoch_10.pth")

state_dict = torch.load(skipgram_embeddings_file_path)
embedding_weights_tensor = state_dict['embeddings.weight']

skipgram_embeddings = nn.Embedding(sentence_piece_vocab_size, embedding_feature_dimension)
skipgram_embeddings.weight.data.copy_(embedding_weights_tensor)
skipgram_embeddings.weight.requires_grad = False

Let's create a helper function that helps us genreate tensors of padded embeddings matching the input tokens

In [164]:
def genreate_padded_embeddings_from_token_id_lists(query_ids):
    query_embeddings_list = []
    for ids in query_ids:
        query_tensor = torch.tensor(ids, dtype=torch.long)
        with torch.no_grad():
            query_embeddings = skipgram_embeddings(query_tensor)
        query_embeddings_list.append(query_embeddings) 

    return pad_sequence(query_embeddings_list, batch_first=True, padding_value=0)

# Use pad_sequence to pad embeddings
# pad_sequence automatically pads to the longest sequence in the batch
# Set batch_first=True to have output tensor of shape (batch_size, seq_len, feature_number)
padded_embeddings_tensor = genreate_padded_embeddings_from_token_id_lists(query_ids=id_queries)

print(padded_embeddings_tensor.shape)  # (batch_size, max_seq_len, embedding_dimension)

torch.Size([1800, 25, 128])


Next, using this helper function, we create the inputs for train, test and validate groups

In [169]:
# First we read out the queries and encode them to tokens

def generate_tokenized_queries_from_dataset(dataset):
    entries = dataset[query_string]
    tokenized_list = [sp_model.encode_as_pieces(entry) for entry in entries]
    id_list = [[sp_model.piece_to_id(token) for token in entry_tokens] for entry_tokens in tokenized_list]
    return id_list

train_query_token_ids = generate_tokenized_queries_from_dataset(ms_train_df[query_string])
print(train_query_token_ids)

[[95, 11, 559, 1255], [52, 559, 6812, 192, 53, 108, 120, 8, 6693], [125, 166, 119, 26, 213, 14, 286, 41, 36, 3182, 9, 2641, 592], [369, 10, 1400, 3258, 12, 6186], [1242, 4111, 5013, 12, 97], [189, 19, 4, 760, 480, 12, 4, 312], [84, 10, 179, 8, 1726], [95, 183, 8, 3513, 4270, 213, 10, 7139, 4, 442], [426, 414, 2577, 22, 524, 7, 31, 1779, 1628, 1328, 319], [95, 11, 8, 6744], [12, 694, 149, 2754, 130, 19, 647, 34, 9, 1358, 399, 130, 19, 647, 34], [3826, 16, 4, 77, 7, 4, 3556, 135, 4400, 4, 1334, 12, 4, 2916], [125, 166, 1279, 595, 1616, 12, 4, 1410, 1282], [83, 84, 7, 3535, 75, 469, 729], [11, 13, 3508, 5874, 291, 815, 99, 187, 7, 3290, 18, 4117, 43], [1836, 1737, 6, 84], [95, 1137, 7, 1901, 11, 8, 1089, 6470, 6, 243], [135, 3790, 22, 4, 1607, 32, 11, 95, 32, 11], [95, 11, 8177, 2551], [125, 166, 11, 3586, 120, 3822, 3819], [95, 11, 8, 5994, 365], [125, 166, 183, 32, 221, 10, 3447, 8, 2337, 2938], [95, 11, 4, 824, 1280, 7, 918, 561, 1130], [201, 1000, 12, 251, 1177], [95, 52, 414, 315], [

# Step X - Setup of the Two tower model

First lets start by defining the two tower calss

In [None]:
class TwoTowerModel(nn.Module):
    def __init__(self, embedding_size, hidden_size, output_size, vocab_size):
        super(TwoTowerModel, self).__init__()

        # Embedding layer for the first tower, in case needed here for demonstration
        # It seems you've already set up embeddings externally, so this might be redundant
        # self.embeddings = nn.Embedding(vocab_size, embedding_size)

        # First tower - Query Encoder RNN
        self.query_encoder_rnn = nn.RNN(embedding_feature_dimension, hidden_size, batch_first=True)
        
        # Example initialization for the second tower - placeholder
        # self.other_model = SomeOtherModel(...)
        
        # Example output layer - Adjust according to your needs
        # This could be used if you're combining the outputs before a comparison
        self.output_layer = nn.Linear(hidden_size, output_size)


        
    def forward(self, query_tokens):
        # Assuming query_tokens are already embedded. If not, embed them first
        # query_embeddings = self.embeddings(query_tokens)  # If starting with token IDs
        
        # Process query through RNN
        query_output, hidden_state = self.query_encoder_rnn(query_tokens)
        
        # With a basic RNN, the hidden state is simpler compared to LSTM
        # Just use the last hidden state for simplicity, though there are more sophisticated methods
        query_repr = hidden_state[-1]  # Taking the last layer's hidden state for comparison
        
        # Placeholder for processing through the second tower and combining the results
        # other_output = self.other_model(other_input)
        
        # Example: combining outputs with a linear layer (adjust as needed)
        # final_output = self.output_layer(query_repr)  # Example output processing
        
        return query_repr  # Or return final_output based on your architecture needs

Now instantiate the model and perform training

In [None]:
# Create model instance
model = TwoTowerModel(embedding_feature_dimension, hidden_size, output_size, vocab_size)

# Example usage
# Assume query_tokens is a batch of embedded query tokens with shape (batch_size, seq_length, embedding_size)
# query_repr = model(query_tokens)
# print(query_repr)  # This would be the representation you'd compare across towers
