# Development notes

Actions:
* Right now word to vec will use all data to train its embeddings. This should probably be validated by a train, test, validate split.

# Step 0a - Configuration

In [89]:
# Configuration
should_use_small_data = True
corpus_filename = "./datasets/ms_marco_corpus.txt"

sentencepiece_model_prefix = "ms_marco"
sentencepiece_model_file_name = sentencepiece_model_prefix + ".model"

# Hyper parameters
sentence_piece_vocab_size = 10000

# Step 0 - Load dependencies

In [99]:
import os
import sentencepiece as spm
from datasets import list_datasets
from datasets import load_dataset
import pandas as pd

import torch
from torch.utils.data import Dataset, DataLoader

# Step 1 - Load the datasets

In [90]:
datasets_list = list_datasets()
ms_df_dict = load_dataset("ms_marco", "v1.1")
ms_train_df = ms_df_dict['train']
ms_validation_df = ms_df_dict['validation']
ms_test_df = ms_df_dict['test']

# Convert to Pandas
ms_train_df = pd.DataFrame(ms_train_df)
ms_validation_df = pd.DataFrame(ms_validation_df)
ms_test_df = pd.DataFrame(ms_test_df)

print(ms_train_df.head())

if should_use_small_data: 
    # Setup a smaller dataset for debugging purposes
    print("Running with smaller datasets enabled")
    ms_train_df = ms_train_df.iloc[0:1000]
    ms_validation_df = ms_validation_df.iloc[0:600]
    ms_test_df = ms_test_df[0:200]

                                             answers  \
0  [Results-Based Accountability is a disciplined...   
1                                              [Yes]   
2                                    [20-25 minutes]   
3                       [$11 to $22 per square foot]   
4                      [Due to symptoms in the body]   

                                            passages  \
0  {'is_selected': [0, 0, 0, 0, 0, 1, 0, 0, 0, 0]...   
1  {'is_selected': [0, 1, 0, 0, 0, 0, 0], 'passag...   
2  {'is_selected': [0, 0, 0, 0, 1, 0, 0, 0, 0, 0]...   
3  {'is_selected': [0, 0, 0, 0, 0, 0, 0, 0, 1], '...   
4  {'is_selected': [0, 0, 1, 0, 0, 0, 0, 0], 'pas...   

                                               query  query_id   query_type  \
0                                        what is rba     19699  description   
1                       was ronald reagan a democrat     19700  description   
2  how long do you need for sydney and surroundin...     19701      numeric   
3         

# Step X - Sentence piece preparation

First we create the corpus from the combination from the combination of the the queries and the documents (both positive and negative)

In [91]:
query_string = "query"
passage_string = "passages"
passage_text_string = "passage_text"

# Gather all of queries into one list
all_queries = list(ms_train_df[query_string]) + list(ms_validation_df[query_string]) + list(ms_test_df[query_string])

assert (len(all_queries) == len(ms_train_df) + len(ms_validation_df) + len(ms_test_df))

# Helper function for reading out the data from a given container.
def read_passage_texts_from_data(data_frame):
    passages = data_frame[passage_string]
    texts = [passage[passage_text_string] for passage in passages]
    return texts

train_texts = read_passage_texts_from_data(ms_train_df)
test_texts = read_passage_texts_from_data(ms_test_df)
validation_texts = read_passage_texts_from_data(ms_validation_df)

# Now lets read out all of documents for the corpus
def read_all_documents(data_frame):
    return [entry for passage in data_frame[passage_string] for entry in passage[passage_text_string]]

all_documents = read_all_documents(ms_train_df) + read_all_documents(ms_test_df) + read_all_documents(ms_validation_df)

corpus = all_queries + all_documents

Write the corpus to a file for preview

In [92]:
with open(corpus_filename, "w") as corpus_file:
    for sentence in corpus:
        corpus_file.write(sentence + os.linesep)

Train and generate the sentence piece model using the corpus text

In [93]:
spm.SentencePieceTrainer.train(input=corpus_filename, model_prefix=sentencepiece_model_prefix, vocab_size=sentence_piece_vocab_size)

sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: ./datasets/ms_marco_corpus.txt
  input_format: 
  model_prefix: ms_marco
  model_type: UNIGRAM
  vocab_size: 10000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
  enable_differential

Step X - Generate tokens for the queries and the documents using sentence piece model

In [94]:
sp_model = spm.SentencePieceProcessor()
sp_model.load(sentencepiece_model_file_name)

tokenized_queries = [sp_model.encode_as_pieces(query) for query in all_queries]

print(tokenized_queries)



# Step X - Skipgram Word2Vec model implementation

Helper class to prepare the input data to the Skipgram model

In [100]:
def generate_skip_gram_pairs(tokenized_sentences, window_size=2):
    pairs = []
    for sentence in tokenized_sentences:
        for center_word_pos in range(len(sentence)):
            for w in range(-window_size, window_size + 1):
                context_word_pos = center_word_pos + w
                if context_word_pos < 0 or context_word_pos >= len(sentence) or center_word_pos == context_word_pos:
                    continue
                center_word_index = sentence[center_word_pos]
                context_word_index = sentence[context_word_pos]
                pairs.append((center_word_index, context_word_index))
    return pairs

class SkipGramDataset(Dataset):
    def __init__(self, pairs):
        self.pairs = pairs

    def __len__(self):
        return len(self.pairs)
    
    def __getitem__(self, index):
        center, context = self.pairs[index]
        return torch.tensor(center, dtype=torch.long), torch.tensor(context, dtype=torch.long)