# Step 0a - Configuration

In [61]:
should_use_small_data = True

# Step 0 - Load dependencies

In [62]:
import os
import sentencepiece as spm
from datasets import list_datasets
from datasets import load_dataset
import pandas as pd

# Step 1 - Load the datasets

In [63]:
datasets_list = list_datasets()
ms_df_dict = load_dataset("ms_marco", "v1.1")
ms_train_df = ms_df_dict['train']
ms_validation_df = ms_df_dict['validation']
ms_test_df = ms_df_dict['test']

# Convert to Pandas
ms_train_df = pd.DataFrame(ms_train_df)
ms_validation_df = pd.DataFrame(ms_validation_df)
ms_test_df = pd.DataFrame(ms_test_df)

print(ms_train_df.head())

if should_use_small_data: 
    # Setup a smaller dataset for debugging purposes
    print("Running with smaller datasets enabled")
    ms_train_df = ms_train_df.iloc[0:1000]
    ms_validation_df = ms_validation_df.iloc[0:600]
    ms_test_df = ms_test_df[0:200]

# Step X - Sentence piece preparation

First we create the corpus from the combination from the combination of the the queries and the documents (both positive and negative)

In [None]:
query_string = "query"
passage_string = "passages"
passage_text_string = "passage_text"

# Gather all of queries into one list
all_queries = list(ms_train_df[query_string]) + list(ms_validation_df[query_string]) + list(ms_test_df[query_string])

assert (len(all_queries) == len(ms_train_df) + len(ms_validation_df) + len(ms_test_df))

# Helper function for reading out the data from a given container.
def read_passage_texts_from_data(data_frame):
    passages = data_frame[passage_string]
    texts = [passage[passage_text_string] for passage in passages]
    return texts

train_texts = read_passage_texts_from_data(ms_train_df)
test_texts = read_passage_texts_from_data(ms_test_df)
validation_texts = read_passage_texts_from_data(ms_validation_df)

# Now lets read out all of documents for the corpus
def read_all_documents(data_frame):
    return [entry for passage in data_frame[passage_string] for entry in passage[passage_text_string]]

all_documents = read_all_documents(ms_train_df) + read_all_documents(ms_test_df) + read_all_documents(ms_validation_df)

corpus = all_queries + all_documents

Write the corpus to a file for preview

In [60]:
corpus_filename = "./datasets/ms_marco_corpus.txt"
with open(corpus_filename, "w") as corpus_file:
    for sentence in corpus:
        corpus_file.write(sentence + os.linesep)

Train and generate the sentence piece model using the corpus text

In [None]:
spm.Sentece