# Prepare data for training
- Create a dataset from raw data:
  - (doc, query, relevance)
  - relevance is 0 (not relevant), 1 (relevant), 2 (user-selected)

In [1]:
import pandas as pd
from datasets import load_dataset
import matplotlib.pyplot as plt
import seaborn as sns
import collections  

In [2]:
data = load_dataset('ms_marco', 'v1.1', split='train')

In [3]:
def preprocess(text: str) -> list[str]:
  text = text.lower()
  text = text.replace('.',  ' <PERIOD> ')
  text = text.replace(',',  ' <COMMA> ')
  text = text.replace('"',  ' <QUOTATION_MARK> ')
  text = text.replace(';',  ' <SEMICOLON> ')
  text = text.replace('!',  ' <EXCLAMATION_MARK> ')
  text = text.replace('?',  ' <QUESTION_MARK> ')
  text = text.replace('(',  ' <LEFT_PAREN> ')
  text = text.replace(')',  ' <RIGHT_PAREN> ')
  text = text.replace('--', ' <HYPHENS> ')
  text = text.replace('?',  ' <QUESTION_MARK> ')
  text = text.replace(':',  ' <COLON> ')
  words = text.split()
  stats = collections.Counter(words)
  words = [word for word in words if stats[word] > 5]
  return words


In [4]:
df_raw = pd.DataFrame(data)


In [5]:
df_raw.head()

Unnamed: 0,answers,passages,query,query_id,query_type,wellFormedAnswers
0,[Results-Based Accountability is a disciplined...,"{'is_selected': [0, 0, 0, 0, 0, 1, 0, 0, 0, 0]...",what is rba,19699,description,[]
1,[Yes],"{'is_selected': [0, 1, 0, 0, 0, 0, 0], 'passag...",was ronald reagan a democrat,19700,description,[]
2,[20-25 minutes],"{'is_selected': [0, 0, 0, 0, 1, 0, 0, 0, 0, 0]...",how long do you need for sydney and surroundin...,19701,numeric,[]
3,[$11 to $22 per square foot],"{'is_selected': [0, 0, 0, 0, 0, 0, 0, 0, 1], '...",price to install tile in shower,19702,numeric,[]
4,[Due to symptoms in the body],"{'is_selected': [0, 0, 1, 0, 0, 0, 0, 0], 'pas...",why conversion observed in body,19703,description,[]


In [6]:
def unwrap_passages(row: pd.Series) -> pd.DataFrame:
  df_psg = pd.DataFrame(row['passages'])
  df_psg['query'] = row['query']
  df_psg['query_id'] = row['query_id']
  df_psg['query_type'] = row['query_type']
  # answers = row['answers']
  # df_psg['answers'] = [answers] * len(df_psg) if answers and any(answers) else [[]] * len(df_psg)
  return df_psg


In [7]:
chunks = []
for i, row in df_raw.iterrows():
    chunks.append(unwrap_passages(row))
df_pos = pd.concat(chunks).reset_index(drop=True)
df_pos['relevance'] = df_pos['is_selected'] + 1
# TODO: vectorize this, do it much faster


In [8]:
df_pos.head(50)

Unnamed: 0,is_selected,passage_text,url,query,query_id,query_type,relevance
0,0,"Since 2007, the RBA's outstanding reputation h...",https://en.wikipedia.org/wiki/Reserve_Bank_of_...,what is rba,19699,description,1
1,0,The Reserve Bank of Australia (RBA) came into ...,https://en.wikipedia.org/wiki/Reserve_Bank_of_...,what is rba,19699,description,1
2,0,RBA Recognized with the 2014 Microsoft US Regi...,http://acronyms.thefreedictionary.com/RBA,what is rba,19699,description,1
3,0,The inner workings of a rebuildable atomizer a...,https://www.slimvapepen.com/rebuildable-atomiz...,what is rba,19699,description,1
4,0,Results-Based Accountability® (also known as R...,http://rba-africa.com/about/what-is-rba/,what is rba,19699,description,1
5,1,Results-Based Accountability® (also known as R...,http://resultsleadership.org/what-is-results-b...,what is rba,19699,description,2
6,0,"RBA uses a data-driven, decision-making proces...",http://rba-africa.com/about/what-is-rba/,what is rba,19699,description,1
7,0,vs. NetIQ Identity Manager. Risk-based authent...,http://searchsecurity.techtarget.com/definitio...,what is rba,19699,description,1
8,0,"A rebuildable atomizer (RBA), often referred t...",https://www.slimvapepen.com/rebuildable-atomiz...,what is rba,19699,description,1
9,0,Get To Know Us. RBA is a digital and technolog...,http://www.rbaconsulting.com/,what is rba,19699,description,1


In [9]:
def create_negative_samples(df_queries, df, n_samples=10):
    # Create a DataFrame with repeated rows from df_queries
    df_neg = df_queries.loc[df_queries.index.repeat(n_samples)].reset_index(drop=True)
    
    # Sample random passages for each query
    random_passages = df[['passage_text', 'url']].sample(n=len(df_neg), replace=True).reset_index(drop=True)
    
    # Add the random passages and set relevance to 0
    df_neg['passage_text'] = random_passages['passage_text']
    df_neg['url'] = random_passages['url']
    df_neg['relevance'] = 0
    
    return df_neg

In [10]:
df_queries = df_pos[['query', 'query_id', 'query_type']].drop_duplicates().reset_index(drop=True)


In [11]:
df_neg = create_negative_samples(df_queries, df_pos, 10) # do we want to adjust the  number of negative samples??

In [12]:
df_training = pd.concat([df_pos, df_neg])
df_training.sort_values(by=['query_id', 'relevance'], 
                        ascending=[True, False], 
                        inplace=True)
df_training = df_training.reset_index(drop=True)
df_training = df_training.rename(columns={'passage_text': 'document'})



In [15]:
df_training[5:15]

Unnamed: 0,is_selected,document,url,query,query_id,query_type,relevance
5,0.0,Results-Based Accountability® (also known as R...,http://rba-africa.com/about/what-is-rba/,what is rba,19699,description,1
6,0.0,"RBA uses a data-driven, decision-making proces...",http://rba-africa.com/about/what-is-rba/,what is rba,19699,description,1
7,0.0,vs. NetIQ Identity Manager. Risk-based authent...,http://searchsecurity.techtarget.com/definitio...,what is rba,19699,description,1
8,0.0,"A rebuildable atomizer (RBA), often referred t...",https://www.slimvapepen.com/rebuildable-atomiz...,what is rba,19699,description,1
9,0.0,Get To Know Us. RBA is a digital and technolog...,http://www.rbaconsulting.com/,what is rba,19699,description,1
10,,1 Saturated fat. 2 This is a type of fat that...,http://www.mayoclinic.org/healthy-lifestyle/nu...,what is rba,19699,description,0
11,,1 (Of an action or decision) performed by or a...,http://www.oxforddictionaries.com/definition/e...,what is rba,19699,description,0
12,,The primary cause of pterygium is cumulative U...,https://www.clearvieweyes.com/pterygium/causes,what is rba,19699,description,0
13,,Obsessive-Compulsive Disorder (OCD) is charact...,https://www.anxiety.org/obsessive-compulsive-d...,what is rba,19699,description,0
14,,Schedule provides the applicable tariff rates ...,https://www.aacb.com/resources/tariff-lookup/,what is rba,19699,description,0


In [16]:
df_training.to_parquet('./training.parquet')