# Prepare data for training

- Create a dataset from raw data:
  - (doc, query, relevance)
  - relevance is 0 (not relevant), 1 (relevant), 2 (user-selected)


In [29]:
import pandas as pd
from datasets import load_dataset
import matplotlib.pyplot as plt
import seaborn as sns
import collections
import random
import numpy as np

In [2]:
data = load_dataset("ms_marco", "v1.1", split="train")

README.md:   0%|          | 0.00/9.48k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


validation-00000-of-00001.parquet:   0%|          | 0.00/21.4M [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/175M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/10047 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/82326 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/9650 [00:00<?, ? examples/s]

In [3]:
def preprocess(text: str) -> list[str]:
    text = text.lower()
    text = text.replace(".", " <PERIOD> ")
    text = text.replace(",", " <COMMA> ")
    text = text.replace('"', " <QUOTATION_MARK> ")
    text = text.replace(";", " <SEMICOLON> ")
    text = text.replace("!", " <EXCLAMATION_MARK> ")
    text = text.replace("?", " <QUESTION_MARK> ")
    text = text.replace("(", " <LEFT_PAREN> ")
    text = text.replace(")", " <RIGHT_PAREN> ")
    text = text.replace("--", " <HYPHENS> ")
    text = text.replace("?", " <QUESTION_MARK> ")
    text = text.replace(":", " <COLON> ")
    words = text.split()
    stats = collections.Counter(words)
    words = [word for word in words if stats[word] > 5]
    return words

In [16]:
df_raw = pd.DataFrame(data)

In [17]:
df_raw.head()

Unnamed: 0,answers,passages,query,query_id,query_type,wellFormedAnswers
0,[Results-Based Accountability is a disciplined...,"{'is_selected': [0, 0, 0, 0, 0, 1, 0, 0, 0, 0]...",what is rba,19699,description,[]
1,[Yes],"{'is_selected': [0, 1, 0, 0, 0, 0, 0], 'passag...",was ronald reagan a democrat,19700,description,[]
2,[20-25 minutes],"{'is_selected': [0, 0, 0, 0, 1, 0, 0, 0, 0, 0]...",how long do you need for sydney and surroundin...,19701,numeric,[]
3,[$11 to $22 per square foot],"{'is_selected': [0, 0, 0, 0, 0, 0, 0, 0, 1], '...",price to install tile in shower,19702,numeric,[]
4,[Due to symptoms in the body],"{'is_selected': [0, 0, 1, 0, 0, 0, 0, 0], 'pas...",why conversion observed in body,19703,description,[]


In [6]:
def unwrap_passages(row: pd.Series) -> pd.DataFrame:
    df_psg = pd.DataFrame(row["passages"])
    df_psg["query"] = row["query"]
    df_psg["query_id"] = row["query_id"]
    df_psg["query_type"] = row["query_type"]
    # answers = row['answers']
    # df_psg['answers'] = [answers] * len(df_psg) if answers and any(answers) else [[]] * len(df_psg)
    return df_psg

In [18]:
chunks = []
for i, row in df_raw.iterrows():
    chunks.append(unwrap_passages(row))
df = pd.concat(chunks).reset_index(drop=True)

In [36]:
df.head(1)

Unnamed: 0,is_selected,passage_text,url,query,query_id,query_type
0,0,"Since 2007, the RBA's outstanding reputation h...",https://en.wikipedia.org/wiki/Reserve_Bank_of_...,what is rba,19699,description


In [38]:
import os

# save df to data/processed/flattened_train.parquet
file_path = os.path.join("data", "processed", "flattened_train.parquet")
df.to_parquet(file_path)

In [None]:
# import df from data/processed/flattened_train.parquet
file_path = os.path.join("data", "processed", "flattened_train.parquet")
df = pd.read_parquet(file_path)

In [28]:
len(df)

676193

In [57]:
dfc = df.head(1000)

In [58]:
# Rename columns for clarity
dfc.columns = ["is_selected", "passage_text", "url", "query", "query_id", "query_type"]

# Create arrays to hold new data
query_list = []
query_id_list = []
relevant_doc_list = []
irrelevant_doc_list = []
relevant_doc_is_selected_list = []
# Iterate through each unique query
unique_queries = dfc["query"].unique()

for query in unique_queries:
    # Get all relevant documents for the current query
    query_relevant_docs = df[df["query"] == query]
    # Get irrelevant documents (documents that belong to other queries)
    irrelevant_docs = df[df["query"] != query]

    # For each relevant document, sample an irrelevant document
    for _, relevant_row in query_relevant_docs.iterrows():
        sampled_irrelevant_doc = irrelevant_docs.sample(1).iloc[0]

        # Add data to lists
        query_list.append(query)
        query_id_list.append(relevant_row["query_id"])
        relevant_doc_list.append(relevant_row["passage_text"])
        irrelevant_doc_list.append(sampled_irrelevant_doc["passage_text"])
        relevant_doc_is_selected_list.append(relevant_row["is_selected"])

In [59]:
# Create the new DataFrame from the lists
new_df = pd.DataFrame(
    {
        "query": query_list,
        "query_id": query_id_list,
        "relevant_document": relevant_doc_list,
        "irrelevant_document": irrelevant_doc_list,
        "is_selected": relevant_doc_is_selected_list,
    }
)

In [60]:
new_df.head(30)
# extend to add is_selected

Unnamed: 0,query,query_id,relevant_document,irrelevant_document,is_selected
0,what is rba,19699,"Since 2007, the RBA's outstanding reputation h...",Cover the grill again and allow to cook for an...,0
1,what is rba,19699,The Reserve Bank of Australia (RBA) came into ...,Submit. · just now. Report Abuse. cloud is vib...,0
2,what is rba,19699,RBA Recognized with the 2014 Microsoft US Regi...,Determining Flag Size. The length of the flag ...,0
3,what is rba,19699,The inner workings of a rebuildable atomizer a...,"1 On average, a chair lift can cost anywhere f...",0
4,what is rba,19699,Results-Based Accountability® (also known as R...,n. A seismic wave that travels through the ear...,0
5,what is rba,19699,Results-Based Accountability® (also known as R...,Brief History of Maytag & Washing Machine Inno...,1
6,what is rba,19699,"RBA uses a data-driven, decision-making proces...",Function. The cardiac skeleton has four major ...,0
7,what is rba,19699,vs. NetIQ Identity Manager. Risk-based authent...,Types of counseling most often used to treat s...,0
8,what is rba,19699,"A rebuildable atomizer (RBA), often referred t...","Most homeowners report spending between $3,675...",0
9,what is rba,19699,Get To Know Us. RBA is a digital and technolog...,Definition. Patellar tendinitis is an injury t...,0


In [61]:
new_df.to_parquet("data/processed/query_rel_doc.parquet")