# Construct Corpus data dump for Retrieval

In [2]:
import pandas as pd
import numpy as np

Load 2WikiMultiHopQA data

In [2]:
df_wiki_train = pd.read_json('data/QA-dataset/data/2wikimultihopQA/train.json')
df_wiki_dev = pd.read_json('data/QA-dataset/data/2wikimultihopQA/dev.json')
df_wiki_test = pd.read_json('data/QA-dataset/data/2wikimultihopQA/test.json')

df_wiki = pd.concat([df_wiki_train, df_wiki_dev, df_wiki_test])
df_wiki.drop(columns=['type', 'supporting_facts', 'evidences'], inplace=True)

df_wiki = df_wiki.explode('context', ignore_index=True)
df_wiki_corpus = df_wiki['context'].apply(lambda row: row[0] + ' - ' + ' '.join(row[1]))
df_wiki_corpus.drop_duplicates(ignore_index=True, inplace=True)

df_wiki_corpus

0         Stuart Rosenberg - Stuart Rosenberg (August 11...
1         Méditerranée (1963 film) - Méditerranée is a 1...
2         Move (1970 film) - Move is a 1970 American com...
3         Ian Barry (director) - Ian Barry is an Austral...
4         Peter Levin - Peter Levin is an American direc...
                                ...                        
430220    Karl Friedrich, Duke of Saxe-Meiningen - Karl ...
430221    Josip Cindro - Josip Cindro was a Dalmatian po...
430222    Puven Pather - Puven Pather is an Australian f...
430223    Jessica Gower - Jessica Gower (born 1977 in Me...
430224    Never Trouble Trouble Until Trouble Troubles Y...
Name: context, Length: 430225, dtype: object

Load MusiqueQA data

In [3]:
df_musique_train = pd.read_json('data/QA-dataset/data/musique/musique_full_v1.0_train.jsonl', lines=True)
df_musique_dev = pd.read_json('data/QA-dataset/data/musique/musique_full_v1.0_dev.jsonl', lines=True)
df_musique_test = pd.read_json('data/QA-dataset/data/musique/musique_full_v1.0_test.jsonl', lines=True)

df_musique = pd.concat([df_musique_train, df_musique_dev, df_musique_test])
df_musique.drop(columns=['question_decomposition', 'answer_aliases', 'answerable'], inplace=True)

df_musique = df_musique.explode('paragraphs', ignore_index=True)
df_musique_corpus = df_musique['paragraphs'].apply(lambda row: row['title'] + ' - ' +  row['paragraph_text'])
df_musique_corpus.drop_duplicates(ignore_index=True, inplace=True)

df_musique_corpus

0         All Things in Time - All Things in Time is an ...
1         Goin' Out of My Head - ``Goin 'Out of My Head ...
2         Every Little Thing U Do - ``Every Little Thing...
3         She Don't Love You - ``She Do n't Love You ''i...
4         Crazy Desire - Crazy Desire (originally titled...
                                ...                        
133194    Land reform in India - In land reform in Keral...
133195    A Place in England - A Place in England is a n...
133196    Luis Sales - Luis Sales (1745–1807) served as ...
133197    Joseph Merklin - Joseph Merklin (17 February 1...
133198    SM UB-94 - SM "UB-94" was a German Type UB III...
Name: paragraphs, Length: 133199, dtype: object

Combine to a single corpus concisting of phrases to be retrieved and save as npy file (input for RealmRetriever)

In [45]:
df_corpus = pd.concat([df_wiki_corpus.astype(str), df_musique_corpus.astype(str)])
df_corpus = df_corpus.sample(frac=1, random_state=42)
df_corpus = df_corpus.reset_index(drop=True)

# Convert to numpy of byte strings
np_corpus = df_corpus.to_numpy()
for i in range(len(np_corpus)):
    np_corpus[i] = np_corpus[i].encode()

np.save('data/block_records.npy', np_corpus)

np_corpus

array([b'Ishberda - Ishberda is a rural locality( a selo) and the administrative center of Ishberdinsky Selsoviet, Baymaksky District, Bashkortostan, Russia. The population was 638 as of 2010. There are 6 streets.',
       b'Santiago Giordana - Santiago Giordana( born 3 May 1995) is an Argentine professional footballer who plays as a forward for Primera B Nacional side Club Atl\xc3\xa9tico Alvarado.',
       b'Dermide Leclerc - Dermide Louis Napol\xc3\xa9on Leclerc (20 April 1798 \xe2\x80\x93 14 August 1804) was the only child of Pauline Bonaparte (later "suo jure" Duchess of Guastalla) and her husband, French Army general Charles Leclerc. Through his mother, Dermide was a nephew of the future Emperor Napoleon I. In 1802, during the Haitian Revolution, Dermide arrived on the island-colony of Saint-Domingue with his parents, as part of the Saint-Domingue expedition. After his father\'s death of yellow fever later during the year, Dermide and Pauline were brought back to France. In 1803,