# Data preprocessing notebook

For `Vaswani`, you have to convert files into `.tsv` files.  
This notebook is a tool for the conversion.  

If you run the all cells below, you can get below `.tsv` files.
- `doc-text.tsv`
- `query-text.tsv`
- `rlv-ass.tsv`
- `qrel.tsv`

## required packages
- pandas

In [1]:
import pandas as pd
import csv
import os

In [2]:
# utility functions

def delete_first_row_tsv(file_path):
    temp_file_path = file_path + '.temp'

    with open(file_path, 'r', newline='') as f_input, open(temp_file_path, 'w', newline='') as f_output:
        reader = csv.reader(f_input, delimiter='\t')
        writer = csv.writer(f_output, delimiter='\t')

        next(reader)

        for row in reader:
            writer.writerow(row)

    os.rename(temp_file_path, file_path)

def delete_last_row_tsv(file_path):
    temp_file_path = file_path + '.temp'

    with open(file_path, 'r', newline='') as f_input:
        rows = list(csv.reader(f_input, delimiter='\t'))[:-1]

    with open(temp_file_path, 'w', newline='') as f_output:
        writer = csv.writer(f_output, delimiter='\t')
        writer.writerows(rows)

    os.rename(temp_file_path, file_path)

In [9]:
# doc-text(collection) preprocess

SEPERATOR = '/'

with open("/home/requirements/data/Vaswani/doc-text", "r") as file:
    docs = file.read().replace('\n', '')

seperated_docs = docs.split(SEPERATOR)

data = []
count_digits = lambda number: len(str(number))

for pid, passage in enumerate(seperated_docs, 0):
    processed_passage = passage[count_digits(pid+1):]
    data.append((pid, processed_passage))

df = pd.DataFrame(data, columns=['pid', 'passage'])
df.to_csv('doc-text.tsv', sep='\t', index=False)
delete_first_row_tsv('doc-text.tsv')
delete_last_row_tsv('doc-text.tsv')
os.rename('doc-text.tsv', 'Vaswani/doc-text.tsv')

In [13]:
# query-text(query) preprocess

SEPERATOR = '/'

with open("/home/requirements/data/Vaswani/query-text", "r") as file:
    docs = file.read().replace('\n', '')

seperated_docs = docs.split(SEPERATOR)

data = []
count_digits = lambda number: len(str(number))

for pid, passage in enumerate(seperated_docs, 0):
    processed_passage = passage[count_digits(pid+1):]
    data.append((pid+1, processed_passage))

df = pd.DataFrame(data, columns=['qid', 'query'])
df.to_csv('query-text.tsv', sep='\t', index=False)
delete_first_row_tsv('query-text.tsv')
delete_last_row_tsv('query-text.tsv')
os.rename('query-text.tsv', 'Vaswani/query-text.tsv')

In [14]:
# rlv-ass(relevant result assessment) preprocess

result_array = []
with open("/home/requirements/data/Vaswani/rlv-ass", "r") as file:
    temp_list = []

    for line in file:
        line = line.strip()

        if line:
            numbers = line.split()
            temp_list.extend(numbers)
            if '/' in line:
                result_array.append(temp_list)
                temp_list = []

data = []
for array in result_array:
    array = [int(i) for i in array[:-1]]
    for x in array[1:]:
        data.append((array[0], x))

df = pd.DataFrame(data, columns=['qid', 'related passage'])
df.to_csv('rlv-ass.tsv', sep='\t', index=False)
os.rename('rlv-ass.tsv', 'Vaswani/rlv-ass.tsv')

In [19]:
# qrel preprocess

result_array = []
with open("/home/requirements/data/Vaswani/rlv-ass", "r") as file:
    temp_list = []

    for line in file:
        line = line.strip()

        if line:
            numbers = line.split()
            temp_list.extend(numbers)
            if '/' in line:
                result_array.append(temp_list)
                temp_list = []

data = []
for array in result_array:
    array = [int(i) for i in array[:-1]]
    for x in array[1:]:
        data.append((array[0], 0, x, 1))

df = pd.DataFrame(data, columns=['qid', '0', 'related passage', '1'])
df.to_csv('qrel.tsv', sep='\t', index=False)
delete_first_row_tsv('qrel.tsv')
os.rename('qrel.tsv', 'Vaswani/qrel.tsv')

For `MS MARCO`, in order to make reasonable analysis on candidate set/retrieved set, ...

Create `quries.short.dev.tsv` and `top1000.short.dev`

In [4]:
# short preprocess

long_df = pd.read_csv("/home/requirements/data/MSMARCO/top1000.dev", sep='\t', names=['qid', 'pid', 'query', 'passage'])

short_qids = long_df['qid'].unique()[100:200]
short_df = long_df[long_df['qid'].isin(short_qids)]

# top1000.short.dev
top1000_short_df = short_df[['qid', 'pid', 'query', 'passage']]
top1000_short_df.to_csv('top1000.short.dev', sep='\t', index=False)
delete_first_row_tsv('top1000.short.dev')
os.rename('top1000.short.dev', 'MSMARCO/top1000.short.dev')

# queries.short.dev.tsv
query_short_df = short_df[['qid', 'query']]
query_short_df.drop_duplicates(inplace=True)
query_short_df.to_csv('queries.short.dev.tsv', sep='\t', index=False)
delete_first_row_tsv('queries.short.dev.tsv')
os.rename('queries.short.dev.tsv', 'MSMARCO/queries.short.dev.tsv')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  query_short_df.drop_duplicates(inplace=True)


In [3]:
long_df = pd.read_csv("/home/requirements/data/MSMARCO/top1000.dev", sep='\t', names=['qid', 'pid', 'query', 'passage'])
qrel_long_df = pd.read_csv("/home/requirements/data/MSMARCO/qrels.dev.tsv", sep='\t', names=['qid', '0', 'pid', '1'])


# qrels.short.dev.tsv

short_qids = long_df['qid'].unique()[100:200]
qrel_long_df = qrel_long_df[qrel_long_df['qid'].isin(short_qids)]
qrel_long_df.to_csv('qrels.short.dev.tsv', sep='\t', index=False)
delete_first_row_tsv('qrels.short.dev.tsv')
os.rename('qrels.short.dev.tsv', 'MSMARCO/qrels.short.dev.tsv')

In [5]:
long_df = pd.read_csv("/home/requirements/data/MSMARCO/top1000.dev", sep='\t', names=['qid', 'pid', 'query', 'passage'])

short_qids = long_df['qid'].unique()[100:200]
short_df = long_df[long_df['qid'].isin(short_qids)]

# qreltransformed.top1000.short.dev

qreltransformed_top1000_short_df = pd.DataFrame({
    'qid': short_df['qid'].tolist(),
    '0': [0 for i in range(100000)],
    'pid': short_df['pid'].tolist(),
    '1': [1 for i in range(100000)]
})
qreltransformed_top1000_short_df.to_csv('qreltransformed.top1000.short.dev', sep='\t', index=False)
delete_first_row_tsv('qreltransformed.top1000.short.dev')
os.rename('qreltransformed.top1000.short.dev', 'MSMARCO/qreltransformed.top1000.short.dev')

collection 7130310 to 7409929

In [4]:
import pandas as pd
collection_df = pd.read_csv("/home/requirements/data/MSMARCO/collection.tsv", sep='\t', names=['pid', 'passage'])

subset_df = collection_df.iloc[7130310:7409930]
subset_df.to_csv('collection.51to52.tsv', sep='\t', index=False)
delete_first_row_tsv('collection.51to52.tsv')
os.rename('collection.51to52.tsv', 'MSMARCO/collection.51to52.tsv')
