# Data preprocessing notebook

For `Vaswani`, you have to convert files into `.tsv` files.  
This notebook is a tool for the conversion.  

If you run the all cells below, you can get below `.tsv` files.
- `doc-text.tsv`
- `query-text.tsv`
- `rlv-ass.tsv`
- `qrel.tsv`

## required packages
- pandas

In [5]:
import pandas as pd
import csv
import os

In [8]:
# utility functions

def delete_first_row_tsv(file_path):
    temp_file_path = file_path + '.temp'

    with open(file_path, 'r', newline='') as f_input, open(temp_file_path, 'w', newline='') as f_output:
        reader = csv.reader(f_input, delimiter='\t')
        writer = csv.writer(f_output, delimiter='\t')

        next(reader)

        for row in reader:
            writer.writerow(row)

    os.rename(temp_file_path, file_path)

def delete_last_row_tsv(file_path):
    temp_file_path = file_path + '.temp'

    with open(file_path, 'r', newline='') as f_input:
        rows = list(csv.reader(f_input, delimiter='\t'))[:-1]

    with open(temp_file_path, 'w', newline='') as f_output:
        writer = csv.writer(f_output, delimiter='\t')
        writer.writerows(rows)

    os.rename(temp_file_path, file_path)

In [9]:
# doc-text(collection) preprocess

SEPERATOR = '/'

with open("/home/required_pool/data/Vaswani/doc-text", "r") as file:
    docs = file.read().replace('\n', '')

seperated_docs = docs.split(SEPERATOR)

data = []
count_digits = lambda number: len(str(number))

for pid, passage in enumerate(seperated_docs, 0):
    processed_passage = passage[count_digits(pid+1):]
    data.append((pid, processed_passage))

df = pd.DataFrame(data, columns=['pid', 'passage'])
df.to_csv('doc-text.tsv', sep='\t', index=False)
delete_first_row_tsv('doc-text.tsv')
delete_last_row_tsv('doc-text.tsv')
os.rename('doc-text.tsv', 'Vaswani/doc-text.tsv')

In [13]:
# query-text(query) preprocess

SEPERATOR = '/'

with open("/home/required_pool/data/Vaswani/query-text", "r") as file:
    docs = file.read().replace('\n', '')

seperated_docs = docs.split(SEPERATOR)

data = []
count_digits = lambda number: len(str(number))

for pid, passage in enumerate(seperated_docs, 0):
    processed_passage = passage[count_digits(pid+1):]
    data.append((pid+1, processed_passage))

df = pd.DataFrame(data, columns=['qid', 'passage'])
df.to_csv('query-text.tsv', sep='\t', index=False)
delete_first_row_tsv('query-text.tsv')
delete_last_row_tsv('query-text.tsv')
os.rename('query-text.tsv', 'Vaswani/query-text.tsv')

In [14]:
# rlv-ass(relevant result assessment) preprocess

result_array = []
with open("/home/required_pool/data/Vaswani/rlv-ass", "r") as file:
    temp_list = []

    for line in file:
        line = line.strip()

        if line:
            numbers = line.split()
            temp_list.extend(numbers)
            if '/' in line:
                result_array.append(temp_list)
                temp_list = []

data = []
for array in result_array:
    array = [int(i) for i in array[:-1]]
    for x in array[1:]:
        data.append((array[0], x))

df = pd.DataFrame(data, columns=['qid', 'related passage'])
df.to_csv('rlv-ass.tsv', sep='\t', index=False)
os.rename('rlv-ass.tsv', 'Vaswani/rlv-ass.tsv')

In [19]:
# qrel preprocess

result_array = []
with open("/home/required_pool/data/Vaswani/rlv-ass", "r") as file:
    temp_list = []

    for line in file:
        line = line.strip()

        if line:
            numbers = line.split()
            temp_list.extend(numbers)
            if '/' in line:
                result_array.append(temp_list)
                temp_list = []

data = []
for array in result_array:
    array = [int(i) for i in array[:-1]]
    for x in array[1:]:
        data.append((array[0], 0, x, 1))

df = pd.DataFrame(data, columns=['qid', '0', 'related passage', '1'])
df.to_csv('qrel.tsv', sep='\t', index=False)
delete_first_row_tsv('qrel.tsv')
os.rename('qrel.tsv', 'Vaswani/qrel.tsv')