In [1]:
import pandas as pd
import numpy as np
pd.__version__

'1.5.2'

In [2]:
# Anyone using this please change these paths according to your machine

TRAIN_DATASET = r"./train_data.csv"
SAMPLE_INPUT_PARA = r"./dataset/input_paragraph.csv"
SAMPLE_INPUT_QUESTION = r"./dataset/input_question.csv"
SAMPLE_THEME_INTERVAL = r"./dataset/theme_interval.csv"
SAMPLE_GROUND_TRUTH = r"./dataset/ground_truth.csv"

## Reading Data from train_data.csv

Link for the file is : [https://drive.google.com/file/d/1Z-yb752A3o7b9dqrGt24XU0sl53FVqya/view](https://drive.google.com/file/d/1Z-yb752A3o7b9dqrGt24XU0sl53FVqya/view)

In [3]:
train_data = pd.read_csv(TRAIN_DATASET)
train_data.rename(columns={'Unnamed: 0': 'id'}, inplace=True)
print(train_data.shape)
train_data.head()

(75055, 7)


Unnamed: 0,id,Theme,Paragraph,Question,Answer_possible,Answer_text,Answer_start
0,2,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce leave Destiny's Child and bec...,True,['2003'],[526]
1,6,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,What album made her a worldwide known artist?,True,['Dangerously in Love'],[505]
2,7,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,Who managed the Destiny's Child group?,True,['Mathew Knowles'],[360]
3,8,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyoncé rise to fame?,True,['late 1990s'],[276]
4,9,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,What role did Beyoncé have in Destiny's Child?,True,['lead singer'],[290]


## Making input_questions csv

In [4]:
sample_input_questions = pd.DataFrame(train_data[['Question', 'Theme']])
print(sample_input_questions.size)
sample_input_questions.rename(columns={'Question' : 'question'}, inplace=True)
sample_input_questions.rename(columns={'Theme' : 'theme'}, inplace=True)
sample_input_questions.index = np.arange(1, len(sample_input_questions)+1)
sample_input_questions.to_csv(SAMPLE_INPUT_QUESTION, index_label='id')

150110


## Making input_paragraph csv

In [5]:
sample_input_para_theme = train_data['Theme'].unique()

In [6]:
sample_input_para = pd.DataFrame({
    "theme" : [],
    "paragraph" : []
})

for theme in sample_input_para_theme:
    themes = pd.DataFrame(train_data[train_data['Theme'] == theme])
    paras = themes['Paragraph'].unique()
    len_theme = paras.size
    theme_arr = np.array([theme for _ in range(len_theme)])
    paras = pd.Series(paras, name='paragraph')
    theme_arr = pd.Series(theme_arr, name='theme')
    temp_df = pd.concat([theme_arr, paras], axis=1)
    sample_input_para = pd.concat([sample_input_para, temp_df])

sample_input_para.index = np.arange(1, len(sample_input_para)+1)
sample_input_para.to_csv(SAMPLE_INPUT_PARA, encoding='utf-8', index_label='id')

## Making theme_interval csv

In [7]:
sample_input_questions['id'] = sample_input_questions.index

In [8]:
sample_theme_interval = pd.DataFrame({
    "theme" : [],
    "start" : [],
    "end" : []
})

for theme in sample_input_para_theme:
    themes = pd.DataFrame(sample_input_questions[sample_input_questions['theme']==theme])
    start = int(themes['id'].iloc[0])
    len_themes = len(themes.index)
    end = int(themes['id'].iloc[len_themes-1])
    temp_df = pd.DataFrame({
        "theme" : [theme],
        "start" : [start],
        "end" : [end]
    })
    sample_theme_interval = pd.concat([sample_theme_interval, temp_df])

sample_theme_interval.to_csv(SAMPLE_THEME_INTERVAL, encoding='utf-8', index=False)

## Making ground_truth csv

In [9]:
train_data.rename(columns={"Paragraph":"paragraph"}, inplace=True)
train_data.rename(columns={"Theme":"theme"}, inplace=True)
train_data.rename(columns={"Question":"question"}, inplace=True)

In [10]:
sample_ground_truth = pd.DataFrame()
sample_ground_truth['question_id'] = sample_input_questions.index
sample_ground_truth['answers'] = train_data['Answer_text']

# Nirmit will add the paragraph_id

sample_ground_truth.to_csv(SAMPLE_GROUND_TRUTH, encoding='utf-8', index=False)