## Build Chatbot using Google NarrativeQA Reading Comprehension Dataset



### This notebook ingests, cleans, and saves the questions in one CSV and the answers in another CSV.
INGEST: The original dataset provided by Google was of the following schema: 
* qaps.csv: document_id, set, question, answer1, answer2, question_tokenized, answer1_tokenized, answer2_tokenized.

EXPORT: I cleaned and parsed the dataset to export the following:
* questions_list.csv: document_id and question
* answers_list.csv: answer1 and answer2

In [7]:
import nltk
import re
import os
import io
import pandas as pd
import time
import csv
import pickle
import string
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/joelhaas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
startTime = time.time()

In [9]:
# Clean each question and answer

def cleaning_document(document):
    regex = r"\([^)]*\)|\[[^)]*\]"   #removes anything in () or [] and trailing spaces
    
    document = document.lower()
        
    document = re.sub(' +', ' ', document)    
    document = document.replace(" •", "")
    document = document.replace(" .", ".")
    document = document.replace("-", " ")
    document = document.replace("#","")
    document = document.replace("/"," ")
    document = document.replace("\\"," ")
    document = document.replace("TM"," ")
    document = document.replace("\n", " ")
    document = document.replace("\t", " ")
    document = document.replace("test,\"", " ")
    document = document.replace("train,\"", " ")
    document = document.replace("  ", " ")
    
    return document

In [10]:
# Flatten lists of list 

def nested_flatten(inputList):
    summary = []
    
    for item in inputList:
        if isinstance(item, list):
            summary += nested_flatten(item)
        else:
            summary += [item]
    return summary

In [11]:
def main():
    num_files = 0

    answers_list = []
    questions_list = []
    
    # Ingest qas.csv file
    with open('qas.csv', newline='') as csvfile:
        spamreader = csv.reader(csvfile, delimiter=',')
        for row in spamreader:
            num_files += 1
            
            try:
                row = str(row[0])
                row = row.split(',') # split each row by a comma

                answerID = row[0]  # equivalent to doc_id
                question = row[2]  # question being asked

                answer1 = row[3]   # annotated answer by participant
                answer2 = row[4]   # annotated answer by paarticipant
                
                # prep question for CSV export
                updatedQuestion = answerID + ', ' + question + ' '
                updatedQuestion = cleaning_document(updatedQuestion) # clean sentence
                questions_list.append(updatedQuestion)
                
                # prep answers for CSV export
                answers = answer1 + ', ' + answer2 + ' '
                answers = cleaning_document(answers) # clean sentence
                answers_list.append(answers)
            
            except:
                pass

    
    # Export questions and answers to CSVs
    df1 = pd.DataFrame(data={"questions": questions_list})
    df1.to_csv("questions_list.csv", sep=',', encoding='utf-8',index=False)      

    df2 = pd.DataFrame(data={"answers": answers_list})
    df2.to_csv("answers_list.csv", sep=',', encoding='utf-8',index=False) 


In [12]:
if __name__ == '__main__':
    main()