<a href="https://colab.research.google.com/github/karvesaket/retroqa/blob/master/preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
def preprocess_csv(newsqa_df, mode="split"):
  print('Total number of questions:', len(newsqa_df))
  # Eliminate bad questions
  bad_question_length = len(newsqa_df[newsqa_df['is_question_bad'] == '1.0'])
  newsqa_df = newsqa_df[newsqa_df['is_question_bad'] != '1.0']
  newsqa_df.index = range(len(newsqa_df))
  print("You have eliminated {} bad questions.".format(bad_question_length))
  print("There are {} rows remaining.".format(len(newsqa_df)))
  # Remove "(CNN) --" patterns in the beginning of story text and eliminate rows where the min answer index becomes negative after removing (CNN) pattern
  import re
  cnn_list = []
  indices_to_drop = []
  for index, line in newsqa_df.iterrows():
    if re.search('\(CNN\) +-', line['story_text']):
      end_index = line['story_text'].index('(CNN)')
      end_index += 5
      while line['story_text'][end_index] == ' ':
        end_index += 1
      end_index += 2
      while line['story_text'][end_index] == ' ':
        end_index += 1
      cnn = line['story_text'][line['story_text'].index('(CNN)'):end_index]
      if cnn not in cnn_list:
        cnn_list.append(cnn)
      all_answers = line['answer_char_ranges'].split('|')
      all_answers = [answer for answer in all_answers if answer != 'None']
      stop = False
      for i, answers in enumerate(all_answers):
        answers = answers.split(',')
        for j, answer in enumerate(answers):
          answer = answer.split(':')
          if int(answer[0])-(line['story_text'].index(cnn)+len(cnn)) < 0:
            stop = True
            indices_to_drop.append(index)
            break
        if stop:
          break
    else:
      indices_to_drop.append(index)
  print("Here are the eliminated (CNN) patterns:", cnn_list)
  newsqa_df = newsqa_df.drop(newsqa_df.index[indices_to_drop])
  newsqa_df.index = range(len(newsqa_df))
  print("You have eliminated {} rows where the min answer index was negative after removing the (CNN) patterns.".format(len(indices_to_drop)))
  print("There are {} rows remaining.".format(len(newsqa_df)))
  # Preprocess according to tokenization mode
  if mode == "spacy":
    import spacy
    nlp = spacy.load('en_core_web_sm', disable=['tagger', 'parser', 'ner', 'textcat'])
  elif mode == "bert":
    !pip install transformers
    from transformers import BertTokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
  print("Preprocessing...")
  answer1_min_char_index = []
  answer1_max_char_index = []
  answer1_char = []
  answer2_min_char_index = []
  answer2_max_char_index = []
  answer2_char = []
  answer1_min_word_index = []
  answer1_max_word_index = []
  answer1_word = []
  answer2_min_word_index = []
  answer2_max_word_index = []
  answer2_word = []
  new_story_text = []
  for index, line in newsqa_df.iterrows():
    all_answers = line['answer_char_ranges'].split('|')
    all_answers = [answer for answer in all_answers if answer != 'None']
    if len(all_answers) < 2:
      answer2_min_char_index.append('None')
      answer2_max_char_index.append('None')
      answer2_char.append('None')
      answer2_min_word_index.append('None')
      answer2_max_word_index.append('None')
      answer2_word.append('None')
      if len(all_answers) == 0:
        answer1_min_char_index.append('None')
        answer1_max_char_index.append('None')
        answer1_char.append('None')
        answer1_min_word_index.append('None')
        answer1_max_word_index.append('None')
        answer1_word.append('None')
    else:
      all_answers = all_answers[:2]
    for cnn in cnn_list:
      if cnn in line['story_text']:
        for i, answers in enumerate(all_answers):
          min_answers = ''
          max_answers = ''
          answers = answers.split(',')
          for j, answer in enumerate(answers):
            answer = answer.split(':')
            answer[0] = int(answer[0])-(line['story_text'].index(cnn)+len(cnn))
            min_answers += str(answer[0])
            answer[1] = int(answer[1])-(line['story_text'].index(cnn)+len(cnn))
            max_answers += str(answer[1]-1)
            answers[j] = str(answer[0])+':'+str(answer[1]-1)
            if j < len(answers) - 1:
              answers[j] += ','
              min_answers += ','
              max_answers += ','
          all_answers[i] = ''.join(answers)
          if i == 0:
            answer1_min_char_index.append(min_answers)
            answer1_max_char_index.append(max_answers)
          elif i == 1:
            answer2_min_char_index.append(min_answers)
            answer2_max_char_index.append(max_answers)
        line['story_text'] = line['story_text'][line['story_text'].index(cnn)+len(cnn):]
    new_story_text.append(line['story_text'])
    for i, answers in enumerate(all_answers):
      char_result = ''
      min_answers = ''
      max_answers = ''
      word_result = ''
      answers = answers.split(',')
      for j, answer in enumerate(answers):
        answer = answer.split(':')
        char_result += line['story_text'][int(answer[0]):int(answer[1])+1]
        story_text_split = line['story_text'].split(line['story_text'][int(answer[0]):int(answer[1])+1])
        if mode == 'split':
          before_split = story_text_split[0].split()
          min_word_index = len(before_split)
          answer_split = line['story_text'][int(answer[0]):int(answer[1])+1].split()
          max_word_index = min_word_index + len(answer_split)
          word_result += ' '.join(line['story_text'].split()[min_word_index:max_word_index]) + ' '
        elif mode == 'spacy':
          before_split = nlp(story_text_split[0])
          before_tokens = [token.text for token in before_split]
          min_word_index = len(before_tokens)
          answer_split = nlp(line['story_text'][int(answer[0]):int(answer[1])+1])
          answer_tokens = [token.text for token in answer_split]
          max_word_index = min_word_index + len(answer_tokens)
          word_split = nlp(line['story_text'])
          word_tokens = [token.text for token in word_split]
          word_result += ' '.join(word_tokens[min_word_index:max_word_index]) + ' '
        elif mode == 'bert':
          before_split = tokenizer.tokenize(story_text_split[0])
          min_word_index = len(before_split)
          answer_split = tokenizer.tokenize(line['story_text'][int(answer[0]):int(answer[1])+1])
          max_word_index = min_word_index + len(answer_split)
          word_split = tokenizer.tokenize(line['story_text'])
          final_word_split = []
          for token in word_split[min_word_index:max_word_index]:
            if token[:2] == '##':
              final_word_split.append(token[2:])
            else:
              final_word_split.append(token)
          word_result += ' '.join(final_word_split) + ' '
        min_answers += str(min_word_index)
        max_answers += str(max_word_index-1)
        if j < len(answers) - 1:
          min_answers += ','
          max_answers += ','
      if i == 0:
        answer1_char.append(char_result)
        answer1_min_word_index.append(min_answers)
        answer1_max_word_index.append(max_answers)
        answer1_word.append(word_result)
      elif i == 1:
        answer2_char.append(char_result)
        answer2_min_word_index.append(min_answers)
        answer2_max_word_index.append(max_answers)
        answer2_word.append(word_result)
  newsqa_df = newsqa_df.drop('answer_char_ranges', axis=1)
  newsqa_df['char_start_index_1'] = answer1_min_char_index
  newsqa_df['char_end_index_1'] = answer1_max_char_index
  newsqa_df['char_text_1'] = answer1_char
  newsqa_df['char_start_index_2'] = answer2_min_char_index
  newsqa_df['char_end_index_2'] = answer2_max_char_index
  newsqa_df['char_text_2'] = answer2_char
  newsqa_df['word_start_index_1'] = answer1_min_word_index
  newsqa_df['word_end_index_1'] = answer1_max_word_index
  newsqa_df['word_text_1'] = answer1_word
  newsqa_df['word_start_index_2'] = answer2_min_word_index
  newsqa_df['word_end_index_2'] = answer2_max_word_index
  newsqa_df['word_text_2'] = answer2_word
  newsqa_df['story_text'] = new_story_text
  # Remove rows where the char answer does not match the word answer
  indices_to_drop = []
  for index, line in newsqa_df.iterrows():
    real_char_answer_1 = [a.lower() for a in newsqa_df['char_text_1'][index] if a not in ' \n']
    real_word_answer_1 = [a.lower() for a in newsqa_df['word_text_1'][index] if a not in ' \n']
    real_char_answer_2 = [a.lower() for a in newsqa_df['char_text_2'][index] if a not in ' \n']
    real_word_answer_2 = [a.lower() for a in newsqa_df['word_text_2'][index] if a not in ' \n']
    if real_char_answer_1 != real_word_answer_1 or real_char_answer_2 != real_word_answer_2:
      indices_to_drop.append(index)
  newsqa_df = newsqa_df.drop(newsqa_df.index[indices_to_drop])
  newsqa_df.index = range(len(newsqa_df))
  print("You have eliminated {} rows where the char answer did not match the word answer.".format(len(indices_to_drop)))
  print("There are {} rows remaining.".format(len(newsqa_df)))
  return newsqa_df

In [0]:
import pandas as pd
newsqa_df = pd.read_csv('gdrive/Shared drives/CIS 700-1 Final Project/Data/combined-newsqa-data-v1.csv')
newsqa_df = preprocess_csv(newsqa_df, "spacy")
newsqa_df

Total number of questions: 119633
You have eliminated 6646 bad questions.
There are 112987 rows remaining.
Here are the eliminated (CNN) patterns: ['(CNN) -- ', '(CNN)  -- ', '(CNN)      -- ', '(CNN) --  ', '(CNN) --   ', '(CNN)  --  ', '(CNN)    -- ', '(CNN)   -- ', '(CNN) --     ', '(CNN)  --   ']
You have eliminated 10713 rows where the min answer index was negative after removing the (CNN) patterns.
There are 102274 rows remaining.
You have eliminated 544 rows where the char answer did not match the word answer.
There are 101730 rows remaining.


Unnamed: 0,story_id,question,is_answer_absent,is_question_bad,validated_answers,story_text,char_start_index_1,char_end_index_1,char_text_1,char_start_index_2,char_end_index_2,char_text_2,word_start_index_1,word_end_index_1,word_text_1,word_start_index_2,word_end_index_2,word_text_2
0,./cnn/stories/42d01e187213e86f5fe617fe32e716ff...,What was the amount of children murdered?,0.0,0.0,"{""none"": 1, ""294:297"": 2}",A high court in northern India on Friday acqui...,268,270,19,,,,54,54,19,,,
1,./cnn/stories/c48228a52f26aca65c31fad273e66164...,Where was one employee killed?,0.0,0.0,,Fighting in the volatile Sudanese region of Da...,25,50,Sudanese region of Darfur,1601,1608,"Seleia,",4,7,Sudanese region of Darfur,302,303,"Seleia ,"
2,./cnn/stories/c65ed85800e4535f4bbbfa2c34d7d963...,who did say South Africa did not issue a visa ...,0.0,0.0,"{""839:853"": 1, ""103:127"": 2}",Miffed by a visa delay that led the Dalai Lama...,81,104,Archbishop Desmond Tutu,92,104,Desmond Tutu,18,20,Archbishop Desmond Tutu,19,20,Desmond Tutu
3,./cnn/stories/0cf66b646e9b32076513c050edf32a79...,How many years old was the businessman?,0.0,0.0,,England international footballer Steven Gerrar...,528,539,29-year-old,528,539,29-year-old,99,101,29-year - old,99,101,29-year - old
4,./cnn/stories/13012604e3203c18df09289dfedd14cd...,What frightened the families?,0.0,0.0,"{""688:791"": 2, ""690:742"": 1}","At least 6,000 Christians have fled the northe...",666,717,series of killings and threats by Muslim extre...,664,766,a series of killings and threats by Muslim ext...,125,132,series of killings and threats by Muslim extre...,124,143,a series of killings and threats by Muslim ext...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101725,./cnn/stories/7c06e091d7294c87ba42df50008783d9...,what is this pattern is all about?,0.0,0.5,"{""1624:1829"": 2}",They feature characters such as hat-wearing ca...,1561158016151821217121922317,1571160218191894218723122406,"reassuring of 'home, away, home,' ""The basic p...",,,,317319332380463469494,317327379394467492514,"reassuring of ' home , away , home , ' "" The b...",,,
101726,./cnn/stories/4424c8580952975a3e367176a215c787...,is toyota under fire issues on sticking gas pe...,1.0,0.0,,Without issuing a recall of its iconic Prius h...,,,,,,,,,,,,
101727,./cnn/stories/7b2b414d8cbc968f4df05bcefb2f9f0f...,what are the men being detained for,0.0,0.0,"{""2386:2435"": 2}",Three of five Americans contractors detained i...,2363,2411,suspects in connection with Kitterman's slayin...,1123,1143,"""illegal substances""",444,452,suspects in connection with Kitterman 's slayi...,214,217,""" illegal substances """
101728,./cnn/stories/4566e90ca5e65f0323c41319030ca434...,In what year didIvory Coast exit in group stag...,0.0,0.0,,Didier Drogba is backing his Ivory Coast team ...,1250,1254,2006,1250,1254,2006,257,257,2006,257,257,2006


In [0]:
newsqa_df.to_csv('gdrive/Shared drives/CIS 700-1 Final Project/Data/combined-newsqa-data-v2.csv', index=False)

In [0]:
newsqa_df[:10].to_csv('gdrive/Shared drives/CIS 700-1 Final Project/Data/mini-combined-newsqa-data-v2.csv', index=False)

In [0]:
def preprocess_json(data, mode="split", only_answerable=False):
  # Convert json data to csv format
  newsqa_data2 = {'story_id': [], 'question': [], 'answer_char_ranges': [], 'is_answer_absent': [], 'is_question_bad': [], 'validated_answers': [], 'story_text': []}
  for story in data['data']:
    for question in story['questions']:
      newsqa_data2['story_id'].append(story['storyId'])
      newsqa_data2['question'].append(question['q'])
      if 'badQuestion' in question['consensus'] and question['consensus']['badQuestion'] == True or 'noAnswer' in question['consensus'] and question['consensus']['noAnswer'] == True:
        newsqa_data2['answer_char_ranges'].append('None')
      else:
        newsqa_data2['answer_char_ranges'].append(str(question['consensus']['s'])+':'+str(question['consensus']['e']))
      newsqa_data2['is_answer_absent'].append(str(question['isAnswerAbsent']))
      if 'isQuestionBad' not in question:
        newsqa_data2['is_question_bad'].append('0.0')
      else:
        newsqa_data2['is_question_bad'].append(str(question['isQuestionBad']))
      validated_answers = ''
      if 'validatedAnswers' not in question:
        validated_answers = 'NaN'
      else:
        num_answer_found = False
        for index, answer in enumerate(question['validatedAnswers']):
          if index == 0:
            validated_answers += '{'
          if 's' in answer and 'e' in answer:
            validated_answers += '\"' + str(answer['s']) + ':' + str(answer['e']) + '\": ' + str(answer['count'])
            num_answer_found = True
          elif 'badQuestion' in answer or 'noAnswer' in answer:
            validated_answers += '\"none\": ' + str(answer['count'])
          if index < len(question['validatedAnswers'])-1:
            validated_answers += ', '
          else:
            validated_answers += '}'
        if not num_answer_found:
          validated_answers = 'NaN'
      newsqa_data2['validated_answers'].append(validated_answers)
      newsqa_data2['story_text'].append(story['text'])
  newsqa_df2 = pd.DataFrame.from_dict(newsqa_data2)
  print('Total number of questions:', len(newsqa_df2))
  # Eliminate bad questions
  bad_question_length = len(newsqa_df2[newsqa_df2['is_question_bad'] == '1.0'])
  newsqa_df2 = newsqa_df2[newsqa_df2['is_question_bad'] != '1.0']
  newsqa_df2.index = range(len(newsqa_df2))
  print("You have eliminated {} bad questions.".format(bad_question_length))
  print("There are {} rows remaining.".format(len(newsqa_df2)))
  # Remove "(CNN) --" patterns in the beginning of story text and eliminate rows where the min answer index becomes negative after removing (CNN) pattern
  import re
  cnn_list = []
  indices_to_drop = []
  for index, line in newsqa_df2.iterrows():
    if re.search('\(CNN\) +-', line['story_text']):
      end_index = line['story_text'].index('(CNN)')
      end_index += 5
      while line['story_text'][end_index] == ' ':
        end_index += 1
      end_index += 2
      while line['story_text'][end_index] == ' ':
        end_index += 1
      cnn = line['story_text'][line['story_text'].index('(CNN)'):end_index]
      if cnn not in cnn_list:
        cnn_list.append(cnn)
      if line['answer_char_ranges'] != 'None':
        answer = line['answer_char_ranges'].split(':')
        if int(answer[0])-(line['story_text'].index(cnn)+len(cnn)) < 0:
            indices_to_drop.append(index)
    else:
      indices_to_drop.append(index)
  print("Here are the eliminated (CNN) patterns:", cnn_list)
  newsqa_df2 = newsqa_df2.drop(newsqa_df2.index[indices_to_drop])
  newsqa_df2.index = range(len(newsqa_df2))
  print("You have eliminated {} rows where the min answer index was negative after removing the (CNN) patterns.".format(len(indices_to_drop)))
  print("There are {} rows remaining.".format(len(newsqa_df2)))
  # Preprocess according to tokenization mode
  if mode == "spacy":
    import spacy
    nlp = spacy.load('en_core_web_sm', disable=['tagger', 'parser', 'ner', 'textcat'])
  elif mode == "bert":
    !pip install transformers
    from transformers import BertTokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
  print("Preprocessing...")
  answer_min_char_index = []
  answer_max_char_index = []
  answer_char = []
  answer_min_word_index = []
  answer_max_word_index = []
  answer_word = []
  new_story_text = []
  for index, line in newsqa_df2.iterrows():
    if line['answer_char_ranges'] == 'None':
      answer_min_char_index.append('None')
      answer_max_char_index.append('None')
      answer_char.append('None')
      answer_min_word_index.append('None')
      answer_max_word_index.append('None')
      answer_word.append('None')
    for cnn in cnn_list:
      if cnn in line['story_text']:
        if line['answer_char_ranges'] != 'None':
          answer = line['answer_char_ranges'].split(':')
          answer[0] = int(answer[0])-(line['story_text'].index(cnn)+len(cnn))
          answer[1] = int(answer[1])-(line['story_text'].index(cnn)+len(cnn))
          line['answer_char_ranges'] = str(answer[0])+':'+str(answer[1]-1)
          answer_min_char_index.append(str(answer[0]))
          answer_max_char_index.append(str(answer[1]-1))
        line['story_text'] = line['story_text'][line['story_text'].index(cnn)+len(cnn):]
    new_story_text.append(line['story_text'])
    if line['answer_char_ranges'] == 'None':
      continue
    answer = line['answer_char_ranges'].split(':')
    char_result = line['story_text'][int(answer[0]):int(answer[1])+1]
    story_text_split = line['story_text'].split(line['story_text'][int(answer[0]):int(answer[1])+1])
    if mode == 'split':
      before_split = story_text_split[0].split()
      min_word_index = len(before_split)
      answer_split = line['story_text'][int(answer[0]):int(answer[1])+1].split()
      max_word_index = min_word_index + len(answer_split)
      word_result = ' '.join(line['story_text'].split()[min_word_index:max_word_index]) + ' '
    elif mode == 'spacy':
      before_split = nlp(story_text_split[0])
      before_tokens = [token.text for token in before_split]
      min_word_index = len(before_tokens)
      answer_split = nlp(line['story_text'][int(answer[0]):int(answer[1])+1])
      answer_tokens = [token.text for token in answer_split]
      max_word_index = min_word_index + len(answer_tokens)
      word_split = nlp(line['story_text'])
      word_tokens = [token.text for token in word_split]
      word_result = ' '.join(word_tokens[min_word_index:max_word_index]) + ' '
    elif mode == 'bert':
      before_split = tokenizer.tokenize(story_text_split[0])
      min_word_index = len(before_split)
      answer_split = tokenizer.tokenize(line['story_text'][int(answer[0]):int(answer[1])+1])
      max_word_index = min_word_index + len(answer_split)
      word_split = tokenizer.tokenize(line['story_text'])
      final_word_split = []
      for token in word_split[min_word_index:max_word_index]:
        if token[:2] == '##':
          final_word_split.append(token[2:])
        else:
          final_word_split.append(token)
      word_result = ' '.join(final_word_split) + ' '
    min_answers = str(min_word_index)
    max_answers = str(max_word_index-1)
    answer_char.append(char_result)
    answer_min_word_index.append(str(min_word_index))
    answer_max_word_index.append(str(max_word_index-1))
    answer_word.append(word_result)
  newsqa_df2 = newsqa_df2.drop('answer_char_ranges', axis=1)
  newsqa_df2['char_start_index'] = answer_min_char_index
  newsqa_df2['char_end_index'] = answer_max_char_index
  newsqa_df2['char_text'] = answer_char
  newsqa_df2['word_start_index'] = answer_min_word_index
  newsqa_df2['word_end_index'] = answer_max_word_index
  newsqa_df2['word_text'] = answer_word
  newsqa_df2['story_text'] = new_story_text
  # Remove rows where the char answer does not match the word answer
  indices_to_drop = []
  for index, line in newsqa_df2.iterrows():
    real_char_answer = [a.lower() for a in newsqa_df2['char_text'][index] if a not in ' \n']
    real_word_answer = [a.lower() for a in newsqa_df2['word_text'][index] if a not in ' \n']
    if real_char_answer != real_word_answer:
      indices_to_drop.append(index)
  newsqa_df2 = newsqa_df2.drop(newsqa_df2.index[indices_to_drop])
  newsqa_df2.index = range(len(newsqa_df2))
  print("You have eliminated {} rows where the char answer did not match the word answer.".format(len(indices_to_drop)))
  print("There are {} rows remaining.".format(len(newsqa_df2)))
  if only_answerable:
    newsqa_df2 = newsqa_df2[newsqa_df2['char_text'] != 'None']
    newsqa_df2.index = range(len(newsqa_df2))
    print("There are {} answerable questions remaining.".format(len(newsqa_df2)))
  return newsqa_df2

In [0]:
import json
with open('gdrive/Shared drives/CIS 700-1 Final Project/Data/combined-newsqa-data-v1.json') as f:
  data = json.load(f)
newsqa_df2 = preprocess_json(data, "split", True)
newsqa_df2

Total number of questions: 119633
You have eliminated 6646 bad questions.
There are 112987 rows remaining.
Here are the eliminated (CNN) patterns: ['(CNN) -- ', '(CNN)  -- ', '(CNN)      -- ', '(CNN) --  ', '(CNN) --   ', '(CNN) --     ', '(CNN)  --  ', '(CNN)    -- ', '(CNN)   -- ', '(CNN)  --   ']
You have eliminated 9956 rows where the min answer index was negative after removing the (CNN) patterns.
There are 103031 rows remaining.
Preprocessing...
You have eliminated 121 rows where the char answer did not match the word answer.
There are 102910 rows remaining.
There are 77693 answerable questions remaining.


Unnamed: 0,story_id,question,is_answer_absent,is_question_bad,validated_answers,story_text,char_start_index,char_end_index,char_text,word_start_index,word_end_index,word_text
0,./cnn/stories/42d01e187213e86f5fe617fe32e716ff...,What was the amount of children murdered?,0.0,0.0,"{""none"": 1, ""294:297"": 2}",A high court in northern India on Friday acqui...,268,270,19,48,48,19
1,./cnn/stories/42d01e187213e86f5fe617fe32e716ff...,When was Pandher sentenced to death?,0.0,0.0,,A high court in northern India on Friday acqui...,235,244,February.\n,42,42,February.
2,./cnn/stories/42d01e187213e86f5fe617fe32e716ff...,The court aquitted Moninder Singh Pandher of w...,0.0,0.0,"{""624:640"": 2}",A high court in northern India on Friday acqui...,598,613,rape and murder,104,106,rape and murder
3,./cnn/stories/42d01e187213e86f5fe617fe32e716ff...,who was acquitted,0.0,0.0,,A high court in northern India on Friday acqui...,169,191,Moninder Singh Pandher,30,32,Moninder Singh Pandher
4,./cnn/stories/42d01e187213e86f5fe617fe32e716ff...,who was sentenced,0.33333333333299997,0.0,"{""195:218"": 2}",A high court in northern India on Friday acqui...,169,191,Moninder Singh Pandher,30,32,Moninder Singh Pandher
...,...,...,...,...,...,...,...,...,...,...,...,...
77688,./cnn/stories/a065926962ac486d89602e4e7d774f47...,Charges were dropped against which professor?,0.0,0.333333333333,,"The commissioner of the Cambridge, Massachuset...",161,183,"Henry Louis Gates Jr.,",21,24,"Henry Louis Gates Jr.,"
77689,./cnn/stories/a065926962ac486d89602e4e7d774f47...,What did the officer say?,0.0,0.0,"{""4928:5076"": 2}","The commissioner of the Cambridge, Massachuset...",4919,5066,"""While I was led to believe that Gates was law...",772,797,"""While I was led to believe that Gates was law..."
77690,./cnn/stories/a065926962ac486d89602e4e7d774f47...,What happened to the professor?,0.0,0.0,"{""751:897"": 2}","The commissioner of the Cambridge, Massachuset...",742,887,Gates was arrested for disorderly conduct afte...,118,141,Gates was arrested for disorderly conduct afte...
77691,./cnn/stories/52a68e9f8f4d36a669e207d66273a2d7...,what did cooey say,0.0,0.0,,An Ohio death row inmate who says he is too ov...,34,69,he is too overweight to be executed,7,13,he is too overweight to be executed


In [0]:
newsqa_df2.to_csv('gdrive/Shared drives/CIS 700-1 Final Project/Data/combined-newsqa-data-v2-json.csv', index=False)

In [0]:
newsqa_df2[:10].to_csv('gdrive/Shared drives/CIS 700-1 Final Project/Data/mini-combined-newsqa-data-v2-json.csv', index=False)