In [1]:
import json
import sys
import random
sys.path.append('../')
data_path = '../new_dataset/'
train_data = json.load(open(data_path +'/drop_dataset_train.json'))
dev_data = json.load(open(data_path +'/drop_dataset_dev.json'))

In [2]:
def convert_dict_to_string(dict_: dict):
    running_sentence = ""
    for k,v in dict_.items():
        running_sentence +=f' {k}:{v}'
    return running_sentence


def simplify_dataset_pack(data_pack):
    simplified_dataset =[]
    for _,data in data_pack.items():
        passage = data['passage']
        qa_pairs = data['qa_pairs']
        questions_answers = []
        answer_spans = []
        for qa in qa_pairs:
            question= qa['question']

            answer = ''

            # Check if the answer is a span or nor
            if len(qa['answer']['spans'])>0:
                answer = qa['answer']['spans']
            elif qa['answer']['number']!="":
                answer = qa['answer']['number']
            elif qa['answer']['date']['day']!="":
                answer = convert_dict_to_string(qa['answer']['date'])
            elif qa['answer']['date']['month']!="":
                answer = convert_dict_to_string(qa['answer']['date'])
            elif qa['answer']['date']['day']!="":
                answer = convert_dict_to_string(qa['answer']['date'])
            
            if answer!='':
                questions_answers.append((question,answer))
            
        if len(questions_answers)>0:
            simplified_dataset.append({'passage': passage, 'qa_pairs': questions_answers})   
    return simplified_dataset

In [3]:
train_simplified_dataset = simplify_dataset_pack(train_data)
dev_simplified_dataset = simplify_dataset_pack(dev_data)

In [43]:
random.choice([23,45])

45

In [5]:
from nltk.tokenize import sent_tokenize
from nltk.util import ngrams
def selectContext(articles, answer_index,n=4):
    vals = list(ngrams(range(len(articles)),n))
    for l in vals:
        if answer_index in l:
            return l

def get_span_sentence(span_answer,passage,n=4):
    answer = span_answer.strip()
    context = passage.strip()
    contexts = sent_tokenize(context)
    
    answer_index = [idx for idx, s in enumerate(contexts) if answer in s]
    answer_sentences = [s for idx, s in enumerate(contexts) if answer in s]

    if len(contexts) < 5 and len(answer_index) >0:
        return passage.strip(), " ".join(answer_sentences)
    if len(answer_index) < 1:
        return None
    else:
        if len(answer_index)==1:
            selected_context_idxs = selectContext(contexts, answer_index, n=n)
            if selected_context_idxs is not None:
                answer_context = ' '.join([contexts[sid] for sid in selected_context_idxs])
            else:
                answer_context = passage.strip()
        else:
            answer_context =  " ".join(answer_sentences)
    return answer_context,  " ".join(answer_sentences)

In [6]:
from src.dataset_processor import QuestionGenerationData
from src.config import GenerationTasks
def generateDropQuestions(simplified_dataset):
    data_pack = []
    for data in simplified_dataset:
        passage = data['passage']

        for q,a in data['qa_pairs']:
            if isinstance(a,list):
                a = random.choice(a)
            dob_q = QuestionGenerationData(task=GenerationTasks.context_question_gen, input_text= passage, output_text=q,contextual_text= a.strip())
            """
            dob_q = DatasetObject(question=q, context=passage.strip(),
                                  answer=a.strip(),
                                  input_text= passage,
                                  answer_sentence=a.strip(), 
                                  fact=passage, 
                                  task=GenerationTasks.context_question_gen, 
                                  task_id=3)
            """
            data_pack.append(dob_q)
    return data_pack

def generateDropQuestions_wo_contextdef(simplified_dataset):
    data_pack = []
    for data in simplified_dataset:
        passage = data['passage']
        for q,a in data['qa_pairs']:
            if isinstance(a,list):
                
                a = random.choice(a)
                obj = get_span_sentence(a,passage,n=4)
                if obj is not None:
                    #print(answer_sentence)
                    fact, answer_sentence = obj

                    dob_q = QuestionGenerationData(task=GenerationTasks.vanilla_question_gen, input_text= fact, 
                                                   output_text=q,contextual_text= answer_sentence.strip())
                    """
                    dob_q = DatasetObject(question=q, input_text=fact,
                             context=data['passage'].strip(), answer=answer_sentence, answer_sentence=answer_sentence.strip(), 
                             fact=fact, task=GenerationTasks.vanilla_question_gen, task_id=1)
                    """
                    
                    data_pack.append(dob_q)

                    #dob_a = DatasetObject(question=q, context=data['passage'].strip(), answer=answer_sentence, answer_sentence=answer_sentence.strip(), fact=fact, task='<generate_answers> ', task_id=2)
                    #data_pack.append(dob_a)
    return data_pack

In [7]:
train_data_obj1 = generateDropQuestions(simplified_dataset=train_simplified_dataset) +  generateDropQuestions_wo_contextdef(simplified_dataset=train_simplified_dataset)
#train_data_obj2 = generateDropQuestions_wo_contextdef(simplified_dataset=train_simplified_dataset)

In [8]:
dev_data_obj = generateDropQuestions(simplified_dataset=dev_simplified_dataset) +  generateDropQuestions_wo_contextdef(simplified_dataset=dev_simplified_dataset)

In [9]:
from dataclass_csv import DataclassReader, dateformat,DataclassWriter

with open("../curated_data/drop_train.csv", "w",encoding='utf-8') as f:
    w = DataclassWriter(f, train_data_obj1, QuestionGenerationData)
    w.write()

with open("../curated_data/drop_dev.csv", "w",encoding='utf-8') as f:
    w = DataclassWriter(f, dev_data_obj, QuestionGenerationData)
    w.write()

In [10]:
len(train_data_obj1)

104803

In [30]:
len(train_simplified_dataset),len(dev_simplified_dataset)

(5559, 582)

In [38]:
train_simplified_dataset[0]['passage'].split('. ')

['To start the season, the Lions traveled south to Tampa, Florida to take on the Tampa Bay Buccaneers',
 'The Lions scored first in the first quarter with a 23-yard field goal by Jason Hanson',
 'The Buccaneers tied it up with a 38-yard field goal by Connor Barth, then took the lead when Aqib Talib intercepted a pass from Matthew Stafford and ran it in 28 yards',
 'The Lions responded with a 28-yard field goal',
 'In the second quarter, Detroit took the lead with a 36-yard touchdown catch by Calvin Johnson, and later added more points when Tony Scheffler caught an 11-yard TD pass',
 'Tampa Bay responded with a 31-yard field goal just before halftime',
 'The second half was relatively quiet, with each team only scoring one touchdown',
 "First, Detroit's Calvin Johnson caught a 1-yard pass in the third quarter",
 "The game's final points came when Mike Williams of Tampa Bay caught a 5-yard pass",
 ' The Lions won their regular season opener for the first time since 2007']

In [37]:
train_simplified_dataset[0]['qa_pairs'][0]

('How many points did the buccaneers need to tie in the first?', '3')

In [23]:
dev_dataset

[{'passage': " Hoping to rebound from their loss to the Patriots, the Raiders stayed at home for a Week 16 duel with the Houston Texans.  Oakland would get the early lead in the first quarter as quarterback JaMarcus Russell completed a 20-yard touchdown pass to rookie wide receiver Chaz Schilens.  The Texans would respond with fullback Vonta Leach getting a 1-yard touchdown run, yet the Raiders would answer with kicker Sebastian Janikowski getting a 33-yard and a 30-yard field goal.  Houston would tie the game in the second quarter with kicker Kris Brown getting a 53-yard and a 24-yard field goal. Oakland would take the lead in the third quarter with wide receiver Johnnie Lee Higgins catching a 29-yard touchdown pass from Russell, followed up by an 80-yard punt return for a touchdown.  The Texans tried to rally in the fourth quarter as Brown nailed a 40-yard field goal, yet the Raiders' defense would shut down any possible attempt.",
  'qa_pairs': [('Who scored the first touchdown of t

In [14]:
question

'How many yards longer was the longest field goal than the shortest?'

In [11]:
data

{'passage': " Hoping to rebound from their loss to the Patriots, the Raiders stayed at home for a Week 16 duel with the Houston Texans.  Oakland would get the early lead in the first quarter as quarterback JaMarcus Russell completed a 20-yard touchdown pass to rookie wide receiver Chaz Schilens.  The Texans would respond with fullback Vonta Leach getting a 1-yard touchdown run, yet the Raiders would answer with kicker Sebastian Janikowski getting a 33-yard and a 30-yard field goal.  Houston would tie the game in the second quarter with kicker Kris Brown getting a 53-yard and a 24-yard field goal. Oakland would take the lead in the third quarter with wide receiver Johnnie Lee Higgins catching a 29-yard touchdown pass from Russell, followed up by an 80-yard punt return for a touchdown.  The Texans tried to rally in the fourth quarter as Brown nailed a 40-yard field goal, yet the Raiders' defense would shut down any possible attempt.",
 'qa_pairs': [{'question': 'Who scored the first touc