# Pre-process QA Training data

In [0]:
from google.colab import drive
drive.mount('/gdrive')

In [0]:
import numpy as np
import tensorflow_datasets as tfds
import pandas as pd

In [0]:
with open('/gdrive/My Drive/Colab Notebooks/SQuad/data/foo.txt', 'w') as f:
  f.write('Hello Google Drive!')
!cat '/gdrive/My Drive/Colab Notebooks/SQuad/data/foo.txt'

Hello Google Drive!

## Common Functions

In [0]:
#load training json file into PD
import json
def get_pd_data (filepath):
    with open(filepath) as f:
        Squad = json.load(f)
    
    data=pd.DataFrame(columns=['ID','Context','Question','Answers','Start'])
    for article in Squad['data']:
        for paragraph in article['paragraphs']:
            Context=paragraph['context']
            
            for question in paragraph['qas']:
                Question=[]
                Answers=[]
                Start=[]
                ID=[]
                ID.append(str(question['id']))
                Question.append(str(question['question']))
                subA=[]
                subS=[]
                for n,subanswer in enumerate(question['answers']):
                    subA.append(str(question['answers'][n]['text']))
                    subS.append(int(question['answers'][n]['answer_start']))
                Answers=subA
                Start=subS
                temp=pd.DataFrame(data={'ID':[ID],'Context':[Context],'Question':[Question],'Answers':[Answers],'Start':[Start]})
                data=data.append(temp,ignore_index=True)
                               
    return data

In [0]:
Squad['data'][0]['paragraphs'][0]['qas']

[{'answers': [{'answer_start': 515, 'text': 'Saint Bernadette Soubirous'}],
  'id': '5733be284776f41900661182',
  'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?'},
 {'answers': [{'answer_start': 188, 'text': 'a copper statue of Christ'}],
  'id': '5733be284776f4190066117f',
  'question': 'What is in front of the Notre Dame Main Building?'},
 {'answers': [{'answer_start': 279, 'text': 'the Main Building'}],
  'id': '5733be284776f41900661180',
  'question': 'The Basilica of the Sacred heart at Notre Dame is beside to which structure?'},
 {'answers': [{'answer_start': 381,
    'text': 'a Marian place of prayer and reflection'}],
  'id': '5733be284776f41900661181',
  'question': 'What is the Grotto at Notre Dame?'},
 {'answers': [{'answer_start': 92,
    'text': 'a golden statue of the Virgin Mary'}],
  'id': '5733be284776f4190066117e',
  'question': 'What sits on top of the Main Building at Notre Dame?'}]

## Pre-process training data


In [0]:
data=''
mypath='/gdrive/My Drive/Colab Notebooks/SQuad/data/train-v1.1.json'
data=get_pd_data(mypath)

In [0]:
data.tail(3)

Unnamed: 0,ID,Context,Question,Answers,Start
87596,[5735d259012e2f140011a09f],"Kathmandu Metropolitan City (KMC), in order to...",[With what Belorussian city does Kathmandu hav...,[Minsk],[476]
87597,[5735d259012e2f140011a0a0],"Kathmandu Metropolitan City (KMC), in order to...",[In what year did Kathmandu create its initial...,[1975],[199]
87598,[5735d259012e2f140011a0a1],"Kathmandu Metropolitan City (KMC), in order to...",[What is KMC an initialism of?],[Kathmandu Metropolitan City],[0]


In [0]:
data.iloc[87596]['Question']

['With what Belorussian city does Kathmandu have a relationship?']

In [0]:
!pip install nltk
import nltk.data
nltk.download('punkt')
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [0]:
def prepare_mod1_tar(row):

    row['Target']=[]
    temp=row['Context']
    
    temp0=sent_detector.tokenize(row['Context'])
    for startn in row['Start']:
      
        tempA=temp[:startn]+'[@Break@]'+temp[startn:]
        replacement=sent_detector.tokenize(tempA)

        for n,sentence in enumerate(replacement):
            if '[@Break@]' in sentence:
                break
        if len(replacement)>len(temp0):
            n-=1
            replaceans=np.zeros(len(temp0))
        elif len(replacement)==len(temp0):
            replaceans=np.zeros(len(temp0))
        else:
            n+=1
            replaceans=np.zeros(len(temp0))
        replaceans[n]=1
        row['Target']=replaceans
    row['Context']=temp0
    # length=0
    # for item in row['Target']:
    #     if length==0:
    #         length+=len(item)
    #     else:
    #         if length!=len(item):
    #             print(row.name)
    return row


In [0]:
test=data.copy()

In [0]:
test=test.apply(prepare_mod1_tar,axis=1)


In [0]:
test.tail(4)

Unnamed: 0,ID,Context,Question,Answers,Start,Target
87595,[5735d259012e2f140011a09e],"[Kathmandu Metropolitan City (KMC), in order t...",[What was Yangon previously known as?],[Rangoon],[414],"[0.0, 0.0, 1.0, 0.0]"
87596,[5735d259012e2f140011a09f],"[Kathmandu Metropolitan City (KMC), in order t...",[With what Belorussian city does Kathmandu hav...,[Minsk],[476],"[0.0, 0.0, 1.0, 0.0]"
87597,[5735d259012e2f140011a0a0],"[Kathmandu Metropolitan City (KMC), in order t...",[In what year did Kathmandu create its initial...,[1975],[199],"[0.0, 1.0, 0.0, 0.0]"
87598,[5735d259012e2f140011a0a1],"[Kathmandu Metropolitan City (KMC), in order t...",[What is KMC an initialism of?],[Kathmandu Metropolitan City],[0],"[1.0, 0.0, 0.0, 0.0]"


In [0]:
test.to_pickle('/gdrive/My Drive/Colab Notebooks/SQuad/data/train_v1_pd.pkl')

## Pre-process Model V4 dev data


In [0]:
#load training json file into PD
import json
def get_pd_data (filepath):
    with open(filepath) as f:
        Squad = json.load(f)
    
    data=pd.DataFrame(columns=['ID','Context','Question','Answers','Start'])
    for article in Squad['data']:
        for paragraph in article['paragraphs']:
            Context=paragraph['context']
            
            for question in paragraph['qas']:
                Question=[]
                Answers=[]
                Start=[]
                ID=[]
                ID.append(str(question['id']))
                Question.append(str(question['question']))
                subA=[]
                subS=[]
                for n,subanswer in enumerate(question['answers']):
                    subA.append(str(question['answers'][n]['text']))
                    subS.append(int(question['answers'][n]['answer_start']))
                Answers=subA
                Start=subS
                temp=pd.DataFrame(data={'ID':[ID],'Context':[Context],'Question':[Question],'Answers':[Answers],'Start':[Start]})
                data=data.append(temp,ignore_index=True)
                               
    return data

In [0]:
data=''
mypath='/gdrive/My Drive/Colab Notebooks/SQuad/data/dev-v1.1.json'
data=get_pd_data(mypath)

In [0]:
data.tail(10)

Unnamed: 0,ID,Context,Question,Answers,Start
10560,[5737a9afc3c5551400e51f61],The connection between macroscopic nonconserva...,[In what treatment are nonconservative and con...,"[statistical mechanics, statistical mechanics,...","[134, 134, 110, 110]"
10561,[5737a9afc3c5551400e51f62],The connection between macroscopic nonconserva...,[What changes macroscopic closed system energi...,"[nonconservative forces, internal energies of ...","[188, 229, 188, 188]"
10562,[5737a9afc3c5551400e51f63],The connection between macroscopic nonconserva...,[What is the exchange of heat associated with?],"[nonconservative forces, nonconservative force...","[188, 188, 188, 188]"
10563,[5737a9afc3c5551400e51f64],The connection between macroscopic nonconserva...,[What is the law of thermodynamics associated ...,"[Second, Second law of thermodynamics, Second ...","[331, 331, 331, 331]"
10564,[5737a9afc3c5551400e51f65],The connection between macroscopic nonconserva...,[What makes energy changes in a closed system?],"[nonconservative forces, nonconservative force...","[361, 361, 361, 361]"
10565,[5737aafd1c456719005744fb],"The pound-force has a metric counterpart, less...",[What is the metric term less used than the Ne...,"[kilogram-force, pound-force, kilogram-force (...","[82, 4, 82, 82, 78]"
10566,[5737aafd1c456719005744fc],"The pound-force has a metric counterpart, less...",[What is the kilogram-force sometimes reffered...,"[kilopond, kilopond, kilopond, kilopond, kilop...","[114, 114, 114, 114, 114]"
10567,[5737aafd1c456719005744fd],"The pound-force has a metric counterpart, less...",[What is a very seldom used unit of mass in th...,"[slug, metric slug, metric slug, metric slug, ...","[274, 267, 267, 267, 263]"
10568,[5737aafd1c456719005744fe],"The pound-force has a metric counterpart, less...",[What seldom used term of a unit of force equa...,"[kip, kip, kip, kip, kip]","[712, 712, 712, 712, 712]"
10569,[5737aafd1c456719005744ff],"The pound-force has a metric counterpart, less...",[What is the seldom used force unit equal to o...,"[sthène, sthène, sthène, sthène, sthène]","[665, 665, 665, 665, 665]"


In [0]:
def prepare_mod1_tar_dev(row):

    row['Target']=[]
    temp=row['Context']
    
    temp0=sent_detector.tokenize(row['Context'])
    for startn in row['Start']:
      
        tempA=temp[:startn]+'[@Break@]'+temp[startn:]
        replacement=sent_detector.tokenize(tempA)

        for n,sentence in enumerate(replacement):
            if '[@Break@]' in sentence:
                break
        if len(replacement)>len(temp0):
            n-=1
            replaceans=np.zeros(len(temp0))
        elif len(replacement)==len(temp0):
            replaceans=np.zeros(len(temp0))
        else:
            n+=1
            replaceans=np.zeros(len(temp0))
        replaceans[n]=1
        row['Target'].append(list(replaceans))
    row['Context']=temp0
    
    return row


In [0]:
test=data.copy()

In [0]:
test=test.apply(prepare_mod1_tar_dev,axis=1)

In [0]:
test.tail(5)

Unnamed: 0,ID,Context,Question,Answers,Start,Target
10565,[5737aafd1c456719005744fb],"[The pound-force has a metric counterpart, les...",[What is the metric term less used than the Ne...,"[kilogram-force, pound-force, kilogram-force (...","[82, 4, 82, 82, 78]","[[1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0], [..."
10566,[5737aafd1c456719005744fc],"[The pound-force has a metric counterpart, les...",[What is the kilogram-force sometimes reffered...,"[kilopond, kilopond, kilopond, kilopond, kilop...","[114, 114, 114, 114, 114]","[[1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0], [..."
10567,[5737aafd1c456719005744fd],"[The pound-force has a metric counterpart, les...",[What is a very seldom used unit of mass in th...,"[slug, metric slug, metric slug, metric slug, ...","[274, 267, 267, 267, 263]","[[0.0, 1.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0], [..."
10568,[5737aafd1c456719005744fe],"[The pound-force has a metric counterpart, les...",[What seldom used term of a unit of force equa...,"[kip, kip, kip, kip, kip]","[712, 712, 712, 712, 712]","[[0.0, 0.0, 0.0, 1.0], [0.0, 0.0, 0.0, 1.0], [..."
10569,[5737aafd1c456719005744ff],"[The pound-force has a metric counterpart, les...",[What is the seldom used force unit equal to o...,"[sthène, sthène, sthène, sthène, sthène]","[665, 665, 665, 665, 665]","[[0.0, 0.0, 0.0, 1.0], [0.0, 0.0, 0.0, 1.0], [..."


In [0]:
def prepare_inputs(row):
    row['Input']=row['Question']+row['Context']
    row['Validation']=[]
    for item in row['Target']:
        row['Validation'].append(np.argmax(item))
    return row
test=test.apply(prepare_inputs,axis=1)

In [0]:
test.tail(3)

Unnamed: 0,ID,Context,Question,Answers,Start,Target,Input,Validation
10567,[5737aafd1c456719005744fd],"[The pound-force has a metric counterpart, les...",[What is a very seldom used unit of mass in th...,"[slug, metric slug, metric slug, metric slug, ...","[274, 267, 267, 267, 263]","[[0.0, 1.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0], [...",[What is a very seldom used unit of mass in th...,"[1, 1, 1, 1, 1]"
10568,[5737aafd1c456719005744fe],"[The pound-force has a metric counterpart, les...",[What seldom used term of a unit of force equa...,"[kip, kip, kip, kip, kip]","[712, 712, 712, 712, 712]","[[0.0, 0.0, 0.0, 1.0], [0.0, 0.0, 0.0, 1.0], [...",[What seldom used term of a unit of force equa...,"[3, 3, 3, 3, 3]"
10569,[5737aafd1c456719005744ff],"[The pound-force has a metric counterpart, les...",[What is the seldom used force unit equal to o...,"[sthène, sthène, sthène, sthène, sthène]","[665, 665, 665, 665, 665]","[[0.0, 0.0, 0.0, 1.0], [0.0, 0.0, 0.0, 1.0], [...",[What is the seldom used force unit equal to o...,"[3, 3, 3, 3, 3]"


In [0]:
test.to_pickle('/gdrive/My Drive/Colab Notebooks/SQuad/data/dev_v1_pd.pkl')