In [1]:
import itertools
import pandas as pd
import collections
import numpy as np
import re
import pickle

df_train = pd.read_csv('./data/WikiQACorpus/WikiQA-train.tsv', sep='\t')
df_test = pd.read_csv('./data/WikiQACorpus/WikiQA-test.tsv', sep='\t')
print('Shape_train: ', df_train.shape)
print('Shape_test: ', df_test.shape)
df_train.head(12)

Shape_train:  (20347, 7)
Shape_test:  (6116, 7)


Unnamed: 0,QuestionID,Question,DocumentID,DocumentTitle,SentenceID,Sentence,Label
0,Q1,how are glacier caves formed?,D1,Glacier cave,D1-0,A partly submerged glacier cave on Perito More...,0
1,Q1,how are glacier caves formed?,D1,Glacier cave,D1-1,The ice facade is approximately 60 m high,0
2,Q1,how are glacier caves formed?,D1,Glacier cave,D1-2,Ice formations in the Titlis glacier cave,0
3,Q1,how are glacier caves formed?,D1,Glacier cave,D1-3,A glacier cave is a cave formed within the ice...,1
4,Q1,how are glacier caves formed?,D1,Glacier cave,D1-4,"Glacier caves are often called ice caves , but...",0
5,Q2,How are the directions of the velocity and for...,D2,Circular motion,D2-0,"In physics , circular motion is a movement of ...",0
6,Q2,How are the directions of the velocity and for...,D2,Circular motion,D2-1,"It can be uniform, with constant angular rate ...",0
7,Q2,How are the directions of the velocity and for...,D2,Circular motion,D2-2,The rotation around a fixed axis of a three-di...,0
8,Q2,How are the directions of the velocity and for...,D2,Circular motion,D2-3,The equations of motion describe the movement ...,0
9,Q2,How are the directions of the velocity and for...,D2,Circular motion,D2-4,Examples of circular motion include: an artifi...,0


In [2]:
def saveobj(obj, fname):
    with open(fname, 'wb') as fin:
        pickle.dump(obj, fin, pickle.HIGHEST_PROTOCOL)

In [3]:
sub = r"[^A-Za-z]+"
df_train.loc[:,'Question'] = df_train.loc[:, 'Question'].apply(lambda x: re.sub(sub, ' ', x).strip().lower())
df_train.loc[:,'Sentence'] = df_train.loc[:, 'Sentence'].apply(lambda x: re.sub(sub, ' ', x).strip().lower())
df_test.loc[:,'Question'] = df_test.loc[:, 'Question'].apply(lambda x: re.sub(sub, ' ', x).strip().lower())
df_test.loc[:,'Sentence'] = df_test.loc[:, 'Sentence'].apply(lambda x: re.sub(sub, ' ', x).strip().lower())
df_train.head()

Unnamed: 0,QuestionID,Question,DocumentID,DocumentTitle,SentenceID,Sentence,Label
0,Q1,how are glacier caves formed,D1,Glacier cave,D1-0,a partly submerged glacier cave on perito more...,0
1,Q1,how are glacier caves formed,D1,Glacier cave,D1-1,the ice facade is approximately m high,0
2,Q1,how are glacier caves formed,D1,Glacier cave,D1-2,ice formations in the titlis glacier cave,0
3,Q1,how are glacier caves formed,D1,Glacier cave,D1-3,a glacier cave is a cave formed within the ice...,1
4,Q1,how are glacier caves formed,D1,Glacier cave,D1-4,glacier caves are often called ice caves but t...,0


In [4]:
class Voc:
    def __init__(self):
        self.token2index = {'<PAD>':0, '<SOS>':1, '<EOS>':2, '<UNK>':3}
        self.index2token = {v:k for k, v in self.token2index.items()}
        self.voclen = len(self.token2index)
        self.__lookslike_len__ = 10
    
    def extend_vocab(self, iterable):
        if not isinstance(iterable, collections.Iterable):
            raise ValueError('Value must be an iterable.')
        else:
            iterable = set(iterable)
            iterable = iterable - self.token2index.keys()
            ids = range(self.voclen, len(iterable)+self.voclen)
            self.token2index.update(dict(zip(iterable, ids)))
            self.index2token = {v:k for k, v in self.token2index.items()}
            self.voclen = len(self.token2index)
            
    def __call__(self):
        print('Vocabulary size: ', self.voclen)
        print('token2index looks like: ', list(self.token2index.items())[:self.__lookslike_len__], ', ...')
        print('index2token looks like: ', list(self.index2token.items())[:self.__lookslike_len__], ', ...')

In [5]:
df_train.head()

Unnamed: 0,QuestionID,Question,DocumentID,DocumentTitle,SentenceID,Sentence,Label
0,Q1,how are glacier caves formed,D1,Glacier cave,D1-0,a partly submerged glacier cave on perito more...,0
1,Q1,how are glacier caves formed,D1,Glacier cave,D1-1,the ice facade is approximately m high,0
2,Q1,how are glacier caves formed,D1,Glacier cave,D1-2,ice formations in the titlis glacier cave,0
3,Q1,how are glacier caves formed,D1,Glacier cave,D1-3,a glacier cave is a cave formed within the ice...,1
4,Q1,how are glacier caves formed,D1,Glacier cave,D1-4,glacier caves are often called ice caves but t...,0


In [6]:
df_test.head()

Unnamed: 0,QuestionID,Question,DocumentID,DocumentTitle,SentenceID,Sentence,Label
0,Q0,how african americans were immigrated to the us,D0,African immigration to the United States,D0-0,african immigration to the united states refer...,0
1,Q0,how african americans were immigrated to the us,D0,African immigration to the United States,D0-1,the term african in the scope of this article ...,0
2,Q0,how african americans were immigrated to the us,D0,African immigration to the United States,D0-2,from the immigration and nationality act of to...,0
3,Q0,how african americans were immigrated to the us,D0,African immigration to the United States,D0-3,african immigrants in the united states come f...,0
4,Q0,how african americans were immigrated to the us,D0,African immigration to the United States,D0-4,they include people from different national li...,0


In [7]:
voc = Voc()
for colname in ['Question', 'Sentence']:
    print('Name of column: ', colname)
    s = map(lambda x: x.split(), df_train.loc[:, colname])
    s = set(itertools.chain.from_iterable(s))
    print('Size before vocabulary extending: ', voc.voclen)
    voc.extend_vocab(s)
    print('Size after vocabulary extending: ', voc.voclen)

Name of column:  Question
Size before vocabulary extending:  4
Size after vocabulary extending:  3947
Name of column:  Sentence
Size before vocabulary extending:  3947
Size after vocabulary extending:  29340


In [8]:
%%time
df_train.loc[:, 'Question_encoded'] = df_train.loc[:, 'Question'].apply(
    lambda x: [voc.token2index.get(i, voc.token2index['<UNK>']) for i in x.split()])
df_train.loc[:, 'Sentence_encoded'] = df_train.loc[:, 'Sentence'].apply(
    lambda x: [voc.token2index.get(i, voc.token2index['<UNK>']) for i in x.split()])

df_test.loc[:, 'Question_encoded'] = df_test.loc[:, 'Question'].apply(
    lambda x: [voc.token2index.get(i, voc.token2index['<UNK>']) for i in x.split()])
df_test.loc[:, 'Sentence_encoded'] = df_test.loc[:, 'Sentence'].apply(
    lambda x: [voc.token2index.get(i, voc.token2index['<UNK>']) for i in x.split()])

CPU times: user 572 ms, sys: 16 ms, total: 588 ms
Wall time: 584 ms


In [9]:
df_train.head()

Unnamed: 0,QuestionID,Question,DocumentID,DocumentTitle,SentenceID,Sentence,Label,Question_encoded,Sentence_encoded
0,Q1,how are glacier caves formed,D1,Glacier cave,D1-0,a partly submerged glacier cave on perito more...,0,"[2622, 3473, 2381, 524, 3311]","[1878, 4448, 10810, 2381, 21552, 1328, 24841, ..."
1,Q1,how are glacier caves formed,D1,Glacier cave,D1-1,the ice facade is approximately m high,0,"[2622, 3473, 2381, 524, 3311]","[474, 40, 24270, 383, 13999, 2461, 2387]"
2,Q1,how are glacier caves formed,D1,Glacier cave,D1-2,ice formations in the titlis glacier cave,0,"[2622, 3473, 2381, 524, 3311]","[40, 11084, 1433, 474, 25610, 2381, 21552]"
3,Q1,how are glacier caves formed,D1,Glacier cave,D1-3,a glacier cave is a cave formed within the ice...,1,"[2622, 3473, 2381, 524, 3311]","[1878, 2381, 21552, 383, 1878, 21552, 3311, 18..."
4,Q1,how are glacier caves formed,D1,Glacier cave,D1-4,glacier caves are often called ice caves but t...,0,"[2622, 3473, 2381, 524, 3311]","[2381, 524, 3473, 988, 354, 40, 524, 21447, 38..."


In [10]:
df_test.head()

Unnamed: 0,QuestionID,Question,DocumentID,DocumentTitle,SentenceID,Sentence,Label,Question_encoded,Sentence_encoded
0,Q0,how african americans were immigrated to the us,D0,African immigration to the United States,D0-0,african immigration to the united states refer...,0,"[2622, 3075, 1688, 2123, 3, 2756, 474, 1625]","[3075, 26695, 2756, 474, 12, 3583, 20351, 2756..."
1,Q0,how african americans were immigrated to the us,D0,African immigration to the United States,D0-1,the term african in the scope of this article ...,0,"[2622, 3075, 1688, 2123, 3, 2756, 474, 1625]","[474, 1775, 3075, 1433, 474, 8908, 52, 3814, 2..."
2,Q0,how african americans were immigrated to the us,D0,African immigration to the United States,D0-2,from the immigration and nationality act of to...,0,"[2622, 3075, 1688, 2123, 3, 2756, 474, 1625]","[3579, 474, 26695, 950, 2682, 2942, 52, 2756, ..."
3,Q0,how african americans were immigrated to the us,D0,African immigration to the United States,D0-3,african immigrants in the united states come f...,0,"[2622, 3075, 1688, 2123, 3, 2756, 474, 1625]","[3075, 2937, 1433, 474, 12, 3583, 3449, 3579, ..."
4,Q0,how african americans were immigrated to the us,D0,African immigration to the United States,D0-4,they include people from different national li...,0,"[2622, 3075, 1688, 2123, 3, 2756, 474, 1625]","[498, 2431, 2270, 3579, 1976, 432, 21255, 2113..."


In [11]:
Q_MAXLEN = df_train.Question_encoded.apply(len).max()
S_MAXLEN = df_train.Sentence_encoded.apply(len).max()

def pad_sequence(seq, max_len, padding=0, cut_last=True):
    seq = seq + [padding] * (max_len - len(seq))
    if cut_last:
        seq = seq[:max_len]
    else:
        seq = seq[-max_len:]
    assert len(seq) == max_len
    return np.array(seq)

df_train.loc[:, 'Question_encoded'] = df_train.Question_encoded.apply(lambda x: pad_sequence(x, Q_MAXLEN))
df_train.loc[:, 'Sentence_encoded'] = df_train.Sentence_encoded.apply(lambda x: pad_sequence(x, S_MAXLEN))

df_test.loc[:, 'Question_encoded'] = df_test.Question_encoded.apply(lambda x: pad_sequence(x, Q_MAXLEN))
df_test.loc[:, 'Sentence_encoded'] = df_test.Sentence_encoded.apply(lambda x: pad_sequence(x, S_MAXLEN))

In [12]:
df_train

Unnamed: 0,QuestionID,Question,DocumentID,DocumentTitle,SentenceID,Sentence,Label,Question_encoded,Sentence_encoded
0,Q1,how are glacier caves formed,D1,Glacier cave,D1-0,a partly submerged glacier cave on perito more...,0,"[2622, 3473, 2381, 524, 3311, 0, 0, 0, 0, 0, 0...","[1878, 4448, 10810, 2381, 21552, 1328, 24841, ..."
1,Q1,how are glacier caves formed,D1,Glacier cave,D1-1,the ice facade is approximately m high,0,"[2622, 3473, 2381, 524, 3311, 0, 0, 0, 0, 0, 0...","[474, 40, 24270, 383, 13999, 2461, 2387, 0, 0,..."
2,Q1,how are glacier caves formed,D1,Glacier cave,D1-2,ice formations in the titlis glacier cave,0,"[2622, 3473, 2381, 524, 3311, 0, 0, 0, 0, 0, 0...","[40, 11084, 1433, 474, 25610, 2381, 21552, 0, ..."
3,Q1,how are glacier caves formed,D1,Glacier cave,D1-3,a glacier cave is a cave formed within the ice...,1,"[2622, 3473, 2381, 524, 3311, 0, 0, 0, 0, 0, 0...","[1878, 2381, 21552, 383, 1878, 21552, 3311, 18..."
4,Q1,how are glacier caves formed,D1,Glacier cave,D1-4,glacier caves are often called ice caves but t...,0,"[2622, 3473, 2381, 524, 3311, 0, 0, 0, 0, 0, 0...","[2381, 524, 3473, 988, 354, 40, 524, 21447, 38..."
5,Q2,how are the directions of the velocity and for...,D2,Circular motion,D2-0,in physics circular motion is a movement of an...,0,"[2622, 3473, 474, 2925, 52, 474, 1776, 950, 94...","[1433, 1923, 3342, 2592, 383, 1878, 1575, 52, ..."
6,Q2,how are the directions of the velocity and for...,D2,Circular motion,D2-1,it can be uniform with constant angular rate o...,0,"[2622, 3473, 474, 2925, 52, 474, 1776, 950, 94...","[569, 3124, 2028, 1634, 3033, 3681, 24467, 103..."
7,Q2,how are the directions of the velocity and for...,D2,Circular motion,D2-2,the rotation around a fixed axis of a three di...,0,"[2622, 3473, 474, 2925, 52, 474, 1776, 950, 94...","[474, 11650, 68, 1878, 2385, 16332, 52, 1878, ..."
8,Q2,how are the directions of the velocity and for...,D2,Circular motion,D2-3,the equations of motion describe the movement ...,0,"[2622, 3473, 474, 2925, 52, 474, 1776, 950, 94...","[474, 27996, 52, 2592, 3562, 474, 1575, 52, 47..."
9,Q2,how are the directions of the velocity and for...,D2,Circular motion,D2-4,examples of circular motion include an artific...,0,"[2622, 3473, 474, 2925, 52, 474, 1776, 950, 94...","[19830, 52, 3342, 2592, 2431, 2260, 17521, 168..."


In [13]:
df_test

Unnamed: 0,QuestionID,Question,DocumentID,DocumentTitle,SentenceID,Sentence,Label,Question_encoded,Sentence_encoded
0,Q0,how african americans were immigrated to the us,D0,African immigration to the United States,D0-0,african immigration to the united states refer...,0,"[2622, 3075, 1688, 2123, 3, 2756, 474, 1625, 0...","[3075, 26695, 2756, 474, 12, 3583, 20351, 2756..."
1,Q0,how african americans were immigrated to the us,D0,African immigration to the United States,D0-1,the term african in the scope of this article ...,0,"[2622, 3075, 1688, 2123, 3, 2756, 474, 1625, 0...","[474, 1775, 3075, 1433, 474, 8908, 52, 3814, 2..."
2,Q0,how african americans were immigrated to the us,D0,African immigration to the United States,D0-2,from the immigration and nationality act of to...,0,"[2622, 3075, 1688, 2123, 3, 2756, 474, 1625, 0...","[3579, 474, 26695, 950, 2682, 2942, 52, 2756, ..."
3,Q0,how african americans were immigrated to the us,D0,African immigration to the United States,D0-3,african immigrants in the united states come f...,0,"[2622, 3075, 1688, 2123, 3, 2756, 474, 1625, 0...","[3075, 2937, 1433, 474, 12, 3583, 3449, 3579, ..."
4,Q0,how african americans were immigrated to the us,D0,African immigration to the United States,D0-4,they include people from different national li...,0,"[2622, 3075, 1688, 2123, 3, 2756, 474, 1625, 0...","[498, 2431, 2270, 3579, 1976, 432, 21255, 2113..."
5,Q0,how african americans were immigrated to the us,D0,African immigration to the United States,D0-5,as such african immigrants are to be distingui...,1,"[2622, 3075, 1688, 2123, 3, 2756, 474, 1625, 0...","[1805, 20586, 3075, 2937, 3473, 2756, 2028, 13..."
6,Q3,how large were early jails,D3,Prison,D3-0,a prison from old french prisoun also known as...,0,"[2622, 3394, 2123, 1781, 26316, 0, 0, 0, 0, 0,...","[1878, 8612, 3579, 1401, 21418, 3, 13521, 18, ..."
7,Q3,how large were early jails,D3,Prison,D3-1,imprisonment or incarceration is a legal penal...,0,"[2622, 3394, 2123, 1781, 26316, 0, 0, 0, 0, 0,...","[28217, 968, 23839, 383, 1878, 1200, 22268, 23..."
8,Q3,how large were early jails,D3,Prison,D3-2,other terms used are penitentiary correctional...,0,"[2622, 3394, 2123, 1781, 26316, 0, 0, 0, 0, 0,...","[1602, 3821, 568, 3473, 26491, 22705, 28556, 3..."
9,Q3,how large were early jails,D3,Prison,D3-3,in some legal systems some of these terms have...,0,"[2622, 3394, 2123, 1781, 26316, 0, 0, 0, 0, 0,...","[1433, 641, 1200, 933, 641, 52, 28989, 3821, 2..."


In [15]:
df_train.to_pickle('./data/processed/wikiqa_df_train.pickle')
df_test.to_pickle('./data/processed/wikiqa_df_test.pickle')

In [16]:
voc_items = {'index2token':voc.index2token,
             'token2index':voc.token2index,
             'voc_len':voc.voclen}
saveobj(voc_items, './data/processed/vocabulary.pickle')