In [20]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import torch
from torch.jit import script, trace
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import csv
import random
import re
import os
import unicodedata
import codecs
from io import open
import itertools
import math
import pandas as pd
import logging
import collections
import numpy as np
from model.model import QAMatching

logging.basicConfig(level=logging.INFO)
USE_CUDA = torch.cuda.is_available()
logging.info('USE_CUDA: {}'.format(USE_CUDA))
device = torch.device("cuda" if USE_CUDA else "cpu")

df_train = pd.read_csv('./data/WikiQACorpus/WikiQA-train.tsv', sep='\t')
df_test = pd.read_csv('./data/WikiQACorpus/WikiQA-test.tsv', sep='\t')
print('Shape_train: ', df_train.shape)
print('Shape_test: ', df_test.shape)
df_train.head(12)

INFO:root:USE_CUDA: True


Shape_train:  (20347, 7)
Shape_test:  (6116, 7)


Unnamed: 0,QuestionID,Question,DocumentID,DocumentTitle,SentenceID,Sentence,Label
0,Q1,how are glacier caves formed?,D1,Glacier cave,D1-0,A partly submerged glacier cave on Perito More...,0
1,Q1,how are glacier caves formed?,D1,Glacier cave,D1-1,The ice facade is approximately 60 m high,0
2,Q1,how are glacier caves formed?,D1,Glacier cave,D1-2,Ice formations in the Titlis glacier cave,0
3,Q1,how are glacier caves formed?,D1,Glacier cave,D1-3,A glacier cave is a cave formed within the ice...,1
4,Q1,how are glacier caves formed?,D1,Glacier cave,D1-4,"Glacier caves are often called ice caves , but...",0
5,Q2,How are the directions of the velocity and for...,D2,Circular motion,D2-0,"In physics , circular motion is a movement of ...",0
6,Q2,How are the directions of the velocity and for...,D2,Circular motion,D2-1,"It can be uniform, with constant angular rate ...",0
7,Q2,How are the directions of the velocity and for...,D2,Circular motion,D2-2,The rotation around a fixed axis of a three-di...,0
8,Q2,How are the directions of the velocity and for...,D2,Circular motion,D2-3,The equations of motion describe the movement ...,0
9,Q2,How are the directions of the velocity and for...,D2,Circular motion,D2-4,Examples of circular motion include: an artifi...,0


In [21]:
sub = r"[^A-Za-z]+"
df_train.loc[:,'Question'] = df_train.loc[:, 'Question'].apply(lambda x: re.sub(sub, ' ', x).strip().lower())
df_train.loc[:,'Sentence'] = df_train.loc[:, 'Sentence'].apply(lambda x: re.sub(sub, ' ', x).strip().lower())
df_test.loc[:,'Question'] = df_test.loc[:, 'Question'].apply(lambda x: re.sub(sub, ' ', x).strip().lower())
df_test.loc[:,'Sentence'] = df_test.loc[:, 'Sentence'].apply(lambda x: re.sub(sub, ' ', x).strip().lower())
df_train.head()

Unnamed: 0,QuestionID,Question,DocumentID,DocumentTitle,SentenceID,Sentence,Label
0,Q1,how are glacier caves formed,D1,Glacier cave,D1-0,a partly submerged glacier cave on perito more...,0
1,Q1,how are glacier caves formed,D1,Glacier cave,D1-1,the ice facade is approximately m high,0
2,Q1,how are glacier caves formed,D1,Glacier cave,D1-2,ice formations in the titlis glacier cave,0
3,Q1,how are glacier caves formed,D1,Glacier cave,D1-3,a glacier cave is a cave formed within the ice...,1
4,Q1,how are glacier caves formed,D1,Glacier cave,D1-4,glacier caves are often called ice caves but t...,0


In [22]:
class Voc:
    def __init__(self):
        self.token2index = {'<PAD>':0, '<SOS>':1, '<EOS>':2, '<UNK>':3}
        self.index2token = {v:k for k, v in self.token2index.items()}
        self.voclen = len(self.token2index)
        self.__lookslike_len__ = 10
    
    def extend_vocab(self, iterable):
        if not isinstance(iterable, collections.Iterable):
            raise ValueError('Value must be an iterable.')
        else:
            iterable = set(iterable)
            iterable = iterable - self.token2index.keys()
            ids = range(self.voclen, len(iterable)+self.voclen)
            self.token2index.update(dict(zip(iterable, ids)))
            self.index2token = {v:k for k, v in self.token2index.items()}
            self.voclen = len(self.token2index)
            
    def __call__(self):
        print('Vocabulary size: ', self.voclen)
        print('token2index looks like: ', list(self.token2index.items())[:self.__lookslike_len__], ', ...')
        print('index2token looks like: ', list(self.index2token.items())[:self.__lookslike_len__], ', ...')

In [23]:
df_train.head()

Unnamed: 0,QuestionID,Question,DocumentID,DocumentTitle,SentenceID,Sentence,Label
0,Q1,how are glacier caves formed,D1,Glacier cave,D1-0,a partly submerged glacier cave on perito more...,0
1,Q1,how are glacier caves formed,D1,Glacier cave,D1-1,the ice facade is approximately m high,0
2,Q1,how are glacier caves formed,D1,Glacier cave,D1-2,ice formations in the titlis glacier cave,0
3,Q1,how are glacier caves formed,D1,Glacier cave,D1-3,a glacier cave is a cave formed within the ice...,1
4,Q1,how are glacier caves formed,D1,Glacier cave,D1-4,glacier caves are often called ice caves but t...,0


In [24]:
df_test.head()

Unnamed: 0,QuestionID,Question,DocumentID,DocumentTitle,SentenceID,Sentence,Label
0,Q0,how african americans were immigrated to the us,D0,African immigration to the United States,D0-0,african immigration to the united states refer...,0
1,Q0,how african americans were immigrated to the us,D0,African immigration to the United States,D0-1,the term african in the scope of this article ...,0
2,Q0,how african americans were immigrated to the us,D0,African immigration to the United States,D0-2,from the immigration and nationality act of to...,0
3,Q0,how african americans were immigrated to the us,D0,African immigration to the United States,D0-3,african immigrants in the united states come f...,0
4,Q0,how african americans were immigrated to the us,D0,African immigration to the United States,D0-4,they include people from different national li...,0


In [14]:
voc = Voc()
for colname in ['Question', 'Sentence']:
    print('Name of column: ', colname)
    s = map(lambda x: x.split(), df_train.loc[:, colname])
    s = set(itertools.chain.from_iterable(s))
    print('Size before vocabulary extending: ', voc.voclen)
    voc.extend_vocab(s)
    print('Size after vocabulary extending: ', voc.voclen)

Name of column:  Question
Size before vocabulary extending:  4
Size after vocabulary extending:  3947
Name of column:  Sentence
Size before vocabulary extending:  3947
Size after vocabulary extending:  29340


In [33]:
%%time
df_train.loc[:, 'Question_encoded'] = df_train.loc[:, 'Question'].apply(
    lambda x: [voc.token2index.get(i, voc.token2index['<UNK>']) for i in x.split()])
df_train.loc[:, 'Sentence_encoded'] = df_train.loc[:, 'Sentence'].apply(
    lambda x: [voc.token2index.get(i, voc.token2index['<UNK>']) for i in x.split()])

df_test.loc[:, 'Question_encoded'] = df_test.loc[:, 'Question'].apply(
    lambda x: [voc.token2index.get(i, voc.token2index['<UNK>']) for i in x.split()])
df_test.loc[:, 'Sentence_encoded'] = df_test.loc[:, 'Sentence'].apply(
    lambda x: [voc.token2index.get(i, voc.token2index['<UNK>']) for i in x.split()])

CPU times: user 480 ms, sys: 20 ms, total: 500 ms
Wall time: 509 ms


In [34]:
df_train.head()

Unnamed: 0,QuestionID,Question,DocumentID,DocumentTitle,SentenceID,Sentence,Label,Question_encoded,Sentence_encoded
0,Q1,how are glacier caves formed,D1,Glacier cave,D1-0,a partly submerged glacier cave on perito more...,0,"[1634, 2170, 732, 2720, 3143]","[1896, 24391, 16338, 732, 5902, 2422, 28026, 8..."
1,Q1,how are glacier caves formed,D1,Glacier cave,D1-1,the ice facade is approximately m high,0,"[1634, 2170, 732, 2720, 3143]","[71, 3216, 19270, 908, 21302, 544, 1069]"
2,Q1,how are glacier caves formed,D1,Glacier cave,D1-2,ice formations in the titlis glacier cave,0,"[1634, 2170, 732, 2720, 3143]","[3216, 17771, 98, 71, 11296, 732, 5902]"
3,Q1,how are glacier caves formed,D1,Glacier cave,D1-3,a glacier cave is a cave formed within the ice...,1,"[1634, 2170, 732, 2720, 3143]","[1896, 732, 5902, 908, 1896, 5902, 3143, 157, ..."
4,Q1,how are glacier caves formed,D1,Glacier cave,D1-4,glacier caves are often called ice caves but t...,0,"[1634, 2170, 732, 2720, 3143]","[732, 2720, 2170, 2858, 595, 3216, 2720, 15509..."


In [35]:
df_test.head()

Unnamed: 0,QuestionID,Question,DocumentID,DocumentTitle,SentenceID,Sentence,Label,Question_encoded,Sentence_encoded
0,Q0,how african americans were immigrated to the us,D0,African immigration to the United States,D0-0,african immigration to the united states refer...,0,"[1634, 3915, 1380, 2315, 3, 3045, 71, 1751]","[3915, 10943, 3045, 71, 240, 1689, 5124, 3045,..."
1,Q0,how african americans were immigrated to the us,D0,African immigration to the United States,D0-1,the term african in the scope of this article ...,0,"[1634, 3915, 1380, 2315, 3, 3045, 71, 1751]","[71, 1968, 3915, 98, 71, 7008, 704, 1199, 1748..."
2,Q0,how african americans were immigrated to the us,D0,African immigration to the United States,D0-2,from the immigration and nationality act of to...,0,"[1634, 3915, 1380, 2315, 3, 3045, 71, 1751]","[808, 71, 10943, 2741, 2828, 1790, 704, 3045, ..."
3,Q0,how african americans were immigrated to the us,D0,African immigration to the United States,D0-3,african immigrants in the united states come f...,0,"[1634, 3915, 1380, 2315, 3, 3045, 71, 1751]","[3915, 3173, 98, 71, 240, 1689, 2439, 808, 290..."
4,Q0,how african americans were immigrated to the us,D0,African immigration to the United States,D0-4,they include people from different national li...,0,"[1634, 3915, 1380, 2315, 3, 3045, 71, 1751]","[1589, 60, 2100, 808, 2937, 2075, 12822, 1933,..."


In [36]:
Q_MAXLEN = df_train.Question_encoded.apply(len).max()
S_MAXLEN = df_train.Sentence_encoded.apply(len).max()

def pad_sequence(seq, max_len, padding=0, cut_last=True):
    seq = seq + [padding] * (max_len - len(seq))
    if cut_last:
        seq = seq[:max_len]
    else:
        seq = seq[-max_len:]
    assert len(seq) == max_len
    return np.array(seq)

df_train.loc[:, 'Question_encoded'] = df_train.Question_encoded.apply(lambda x: pad_sequence(x, Q_MAXLEN))
df_train.loc[:, 'Sentence_encoded'] = df_train.Sentence_encoded.apply(lambda x: pad_sequence(x, S_MAXLEN))

df_test.loc[:, 'Question_encoded'] = df_test.Question_encoded.apply(lambda x: pad_sequence(x, Q_MAXLEN))
df_test.loc[:, 'Sentence_encoded'] = df_test.Sentence_encoded.apply(lambda x: pad_sequence(x, S_MAXLEN))

In [38]:
df_train

Unnamed: 0,QuestionID,Question,DocumentID,DocumentTitle,SentenceID,Sentence,Label,Question_encoded,Sentence_encoded
0,Q1,how are glacier caves formed,D1,Glacier cave,D1-0,a partly submerged glacier cave on perito more...,0,"[1634, 2170, 732, 2720, 3143, 0, 0, 0, 0, 0, 0...","[1896, 24391, 16338, 732, 5902, 2422, 28026, 8..."
1,Q1,how are glacier caves formed,D1,Glacier cave,D1-1,the ice facade is approximately m high,0,"[1634, 2170, 732, 2720, 3143, 0, 0, 0, 0, 0, 0...","[71, 3216, 19270, 908, 21302, 544, 1069, 0, 0,..."
2,Q1,how are glacier caves formed,D1,Glacier cave,D1-2,ice formations in the titlis glacier cave,0,"[1634, 2170, 732, 2720, 3143, 0, 0, 0, 0, 0, 0...","[3216, 17771, 98, 71, 11296, 732, 5902, 0, 0, ..."
3,Q1,how are glacier caves formed,D1,Glacier cave,D1-3,a glacier cave is a cave formed within the ice...,1,"[1634, 2170, 732, 2720, 3143, 0, 0, 0, 0, 0, 0...","[1896, 732, 5902, 908, 1896, 5902, 3143, 157, ..."
4,Q1,how are glacier caves formed,D1,Glacier cave,D1-4,glacier caves are often called ice caves but t...,0,"[1634, 2170, 732, 2720, 3143, 0, 0, 0, 0, 0, 0...","[732, 2720, 2170, 2858, 595, 3216, 2720, 15509..."
5,Q2,how are the directions of the velocity and for...,D2,Circular motion,D2-0,in physics circular motion is a movement of an...,0,"[1634, 2170, 71, 872, 704, 71, 2595, 2741, 288...","[98, 2273, 3713, 2654, 908, 1896, 2218, 704, 5..."
6,Q2,how are the directions of the velocity and for...,D2,Circular motion,D2-1,it can be uniform with constant angular rate o...,0,"[1634, 2170, 71, 872, 704, 71, 2595, 2741, 288...","[525, 81, 1900, 2398, 2110, 2396, 9191, 1965, ..."
7,Q2,how are the directions of the velocity and for...,D2,Circular motion,D2-2,the rotation around a fixed axis of a three di...,0,"[1634, 2170, 71, 872, 704, 71, 2595, 2741, 288...","[71, 12995, 1967, 1896, 9, 28936, 704, 1896, 3..."
8,Q2,how are the directions of the velocity and for...,D2,Circular motion,D2-3,the equations of motion describe the movement ...,0,"[1634, 2170, 71, 872, 704, 71, 2595, 2741, 288...","[71, 5737, 704, 2654, 2569, 71, 2218, 704, 71,..."
9,Q2,how are the directions of the velocity and for...,D2,Circular motion,D2-4,examples of circular motion include an artific...,0,"[1634, 2170, 71, 872, 704, 71, 2595, 2741, 288...","[9482, 704, 3713, 2654, 60, 557, 28648, 8024, ..."


In [39]:
df_test

Unnamed: 0,QuestionID,Question,DocumentID,DocumentTitle,SentenceID,Sentence,Label,Question_encoded,Sentence_encoded
0,Q0,how african americans were immigrated to the us,D0,African immigration to the United States,D0-0,african immigration to the united states refer...,0,"[1634, 3915, 1380, 2315, 3, 3045, 71, 1751, 0,...","[3915, 10943, 3045, 71, 240, 1689, 5124, 3045,..."
1,Q0,how african americans were immigrated to the us,D0,African immigration to the United States,D0-1,the term african in the scope of this article ...,0,"[1634, 3915, 1380, 2315, 3, 3045, 71, 1751, 0,...","[71, 1968, 3915, 98, 71, 7008, 704, 1199, 1748..."
2,Q0,how african americans were immigrated to the us,D0,African immigration to the United States,D0-2,from the immigration and nationality act of to...,0,"[1634, 3915, 1380, 2315, 3, 3045, 71, 1751, 0,...","[808, 71, 10943, 2741, 2828, 1790, 704, 3045, ..."
3,Q0,how african americans were immigrated to the us,D0,African immigration to the United States,D0-3,african immigrants in the united states come f...,0,"[1634, 3915, 1380, 2315, 3, 3045, 71, 1751, 0,...","[3915, 3173, 98, 71, 240, 1689, 2439, 808, 290..."
4,Q0,how african americans were immigrated to the us,D0,African immigration to the United States,D0-4,they include people from different national li...,0,"[1634, 3915, 1380, 2315, 3, 3045, 71, 1751, 0,...","[1589, 60, 2100, 808, 2937, 2075, 12822, 1933,..."
5,Q0,how african americans were immigrated to the us,D0,African immigration to the United States,D0-5,as such african immigrants are to be distingui...,1,"[1634, 3915, 1380, 2315, 3, 3045, 71, 1751, 0,...","[2663, 24159, 3915, 3173, 2170, 3045, 1900, 25..."
6,Q3,how large were early jails,D3,Prison,D3-0,a prison from old french prisoun also known as...,0,"[1634, 1983, 2315, 3488, 19437, 0, 0, 0, 0, 0,...","[1896, 4331, 808, 2818, 28614, 3, 16845, 2033,..."
7,Q3,how large were early jails,D3,Prison,D3-1,imprisonment or incarceration is a legal penal...,0,"[1634, 1983, 2315, 3488, 19437, 0, 0, 0, 0, 0,...","[9933, 2282, 15137, 908, 1896, 3165, 16429, 20..."
8,Q3,how large were early jails,D3,Prison,D3-2,other terms used are penitentiary correctional...,0,"[1634, 1983, 2315, 3488, 19437, 0, 0, 0, 0, 0,...","[2460, 3532, 189, 2170, 15271, 21480, 22066, 3..."
9,Q3,how large were early jails,D3,Prison,D3-3,in some legal systems some of these terms have...,0,"[1634, 1983, 2315, 3488, 19437, 0, 0, 0, 0, 0,...","[98, 463, 3165, 3501, 463, 704, 5591, 3532, 20..."
