In [7]:
import itertools
import pandas as pd
import collections
import numpy as np
import re
import pickle
from tqdm import tqdm


path_embedding = './data/embedding/'
df_train = pd.read_csv('./data/WikiQACorpus/WikiQA-train.tsv', sep='\t')
df_test = pd.read_csv('./data/WikiQACorpus/WikiQA-test.tsv', sep='\t')
print('Shape_train: ', df_train.shape)
print('Shape_test: ', df_test.shape)
df_train.head(12)

def saveobj(obj, fname):
    with open(fname, 'wb') as fin:
        pickle.dump(obj, fin, pickle.HIGHEST_PROTOCOL)

Shape_train:  (20347, 7)
Shape_test:  (6116, 7)


In [8]:
word2index = {}
index2word = {}
padding_index = 0
padding_vector = [0]
index2vector = {padding_index:padding_vector}
with open(f'{path_embedding}glove.6B.50d.txt', 'rb') as f:
    for i, l in enumerate(tqdm(f)):
        line = l.decode().split()
        word = line[0]
        vect = line[1:]
        # Так как "0" зарезервирован под <PAD>, =+1 каждому idx.
        idx = i+1
        word2index[word] = idx
        index2word[idx] = word
        index2vector[idx] = vect
        
index2vector[0] = [0.]*len(vect)
        

400001it [00:07, 54563.88it/s]


In [9]:
index2word

{1: 'the',
 2: ',',
 3: '.',
 4: 'of',
 5: 'to',
 6: 'and',
 7: 'in',
 8: 'a',
 9: '"',
 10: "'s",
 11: 'for',
 12: '-',
 13: 'that',
 14: 'on',
 15: 'is',
 16: 'was',
 17: 'said',
 18: 'with',
 19: 'he',
 20: 'as',
 21: 'it',
 22: 'by',
 23: 'at',
 24: '(',
 25: ')',
 26: 'from',
 27: 'his',
 28: "''",
 29: '``',
 30: 'an',
 31: 'be',
 32: 'has',
 33: 'are',
 34: 'have',
 35: 'but',
 36: 'were',
 37: 'not',
 38: 'this',
 39: 'who',
 40: 'they',
 41: 'had',
 42: 'i',
 43: 'which',
 44: 'will',
 45: 'their',
 46: ':',
 47: 'or',
 48: 'its',
 49: 'one',
 50: 'after',
 51: 'new',
 52: 'been',
 53: 'also',
 54: 'we',
 55: 'would',
 56: 'two',
 57: 'more',
 58: "'",
 59: 'first',
 60: 'about',
 61: 'up',
 62: 'when',
 63: 'year',
 64: 'there',
 65: 'all',
 66: '--',
 67: 'out',
 68: 'she',
 69: 'other',
 70: 'people',
 71: "n't",
 72: 'her',
 73: 'percent',
 74: 'than',
 75: 'over',
 76: 'into',
 77: 'last',
 78: 'some',
 79: 'government',
 80: 'time',
 81: '$',
 82: 'you',
 83: 'years',
 8

In [None]:
sub = r"[^A-Za-z]+"
df_train.loc[:,'Question'] = df_train.loc[:, 'Question'].apply(lambda x: re.sub(sub, ' ', x).strip().lower())
df_train.loc[:,'Sentence'] = df_train.loc[:, 'Sentence'].apply(lambda x: re.sub(sub, ' ', x).strip().lower())
df_test.loc[:,'Question'] = df_test.loc[:, 'Question'].apply(lambda x: re.sub(sub, ' ', x).strip().lower())
df_test.loc[:,'Sentence'] = df_test.loc[:, 'Sentence'].apply(lambda x: re.sub(sub, ' ', x).strip().lower())
df_train.head()

In [None]:
class Voc:
    def __init__(self):
        self.token2index = {'<PAD>':0, '<SOS>':1, '<EOS>':2, '<UNK>':3}
        self.index2token = {v:k for k, v in self.token2index.items()}
        self.voclen = len(self.token2index)
        self.__lookslike_len__ = 10
    
    def extend_vocab(self, iterable):
        if not isinstance(iterable, collections.Iterable):
            raise ValueError('Value must be an iterable.')
        else:
            iterable = set(iterable)
            iterable = iterable - self.token2index.keys()
            ids = range(self.voclen, len(iterable)+self.voclen)
            self.token2index.update(dict(zip(iterable, ids)))
            self.index2token = {v:k for k, v in self.token2index.items()}
            self.voclen = len(self.token2index)
            
    def __call__(self):
        print('Vocabulary size: ', self.voclen)
        print('token2index looks like: ', list(self.token2index.items())[:self.__lookslike_len__], ', ...')
        print('index2token looks like: ', list(self.index2token.items())[:self.__lookslike_len__], ', ...')

In [None]:
df_train.head()

In [None]:
df_test.head()

In [None]:
voc = Voc()
for colname in ['Question', 'Sentence']:
    print('Name of column: ', colname)
    s = map(lambda x: x.split(), df_train.loc[:, colname])
    s = set(itertools.chain.from_iterable(s))
    print('Size before vocabulary extending: ', voc.voclen)
    voc.extend_vocab(s)
    print('Size after vocabulary extending: ', voc.voclen)

In [None]:
%%time
df_train.loc[:, 'Question_encoded'] = df_train.loc[:, 'Question'].apply(
    lambda x: [voc.token2index.get(i, voc.token2index['<UNK>']) for i in x.split()])
df_train.loc[:, 'Sentence_encoded'] = df_train.loc[:, 'Sentence'].apply(
    lambda x: [voc.token2index.get(i, voc.token2index['<UNK>']) for i in x.split()])

df_test.loc[:, 'Question_encoded'] = df_test.loc[:, 'Question'].apply(
    lambda x: [voc.token2index.get(i, voc.token2index['<UNK>']) for i in x.split()])
df_test.loc[:, 'Sentence_encoded'] = df_test.loc[:, 'Sentence'].apply(
    lambda x: [voc.token2index.get(i, voc.token2index['<UNK>']) for i in x.split()])

In [None]:
df_train.head()

In [None]:
df_test.head()

In [None]:
Q_MAXLEN = df_train.Question_encoded.apply(len).max()
S_MAXLEN = df_train.Sentence_encoded.apply(len).max()

def pad_sequence(seq, max_len, padding=0, cut_last=True):
    seq = seq + [padding] * (max_len - len(seq))
    if cut_last:
        seq = seq[:max_len]
    else:
        seq = seq[-max_len:]
    assert len(seq) == max_len
    return np.array(seq)

df_train.loc[:, 'Question_encoded'] = df_train.Question_encoded.apply(lambda x: pad_sequence(x, Q_MAXLEN))
df_train.loc[:, 'Sentence_encoded'] = df_train.Sentence_encoded.apply(lambda x: pad_sequence(x, S_MAXLEN))

df_test.loc[:, 'Question_encoded'] = df_test.Question_encoded.apply(lambda x: pad_sequence(x, Q_MAXLEN))
df_test.loc[:, 'Sentence_encoded'] = df_test.Sentence_encoded.apply(lambda x: pad_sequence(x, S_MAXLEN))

In [None]:
df_train

In [None]:
df_test

In [None]:
df_train.to_pickle('./data/processed/wikiqa_df_train.pickle')
df_test.to_pickle('./data/processed/wikiqa_df_test.pickle')

In [None]:
voc_items = {'index2token':voc.index2token,
             'token2index':voc.token2index,
             'voc_len':voc.voclen}
saveobj(voc_items, './data/processed/vocabulary.pickle')