In [1]:
import numpy as np
import os
import json
from nltk.tokenize import word_tokenize

import torch
import torch.nn as nn
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F
import torchtext

use_cuda = torch.cuda.is_available()
from process_data import save_pickle, load_pickle, load_glove_weights

In [36]:
def load_task(dataset_path):
    ret_data = []
    with open(dataset_path) as f:
        data = json.load(f)
        ver = data['version']
        print('dataset version:', ver)
        data = data['data']
        for i, d in enumerate(data):
            if i % 100 == 0: print('load_task:', i, '/', len(data))
            # print('load', d['title'], i, '/', len(data))
            for p in d['paragraphs']:
                c = word_tokenize(p['context'])
                cc = [list(w) for w in c]
                q, a = [], []
                for qa in p['qas']:
                    q = word_tokenize(qa['question'])
                    qc = [list(w) for w in q]
                    a = [ans['text'] for ans in qa['answers']]
                    ret_data.append((c, cc, qa['id'], q, qc, a)) # TODO context redandancy
#                 break
            break
    return ret_data

In [159]:
train_data = load_task('./dataset/train-v1.1.json')
# dev_data = load_task('./dataset/dev-v1.1.json')
data = train_data # + dev_data
# save_pickle(train_data, 'pickle/train_data.pickle')
# save_pickle(dev_data, 'pickle/dev_data.pickle')

vocab_w, vocab_c = set(), set()
for ctx_w, ctx_c, q_id, q_w, q_c, answer in train_data+dev_data:
    vocab_w |= set(ctx_w + q_w + answer)
    flatten_c = [c for chars in ctx_c for c in chars]
    flatten_q = [c for chars in q_c for c in chars]

    vocab_c |= set(flatten_c + flatten_q) # TODO

vocab_w = list(sorted(vocab_w))
vocab_c = list(sorted(vocab_c))

w2i_w = dict((w, i) for i, w in enumerate(vocab_w, 0))
i2w_w = dict((i, w) for i, w in enumerate(vocab_w, 0))
w2i_c = dict((c, i) for i, c in enumerate(vocab_c, 0))
i2w_c = dict((i, c) for i, c in enumerate(vocab_c, 0))
# save_pickle(vocab, 'pickle/vocab.pickle')
# save_pickle(w2i, 'pickle/w2i.pickle')
# save_pickle(i2w, 'pickle/i2w.pickle')
# train_data = load_pickle('pickle/train_data.pickle')
# vocab = load_pickle('pickle/vocab.pickle')
# w2i = load_pickle('pickle/w2i.pickle')

vocab_size_w = len(vocab_w)
vocab_size_c = len(vocab_c)

ctx_sent_maxlen = max([len(c) for c, _, _, _, _, _ in data])
query_sent_maxlen = max([len(q) for _, _, _, q, _, _ in data])
ctx_word_maxlen = max([len(w) for _, cc, _, _, _, _ in data for w in cc])
query_word_maxlen = max([len(w) for _, _, _, _, qc, _ in data for w in qc])
print('----')
print('n_train', len(train_data))
print('n_dev', len(dev_data))
print('vocab_size_w:', vocab_size_w)
print('vocab_size_c:', vocab_size_c)
print('ctx_sent_maxlen:', ctx_sent_maxlen)
print('query_sent_maxlen:', query_sent_maxlen)
print('ctx_word_maxlen:', ctx_word_maxlen)
print('query_word_maxlen:', query_word_maxlen)

dataset version: 1.1
load_task: 0 / 442
----
n_train 269
n_dev 30
vocab_size_w: 2845
vocab_size_c: 89
ctx_sent_maxlen: 333
query_sent_maxlen: 25
ctx_word_maxlen: 22
query_word_maxlen: 14


In [160]:
# sent1 = list('i have a cat')
# sent2 = list('i had a aieu')
# data = [sent1, sent2]
# vocab = set(sent1+sent2)
# vocab_size = len(vocab)
# print(data)
# print('vocab_size', vocab_size)
# w2i = {w:i for i, w in enumerate(vocab)}
# i2w = {i:w for i, w in enumerate(vocab)}

# max_word_len = max([len(word) for word in data])
# print('max_word_len', max_word_len)
data[0][4]

[['T', 'o'],
 ['w', 'h', 'o', 'm'],
 ['d', 'i', 'd'],
 ['t', 'h', 'e'],
 ['V', 'i', 'r', 'g', 'i', 'n'],
 ['M', 'a', 'r', 'y'],
 ['a', 'l', 'l', 'e', 'g', 'e', 'd', 'l', 'y'],
 ['a', 'p', 'p', 'e', 'a', 'r'],
 ['i', 'n'],
 ['1', '8', '5', '8'],
 ['i', 'n'],
 ['L', 'o', 'u', 'r', 'd', 'e', 's'],
 ['F', 'r', 'a', 'n', 'c', 'e'],
 ['?']]

In [168]:
# In : (N, sentence_len, word_len, vocab_size_c)
# Out: (N, sentence_len, embd_size)
# hoge = []
class CharEmbedding(nn.Module):
    def __init__(self, vocab_size, embd_size, out_chs, filters):
        super(CharEmbedding, self).__init__()
        self.embd_size = embd_size
        self.embedding = nn.Embedding(vocab_size, embd_size)
        # nn.Conv1d(in_channels, out_channels, kernel_size, stride=1, padding=0, ...
        self.conv = nn.ModuleList([nn.Conv2d(1, out_chs, (f[0], f[1])) for f in filters])
        self.dropout = nn.Dropout(.5)
        self.fc1 = nn.Linear(out_chs*len(filters), 1)
        
    def forward(self, x):
        print('x', x.size()) # (N, seq_len, word_len)
        bs = x.size(0)
        seq_len = x.size(1)
        word_len = x.size(2)
        embd = Variable(torch.zeros(bs, seq_len, self.embd_size))
        for i, elm in enumerate(x): # every sample
            for j, chars in enumerate(elm): # every sentence. [ [‘w’, ‘h’, ‘o’, 0], [‘i’, ‘s’, 0, 0], [‘t’, ‘h’, ‘i’, ‘s’] ]
                chars_embd = self.embedding(chars.unsqueeze(0)) # (N, word_len, embd_size) [‘w’,‘h’,‘o’,0]
                chars_embd = torch.sum(chars_embd, 1) # (N, embd_size). sum each char's embedding
                embd[i,j] = chars_embd[0] # set char_embd as word-like embedding

        x = embd # (N, seq_len, embd_dim)
        x = embd.unsqueeze(1) # (N, Cin, seq_len, embd_dim), insert Channnel-In dim
        # Conv2d
        #    Input : (N,Cin, Hin, Win )
        #    Output: (N,Cout,Hout,Wout) 
        x = [F.relu(conv(x)) for conv in self.conv] # (N, Cout, seq_len, embd_dim-filter_w+1). stride == 1
        
        # [(N,Cout,Hout,Wout) -> [(N,Cout,Hout*Wout)] * len(filter_heights)
        # [(N, seq_len, embd_dim-filter_w+1, Cout)] * len(filter_heights)
        x = [xx.view((xx.size(0), xx.size(2), xx.size(3), xx.size(1))) for xx in x]
        
        # maxpool like
        # [(N, seq_len, Cout)] * len(filter_heights)
        x = [torch.sum(xx, 2) for xx in x]
        out = torch.cat(x, 1) # (N, seq_len, Cout)
        return out

embd_size = 100
n_out_ch = 100
filters = [[1, 5]]
tmp_data = data[0][4]
max_len = max([len(chars) for chars in tmp_data])
tmp_var = torch.zeros(1, query_sent_maxlen, query_word_maxlen).type(torch.LongTensor)
print('tmp_var.size()=', tmp_var.size())
for i, chars in enumerate(tmp_data):
    for j, ch in enumerate(chars):
        tmp_var[0][i][j] = w2i_c[ch]
char_embd_net = CharEmbedding(vocab_size_c, embd_size, n_out_ch, filters)
print(char_embd_net)
out = char_embd_net(Variable(tmp_var))
print('out', out.size())

tmp_var.size()= torch.Size([1, 25, 14])
CharEmbedding (
  (embedding): Embedding(89, 100)
  (conv): ModuleList (
    (0): Conv2d(1, 100, kernel_size=(1, 5), stride=(1, 1))
  )
  (dropout): Dropout (p = 0.5)
  (fc1): Linear (100 -> 1)
)
x torch.Size([1, 25, 14])
out torch.Size([1, 25, 100])


In [154]:

glove_embd_w = torch.from_numpy(load_glove_weights('./', embd_size, vocab_size, w2i))

Found 400000 word vectors.
embed_matrix.shape (6, 100)


In [169]:
sent1 = 'i have a cat'.split(' ')
sent2 = 'i had a aieu'.split(' ')
data = [sent1, sent2]
vocab = set(sent1+sent2)
vocab_size = len(vocab)
print(data)
print('vocab_size', vocab_size)
w2i = {w:i for i, w in enumerate(vocab)}
i2w = {i:w for i, w in enumerate(vocab)}

max_word_len = max([len(word) for word in data])
print('max_word_len', max_word_len)

[['i', 'have', 'a', 'cat'], ['i', 'had', 'a', 'aieu']]
vocab_size 6
max_word_len 4


In [170]:
# In : (N, sentence_len, vocab_size_w)
# Out: (N, sentence_len, embd_size)
class WordEmbedding(nn.Module):
    def __init__(self, vocab_size, embd_size, is_train_embd=False, pre_embd_w=None):
        super(WordEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embd_size)
        if pre_embd_w is not None:
            self.embedding.weight = nn.Parameter(pre_embd_w, requires_grad=is_train_embd)
        
    def forward(self, x):
        x = self.embedding(x)
        out = F.relu(x)
        print('out', out.size())
        return out

word_embd_net = WordEmbedding(vocab_size, embd_size, False, glove_embd_w)
word_var = Variable(torch.LongTensor([[w2i[w] for w in data[0]]]))
out = word_embd_net(word_var)
print(out.size())


out torch.Size([1, 4, 100])
torch.Size([1, 4, 100])


In [131]:
print(type(hoge[0]))
print(hoge[0].size())
# print(hoge[1])
# print(hoge[0]+hoge[1])

<class 'torch.autograd.variable.Variable'>
torch.Size([1, 14, 100])


In [139]:
a = torch.randn(2,3, 4)
a


(0 ,.,.) = 
 -0.8029  0.0716  0.8087 -1.8352
 -1.5496  0.7203  0.9974 -0.5096
  0.1004 -0.0163 -1.1995  0.1710

(1 ,.,.) = 
  1.7253  0.6494  0.9882 -1.8014
 -0.1006  1.2877  0.2056 -1.6416
 -0.7419  0.4738  1.6681 -0.5003
[torch.FloatTensor of size 2x3x4]

In [140]:
torch.sum(a, dim=1)


-2.2521  0.7757  0.6066 -2.1738
 0.8828  2.4109  2.8619 -3.9433
[torch.FloatTensor of size 2x4]