In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import os
import json
from nltk.tokenize import word_tokenize

import torch
import torch.nn as nn
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F
import torchtext

use_cuda = torch.cuda.is_available()
from process_data import save_pickle, load_pickle, load_task, load_glove_weights
from process_data import to_var, make_word_vector, make_char_vector
from layers.char_embedding import CharEmbedding
from layers.word_embedding import WordEmbedding
from layers.highway import Highway
from config import Config

In [3]:
train_data, train_ctx_maxlen = load_task('./dataset/train-v1.1.json')
train_data = train_data[:int(len(train_data)*0.7)]
dev_data, dev_ctx_maxlen = load_task('./dataset/dev-v1.1.json')
data = train_data + dev_data
ctx_maxlen = max(train_ctx_maxlen, dev_ctx_maxlen)
# save_pickle(train_data, 'pickle/train_data.pickle')
# save_pickle(dev_data, 'pickle/dev_data.pickle')

vocab_w, vocab_c = set(), set()
for ctx_w, ctx_c, q_id, q_w, q_c, answer, _, _ in data:
    vocab_w |= set(ctx_w + q_w + answer)
    flatten_c = [c for chars in ctx_c for c in chars]
    flatten_q = [c for chars in q_c for c in chars]

    vocab_c |= set(flatten_c + flatten_q) # TODO

vocab_w = list(sorted(vocab_w))
vocab_c = list(sorted(vocab_c))

w2i_w = dict((w, i) for i, w in enumerate(vocab_w, 0))
i2w_w = dict((i, w) for i, w in enumerate(vocab_w, 0))
w2i_c = dict((c, i) for i, c in enumerate(vocab_c, 0))
i2w_c = dict((i, c) for i, c in enumerate(vocab_c, 0))
# save_pickle(vocab, 'pickle/vocab.pickle')
# save_pickle(w2i, 'pickle/w2i.pickle')
# save_pickle(i2w, 'pickle/i2w.pickle')
# train_data = load_pickle('pickle/train_data.pickle')
# vocab = load_pickle('pickle/vocab.pickle')
# w2i = load_pickle('pickle/w2i.pickle')

vocab_size_w = len(vocab_w)
vocab_size_c = len(vocab_c)

ctx_sent_maxlen = max([len(c) for c, _, _, _, _, _, _, _ in data])
query_sent_maxlen = max([len(q) for _, _, _, q, _, _, _, _ in data])
ctx_word_maxlen = max([len(w) for _, cc, _, _, _, _, _, _ in data for w in cc])
query_word_maxlen = max([len(w) for _, _, _, _, qc, _, _, _ in data for w in qc])
print('----')
print('n_train', len(train_data))
# print('n_dev', len(dev_data))
print('ctx_maxlen', ctx_maxlen)
print('vocab_size_w:', vocab_size_w)
print('vocab_size_c:', vocab_size_c)
print('ctx_sent_maxlen:', ctx_sent_maxlen)
print('query_sent_maxlen:', query_sent_maxlen)
print('ctx_word_maxlen:', ctx_word_maxlen)
print('query_word_maxlen:', query_word_maxlen)

dataset version: 1.1
load_task: 0 / 442
dataset version: 1.1
load_task: 0 / 48
----
n_train 188
ctx_maxlen 2060
vocab_size_w: 4086
vocab_size_c: 91
ctx_sent_maxlen: 375
query_sent_maxlen: 34
ctx_word_maxlen: 22
query_word_maxlen: 16


In [4]:
embd_size = 100
glove_embd_w = torch.from_numpy(load_glove_weights('./dataset', embd_size, vocab_size_w, w2i_w))

Found 400000 word vectors.
embed_matrix.shape (4086, 100)


In [22]:
from layers.char_embedding import CharEmbedding
from layers.word_embedding import WordEmbedding
from layers.highway import Highway

args = {
    'embd_size': embd_size,
    'vocab_size_c': vocab_size_c,
    'vocab_size_w': vocab_size_w,
    'pre_embd_w': glove_embd_w, # word embedding
    'filters': [[1, 5]], # char embedding
    'out_chs': 100, # char embedding
    'ans_size': ctx_maxlen
}
args = Config(**args)


class AttentionNet(nn.Module):
    def __init__(self, args):
        super(AttentionNet, self).__init__()
        self.embd_size = args.embd_size
        self.ans_size = args.ans_size
        self.char_embd_net = CharEmbedding(args)
        self.word_embd_net = WordEmbedding(args)
        self.highway_net = Highway(args.embd_size*2)# TODO share is ok?
        self.ctx_embd_layer = nn.GRU(args.embd_size*2, args.embd_size*2, bidirectional=True)
        self.W = nn.Parameter(torch.rand(3*2*2* args.embd_size, 1).type(torch.FloatTensor), requires_grad=True)
#         self.beta = nn.Parameter(torch.rand(8*2*2* args.embd_size).type(torch.FloatTensor).view(1, -1), requires_grad=True)
        self.modeling_layer = nn.GRU(args.embd_size*2*8, args.embd_size*2, bidirectional=True)
        self.p1_layer = nn.Linear(args.embd_size*2*10, args.ans_size)
        self.p2_lstm_layer = nn.GRU(args.embd_size*2*2, args.embd_size*2*2, bidirectional=True)
        self.p2_layer = nn.Linear(args.embd_size*2*12, args.ans_size)
        
    def build_contextual_embd(self, x_c, x_w):
        # 1. Caracter Embedding Layer
        self.char_embd_net = self.char_embd_net.cuda()
        char_embd = self.char_embd_net(x_c) # (N, seq_len, embd_size)
        print('build_context')
        if torch.cuda.is_available():
            print('------char cuda')
            char_embd = char_embd.cuda()
        else:
            print('not cuda ')
        # 2. Word Embedding Layer
        word_embd = self.word_embd_net(x_w) # (N, seq_len, embd_size)
        if torch.cuda.is_available():
            word_embd = word_embd.cuda()
        # Highway Networks of 1. and 2.
        embd = torch.cat((char_embd, word_embd), 2) # (N, seq_len, embd_size*2)
        embd = self.highway_net(embd)
        
        # 3. Contextual  Embedding Layer
        ctx_embd_out, ctx_embd_h = self.ctx_embd_layer(embd)
        return ctx_embd_out
        
    def forward(self, ctx_c, ctx_w, query_c, query_w):
        batch_size = ctx_c.size(0)
        
        # 1. Caracter Embedding Layer 
        # 2. Word Embedding Layer
        # 3. Contextual  Embedding Layer
        embd_context = self.build_contextual_embd(ctx_c, ctx_w) # (N, T, 2d)
        ctx_len = embd_context.size(1)
        embd_query   = self.build_contextual_embd(query_c, query_w) # (N, J, 2d)
        query_len = embd_query.size(1)
        
        # 4. Attention Flow Layer
        # Context2Query
        shape = (batch_size, ctx_len, query_len, self.embd_size*2*2) # (N, T, J, 2d)
        embd_context_ex = embd_context.unsqueeze(2) # (N, T, 1, 2d)
        embd_context_ex = embd_context_ex.expand(shape)
        embd_query_ex = embd_query.unsqueeze(1) # (N, 1, J, 2d)
        embd_query_ex = embd_query_ex.expand(shape)
        a_elmwise_mul_b = torch.mul(embd_context_ex, embd_query_ex) # (N, T, J, 2d)
        cat_data = torch.cat((embd_context_ex, embd_query_ex, a_elmwise_mul_b), 3) # (N, T, J, 6d)
        cat_data = cat_data.view(batch_size, -1, 6*2*self.embd_size)
        S = torch.bmm(cat_data, self.W.unsqueeze(0).expand(batch_size, 6*2*self.embd_size, 1))
        S = S.view(batch_size, ctx_len, query_len)
        
        c2q = torch.bmm(S, embd_query) # (N, T, 2d)
        # Query2Context
        tmp_b = torch.max(S, 2)[0]
        b = torch.stack([F.softmax(tmp_b[i]) for i in range(batch_size)], 0) # (N, T)
        q2c = torch.bmm(b.unsqueeze(1), embd_context).squeeze() # (N, 2d)
        q2c = q2c.unsqueeze(1) # (N, 1, 2d)
        q2c = q2c.repeat(1, ctx_len, 1) # (N, T, 2d)
        
        G = torch.cat((embd_context, c2q, embd_context.mul(c2q), embd_context.mul(q2c)), 2) # (N, T, 8d)
        
        # 5. Modeling Layer
        M, _ = self.modeling_layer(G) # M: (N, T, 2d)
        
        # 5. Output Layer
        G_M = torch.cat((G, M), 2) # (N, T, 10d)
        G_M = G_M.sum(1) #(N, 10d)
        p1 = F.softmax(self.p1_layer(G_M)) # (N, T)
        
        M2, _ = self.p2_lstm_layer(M) # (N, T, 4d)
        G_M2 = torch.cat((G, M2), 2) # (N, T, 12d)
        G_M2 = G_M2.sum(1) # (N, 12d)(N, T)
        p2 = F.softmax(self.p2_layer(G_M2)) # (N, T)
        
        return p1, p2
        
def train(model, optimizer, n_epoch=10, batch_size=1):
    for epoch in range(n_epoch):
        for i in range(0, len(data)-batch_size, batch_size): # TODO shuffle, last elms
            print('batch', i, '/', len(data))
            batch_data = data[i:i+batch_size]
            c = [d[0] for d in batch_data]
            cc = [d[1] for d in batch_data]
            q = [d[3] for d in batch_data]
            qc = [d[4] for d in batch_data]
            a_beg = to_var(torch.LongTensor([d[6] for d in batch_data]).squeeze())
            a_end = to_var(torch.LongTensor([d[7] for d in batch_data]).squeeze())
            c_char_var = make_char_vector(cc, w2i_c, ctx_sent_maxlen, ctx_word_maxlen)
            c_word_var = make_word_vector(c, w2i_w, ctx_sent_maxlen)
            q_char_var = make_char_vector(qc, w2i_c, query_sent_maxlen, query_word_maxlen)
            q_word_var = make_word_vector(q, w2i_w, query_sent_maxlen)
            p1, p2 = model(c_char_var, c_word_var, q_char_var, q_word_var)
            loss_p1 = nn.NLLLoss()(p1, a_beg)
            loss_p2 = nn.NLLLoss()(p2, a_end)
            model.zero_grad()
#             print('loss.backward()')
            (loss_p1+loss_p2).backward()
            optimizer.step()
            
#             break
model = AttentionNet(args)
print(torch.cuda.is_available())
if torch.cuda.is_available():
    model.cuda()
# print(model)
optimizer = torch.optim.Adadelta(filter(lambda p: p.requires_grad, model.parameters()), lr=0.5)
train(model, optimizer)
print('finish train')

True
batch 0 / 998
build_context
------char cuda
build_context
------char cuda
batch 1 / 998
build_context
------char cuda
build_context
------char cuda
batch 2 / 998
build_context
------char cuda
build_context
------char cuda
batch 3 / 998
build_context
------char cuda
build_context
------char cuda
batch 4 / 998
build_context
------char cuda
build_context
------char cuda
batch 5 / 998
build_context
------char cuda
build_context
------char cuda
batch 6 / 998
build_context
------char cuda


  return a.mul(b)


build_context
------char cuda
batch 7 / 998
build_context
------char cuda
build_context
------char cuda
batch 8 / 998
build_context
------char cuda
build_context
------char cuda
batch 9 / 998
build_context
------char cuda
build_context
------char cuda
batch 10 / 998
build_context
------char cuda
build_context
------char cuda
batch 11 / 998
build_context
------char cuda
build_context
------char cuda
batch 12 / 998
build_context
------char cuda
build_context
------char cuda
batch 13 / 998
build_context
------char cuda
build_context
------char cuda
batch 14 / 998
build_context
------char cuda
build_context
------char cuda
batch 15 / 998
build_context
------char cuda
build_context
------char cuda
batch 16 / 998
build_context
------char cuda
build_context
------char cuda
batch 17 / 998
build_context
------char cuda
build_context
------char cuda
batch 18 / 998
build_context
------char cuda
build_context
------char cuda
batch 19 / 998
build_context
------char cuda
build_context
------char cud

batch 122 / 998
build_context
------char cuda
build_context
------char cuda
batch 123 / 998
build_context
------char cuda
build_context
------char cuda
batch 124 / 998
build_context
------char cuda
build_context
------char cuda
batch 125 / 998
build_context
------char cuda
build_context
------char cuda
batch 126 / 998
build_context
------char cuda
build_context
------char cuda
batch 127 / 998
build_context
------char cuda
build_context
------char cuda
batch 128 / 998
build_context
------char cuda
build_context
------char cuda
batch 129 / 998
build_context
------char cuda
build_context
------char cuda
batch 130 / 998
build_context
------char cuda
build_context
------char cuda
batch 131 / 998
build_context
------char cuda
build_context
------char cuda
batch 132 / 998
build_context
------char cuda
build_context
------char cuda
batch 133 / 998
build_context
------char cuda
build_context
------char cuda
batch 134 / 998
build_context
------char cuda
build_context
------char cuda
batch 135 / 

RuntimeError: invalid argument 2: mismatch between the batch size of input (1) and that of target (3) at /pytorch/torch/lib/THCUNN/generic/ClassNLLCriterion.cu:41

In [None]:
N = 16
hid_dim = 50
a_seq_len = 10
b_seq_len = 20
a = torch.randn(N, a_seq_len, hid_dim)
b = torch.randn(N, b_seq_len, hid_dim)
shape = (N, a_seq_len, b_seq_len, hid_dim)

result = torch.zeros(shape)

a_dash = a.unsqueeze(2) # (N, a_len, 1,     hid_dim)
b_dash = b.unsqueeze(1) # (N, 1,     b_len, hid_dim)
a_dash = a_dash.expand(shape)
b_dash = b_dash.expand(shape)
mul = a_dash * b_dash

print(a_dash.size(), b_dash.size(), mul.size())
print(torch.cat((a_dash, b_dash, mul), 3).size())

In [None]:
N = 10
x_len = 3
y_len = 5
hid_dim = 8
data = torch.randn(N, x_len, y_len, hid_dim)
W = torch.randn(hid_dim)
print(torch.mul(data, W).size())

In [None]:
hid_dim = 54
tdata = torch.Tensor([[
    [1] * hid_dim,
    [2] * hid_dim,
    [3] * hid_dim 
],
[
    [1] * hid_dim,
    [2] * hid_dim,
    [3] * hid_dim 
]])
print('data', tdata.size())
tW = torch.randn(hid_dim).view(-1, 1) # assume trainable parameters via nn.Parameter
print('W', tW.size())

print(tdata.size(), tW.unsqueeze(0).size())
print(torch.bmm(tdata, tW.unsqueeze(0)).squeeze())

In [None]:
hid_dim = 32
data = torch.randn(10, 6, hid_dim)
# data = tdata.view(10, 2*3, hid_dim)
W = torch.randn(hid_dim, 1) # assume trainable parameters via nn.Parameter
print(W.size())
W = W.unsqueeze(0).expand(10, hid_dim, 1)
print(W.size())
result = torch.bmm(data, W).squeeze() # error, want (N, 6)
print(result.size())


In [None]:
hid_dim = 32
data = torch.randn(10, 2, 3, hid_dim)
data = tdata.view(10, 2*3, hid_dim)
W = torch.randn(hid_dim, 1) # assume trainable parameters via nn.Parameter
print(W.size())
W = W.unsqueeze(0).expand(10, hid_dim, 1)
print(W.size())
result = torch.bmm(data, W).squeeze() # error, want (N, 6)
result = result.view(10, 2, 3)
print(result.size())
