In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import os
import json
from nltk.tokenize import word_tokenize

import torch
import torch.nn as nn
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F
import torchtext

use_cuda = torch.cuda.is_available()
from process_data import save_pickle, load_pickle, load_task, load_glove_weights
from layers.char_embedding import CharEmbedding
from layers.word_embedding import WordEmbedding
from layers.highway import Highway
from config import Config

In [3]:
train_data = load_task('./dataset/train-v1.1.json')
# dev_data = load_task('./dataset/dev-v1.1.json')
data = train_data # + dev_data
# save_pickle(train_data, 'pickle/train_data.pickle')
# save_pickle(dev_data, 'pickle/dev_data.pickle')

vocab_w, vocab_c = set(), set()
for ctx_w, ctx_c, q_id, q_w, q_c, answer in data:
    vocab_w |= set(ctx_w + q_w + answer)
    flatten_c = [c for chars in ctx_c for c in chars]
    flatten_q = [c for chars in q_c for c in chars]

    vocab_c |= set(flatten_c + flatten_q) # TODO

vocab_w = list(sorted(vocab_w))
vocab_c = list(sorted(vocab_c))

w2i_w = dict((w, i) for i, w in enumerate(vocab_w, 0))
i2w_w = dict((i, w) for i, w in enumerate(vocab_w, 0))
w2i_c = dict((c, i) for i, c in enumerate(vocab_c, 0))
i2w_c = dict((i, c) for i, c in enumerate(vocab_c, 0))
# save_pickle(vocab, 'pickle/vocab.pickle')
# save_pickle(w2i, 'pickle/w2i.pickle')
# save_pickle(i2w, 'pickle/i2w.pickle')
# train_data = load_pickle('pickle/train_data.pickle')
# vocab = load_pickle('pickle/vocab.pickle')
# w2i = load_pickle('pickle/w2i.pickle')

vocab_size_w = len(vocab_w)
vocab_size_c = len(vocab_c)

ctx_sent_maxlen = max([len(c) for c, _, _, _, _, _ in data])
query_sent_maxlen = max([len(q) for _, _, _, q, _, _ in data])
ctx_word_maxlen = max([len(w) for _, cc, _, _, _, _ in data for w in cc])
query_word_maxlen = max([len(w) for _, _, _, _, qc, _ in data for w in qc])
print('----')
print('n_train', len(train_data))
# print('n_dev', len(dev_data))
print('vocab_size_w:', vocab_size_w)
print('vocab_size_c:', vocab_size_c)
print('ctx_sent_maxlen:', ctx_sent_maxlen)
print('query_sent_maxlen:', query_sent_maxlen)
print('ctx_word_maxlen:', ctx_word_maxlen)
print('query_word_maxlen:', query_word_maxlen)

dataset version: 1.1
load_task: 0 / 442
----
n_train 269
vocab_size_w: 2783
vocab_size_c: 89
ctx_sent_maxlen: 333
query_sent_maxlen: 25
ctx_word_maxlen: 22
query_word_maxlen: 14


In [6]:
embd_size = 100
glove_embd_w = torch.from_numpy(load_glove_weights('./dataset', embd_size, vocab_size_w, w2i_w))

Found 400000 word vectors.
embed_matrix.shape (2783, 100)


In [None]:
# from layers.char_embedding import CharEmbedding
# from layers.word_embedding import WordEmbedding
# from layers.highway import Highway

# args = {
#     'embd_size': embd_size,
#     'vocab_size_c': vocab_size_c,
#     'vocab_size_w': vocab_size_w,
#     'pre_embd_w': glove_embd_w, # word embedding
#     'filters': [[1, 5]], # char embedding
#     'out_chs': 100, # char embedding
# }
# conf = Config(**args)
# def make_word_vector(data, w2i_w, query_len):
#     vec_data = []
#     for sentence in data:
#         index_vec = [w2i_w[w] for w in sentence]
#         pad_len = max(0, query_len - len(index_vec))
#         index_vec += [0] * pad_len
#         index_vec = index_vec[:query_len]
#         vec_data.append(index_vec)
    
#     var = Variable(torch.LongTensor(vec_data))
#     return var

# def make_char_vector(data, w2i_c, query_len, word_len):
#     tmp = torch.zeros(len(data), query_len, word_len).type(torch.LongTensor)
#     for i, words in enumerate(data):
#         for j, word in enumerate(words):
#             for k, ch in enumerate(word):
#                 tmp[i][j][k] = w2i_c[ch]
#     return Variable(tmp)
    
# class AttentionNet(nn.Module):
#     def __init__(self, args):
#         super(AttentionNet, self).__init__()
#         self.embd_size = args.embd_size
#         self.char_embd_net = CharEmbedding(args)
#         self.word_embd_net = WordEmbedding(args)
#         self.highway_net = Highway(args.embd_size*2)# TODO share is ok?
#         self.ctx_embd_layer = nn.GRU(args.embd_size*2, args.embd_size*4)
#         self.W = nn.Parameter(torch.rand(3 * args.embd_size).type(torch.DoubleTensor).view(1, -1), requires_grad=True)
# #         self.W = Variable(torch.randn(2*2*args.embd_size),     requires_grad=True)
#         print(self.W.size())
#         print(type(self.W))
    
#     def build_contextual_embd(self, x_c, x_w):
#         char_embd = self.char_embd_net(x_c) # (N, seq_len, embd_size)
#         word_embd = self.word_embd_net(x_w) # (N, seq_len, embd_size)
#         embd = torch.cat((char_embd, word_embd), 2) # (N, seq_len, embd_size*2)
#         embd = self.highway_net(embd)
#         ctx_embd_out, ctx_embd_h = self.ctx_embd_layer(embd)
#         print('ctx_embd_out', ctx_embd_out.size())
#         print('ctx_embd_h', ctx_embd_h.size())
#         print('type', type(ctx_embd

In [None]:
from layers.char_embedding import CharEmbedding
from layers.word_embedding import WordEmbedding
from layers.highway import Highway

args = {
    'embd_size': embd_size,
    'vocab_size_c': vocab_size_c,
    'vocab_size_w': vocab_size_w,
    'pre_embd_w': glove_embd_w, # word embedding
    'filters': [[1, 5]], # char embedding
    'out_chs': 100, # char embedding
}
conf = Config(**args)
def make_word_vector(data, w2i_w, query_len):
    vec_data = []
    for sentence in data:
        index_vec = [w2i_w[w] for w in sentence]
        pad_len = max(0, query_len - len(index_vec))
        index_vec += [0] * pad_len
        index_vec = index_vec[:query_len]
        vec_data.append(index_vec)
    
    var = Variable(torch.LongTensor(vec_data))
    return var

def make_char_vector(data, w2i_c, query_len, word_len):
    tmp = torch.zeros(len(data), query_len, word_len).type(torch.LongTensor)
    for i, words in enumerate(data):
        for j, word in enumerate(words):
            for k, ch in enumerate(word):
                tmp[i][j][k] = w2i_c[ch]
    return Variable(tmp)
    
class AttentionNet(nn.Module):
    def __init__(self, args):
        super(AttentionNet, self).__init__()
        self.embd_size = args.embd_size
        self.char_embd_net = CharEmbedding(args)
        self.word_embd_net = WordEmbedding(args)
        self.highway_net = Highway(args.embd_size*2)# TODO share is ok?
        self.ctx_embd_layer = nn.GRU(args.embd_size*2, args.embd_size*4)
        self.W = nn.Parameter(torch.rand(3*2*2* args.embd_size).type(torch.FloatTensor).view(1, -1), requires_grad=True)
        print(self.W.size())
        print(type(self.W))
    
    def build_contextual_embd(self, x_c, x_w):
        char_embd = self.char_embd_net(x_c) # (N, seq_len, embd_size)
        word_embd = self.word_embd_net(x_w) # (N, seq_len, embd_size)
        embd = torch.cat((char_embd, word_embd), 2) # (N, seq_len, embd_size*2)
        embd = self.highway_net(embd)
        ctx_embd_out, ctx_embd_h = self.ctx_embd_layer(embd)
        print('ctx_embd_out', ctx_embd_out.size())
        print('ctx_embd_h', ctx_embd_h.size())
        print('type', type(ctx_embd_out))
        return ctx_embd_out
        
    def forward(self, ctx_c, ctx_w, query_c, query_w):
        batch_size = ctx_c.size(0)
        embd_context = self.build_contextual_embd(ctx_c, ctx_w) # (N, T, 2d)
        ctx_len = embd_context.size(1)
        embd_query   = self.build_contextual_embd(query_c, query_w) # (N, J, 2d)
        query_len = embd_query.size(1)
        
        # Context2Query
        print('-----------')
        print('ctx_embd_context', embd_context.size())
        print('ctx_embd_query', embd_query.size())
        a_elmwise_mul_b = Variable(torch.zeros(batch_size, ctx_len, query_len, 2*2*self.embd_size).type(torch.FloatTensor))
        S = Variable(torch.zeros(batch_size, ctx_len, query_len).type(torch.DoubleTensor))
        for sample in range(batch_size): # TODO
            for ai in range(ctx_len):
                for bi in range(query_len):
                    a_elmwise_mul_b[sample, ai, bi] = torch.mul(embd_context[sample, ai], embd_query[sample, bi])
                    x = torch.cat((embd_context[sample, ai], embd_query[sample, bi], a_elmwise_mul_b[sample, ai, bi]), 0) # (1, 3*2*embd_dim)
                    S[sample, ai, bi] = torch.mm(self.W, x.unsqueeze(1))[0][0]
#         print('cat_data', cat_data.size())
                
def train(model, n_epoch=1, batch_size=16):
    for epoch in range(n_epoch):
        for i in range(0, len(data)-batch_size, batch_size): # TODO shuffle, last elms
            batch_data = data[i:i+batch_size]
            c = [d[0] for d in batch_data]
            cc = [d[1] for d in batch_data]
            q = [d[3] for d in batch_data]
            qc = [d[4] for d in batch_data]
            c_char_var = make_char_vector(cc, w2i_c, ctx_sent_maxlen, ctx_word_maxlen)
            c_word_var = make_word_vector(c, w2i_w, ctx_sent_maxlen)
            q_char_var = make_char_vector(qc, w2i_c, query_sent_maxlen, query_word_maxlen)
            q_word_var = make_word_vector(q, w2i_w, query_sent_maxlen)
            model(c_char_var, c_word_var, q_char_var, q_word_var)
            break
            
attn = AttentionNet(conf)
# print(attn)
train(attn)
print('finish train')

torch.Size([1, 1200])
<class 'torch.nn.parameter.Parameter'>
ctx_embd_out torch.Size([16, 333, 400])
ctx_embd_h torch.Size([1, 333, 400])
type <class 'torch.autograd.variable.Variable'>
ctx_embd_out torch.Size([16, 25, 400])
ctx_embd_h torch.Size([1, 25, 400])
type <class 'torch.autograd.variable.Variable'>
-----------
ctx_embd_context torch.Size([16, 333, 400])
ctx_embd_query torch.Size([16, 25, 400])


In [None]:
# # %load layers/char_embedding.py
# import torch
# import torch.nn as nn
# import torch.nn.functional as F
# from torch.autograd import Variable

# # In : (N, sentence_len, word_len, vocab_size_c)
# # Out: (N, sentence_len, embd_size)
# class CharEmbedding(nn.Module):
#     def __init__(self, args):
#         super(CharEmbedding, self).__init__()
#         self.embd_size = args.embd_size
#         self.embedding = nn.Embedding(args.vocab_size_c, args.embd_size)
#         # nn.Conv1d(in_channels, out_channels, kernel_size, stride=1, padding=0, ...
#         self.conv = nn.ModuleList([nn.Conv2d(1, args.out_chs, (f[0], f[1])) for f in args.filters])
#         self.dropout = nn.Dropout(.5)
#         self.fc1 = nn.Linear(args.out_chs*len(args.filters), 1)
        
#     def forward(self, x):
#         # x: (N, seq_len, word_len)
#         input_shape = x.size()
#         bs = x.size(0)
#         seq_len = x.size(1)
#         word_len = x.size(2)
#         x = x.view(-1, word_len) # (N*seq_len, word_len)
#         x = self.embedding(x) # (N*seq_len, word_len, embd_size)
#         x = x.view(*input_shape, -1) # (N, seq_len, word_len, embd_size)
#         x = x.sum(2) # (N, seq_len, embd_size)
        
#         return x
# net = CharEmbedding(args)
# bs = 10
# seq_len = 7
# word_len = 5
# input = Variable(torch.zeros(bs, seq_len, word_len)).long()
# out = net(input)
# print('out', out.size())

In [None]:
# CharEmbedding test
embd_size = 100
n_out_ch = 100
filters = [[1, 5]]
tmp_data = data[0][4]
max_len = max([len(chars) for chars in tmp_data])
tmp_var = torch.zeros(1, query_sent_maxlen, query_word_maxlen).type(torch.LongTensor)
print('tmp_var.size()=', tmp_var.size())
for i, chars in enumerate(tmp_data):
    for j, ch in enumerate(chars):
        tmp_var[0][i][j] = w2i_c[ch]
char_embd_net = CharEmbedding(vocab_size_c, embd_size, n_out_ch, filters)
print(char_embd_net)
out = char_embd_net(Variable(tmp_var))
print(out)
print('out', out.size())

In [None]:
# WordEmbedding Test
word_embd_net = WordEmbedding(vocab_size_w, embd_size, False, glove_embd_w)
word_var = Variable(torch.LongTensor([[w2i_w[w] for w in data[0][3]]]))
out = word_embd_net(word_var)
print(out.size())

In [None]:
batch_size = 16
embd_dim = 10
a_len = 7
b_len = 4

a = torch.rand(batch_size, a_len, embd_dim).type(torch.DoubleTensor)  # dummy input1
b = torch.rand(batch_size, b_len, embd_dim).type(torch.DoubleTensor)  # dummy input2
# a_elmwise_mul_b: (N, a_len, b_len, embd_dim)   dummy-code
a_elmwise_mul_b = torch.zeros(batch_size, a_len, b_len, embd_dim).type(torch.DoubleTensor)
S = torch.zeros(batch_size, a_len, b_len).type(torch.DoubleTensor)
W = torch.rand(3 * embd_dim).type(torch.DoubleTensor).view(1, -1) # must be trainable params
# I think there are better way than below
for sample in range(batch_size):
    for ai in range(a_len):
        for bi in range(b_len):
            a_elmwise_mul_b[sample, ai, bi] = torch.mul(a[sample, ai], b[sample, bi])
            x = torch.cat((a[sample, ai], b[sample, bi], a_elmwise_mul_b[sample, ai, bi])) # (1, 3*embd_dim)
            S[sample, ai, bi] = torch.mm(W, x.unsqueeze(1))[0][0]


In [None]:
batch_size = 16
embd_dim = 10
a_len = 7
b_len = 4

a = torch.rand(batch_size, a_len, embd_dim).type(torch.DoubleTensor)  # dummy input1
b = torch.rand(batch_size, b_len, embd_dim).type(torch.DoubleTensor)  # dummy input2
# a_elmwise_mul_b: (N, a_len, b_len, embd_dim)   dummy-code
a_elmwise_mul_b = torch.zeros(batch_size, a_len, b_len, embd_dim).type(torch.DoubleTensor)
S = torch.zeros(batch_size, a_len, b_len).type(torch.DoubleTensor)
W = torch.rand(3 * embd_dim).type(torch.DoubleTensor).view(1, -1) # must be trainable params
# for sample in range(batch_size):
#     for ai in range(a_len):
#         for bi in range(b_len):
#             a_elmwise_mul_b[sample, ai, bi] = torch.mul(a[sample, ai], b[sample, bi])
#             x = torch.cat((a[sample, ai], b[sample, bi], a_elmwise_mul_b[sample, ai, bi])) # (1, 3*embd_dim)
#             S[sample, ai, bi] = torch.mm(W, x.unsqueeze(1))[0][0]
S = torch.bmm(a, b.transpose(1, 2))