In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import os
import json
from nltk.tokenize import word_tokenize

import torch
import torch.nn as nn
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F
import torchtext

use_cuda = torch.cuda.is_available()
from process_data import save_pickle, load_pickle, load_task, load_glove_weights
from layers.char_embedding import CharEmbedding
from layers.word_embedding import WordEmbedding
from layers.highway import Highway
from config import Config

In [3]:
train_data, train_ctx_maxlen = load_task('./dataset/train-v1.1.json')
# dev_data = load_task('./dataset/dev-v1.1.json')
data = train_data # + dev_data
ctx_maxlen = train_ctx_maxlen
# save_pickle(train_data, 'pickle/train_data.pickle')
# save_pickle(dev_data, 'pickle/dev_data.pickle')

vocab_w, vocab_c = set(), set()
for ctx_w, ctx_c, q_id, q_w, q_c, answer, _, _ in data:
    vocab_w |= set(ctx_w + q_w + answer)
    flatten_c = [c for chars in ctx_c for c in chars]
    flatten_q = [c for chars in q_c for c in chars]

    vocab_c |= set(flatten_c + flatten_q) # TODO

vocab_w = list(sorted(vocab_w))
vocab_c = list(sorted(vocab_c))

w2i_w = dict((w, i) for i, w in enumerate(vocab_w, 0))
i2w_w = dict((i, w) for i, w in enumerate(vocab_w, 0))
w2i_c = dict((c, i) for i, c in enumerate(vocab_c, 0))
i2w_c = dict((i, c) for i, c in enumerate(vocab_c, 0))
# save_pickle(vocab, 'pickle/vocab.pickle')
# save_pickle(w2i, 'pickle/w2i.pickle')
# save_pickle(i2w, 'pickle/i2w.pickle')
# train_data = load_pickle('pickle/train_data.pickle')
# vocab = load_pickle('pickle/vocab.pickle')
# w2i = load_pickle('pickle/w2i.pickle')

vocab_size_w = len(vocab_w)
vocab_size_c = len(vocab_c)

ctx_sent_maxlen = max([len(c) for c, _, _, _, _, _, _, _ in data])
query_sent_maxlen = max([len(q) for _, _, _, q, _, _, _, _ in data])
ctx_word_maxlen = max([len(w) for _, cc, _, _, _, _, _, _ in data for w in cc])
query_word_maxlen = max([len(w) for _, _, _, _, qc, _, _, _ in data for w in qc])
print('----')
print('n_train', len(train_data))
# print('n_dev', len(dev_data))
print('ctx_maxlen', ctx_maxlen)
print('vocab_size_w:', vocab_size_w)
print('vocab_size_c:', vocab_size_c)
print('ctx_sent_maxlen:', ctx_sent_maxlen)
print('query_sent_maxlen:', query_sent_maxlen)
print('ctx_word_maxlen:', ctx_word_maxlen)
print('query_word_maxlen:', query_word_maxlen)

dataset version: 1.1
load_task: 0 / 442
----
n_train 269
ctx_maxlen 1786
vocab_size_w: 2783
vocab_size_c: 89
ctx_sent_maxlen: 333
query_sent_maxlen: 25
ctx_word_maxlen: 22
query_word_maxlen: 14


In [4]:
embd_size = 100
glove_embd_w = torch.from_numpy(load_glove_weights('./dataset', embd_size, vocab_size_w, w2i_w))

Found 400000 word vectors.
embed_matrix.shape (2783, 100)


In [15]:
from layers.char_embedding import CharEmbedding
from layers.word_embedding import WordEmbedding
from layers.highway import Highway

args = {
    'embd_size': embd_size,
    'vocab_size_c': vocab_size_c,
    'vocab_size_w': vocab_size_w,
    'pre_embd_w': glove_embd_w, # word embedding
    'filters': [[1, 5]], # char embedding
    'out_chs': 100, # char embedding
    'ans_size': ctx_maxlen
}
args = Config(**args)

def to_var(x):
    # TODO CUDA
    return Variable(x)

def make_word_vector(data, w2i_w, query_len):
    vec_data = []
    for sentence in data:
        index_vec = [w2i_w[w] for w in sentence]
        pad_len = max(0, query_len - len(index_vec))
        index_vec += [0] * pad_len
        index_vec = index_vec[:query_len]
        vec_data.append(index_vec)
    
    return to_var(torch.LongTensor(vec_data))

def make_char_vector(data, w2i_c, query_len, word_len):
    tmp = torch.zeros(len(data), query_len, word_len).type(torch.LongTensor)
    for i, words in enumerate(data):
        for j, word in enumerate(words):
            for k, ch in enumerate(word):
                tmp[i][j][k] = w2i_c[ch]
    return to_var(tmp)

def make_one_hot(data, data_size):
    tmp = torch.zeros(len(data), data_size).type(torch.LongTensor)
    for i in range(len(data)):
        tmp[i, data[i][0]] = 1
    return to_var(tmp)
    
class AttentionNet(nn.Module):
    def __init__(self, args):
        super(AttentionNet, self).__init__()
        self.embd_size = args.embd_size
        self.char_embd_net = CharEmbedding(args)
        self.word_embd_net = WordEmbedding(args)
        self.highway_net = Highway(args.embd_size*2)# TODO share is ok?
        self.ctx_embd_layer = nn.GRU(args.embd_size*2, args.embd_size*2, bidirectional=True)
        self.W = nn.Parameter(torch.rand(3*2*2* args.embd_size).type(torch.FloatTensor).view(1, -1), requires_grad=True)
#         self.beta = nn.Parameter(torch.rand(8*2*2* args.embd_size).type(torch.FloatTensor).view(1, -1), requires_grad=True)
        self.modeling_layer = nn.GRU(args.embd_size*2*8, args.embd_size*2, bidirectional=True)
        self.p1_layer = nn.Linear(args.embd_size*2*10, args.ans_size)
        self.p2_lstm_layer = nn.GRU(args.embd_size*2*2, args.embd_size*2*2, bidirectional=True)
        self.p2_layer = nn.Linear(args.embd_size*2*12, args.ans_size)
    
    def build_contextual_embd(self, x_c, x_w):
        char_embd = self.char_embd_net(x_c) # (N, seq_len, embd_size)
        word_embd = self.word_embd_net(x_w) # (N, seq_len, embd_size)
        embd = torch.cat((char_embd, word_embd), 2) # (N, seq_len, embd_size*2)
        embd = self.highway_net(embd)
        ctx_embd_out, ctx_embd_h = self.ctx_embd_layer(embd)
        print('ctx_embd_out', ctx_embd_out.size())
        print('ctx_embd_h', ctx_embd_h.size())
        return ctx_embd_out
        
    def forward(self, ctx_c, ctx_w, query_c, query_w):
        batch_size = ctx_c.size(0)
        embd_context = self.build_contextual_embd(ctx_c, ctx_w) # (N, T, 2d)
        ctx_len = embd_context.size(1)
        embd_query   = self.build_contextual_embd(query_c, query_w) # (N, J, 2d)
        query_len = embd_query.size(1)
        
        # 4. Attention Flow Layer
        # Context2Query
        print('-----------')
        print('ctx_embd_context', embd_context.size())
        print('ctx_embd_query', embd_query.size())
        a_elmwise_mul_b = Variable(torch.zeros(batch_size, ctx_len, query_len, 2*2*self.embd_size).type(torch.FloatTensor))
        S = Variable(torch.zeros(batch_size, ctx_len, query_len).type(torch.FloatTensor))
        for sample in range(batch_size): # TODO
            for ci in range(ctx_len):
                for qi in range(query_len):
                    a_elmwise_mul_b[sample, ci, qi] = torch.mul(embd_context[sample, ci], embd_query[sample, qi])
                    x = torch.cat((embd_context[sample, ci], embd_query[sample, qi], a_elmwise_mul_b[sample, ci, qi]), 0) # (1, 3*2*embd_dim)
                    S[sample, ci, qi] = torch.mm(self.W, x.unsqueeze(1))[0][0]
                    break
                S[sample, ci] = F.softmax(S[sample, ci]) # softmax(in, dim) is only available in newer version                
                break
            break
            
        c2q = torch.bmm(S, embd_query) # (N, T, 2d)
    
        tmp_b = torch.max(S, 2)[0]
        b = torch.stack([F.softmax(tmp_b[i]) for i in range(batch_size)], 0) # (N, T)
        q2c = torch.bmm(b.unsqueeze(1), embd_context).squeeze() # (N, 2d)
        q2c = q2c.unsqueeze(1) # (N, 1, 2d)
        q2c = q2c.repeat(1, ctx_len, 1) # (N, T, 2d)
        
        G = torch.cat((embd_context, c2q, embd_context.mul(c2q), embd_context.mul(q2c)), 2) # (N, T, 8d)
        
        # 5. Modeling Layer
        M, _ = self.modeling_layer(G) # M: (N, T, 2d)
        
        # 5. Output Layer
        G_M = torch.cat((G, M), 2) # (N, T, 10d)
        G_M = G_M.sum(1) #(N, 10d)
        p1 = F.softmax(self.p1_layer(G_M)) # (N, T)
        
        M2, _ = self.p2_lstm_layer(M) # (N, T, 4d)
        G_M2 = torch.cat((G, M2), 2) # (N, T, 12d)
        G_M2 = G_M2.sum(1) # (N, 12d)(N, T)
        p2 = F.softmax(self.p2_layer(G_M2)) # (N, T)
        
        return p1, p2
        
def train(model, optimizer, loss_fn, n_epoch=1, batch_size=16):
    for epoch in range(n_epoch):
        for i in range(0, len(data)-batch_size, batch_size): # TODO shuffle, last elms
            batch_data = data[i:i+batch_size]
            c = [d[0] for d in batch_data]
            cc = [d[1] for d in batch_data]
            q = [d[3] for d in batch_data]
            qc = [d[4] for d in batch_data]
            a_beg = to_var(torch.LongTensor([d[6] for d in batch_data]).squeeze())
            a_end = to_var(torch.LongTensor([d[7] for d in batch_data]).squeeze())
            c_char_var = make_char_vector(cc, w2i_c, ctx_sent_maxlen, ctx_word_maxlen)
            c_word_var = make_word_vector(c, w2i_w, ctx_sent_maxlen)
            q_char_var = make_char_vector(qc, w2i_c, query_sent_maxlen, query_word_maxlen)
            q_word_var = make_word_vector(q, w2i_w, query_sent_maxlen)
            p1, p2 = model(c_char_var, c_word_var, q_char_var, q_word_var)
            
            loss_p1 = loss_fn(p1, a_beg)
#             loss_p2 = loss_fn(p2, a_end)
            
            model.zero_grad()
            loss_p1.backward()
#             loss_p2.backward()
            optimizer.step()
            
            break

model = AttentionNet(args)
# print(model)
optimizer = torch.optim.Adadelta(filter(lambda p: p.requires_grad, model.parameters()), lr=0.5)
loss_fn = nn.NLLLoss()
train(model, optimizer, loss_fn)
print('finish train')

ctx_embd_out torch.Size([16, 333, 400])
ctx_embd_h torch.Size([2, 333, 200])
ctx_embd_out torch.Size([16, 25, 400])
ctx_embd_h torch.Size([2, 25, 200])
-----------
ctx_embd_context torch.Size([16, 333, 400])
ctx_embd_query torch.Size([16, 25, 400])


RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation

In [None]:
# # %load layers/char_embedding.py
# import torch
# import torch.nn as nn
# import torch.nn.functional as F
# from torch.autograd import Variable

# # In : (N, sentence_len, word_len, vocab_size_c)
# # Out: (N, sentence_len, embd_size)
# class CharEmbedding(nn.Module):
#     def __init__(self, args):
#         super(CharEmbedding, self).__init__()
#         self.embd_size = args.embd_size
#         self.embedding = nn.Embedding(args.vocab_size_c, args.embd_size)
#         # nn.Conv1d(in_channels, out_channels, kernel_size, stride=1, padding=0, ...
#         self.conv = nn.ModuleList([nn.Conv2d(1, args.out_chs, (f[0], f[1])) for f in args.filters])
#         self.dropout = nn.Dropout(.5)
#         self.fc1 = nn.Linear(args.out_chs*len(args.filters), 1)
        
#     def forward(self, x):
#         # x: (N, seq_len, word_len)
#         input_shape = x.size()
#         bs = x.size(0)
#         seq_len = x.size(1)
#         word_len = x.size(2)
#         x = x.view(-1, word_len) # (N*seq_len, word_len)
#         x = self.embedding(x) # (N*seq_len, word_len, embd_size)
#         x = x.view(*input_shape, -1) # (N, seq_len, word_len, embd_size)
#         x = x.sum(2) # (N, seq_len, embd_size)
        
#         return x
# net = CharEmbedding(args)
# bs = 10
# seq_len = 7
# word_len = 5
# input = Variable(torch.zeros(bs, seq_len, word_len)).long()
# out = net(input)
# print('out', out.size())

In [None]:
# CharEmbedding test
embd_size = 100
n_out_ch = 100
filters = [[1, 5]]
tmp_data = data[0][4]
max_len = max([len(chars) for chars in tmp_data])
tmp_var = torch.zeros(1, query_sent_maxlen, query_word_maxlen).type(torch.LongTensor)
print('tmp_var.size()=', tmp_var.size())
for i, chars in enumerate(tmp_data):
    for j, ch in enumerate(chars):
        tmp_var[0][i][j] = w2i_c[ch]
char_embd_net = CharEmbedding(vocab_size_c, embd_size, n_out_ch, filters)
print(char_embd_net)
out = char_embd_net(Variable(tmp_var))
print(out)
print('out', out.size())

In [None]:
# WordEmbedding Test
word_embd_net = WordEmbedding(vocab_size_w, embd_size, False, glove_embd_w)
word_var = Variable(torch.LongTensor([[w2i_w[w] for w in data[0][3]]]))
out = word_embd_net(word_var)
print(out.size())

In [None]:
batch_size = 16
embd_dim = 10
a_len = 7
b_len = 4

a = torch.rand(batch_size, a_len, embd_dim).type(torch.DoubleTensor)  # dummy input1
b = torch.rand(batch_size, b_len, embd_dim).type(torch.DoubleTensor)  # dummy input2
# a_elmwise_mul_b: (N, a_len, b_len, embd_dim)   dummy-code
a_elmwise_mul_b = torch.zeros(batch_size, a_len, b_len, embd_dim).type(torch.DoubleTensor)
S = torch.zeros(batch_size, a_len, b_len).type(torch.DoubleTensor)
W = torch.rand(3 * embd_dim).type(torch.DoubleTensor).view(1, -1) # must be trainable params
# I think there are better way than below
for sample in range(batch_size):
    for ai in range(a_len):
        for bi in range(b_len):
            a_elmwise_mul_b[sample, ai, bi] = torch.mul(a[sample, ai], b[sample, bi])
            x = torch.cat((a[sample, ai], b[sample, bi], a_elmwise_mul_b[sample, ai, bi])) # (1, 3*embd_dim)
            S[sample, ai, bi] = torch.mm(W, x.unsqueeze(1))[0][0]


In [None]:
batch_size = 16
embd_dim = 10
a_len = 7
b_len = 4

a = torch.rand(batch_size, a_len, embd_dim).type(torch.DoubleTensor)  # dummy input1
b = torch.rand(batch_size, b_len, embd_dim).type(torch.DoubleTensor)  # dummy input2
# a_elmwise_mul_b: (N, a_len, b_len, embd_dim)   dummy-code
a_elmwise_mul_b = torch.zeros(batch_size, a_len, b_len, embd_dim).type(torch.DoubleTensor)
S = torch.zeros(batch_size, a_len, b_len).type(torch.DoubleTensor)
W = torch.rand(3 * embd_dim).type(torch.DoubleTensor).view(1, -1) # must be trainable params
# for sample in range(batch_size):
#     for ai in range(a_len):
#         for bi in range(b_len):
#             a_elmwise_mul_b[sample, ai, bi] = torch.mul(a[sample, ai], b[sample, bi])
#             x = torch.cat((a[sample, ai], b[sample, bi], a_elmwise_mul_b[sample, ai, bi])) # (1, 3*embd_dim)
#             S[sample, ai, bi] = torch.mm(W, x.unsqueeze(1))[0][0]
S = torch.bmm(a, b.transpose(1, 2))

In [None]:
for c, _, _, _, _, _, _, _ in data:
    print(c)
    break

In [8]:
from __future__ import print_function
import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.autograd import Variable

# Training settings
parser.add_argument('--seed', type=int, default=1, metavar='S',
                    help='random seed (default: 1)')
parser.add_argument('--log-interval', type=int, default=10, metavar='N',
                    help='how many batches to wait before logging training status')
args = parser.parse_args()
args.cuda = not args.no_cuda and torch.cuda.is_available()

torch.manual_seed(args.seed)
if args.cuda:
    torch.cuda.manual_seed(args.seed)


kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {}
train_loader = torch.utils.data.DataLoader(
    datasets.MNIST('../data', train=True, download=True,
                   transform=transforms.Compose([
                       transforms.ToTensor(),
                       transforms.Normalize((0.1307,), (0.3081,))
                   ])),
    batch_size=16, shuffle=True)


class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
        self.conv2_drop = nn.Dropout2d()
        self.fc1 = nn.Linear(320, 50)
        self.fc2 = nn.Linear(50, 10)

    def forward(self, x):
        x = F.relu(F.max_pool2d(self.conv1(x), 2))
        x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
        x = x.view(-1, 320)
        x = F.relu(self.fc1(x))
        x = F.dropout(x, training=self.training)
        x = self.fc2(x)
        return F.log_softmax(x)

model = Net()
if args.cuda:
    model.cuda()

optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum)

def train(epoch):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        if args.cuda:
            data, target = data.cuda(), target.cuda()
        data, target = Variable(data), Variable(target)
        print(data.size())
        print(target.size())
        break
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()
        if batch_idx % args.log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.data[0]))


for epoch in range(1, args.epochs + 1):
    train(epoch)

usage: ipykernel_launcher.py [-h] [--batch-size N] [--test-batch-size N]
                             [--epochs N] [--lr LR] [--momentum M] [--no-cuda]
                             [--seed S] [--log-interval N]
ipykernel_launcher.py: error: unrecognized arguments: -f /run/user/1000/jupyter/kernel-a818d404-51e0-4797-9902-76133a624891.json


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
