In [1]:
import numpy as np
import os
import json
from nltk.tokenize import word_tokenize

import torch
import torch.nn as nn
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F
import torchtext

use_cuda = torch.cuda.is_available()
from process_data import save_pickle, load_pickle, load_glove_weights

In [5]:
def load_task(dataset_path):
    ret_data = []
    with open(dataset_path) as f:
        data = json.load(f)
        ver = data['version']
        print('dataset version:', ver)
        data = data['data']
        for i, d in enumerate(data):
            if i % 100 == 0: print('load_task:', i, '/', len(data))
            # print('load', d['title'], i, '/', len(data))
            for p in d['paragraphs']:
                c = word_tokenize(p['context'])
                cc = list(p['context'])
                q, a = [], []
                for qa in p['qas']:
                    q = word_tokenize(qa['question'])
                    qc = list(qa['question'])
                    a = [ans['text'] for ans in qa['answers']]
                    ret_data.append((c, cc, qa['id'], q, qc, a))
            break
    return ret_data


In [7]:
train_data = load_task('./dataset/train-v1.1.json')
dev_data = load_task('./dataset/dev-v1.1.json')
# save_pickle(train_data, 'pickle/train_data.pickle')
# save_pickle(dev_data, 'pickle/dev_data.pickle')

vocab_w, vocab_c = set(), set()
for ctx_w, ctx_c, q_id, q_w, q_c, answer in train_data+dev_data:
    vocab_w |= set(ctx_w + q_w + answer)
    vocab_c |= set(ctx_c + q_c + answer)

vocab_w = list(sorted(vocab_w))
vocab_c = list(sorted(vocab_c))

w2i_w = dict((c, i) for i, c in enumerate(vocab_w, 0))
i2w_w = dict((i, c) for i, c in enumerate(vocab_w, 0))
w2i_c = dict((c, i) for i, c in enumerate(vocab_c, 0))
i2w_c = dict((i, c) for i, c in enumerate(vocab_c, 0))
# save_pickle(vocab, 'pickle/vocab.pickle')
# save_pickle(w2i, 'pickle/w2i.pickle')
# save_pickle(i2w, 'pickle/i2w.pickle')
# train_data = load_pickle('pickle/train_data.pickle')
# vocab = load_pickle('pickle/vocab.pickle')
# w2i = load_pickle('pickle/w2i.pickle')

vocab_size_w = len(vocab_w)
vocab_size_c = len(vocab_c)

ctx_w_maxlen = max(map(len, (c for c, _, _, _, _, _ in train_data)))
query_w_maxlen = max(map(len, (q for _, _, _, q, _, _ in train_data)))
ctx_c_maxlen = max(map(len, (cc for c, cc, _, _, _, _ in train_data)))
query_c_maxlen = max(map(len, (qc for _, _, _, q, qc, _ in train_data)))

print('n_train', len(train_data))
print('n_dev', len(dev_data))
print('vocab_size_w:', vocab_size_w)
print('vocab_size_c:', vocab_size_c)
print('ctx_w_maxlen:', ctx_w_maxlen)
print('query_w_maxlen:', query_w_maxlen)
print('ctx_c_maxlen:', ctx_c_maxlen)
print('query_c_maxlen:', query_c_maxlen)

dataset version: 1.1
load_task: 0 / 442
dataset version: 1.1
load_task: 0 / 48
n_train 269
n_dev 810
vocab_size_w: 4588
vocab_size_c: 936
ctx_w_maxlen: 333
query_w_maxlen: 25
ctx_c_maxlen: 1786
query_c_maxlen: 137


In [None]:
# sent1 = list('i have a cat')
# sent2 = list('i had a aieu')
# data = [sent1, sent2]
# vocab = set(sent1+sent2)
# vocab_size = len(vocab)
# print(data)
# print('vocab_size', vocab_size)
# w2i = {w:i for i, w in enumerate(vocab)}
# i2w = {i:w for i, w in enumerate(vocab)}

# max_word_len = max([len(word) for word in data])
# print('max_word_len', max_word_len)

In [None]:
class CharEmbedding(nn.Module):
    def __init__(self, vocab_size, embd_size, out_chs, filters):
        super(CharEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embd_size)
        # nn.Conv1d(in_channels, out_channels, kernel_size, stride=1, padding=0, ...
        self.conv = nn.ModuleList([nn.Conv2d(1, out_chs, (f[0], f[1])) for f in filters])
        self.dropout = nn.Dropout(.5)
        self.fc1 = nn.Linear(out_chs*len(filters), 1)
        
    def forward(self, x):
        bs = x.size(0)
        x = self.embedding(x) # (N, seq_len, embd_dim)
        print('embd', x.size())
        x = x.unsqueeze(1) # (N, Cin, W, embd_dim), insert Channnel-In dim
        print('unsq', x.size())
        print('conv[0](x)', self.conv[0](x).size())
        # Conv2d
        #    Input : (N,Cin, Hin, Win )
        #    Output: (N,Cout,Hout,Wout) 
        x = [F.relu(conv(x)) for conv in self.conv]
        # (N,Cout,Hout,Wout) -> [(N,Cout,Hout*Wout)] * len(filter_heights)
        x = [xx.view((xx.size(0), xx.size(1), xx.size(2)*xx.size(3))) for xx in x]
        print('x[0]', x[0].size())
        # max_pool1d(input, kernel_size, ..
        # (N, Cout, Hout*Wout) --(max_pool1d)--> (N, Cout, 1) --(squeeze(2))--> (N, Cout)
        # [(N, Cout)]  len(filter_heights)
        x = [F.max_pool1d(xx, xx.size(2)).squeeze(2) for xx in x]
        print('after maxpool', x[0].size())
        out = torch.cat(x, 1) # (N, Cout*len(filter_heights))
        print('out', out.size())
        return out

embd_size = 100
n_out_ch = 100
filters = [[1, 5]]
char_embd_net = CharEmbedding(vocab_size, embd_size, n_out_ch, filters)
print(char_embd_net)
char_var = Variable(torch.LongTensor([[w2i[w] for w in data[0]]]))
out = char_embd_net(char_var)

In [None]:

glove_embd_w = torch.from_numpy(load_glove_weights('./', embd_size, vocab_size, w2i))

In [None]:
sent1 = 'i have a cat'.split(' ')
sent2 = 'i had a aieu'.split(' ')
data = [sent1, sent2]
vocab = set(sent1+sent2)
vocab_size = len(vocab)
print(data)
print('vocab_size', vocab_size)
w2i = {w:i for i, w in enumerate(vocab)}
i2w = {i:w for i, w in enumerate(vocab)}

max_word_len = max([len(word) for word in data])
print('max_word_len', max_word_len)

In [None]:
# In : (N, sentence_len, vocab_size)
# Out: (N, sentence_len, embd_size)
class WordEmbedding(nn.Module):
    def __init__(self, vocab_size, embd_size, is_train_embd=False, pre_embd_w=None):
        super(WordEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embd_size)
        if pre_embd_w is not None:
            self.embedding.weight = nn.Parameter(pre_embd_w, requires_grad=is_train_embd)
        
    def forward(self, x):
        x = self.embedding(x)
        out = F.relu(x)
        print('out', out.size())
        return out

word_embd_net = WordEmbedding(vocab_size, embd_size, False, glove_embd_w)
word_var = Variable(torch.LongTensor([[w2i[w] for w in data[0]]]))
out = word_embd_net(word_var)


In [9]:
vocab_c

[' ',
 '!',
 '"',
 '"Hymn for the Weekend"',
 '"Small Business Big Game"',
 '"Super Bowl City',
 '"Super Bowl City"',
 '"golden anniversary',
 '"golden anniversary"',
 '#',
 '$',
 '$1.2 billion',
 '$1.2 million',
 '$2 million',
 '$215 million',
 '$350 million',
 '$40 million',
 '$400m',
 '$5 million',
 '$5 million for a 30-second',
 '$5 million.',
 '$5,000,000',
 '$5,000,000,',
 '$9 million',
 '%',
 '&',
 "'",
 '(',
 '(1110 AM)',
 ')',
 ',',
 ', 23–16,',
 '-',
 '.',
 '. Jim Gray',
 '/',
 '0',
 '1',
 '1 Suffolk Street in Trafalgar Square',
 '1 million',
 '1,250',
 '10',
 '10 times',
 '10.',
 '105',
 '11',
 '1110 AM',
 '118',
 '11:28',
 '12',
 '12,179',
 '1240',
 '12–4',
 '13',
 '13 years',
 '13 years and 48 days',
 '136',
 '14',
 '15',
 '15–1',
 '17',
 '17 seconds',
 '17th',
 '17th of May',
 '18',
 '18-karat gold',
 '18-karat gold-plated',
 '1842',
 '1846',
 '1849',
 '1851–1921',
 '1854',
 '1865',
 '1873',
 '1879',
 '1882',
 '1883',
 '1887',
 '1896',
 '18th overall',
 '19',
 '19.7%',
 '