### 용어
context: question에 대한 hint sentence의 묶음 <br>
sentence: context를 이루고 있는 문장 하나 <br>
question: 질문 (문장 하나)

In [1]:
import itertools
import numpy as np
import pickle

import tensorflow as tf
from tensorflow.contrib.layers import batch_norm
from tensorflow.contrib.layers import fully_connected
from tensorflow.contrib import rnn
from tensorflow.contrib import slim
from tqdm import tqdm

In [2]:
# question, answer, context, label
with open('./babi_preprocessd/train_dataset_masked.pkl', 'rb') as f:
    train = pickle.load(f)
with open('./babi_preprocessd/val_dataset_masked.pkl', 'rb') as f:
    val = pickle.load(f)
with open('./babi_preprocessd/test_dataset_masked.pkl', 'rb') as f:
    test = pickle.load(f)   

In [3]:
with open('./babi_preprocessd/c_word_set.pkl', 'rb') as f:
    c_word_set = pickle.load(f)
with open('./babi_preprocessd/q_word_set.pkl', 'rb') as f:
    q_word_set = pickle.load(f)
with open('./babi_preprocessd/a_word_set.pkl', 'rb') as f:
    a_word_set = pickle.load(f)
with open('./babi_preprocessd/cqa_word_set.pkl', 'rb') as f:
    cqa_word_set = pickle.load(f)

In [4]:
[train_q, train_a, train_c, train_l, train_c_real_len, train_q_real_len] = train
[val_q, val_a, val_c, val_l, val_c_real_len, val_q_real_len] = val
[test_q, test_a, test_c, test_l, test_c_real_len, test_q_real_len] = test

In [5]:
c_max_len = 20
s_max_len = 12
q_max_len = 12
mask_index = 0

In [6]:
# parameters
learning_rate = 2e-4
batch_size = 64
iter_time = 6
display_step = 100
seed = 9

* context words: 124 <br>
* question words: 88 <br>
* answer words: 41 <br>
* all words: 159 <br>
* s,q max len: 12 <br>
* c max len: 20

#### LSTM for context
* 32 unit LSTM

#### LSTM for question
* 32 unit LSTM

In [7]:
# model parameter
s_input_step = s_max_len
s_hidden = 32
# ---
q_input_step = q_max_len
q_hidden = 32

In [8]:
# embed matrix
c_word_embed = 32
c_vocab_size = len(c_word_set)+1 # consider masking
c_word_embed_matrix = tf.Variable(tf.random_uniform(shape=[c_vocab_size, c_word_embed], minval=-1, maxval=1, seed= seed))
# ---
q_word_embed = 32
q_vocab_size = len(q_word_set)+1 # consider masking
q_word_embed_matrix = tf.Variable(tf.random_uniform(shape=[q_vocab_size, q_word_embed], minval=-1, maxval=1, seed=seed))

In [9]:
# input, output
c = tf.placeholder(dtype=tf.int32, shape=[batch_size, c_max_len, s_max_len])
c_real_len = tf.placeholder(dtype=tf.int32, shape=[batch_size, c_max_len])
s = tf.placeholder(dtype=tf.int32, shape=[batch_size, s_max_len]) 
q = tf.placeholder(dtype=tf.int32, shape=[batch_size, q_max_len])
q_real_len = tf.placeholder(dtype=tf.int32, shape=[batch_size])
l = tf.placeholder(dtype=tf.float32, shape=[batch_size, c_max_len, c_max_len])
a = tf.placeholder(dtype=tf.float32, shape=[batch_size, len(cqa_word_set)])

In [10]:
def contextLSTM(c, l, c_real_len, reuse=True, scope= "contextLSTM"):
    
    def sentenceLSTM(s, s_real_len, s_hidden=s_hidden, s_max_len= s_max_len, reuse=reuse, scope= "sentenceLSTM"):
        """
        embedding sentence

        Arguments
            s: sentence (word index list), shape = [batch_size, 12]
            s_real_len: length of the sentence before zero padding, int32

        Returns
            embedded_s: embedded sentence, shape = [batch_size, 32]
        """
        embedded_sent_word = tf.nn.embedding_lookup(c_word_embed_matrix, s)
        s_input = tf.unstack(embedded_sent_word, num=s_max_len, axis=1)
        lstm_cell = rnn.BasicLSTMCell(s_hidden, reuse=reuse)
        outputs, _ = rnn.static_rnn(lstm_cell, s_input, dtype=tf.float32, scope= scope)
        # 'outputs' is a list of output at every timestep, we pack them in a Tensor
        outputs = tf.stack(outputs)
        # and change back dimension to [batch_size(64), s_max_len(12), s_hidden(32)]
        outputs = tf.transpose(outputs, [1, 0, 2])
        # Start indices for each sample
        index = tf.range(0, batch_size) * (s_max_len) + (s_real_len-1)
        # Indexing
        outputs = tf.gather(tf.reshape(outputs, [-1, s_hidden]), index)
        return outputs
    
    """
    Args
        c: list of sentences, shape = [batch_size, 20, 12]
        l: list of labels, shape = [batch_size, 20, 20]
        c_real_len: list of real length, shape = [batch_size, 20]
        
    Returns
        tagged_c_objects: list of embedded sentence + label, shape = [batch_size, 52] 20개
        len(tagged_c_objects) = 20
    """
    context = tf.unstack(c, axis=1) 
    real_lens = tf.unstack(c_real_len, axis=1)
    labels = tf.unstack(l, axis=1)
    tagged_c_objects = []
    for sentence, real_len, label in zip(context, real_lens, labels):
        s_embedded = sentenceLSTM(sentence, real_len, reuse=reuse)
        c_embedded = tf.concat([s_embedded, label], axis= 1)
        tagged_c_objects.append(c_embedded)
                                 
    return tagged_c_objects

In [11]:
def questionLSTM(q, q_real_len, q_hidden=q_hidden, reuse=True, scope = "questionLSTM"):
    """
    Args
        q: zero padded qeustions, shape=[batch_size, q_max_len]
        q_real_len: original question length, shape = [batch_size, 1]
        
    Returns
        embedded_q: embedded questions, shape = [batch_size, q_hidden(32)]
    """
    embedded_q_word = tf.nn.embedding_lookup(q_word_embed_matrix, q)
    q_input = tf.unstack(embedded_q_word, num=q_max_len, axis=1)
    lstm_cell = rnn.BasicLSTMCell(q_hidden, reuse=reuse)
    outputs, _ = rnn.static_rnn(lstm_cell, q_input, dtype=tf.float32, scope = scope)
    # 'outputs' is a list of output at every timestep, we pack them in a Tensor
    outputs = tf.stack(outputs)
    # and change back dimension to [batch_size(64), q_max_len(12), s_hidden(32)]
    outputs = tf.transpose(outputs, [1, 0, 2])
    # Start indices for each sample
    index = tf.range(0, batch_size) * (q_max_len) + (q_real_len-1)
    # Indexing
    outputs = tf.gather(tf.reshape(outputs, [-1, s_hidden]), index)
    return outputs

In [12]:
def convert_to_RN_input(embedded_c, embedded_q):
    """
    
    Args
        embedded_c: output of contextLSTM, 20 length list of embedded sentences
        embedded_q: output of questionLSTM, embedded question
        
    Returns
        RN_input: input for RN g_theta, shape = [batch_size*190, (52+52+32)]
        considered batch_size and all combinations
    """
    # 20 combination 2 --> total 190 object pairs
    object_pairs = list(itertools.combinations(embedded_c, 2))
    # concatenate with question
    RN_inputs = []
    for object_pair in object_pairs:
        RN_input = tf.concat([object_pair[0], object_pair[1], embedded_q], axis=1)
        RN_inputs.append(RN_input)
    
    return tf.concat(RN_inputs, axis=0)

#### RN
* $g_\theta$: 4 layer, all 256 units MLP, ReLU
* $f_\phi$: 3 layer, 256/512/159 units MLP, ReLU, sotmax

In [13]:
g_units = [256,256,256,256]
f_units = [256,512,159]

In [14]:
def fc(inputs, output_shape, activation_fcn = tf.nn.relu, name="fc"):
    output = slim.fully_connected(inputs, int(output_shape), activation_fn=activation_fcn)
    return output

In [15]:
def batch_norm_relu(inputs, output_shape, phase=True, scope=None, activation= True):
    with tf.variable_scope(scope):
        h1 = fully_connected(inputs, output_shape, activation_fn=None, scope='dense')
        h2 = batch_norm(h1, center=True, scale=True, is_training=phase, scope='bn')
        if activation:
            o = tf.nn.relu(h2, 'relu')
        else:
            o = h2
        return o

In [16]:
def g_theta(RN_input, scope= 'g_theta', reuse= True): 
    """
    Args
        RN_input: [o_i, o_j, q], shape = [batch_size*190, 136]
        
    Returns
        g_output: shape = [190, batch_size, 256]
    """
    input_dim = RN_input.shape[1]
    with tf.variable_scope(scope, reuse= reuse) as scope:
        # if not reuse: log.warn(scope.name): reuse하는지 확인하기 위한 출력옵션
#         if not reuse:
#             print(scope.name)
        g_1 = batch_norm_relu(RN_input, g_units[0], scope= "g_1")
        g_2 = batch_norm_relu(g_1, g_units[1], scope= "g_2")
        g_3 = batch_norm_relu(g_2, g_units[2], scope= "g_3")
        g_4 = batch_norm_relu(g_3, g_units[3], scope= "g_4")
    g_output = tf.reshape(g_4, shape=[190, batch_size, g_units[3]])
    return g_output

In [17]:
def f_phi(g, scope= 'f_phi', reuse=True):
    """
    Args
        g: g_theta result, shape = [190, batch_size, 256]
        
    Returns
        f_output: shape = [batch_size, 159]
    """
    f_input = tf.reduce_sum(g, axis=0)
    with tf.variable_scope(scope, reuse=reuse) as scope:
        f_1 = batch_norm_relu(f_input, f_units[0], scope= "f_1")
        f_2 = batch_norm_relu(f_1, f_units[1], scope= "f_2")
        f_3 = batch_norm_relu(f_2, f_units[2], activation= None, scope= "f_3")
    return f_3

reuse...

In [22]:
def model(c, q, l, c_real_len, q_real_len):
    embedded_c = contextLSTM(c, l, c_real_len, reuse=True)
    embedded_q = questionLSTM(q, q_real_len, reuse=None)
    RN_input = convert_to_RN_input(embedded_c, embedded_q)
    f_input = g_theta(RN_input, reuse=None)
    prediction = f_phi(f_input, reuse=None)
    return prediction

In [23]:
prediction = model(c,q,l,c_real_len, q_real_len)

In [24]:
tf.trainable_variables()

[<tf.Variable 'Variable:0' shape=(125, 32) dtype=float32_ref>,
 <tf.Variable 'Variable_1:0' shape=(89, 32) dtype=float32_ref>,
 <tf.Variable 'sentenceLSTM/basic_lstm_cell/weights:0' shape=(64, 128) dtype=float32_ref>,
 <tf.Variable 'sentenceLSTM/basic_lstm_cell/biases:0' shape=(128,) dtype=float32_ref>,
 <tf.Variable 'questionLSTM/basic_lstm_cell/weights:0' shape=(64, 128) dtype=float32_ref>,
 <tf.Variable 'questionLSTM/basic_lstm_cell/biases:0' shape=(128,) dtype=float32_ref>,
 <tf.Variable 'g_theta/g_1/dense/weights:0' shape=(136, 256) dtype=float32_ref>,
 <tf.Variable 'g_theta/g_1/dense/biases:0' shape=(256,) dtype=float32_ref>,
 <tf.Variable 'g_theta/g_1/bn/beta:0' shape=(256,) dtype=float32_ref>,
 <tf.Variable 'g_theta/g_1/bn/gamma:0' shape=(256,) dtype=float32_ref>,
 <tf.Variable 'g_theta/g_2/dense/weights:0' shape=(256, 256) dtype=float32_ref>,
 <tf.Variable 'g_theta/g_2/dense/biases:0' shape=(256,) dtype=float32_ref>,
 <tf.Variable 'g_theta/g_2/bn/beta:0' shape=(256,) dtype=flo

#### Optimization
* 64 mini batches
* cross-entropy loss function
* Adam optimizer
* learning rate: 2e-4

## Issues
1. multiple answer?

In [25]:
correct = tf.equal(tf.argmax(prediction, axis=1), tf.argmax(a, axis=1))
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

In [26]:
loss = tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=a)

In [27]:
global_step = tf.Variable(0, name="global_step", trainable=False)

In [28]:
opt = tf.train.AdamOptimizer(learning_rate=learning_rate)
optimizer = opt.minimize(loss, global_step=global_step)

## Train!

In [29]:
def batch_iter(c, q, l, a, c_real_len, q_real_len, batch_size=batch_size, num_epochs=iter_time, shuffle=True):
    """
    Generates a batch iterator for a dataset.
    """
    c = np.array(c)
    q = np.array(q)
    l = np.array(l)
    a = np.array(a)
    c_real_len = np.array(c_real_len)
    q_real_len = np.array(q_real_len)
    data_size = len(q)
    num_batches_per_epoch = int(data_size/batch_size) + 1
    for epoch in range(num_epochs):
        print("In epoch >> " + str(epoch + 1))
        print("num batches per epoch is: " + str(num_batches_per_epoch))
        # Shuffle the data at each epoch
        if shuffle:
            shuffle_indices = np.random.permutation(np.arange(data_size))
            c_shuffled = c[shuffle_indices]
            q_shuffled = q[shuffle_indices]
            l_shuffled = l[shuffle_indices]
            a_shuffled = a[shuffle_indices]
            c_real_len_shuffled = c_real_len[shuffle_indices]
            q_real_len_shuffled = q_real_len[shuffle_indices]
        else:
            c_shuffled = c
            q_shuffled = q
            l_shuffled = l
            a_shuffled = a
            c_real_len_shuffled = c_real_len
            q_real_len_shuffled = q_real_len
        
        for batch_num in range(num_batches_per_epoch):
            start_index = batch_num * batch_size
            end_index = (batch_num + 1) * batch_size
            if end_index < data_size:
                c_batch, q_batch, l_batch, a_batch, c_real_len_batch, q_real_len_batch = c_shuffled[start_index:end_index], q_shuffled[start_index:end_index], l_shuffled[start_index:end_index], a_shuffled[start_index:end_index], c_real_len_shuffled[start_index:end_index], q_real_len_shuffled[start_index:end_index]
            yield list(zip(c_batch, q_batch, l_batch, a_batch, c_real_len_batch, q_real_len_batch))

In [None]:
with tf.Session() as sess:
    init = tf.global_variables_initializer()
    sess.run(init)
    print("====training====")    
    batch_train = batch_iter(train_c, train_q, train_l, train_a, train_c_real_len, train_q_real_len)
    for train in batch_train:
        c_batch, q_batch, l_batch, a_batch, c_real_len_batch, q_real_len_batch = zip(*train)
        current_step = sess.run(global_step, feed_dict={c:c_batch, q:q_batch, l:l_batch, a:a_batch, c_real_len:c_real_len_batch, q_real_len:q_real_len_batch})
        optimizer.run(feed_dict={c:c_batch, q:q_batch, l:l_batch, a:a_batch, c_real_len:c_real_len_batch, q_real_len:q_real_len_batch})
        if current_step % (display_step) == 0:
            print("step: {}".format(current_step))
            print("====validation start====")
            batch_val = batch_iter(val_c, val_q, val_l, val_a, val_c_real_len, val_q_real_len)
            accs = []
            for val in batch_val:
                c_val, q_val, l_val, a_val, c_real_len_val, q_real_len_val = zip(*val)
                acc = accuracy.eval(feed_dict={c:c_val, q:q_val, l:l_val, a:a_val, c_real_len:c_real_len_val, q_real_len:q_real_len_val})
                accs.append(acc)
            print("Mean accuracy=" + str(sum(accs)/len(accs)))
            print("====training====")

====training====
In epoch >> 1
num batches per epoch is: 2812
step: 0
====validation start====
In epoch >> 1
num batches per epoch is: 312
In epoch >> 2
num batches per epoch is: 312
In epoch >> 3
num batches per epoch is: 312
In epoch >> 4
num batches per epoch is: 312
In epoch >> 5
num batches per epoch is: 312
In epoch >> 6
num batches per epoch is: 312
Mean accuracy=0.0100410657051
====training====
step: 100
====validation start====
In epoch >> 1
num batches per epoch is: 312
In epoch >> 2
num batches per epoch is: 312
In epoch >> 3
num batches per epoch is: 312
In epoch >> 4
num batches per epoch is: 312
In epoch >> 5
num batches per epoch is: 312
In epoch >> 6
num batches per epoch is: 312
Mean accuracy=0.308476896368
====training====
step: 200
====validation start====
In epoch >> 1
num batches per epoch is: 312
In epoch >> 2
num batches per epoch is: 312
In epoch >> 3
num batches per epoch is: 312
In epoch >> 4
num batches per epoch is: 312
In epoch >> 5
num batches per epoch is