In [None]:
# Import modules needed
import tensorflow as tf
import tensorflow.contrib.slim as slim
import tensorflow.contrib.slim.nets
import numpy as np
import math
import timeit
import matplotlib.pyplot as plt
import pickle

%matplotlib inline
plt.rcParams['figure.figsize'] = (5.0, 5.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'
plt.ion()

import util
from vqa_model import Encoder, Decoder, VQASystem, AttnGRU, AttnGRUCell
from squeezenet import SqueezeNet
from vgg16 import VGG16

from compact_bilinear_pooling import compact_bilinear_pooling_layer

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

In [None]:
# Load data (excluding images)

# Restrict the number of possible answers using this
# Decreasing this will increase the number of classes 
train_min_count = 99
val_cutoff = 107183
# Load data
dataset = util.load_data_all(train_min_count, val_cutoff=107183, limit=100000000)
np_embeddings = np.load("data/glove.trimmed.100.npz")["glove"]
answer_to_id, id_to_answer = util.load_answer_map(min_count=train_min_count)
with open('data/qid_to_anstype.dat', 'rb') as fp:
    qid_to_anstype = pickle.load(fp)

print("*" * 60)
print("Questions_train", dataset.train.questions.shape)
print("Questions_mask_train", dataset.train.mask.shape)
print("Questions_ids_train", dataset.train.question_ids.shape)
print("Image_ids_train", dataset.train.image_ids.shape)
print("All_answers_train", dataset.train.all_answers.shape)
print("Answers_train", dataset.train.answers.shape)
print("*" * 60)
print("Questions_val", dataset.val.questions.shape)
print("Questions_mask_val", dataset.val.mask.shape)
print("Questions_ids_val", dataset.val.question_ids.shape)
print("Image_ids_val", dataset.val.image_ids.shape)
print("All_answers_val", dataset.val.all_answers.shape)
print("Answers_val", dataset.val.answers.shape)
print("*" * 60)
print("Questions_test", dataset.test.questions.shape)
print("Questions_mask_test", dataset.test.mask.shape)
print("Questions_ids_test", dataset.test.question_ids.shape)
print("Image_ids_test", dataset.test.image_ids.shape)
print("All_answers_test", dataset.test.all_answers.shape)
print("Answers_test", dataset.test.answers.shape)
print("*" * 60)
print("np_embeddings", np_embeddings.shape)
print("*" * 60)
print("There are", len(answer_to_id), "possible answers (including <unk>)")
print("This should be less than or equal to above", np.max(dataset.train.answers) + 1) 
print("This should be less than or equal to above", np.max(dataset.val.answers) + 1) 
print("This should be less than or equal to above", np.max(dataset.test.answers) + 1) 
print("This should be less than or equal to above", np.max(dataset.train.all_answers) + 1) 
print("This should be less than or equal to above", np.max(dataset.val.all_answers) + 1) 
print("This should be less than or equal to above", np.max(dataset.test.all_answers) + 1) 
print("*" * 60)

In [None]:
class Config:
    """Holds model hyperparams and data information.
    """
    epochs = 10
    
    learning_rate = 0.001
    lr_decay = 0.9
    optimizer = tf.train.AdamOptimizer
    max_gradient_norm = 10.0
    clip_gradients = True
    dropout_keep_prob = 1.0
    l2_reg = 0.0
    weight_decay = 5e-4
    batch_size = 32
    train_all = True
    
    train_limit = 10000000000
    
    max_question_length = 25    
    num_answers_per_question = 10
    num_classes = len(answer_to_id)
    image_size = [224, 224, 3]
    cbpl_output_dim = 8000
    att_conv1_dim = 512
    num_levels = 2
    
    batch_normalize = False
    
    vgg_out_dim = [14, 14]
    
    images_input_sequece_len = vgg_out_dim[0] * vgg_out_dim[1]
    
    rnn_hidden_size = 200 # RNN
    fc_state_size = 100 # Fully connected
    embedding_size = 100
    image_seq_size = 200
    fact_size = rnn_hidden_size
    
    
    num_evaluate = 5000
    
    eval_every = 50
    print_every = 100 
    
    VGG_MEAN = [123.68, 116.78, 103.94]
    
    model_dir = "final_dmn"
    squeeze_net_dir = "sq_net_model/squeezenet.ckpt"
    vgg16_weight_file = "vgg_net_dir/vgg_16.ckpt"
    vgg_out_dim = [14, 14]
    
    vgg_exclude_names = ['qa/vgg_16/pool5', 'qa/vgg_16/fc6', 'qa/vgg_16/fc7', 'qa/vgg_16/fc8']

In [None]:
class BaselineEncoder(Encoder):
    def get_image_seq(self, images):
        with tf.variable_scope('image_facts'):
            image_seq = tf.reshape(images, 
                                   shape=(-1, self.config.images_input_sequece_len, 512))
            print("init_image_seq", image_seq)
            
            W_facts = tf.get_variable(name="W_facts", 
                                     shape=(512, self.config.image_seq_size), 
                                     dtype=tf.float32, 
                                     initializer=tf.contrib.layers.xavier_initializer())
            b_facts = tf.get_variable(name="b_facts", 
                                    shape=(self.config.image_seq_size,), 
                                    dtype=tf.float32, 
                                    initializer=tf.constant_initializer(0.0))
            seq_reshaped = tf.reshape(image_seq,
                                     shape=[-1, 512])
            print("seq_reshaped", seq_reshaped)
            seq_linear = tf.tanh(tf.matmul(seq_reshaped, W_facts) + b_facts)
            image_seq = tf.reshape(seq_linear, 
                           shape=(-1, self.config.images_input_sequece_len, self.config.image_seq_size))
            print("image_seq", image_seq)
            
            return image_seq
    
    def get_attenton_g(self, f, q, m_prev):
        with tf.variable_scope('attention_g'):
            W_att_1 = tf.get_variable(name="W_att_1_weight", 
                                      shape=(4*self.config.rnn_hidden_size, 
                                             self.config.rnn_hidden_size), 
                                      dtype=tf.float32, 
                                      initializer=tf.contrib.layers.xavier_initializer())
            W_att_2 = tf.get_variable(name="W_att_2_weight", 
                                             shape=(self.config.rnn_hidden_size, 1), 
                                             dtype=tf.float32, 
                                             initializer=tf.contrib.layers.xavier_initializer())
            b_att_1 = tf.get_variable(name="b_att_1", 
                                            shape=(self.config.rnn_hidden_size,), 
                                            dtype=tf.float32, 
                                            initializer=tf.constant_initializer(0.0))
            b_att_2 = tf.get_variable(name="b_att_2", 
                                            shape=(), 
                                            dtype=tf.float32, 
                                            initializer=tf.constant_initializer(0.0))
            
            q = tf.expand_dims(q, axis=1)
            m_prev = tf.expand_dims(m_prev, axis=1)
            print("q", q)
            print("m_prev", m_prev)
            z = tf.concat((f * q, f * m_prev, 
                           tf.abs(f - q), tf.abs(f - m_prev)), 
                          axis=2)
            print("z", z)
            
            z_reshaped = tf.reshape(z, [-1, 4*self.config.rnn_hidden_size])
            print("z_reshaped", z_reshaped)
            Z_temp = tf.tanh(tf.matmul(z_reshaped, W_att_1) + b_att_1)
            print("Z_temp", Z_temp)
            
            Z_reshaped = tf.matmul(Z_temp, W_att_2) + b_att_2
            print("Z_reshaped", Z_reshaped)
            
            Z = tf.reshape(Z_reshaped, [-1, self.config.images_input_sequece_len])
            print("Z", Z)
            
            exp_Z = tf.exp(Z)
            g = exp_Z / (1e-6 + tf.reduce_sum(exp_Z, reduction_indices=1, keep_dims=True))
            print("g", g)
                        
            return g
    
    def get_next_memory_state(self, m_prev, c, q):
        with tf.variable_scope('compute_next_m'):
            combo = tf.concat((m_prev, c, q), axis=1)
            print("combo",combo)
            
            W_nm_1 = tf.get_variable(name="W_nm_1_weight", 
                                      shape=(3*self.config.rnn_hidden_size, 
                                             self.config.rnn_hidden_size), 
                                      dtype=tf.float32, 
                                      initializer=tf.contrib.layers.xavier_initializer())
            b_nm_1 = tf.get_variable(name="b_nm_1", 
                                            shape=(self.config.rnn_hidden_size,), 
                                            dtype=tf.float32, 
                                            initializer=tf.constant_initializer(0.0))
            m = tf.matmul(combo, W_nm_1) 
            if self.config.batch_normalize:
                m = tf.layers.batch_normalization(m, training=self.is_training)
            m = tf.nn.relu(m + b_nm_1) 
            print("m", m)
        return m
    
    def encode(self, inputs, encoder_state_input, embeddings, 
               dropout_keep_prob, is_training):
        images, seq_len, questions, question_masks = inputs
        self.batch_size = tf.shape(images)[0]
        self.is_training = is_training
        self.memory = []     
        
        means = tf.reshape(tf.constant(self.config.VGG_MEAN), [1, 1, 1, 3])
        images = images - means 
    
        vgg = tf.contrib.slim.nets.vgg
        with slim.arg_scope(vgg.vgg_arg_scope(weight_decay=self.config.weight_decay)):
            logits, end_points = vgg.vgg_16(images, num_classes=1, 
                                            is_training=False, dropout_keep_prob=1.0)
            self.vgg_out = end_points['qa/vgg_16/conv5/conv5_3']
            self.vgg_include = [end_points[e] for e in end_points if e not in self.config.vgg_exclude_names]
            self.vgg_exclude = [end_points[e] for e in end_points if e not in self.config.vgg_exclude_names]
            for e in self.vgg_include:
                print(e)
            print("vgg_out", self.vgg_out)
        
        with tf.variable_scope('vqa_additional_encode') as vqa_add_scope:
            # Encode question with GRU
            questions_input = tf.nn.embedding_lookup(embeddings, questions)
            with tf.variable_scope('q_encoder') as scope:
                gru_cell = tf.contrib.rnn.GRUCell(self.config.rnn_hidden_size)
                outputs, state = tf.nn.dynamic_rnn(cell=gru_cell,
                                                   inputs=questions_input,
                                                   sequence_length=question_masks,
                                                   dtype=tf.float32)
            # Question representation
            self.q = state
            self.memory.append(state) # m_0 = q
            print("q", self.q)
            
            self.image_seq = self.get_image_seq(self.vgg_out)
                       
            # Forward direction cell
            gru_fw_cell = tf.contrib.rnn.GRUCell(self.config.rnn_hidden_size)
            gru_bw_cell = tf.contrib.rnn.GRUCell(self.config.rnn_hidden_size)
            outputs, output_states = tf.nn.bidirectional_dynamic_rnn(cell_fw=gru_fw_cell,
                                                                     cell_bw=gru_bw_cell,
                                                                     inputs=self.image_seq,
                                                                     sequence_length=seq_len,
                                                                     dtype=tf.float32)
            self.F_bidir = outputs[0] + outputs[1]            
            print("F_bidir", self.F_bidir)
            
            with tf.variable_scope('mem1') as scope:
                g_1 = self.get_attenton_g(f=self.F_bidir, q=self.q, m_prev=self.memory[0])
                attn_cell1 =  AttnGRUCell(num_units=self.config.rnn_hidden_size,
                                         attention=g_1, batch_normalize=self.config.batch_normalize, 
                                         is_training=self.is_training)
                _, c_1 = tf.nn.dynamic_rnn(cell=attn_cell1,
                                           inputs=self.F_bidir,
                                           sequence_length=seq_len,
                                           dtype=tf.float32)
            
#             c_1 = tf.get_variable(name="c_1", shape=(1,self.config.rnn_hidden_size), 
#                                   dtype=tf.float32, initializer=tf.constant_initializer(0.0))
#             attn_cell1 = AttnGRU(num_units=self.config.rnn_hidden_size)
#             with tf.variable_scope('Context1') as scope:
#                 for i in range(self.config.images_input_sequece_len):
#                     if i > 0: tf.get_variable_scope().reuse_variables()
#                     c_1 = attn_cell1(inputs=self.F_bidir[:,i,:], 
#                                     state=c_1, attention=g_1[:,i])
                    #print("c_1", i, c_1)             
            
#             c_1 = tf.reduce_sum(self.F_bidir * tf.expand_dims(g_1, axis=2), axis=1)
                print("c_1", c_1)
                m_1 = self.get_next_memory_state(m_prev=self.memory[0], c=c_1, q=self.q)
                self.memory.append(m_1)
            
#             c_2 = tf.get_variable(name="c_2", shape=(1,self.config.rnn_hidden_size), 
#                                   dtype=tf.float32, initializer=tf.constant_initializer(0.0))
                scope.reuse_variables()
            
            with tf.variable_scope('mem2') as scope:
                g_2 = self.get_attenton_g(f=self.F_bidir, q=self.q, m_prev=self.memory[1])             

                attn_cell2 = AttnGRUCell(num_units=self.config.rnn_hidden_size,
                                         attention=g_2, batch_normalize=self.config.batch_normalize, 
                                         is_training=self.is_training)
                _, c_2 = tf.nn.dynamic_rnn(cell=attn_cell2,
                                           inputs=self.F_bidir,
                                           sequence_length=seq_len,
                                           dtype=tf.float32)
    #             with tf.variable_scope('Context2') as scope:
    #                 for i in range(self.config.images_input_sequece_len):
    #                     if i > 0: tf.get_variable_scope().reuse_variables()
    #                     c_2 = attn_cell2(inputs=self.F_bidir[:,i,:], 
    #                                     state=c_2, attention=g_1[:,i])

                #c_2 = tf.reduce_sum(self.F_bidir * tf.expand_dims(g_2, axis=2), axis=1)
                print("c_2", c_2)
                m_2 = self.get_next_memory_state(m_prev=self.memory[1], c=c_2, q=self.q)
                self.memory.append(m_2)
            
        return self.memory[-1]           
            
            
            
class BaselineDecoder(Encoder):
    def decode(self, knowledge_rep, dropout_keep_prob, is_training):
        with tf.variable_scope('vqa_additional_decode') as vqa_add_scope:
            scores = tf.layers.dense(inputs=knowledge_rep, units=self.config.num_classes, 
                                   activation=tf.nn.relu,
                                   kernel_initializer=tf.contrib.layers.xavier_initializer())
        return scores
    
    def plot(self, attn):
#         print(attn)
        plt.imshow(attn)
        plt.show()

In [None]:
# clear old variables
tf.reset_default_graph()

vqa_encoder = BaselineEncoder(config=Config)
vqa_decoder = BaselineDecoder(config=Config)

vqa_system = VQASystem(encoder=vqa_encoder, decoder=vqa_decoder, 
                       pretrained_embeddings=np_embeddings, config=Config)   

In [None]:
train_saved_model = True
with tf.Session() as sess:
    util.initialize_model(sess, vqa_system, Config.model_dir, train_saved_model,config=Config)    
    vqa_system.train(sess, dataset, best_score=0.0)
#     vqa_system.evaluate_data(session=sess, sample_size=10000, 
#                              dataset=dataset.test, qid_to_anstype=qid_to_anstype, datatype="test")
