In [1]:
# Import modules needed
import tensorflow as tf
import numpy as np
import math
import timeit
import matplotlib.pyplot as plt
import pickle
%matplotlib inline

import util
from vqa_model import Encoder, Decoder, VQASystem, AttnGRU
from squeezenet import SqueezeNet
from vgg16 import VGG16

from compact_bilinear_pooling import compact_bilinear_pooling_layer

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

In [2]:
# Load data (excluding images)

# Restrict the number of possible answers using this
# Decreasing this will increase the number of classes 
train_min_count = 99
val_cutoff = 107183
# Load data
dataset = util.load_data_all(train_min_count, val_cutoff=107183, limit=100000000)
np_embeddings = np.load("data/glove.trimmed.100.npz")["glove"]
answer_to_id, id_to_answer = util.load_answer_map(min_count=train_min_count)
with open('data/qid_to_anstype.dat', 'rb') as fp:
    qid_to_anstype = pickle.load(fp)

print("*" * 60)
print("Questions_train", dataset.train.questions.shape)
print("Questions_mask_train", dataset.train.mask.shape)
print("Questions_ids_train", dataset.train.question_ids.shape)
print("Image_ids_train", dataset.train.image_ids.shape)
print("All_answers_train", dataset.train.all_answers.shape)
print("Answers_train", dataset.train.answers.shape)
print("*" * 60)
print("Questions_val", dataset.val.questions.shape)
print("Questions_mask_val", dataset.val.mask.shape)
print("Questions_ids_val", dataset.val.question_ids.shape)
print("Image_ids_val", dataset.val.image_ids.shape)
print("All_answers_val", dataset.val.all_answers.shape)
print("Answers_val", dataset.val.answers.shape)
print("*" * 60)
print("Questions_test", dataset.test.questions.shape)
print("Questions_mask_test", dataset.test.mask.shape)
print("Questions_ids_test", dataset.test.question_ids.shape)
print("Image_ids_test", dataset.test.image_ids.shape)
print("All_answers_test", dataset.test.all_answers.shape)
print("Answers_test", dataset.test.answers.shape)
print("*" * 60)
print("np_embeddings", np_embeddings.shape)
print("*" * 60)
print("There are", len(answer_to_id), "possible answers (including <unk>)")
print("This should be less than or equal to above", np.max(dataset.train.answers) + 1) 
print("This should be less than or equal to above", np.max(dataset.val.answers) + 1) 
print("This should be less than or equal to above", np.max(dataset.test.answers) + 1) 
print("This should be less than or equal to above", np.max(dataset.train.all_answers) + 1) 
print("This should be less than or equal to above", np.max(dataset.val.all_answers) + 1) 
print("This should be less than or equal to above", np.max(dataset.test.all_answers) + 1) 
print("*" * 60)

************************************************************
Questions_train (383270, 25)
Questions_mask_train (383270,)
Questions_ids_train (383270,)
Image_ids_train (383270,)
All_answers_train (383270, 10)
Answers_train (383270,)
************************************************************
Questions_val (92423, 25)
Questions_mask_val (92423,)
Questions_ids_val (92423,)
Image_ids_val (92423,)
All_answers_val (92423, 10)
Answers_val (92423,)
************************************************************
Questions_test (92162, 25)
Questions_mask_test (92162,)
Questions_ids_test (92162,)
Image_ids_test (92162,)
All_answers_test (92162, 10)
Answers_test (92162,)
************************************************************
np_embeddings (47382, 100)
************************************************************
There are 3004 possible answers (including <unk>)
This should be less than or equal to above 3004
This should be less than or equal to above 3001
This should be less than or equal to ab

In [3]:
class Config:
    """Holds model hyperparams and data information.
    """
    epochs = 20
    
    learning_rate = 5e-6
    lr_decay = 0.75
    optimizer = tf.train.AdamOptimizer
    max_gradient_norm = 10.0
    clip_gradients = True
    dropout_keep_prob = 1.0
    l2_reg = 0.0
    batch_size = 32
    train_all = False
    
    train_limit = 10000000
    
    max_question_length = 25    
    num_answers_per_question = 10
    num_classes = len(answer_to_id)
    image_size = [224, 224, 3]
    cbpl_output_dim = 512
    att_conv1_dim = 256
    
    vgg_out_dim = [14, 14]
    
    images_input_sequece_len = vgg_out_dim[0] * vgg_out_dim[1]
    
    rnn_hidden_size = 512 # RNN
    fc_state_size = 1600 # Fully connected
    embedding_size = 100
    
    num_evaluate = 5000
    
    eval_every = 1000
    print_every = 100 
    
    model_dir = "dmn_model"
    squeeze_net_dir = "sq_net_model/squeezenet.ckpt"
    vgg16_weight_file = "vgg_net_dir/vgg16_weights.npz"

In [4]:
class BaselineEncoder(Encoder):
    def compute_attention_g(self, level):
        with tf.variable_scope('compute_attention_g'):
            q = tf.expand_dims(self.q, axis=1)
            m_before = tf.expand_dims(self.memory[level-1], axis=1)
            print("q", q)
            print("m_before",m_before)
            z = tf.concat((self.F_bidir * q, self.F_bidir * m_before, 
                           tf.abs(self.F_bidir - q), tf.abs(self.F_bidir - m_before)), 
                          axis=2)
            out = tf.Print(z, [tf.shape(z)], 'z = ', summarize=20, first_n=7)
            print("z", level, z)
            
            W_att_1 = tf.get_variable(name="W_att_1_weight", 
                                      shape=(4*self.config.rnn_hidden_size, 
                                             self.config.rnn_hidden_size), 
                                      dtype=tf.float32, 
                                      initializer=tf.contrib.layers.xavier_initializer())
            W_att_2 = tf.get_variable(name="W_att_2_weight", 
                                             shape=(self.config.rnn_hidden_size, 1), 
                                             dtype=tf.float32, 
                                             initializer=tf.contrib.layers.xavier_initializer())
            b_att_1 = tf.get_variable(name="b_att_1", 
                                            shape=(self.config.rnn_hidden_size,), 
                                            dtype=tf.float32, 
                                            initializer=tf.constant_initializer(0.0))
            b_att_2 = tf.get_variable(name="b_att_2", 
                                            shape=(), 
                                            dtype=tf.float32, 
                                            initializer=tf.constant_initializer(0.0))
            
            z_reshaped = tf.reshape(z, [-1, 4*self.config.rnn_hidden_size])
            print("z_reshaped", level, z_reshaped)
            Z_temp = tf.tanh(tf.matmul(z_reshaped, W_att_1) + b_att_1)
            print("Z_temp", level, Z_temp)
            
            Z_reshaped = tf.matmul(Z_temp, W_att_2) + b_att_2
            print("Z_reshaped", level, Z_reshaped)
            
            Z = tf.reshape(Z_reshaped, [self.batch_size, -1])
            print("Z", level, Z)
            
            exp_Z = tf.exp(Z)
            g = exp_Z / (1e-6 + tf.reduce_sum(exp_Z, reduction_indices=1, keep_dims=True))
            print("g", g)
                        
            return g
        
    def get_next_memory_state(self, context, level):
        with tf.variable_scope('compute_next_m'):
            combo = tf.concat((self.memory[level-1], context, self.q), axis=1)
            print("combo", level, combo)
            
            W_nm_1 = tf.get_variable(name="W_nm_1_weight", 
                                      shape=(3*self.config.rnn_hidden_size, 
                                             self.config.rnn_hidden_size), 
                                      dtype=tf.float32, 
                                      initializer=tf.contrib.layers.xavier_initializer())
            b_nm_1 = tf.get_variable(name="b_nm_1", 
                                            shape=(self.config.rnn_hidden_size,), 
                                            dtype=tf.float32, 
                                            initializer=tf.constant_initializer(0.0))
            
            m = tf.nn.relu(tf.matmul(combo, W_nm_1) + b_nm_1) 
            print("m", level, m)
        return m
            
    
    def encode(self, inputs, encoder_state_input, embeddings, dropout_keep_prob):
        self.memory = []        
        
        images, seq_len, questions, question_masks = inputs
        
        self.batch_size = tf.shape(images)[0]
        
        self.vgg_net = VGG16(imgs=images, weights=Config.vgg16_weight_file)
        self.vgg_out = self.vgg_net.conv5_3
        print("vgg_out", self.vgg_out)
        
        with tf.variable_scope('vqa_additional') as vqa_add_scope:
            # Encode question with GRU
            questions_input = tf.nn.embedding_lookup(embeddings, questions)
            with tf.variable_scope('q_encoder') as scope:
                gru_cell = tf.contrib.rnn.GRUCell(self.config.rnn_hidden_size)
                outputs, state = tf.nn.dynamic_rnn(cell=gru_cell,
                                                   inputs=questions_input,
                                                   sequence_length=question_masks,
                                                   #initial_state=image_feats,
                                                   dtype=tf.float32)
            # Question representation
            self.q = state
            self.memory.append(state) # m_0 = q
            print("q", self.q)
                
            F = tf.reshape(self.vgg_out, shape=(-1, 14*14, 512))
            
            print("F", F)
            
            # Forward direction cell
            gru_fw_cell = tf.contrib.rnn.GRUCell(self.config.rnn_hidden_size)
            gru_bw_cell = tf.contrib.rnn.GRUCell(self.config.rnn_hidden_size)
            outputs, output_states = tf.nn.bidirectional_dynamic_rnn(cell_fw=gru_fw_cell,
                                                                     cell_bw=gru_bw_cell,
                                                                     inputs=F,
                                                                     sequence_length=seq_len,
                                                                     dtype=tf.float32)
            self.F_bidir = outputs[0] + outputs[1]            
            print("F_bidir", self.F_bidir)
            
            g_1 = self.compute_attention_g(level=1)    
            c_1 = tf.reduce_sum(self.F_bidir * tf.expand_dims(g_1, axis=2), axis=1)
            print("c_1", c_1)
            m_1 = self.get_next_memory_state(context=c_1, level=1)
            self.memory.append(m_1)
            
            vqa_add_scope.reuse_variables()
            
            g_2 = self.compute_attention_g(level=2) 
            
            c_2 = tf.reduce_sum(self.F_bidir * tf.expand_dims(g_2, axis=2), axis=1)
            print("c_2", c_2)
            m_2 = self.get_next_memory_state(context=c_2, level=2)
            self.memory.append(m_2)
            
        return self.memory[-1]           
            
        #return state

class BaselineDecoder(Encoder):
    def decode(self, knowledge_rep, dropout_keep_prob):
        scores = tf.layers.dense(inputs=knowledge_rep, units=self.config.num_classes, 
                               activation=tf.nn.relu,
                               kernel_initializer=tf.contrib.layers.xavier_initializer())
        return scores

In [5]:
# clear old variables
tf.reset_default_graph()

vqa_encoder = BaselineEncoder(config=Config)
vqa_decoder = BaselineDecoder(config=Config)

vqa_system = VQASystem(encoder=vqa_encoder, decoder=vqa_decoder, 
                       pretrained_embeddings=np_embeddings, config=Config)   

Done Adding Placeholers!
Done Adding Embedding!
vgg_out Tensor("qa/conv5_3:0", shape=(?, 14, 14, 512), dtype=float32)
q Tensor("qa/vqa_additional/q_encoder/rnn/while/Exit_2:0", shape=(?, 512), dtype=float32)
F Tensor("qa/vqa_additional/Reshape:0", shape=(?, 196, 512), dtype=float32)
F_bidir Tensor("qa/vqa_additional/add:0", shape=(?, 196, 512), dtype=float32)
q Tensor("qa/vqa_additional/compute_attention_g/ExpandDims:0", shape=(?, 1, 512), dtype=float32)
m_before Tensor("qa/vqa_additional/compute_attention_g/ExpandDims_1:0", shape=(?, 1, 512), dtype=float32)
z 1 Tensor("qa/vqa_additional/compute_attention_g/concat:0", shape=(?, 196, 2048), dtype=float32)
z_reshaped 1 Tensor("qa/vqa_additional/compute_attention_g/Reshape:0", shape=(?, 2048), dtype=float32)
Z_temp 1 Tensor("qa/vqa_additional/compute_attention_g/Tanh:0", shape=(?, 512), dtype=float32)
Z_reshaped 1 Tensor("qa/vqa_additional/compute_attention_g/add_1:0", shape=(?, 1), dtype=float32)
Z 1 Tensor("qa/vqa_additional/compute_att

In [12]:
train_saved_model = True
with tf.Session() as sess:
    util.initialize_model(sess, vqa_system, Config.model_dir, train_saved_model,config=Config)    
    #vqa_system.train(sess, dataset)
    vqa_system.evaluate_data(session=sess, sample_size=10000, 
                             dataset=dataset.val, qid_to_anstype=qid_to_anstype, datatype="val")

Reading model parameters from dmn_model/model.ckpt
*************************
Overall accuracy (val): 32.92
Accuracy for (other): 14.402304368698992 (600/4166)
Accuracy for (yes/no): 52.17787913340935 (2288/4385)
Accuracy for (number): 27.88129744651484 (404/1449)
*************************


In [7]:
# tf.reset_default_graph()
# all_answers = tf.placeholder(tf.int64, [None, 5])
# answers = tf.placeholder(tf.int64, [None])

# def acc_count(t, val):
#     t = tf.reshape(t, shape=(-1, 1))
#     elements_equal_to_value = tf.equal(t, val)
#     as_ints = tf.cast(elements_equal_to_value, tf.int32)
#     count = tf.reduce_sum(as_ints, axis=1)
#     accuracy = 1.0 * tf.minimum(count / 3, 1)
#     return accuracy

# with tf.Session() as sess:
#     v = sess.run([acc_count(answers, all_answers)], feed_dict={answers : np.array([1,2,3]),
#                                                        all_answers : np.array([[1,2,1,1,1], [0,1,1,2,0], [3,3,3,1,0]])})
#     print(v)

In [8]:
#print(np.sum(dataset.train.answers == 0))

In [9]:
# print(util.Progbar)
# prog = util.Progbar(target=100)
# for i in range(100):
#     prog.update(i + 1)

In [10]:
# a = tf.placeholder(tf.int32, [1,4,5])
# b = tf.placeholder(tf.int32, [1,5])
# func = tf.reduce_sum

# with tf.Session() as sess:
#     #res = sess.run(func, {a : np.array([[1,2,3,4], [8,7,6,5], [9,10,11,12], [16,15,14,13]])})
#     res = sess.run(func, {a : np.array([[[1,2,3,4,1], [8,7,6,5,1], [9,10,11,12,1], [16,15,14,13,1]]]),
#                           b : np.array([[2,3,4,2,2]])})
#     print(res)