In [1]:
# Import modules needed
import tensorflow as tf
import numpy as np
import math
import timeit
import matplotlib.pyplot as plt
import pickle
%matplotlib inline

import util
from vqa_model import Encoder, Decoder, VQASystem
from squeezenet import SqueezeNet
from vgg16 import VGG16

from compact_bilinear_pooling import compact_bilinear_pooling_layer

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

In [2]:
# Load data (excluding images)

# Restrict the number of possible answers using this
# Decreasing this will increase the number of classes 
train_min_count = 25
val_cutoff = 107183
# Load data
dataset = util.load_data_all(train_min_count, val_cutoff=107183, limit=100000000)
np_embeddings = np.load("data/glove.trimmed.100.npz")["glove"]
answer_to_id, id_to_answer = util.load_answer_map(min_count=train_min_count)

print("*" * 60)
print("Questions_train", dataset.train.questions.shape)
print("Questions_mask_train", dataset.train.mask.shape)
print("Image_ids_train", dataset.train.image_ids.shape)
print("All_answers_train", dataset.train.all_answers.shape)
print("Answers_train", dataset.train.answers.shape)
print("*" * 60)
print("Questions_val", dataset.val.questions.shape)
print("Questions_mask_val", dataset.val.mask.shape)
print("Image_ids_val", dataset.val.image_ids.shape)
print("All_answers_val", dataset.val.all_answers.shape)
print("Answers_val", dataset.val.answers.shape)
print("*" * 60)
print("Questions_test", dataset.test.questions.shape)
print("Questions_mask_test", dataset.test.mask.shape)
print("Image_ids_test", dataset.test.image_ids.shape)
print("All_answers_test", dataset.test.all_answers.shape)
print("Answers_test", dataset.test.answers.shape)
print("*" * 60)
print("np_embeddings", np_embeddings.shape)
print("*" * 60)
print("There are", len(answer_to_id), "possible answers (including <unk>)")
print("This should be less than or equal to above", np.max(dataset.train.answers) + 1) 
print("This should be less than or equal to above", np.max(dataset.val.answers) + 1) 
print("This should be less than or equal to above", np.max(dataset.test.answers) + 1) 
print("This should be less than or equal to above", np.max(dataset.train.all_answers) + 1) 
print("This should be less than or equal to above", np.max(dataset.val.all_answers) + 1) 
print("This should be less than or equal to above", np.max(dataset.test.all_answers) + 1) 
print("*" * 60)

************************************************************
Questions_train (402672, 25)
Questions_mask_train (402672,)
Image_ids_train (402672,)
All_answers_train (402672, 10)
Answers_train (402672,)
************************************************************
Questions_val (97112, 25)
Questions_mask_val (97112,)
Image_ids_val (97112,)
All_answers_val (97112, 10)
Answers_val (97112,)
************************************************************
Questions_test (96963, 25)
Questions_mask_test (96963,)
Image_ids_test (96963,)
All_answers_test (96963, 10)
Answers_test (96963,)
************************************************************
np_embeddings (47382, 100)
************************************************************
There are 8387 possible answers (including <unk>)
This should be less than or equal to above 8387
This should be less than or equal to above 8385
This should be less than or equal to above 8385
This should be less than or equal to above 8387
This should be less than or 

In [3]:
# print((dataset.train.answers == 0).sum())
# print((dataset.val.answers == 0).sum())
# print((dataset.test.answers == 0).sum())
# print((dataset.train.questions[dataset.train.answers == 0].shape))

In [4]:
class Config:
    """Holds model hyperparams and data information.
    """
    epochs = 20
    
    learning_rate = 5e-4
    lr_decay = 0.9
    optimizer = tf.train.AdamOptimizer
    max_gradient_norm = 10.0
    clip_gradients = True
    dropout_keep_prob = 1.0
    l2_reg = 0.0
    batch_size = 32
    train_all = False
    
    train_limit = 10000000
    
    max_question_length = 25    
    num_answers_per_question = 10
    num_classes = len(answer_to_id)
    image_size = [224, 224, 3]
    cbpl_output_dim = 512
    att_conv1_dim = 256
    
    vgg_out_dim = [14, 14]
    
    images_input_sequece_len = vgg_out_dim[0] * vgg_out_dim[1]
    
    rnn_hidden_size = 512 # RNN
    fc_state_size = 100 # Fully connected
    embedding_size = 100
    
    num_evaluate = 5000
    
    eval_every = 500
    print_every = 100 
    
    model_dir = "bilinear_model"
    squeeze_net_dir = "sq_net_model/squeezenet.ckpt"
    vgg16_weight_file = "vgg_net_dir/vgg16_weights.npz"

In [5]:
class BaselineEncoder(Encoder):
    
            
    
    def encode(self, inputs, encoder_state_input, embeddings, dropout_keep_prob):
        images, questions, question_masks = inputs
        self.batch_size = tf.shape(images)[0]
        
        self.vgg_net = VGG16(imgs=images, weights=Config.vgg16_weight_file)
        self.vgg_out = self.vgg_net.conv5_3
        print("vgg_out", self.vgg_out)
        
        with tf.variable_scope('vqa_additional'):
            # Encode question with GRU
            questions_input = tf.nn.embedding_lookup(embeddings, questions)
            with tf.variable_scope('q_encoder') as scope:
                gru_cell = tf.contrib.rnn.GRUCell(self.config.rnn_hidden_size)
#                 gru_cell = tf.contrib.rnn.DropoutWrapper(gru_cell,
#                                                          input_keep_prob=dropout_keep_prob,
#                                                          output_keep_prob=dropout_keep_prob)
                outputs, state = tf.nn.dynamic_rnn(cell=gru_cell,
                                                   inputs=questions_input,
                                                   sequence_length=question_masks,
                                                   dtype=tf.float32)
            # Question representation
            self.q_enc = state
            self.q_enc = tf.nn.dropout(self.q_enc, keep_prob=dropout_keep_prob)
            print("q_enc", self.q_enc)
            
            tile_temp = tf.reshape(self.q_enc, shape=(self.batch_size, 1, 1, self.config.rnn_hidden_size))
            self.q_tile = tf.tile(tile_temp, [1] + self.config.vgg_out_dim + [1])
            print("q_tile", self.q_tile)
            
            self.q_im_attn = compact_bilinear_pooling_layer(bottom1=self.vgg_out, bottom2=self.q_tile, 
                                                            output_dim=self.config.cbpl_output_dim,
                                                            sum_pool=False)
#             self.q_im_attn = tf.sign(self.q_im_attn) * tf.sqrt(tf.abs(self.q_im_attn))
#             self.q_im_attn = tf.nn.l2_normalize(self.q_im_attn, 0)
            print("q_im_attn", self.q_im_attn)
            
            self.att_Wconv1 = tf.Variable(tf.truncated_normal([3, 3, self.config.cbpl_output_dim, 
                                                               self.config.att_conv1_dim], 
                                                              dtype=tf.float32,
                                                              stddev=1e-1), 
                                          name='att_Wconv1_weight')
            self.att_bconv1 = tf.Variable(tf.constant(0.0, shape=[self.config.att_conv1_dim], dtype=tf.float32),
                                 trainable=True, name='att_bconv1')
            self.attn_conv1 = tf.nn.conv2d(self.q_im_attn,self.att_Wconv1,
                                           strides=[1,1,1,1], padding='SAME') + self.att_bconv1
            self.attn_conv1 = tf.nn.relu(self.attn_conv1)
            print("attn_conv1", self.attn_conv1)
            
            self.att_Wconv2 = tf.Variable(tf.truncated_normal([3, 3, self.config.att_conv1_dim, 1], 
                                                              dtype=tf.float32,
                                                              stddev=1e-1), 
                                          name='att_Wconv2_weight')
            self.att_bconv2 = tf.Variable(tf.constant(0.0, shape=[1], dtype=tf.float32),
                                 trainable=True, name='att_bconv2')
            self.attn_conv2 = tf.nn.conv2d(self.attn_conv1, self.att_Wconv2,
                                           strides=[1,1,1,1], padding='SAME') + self.att_bconv2
            print("attn_conv2", self.attn_conv2)
            
            self.attn_flat = tf.reshape(self.attn_conv2, shape=[-1, self.config.images_input_sequece_len])
            self.alpha = tf.nn.softmax(self.attn_flat)
            self.alpha = tf.reshape(self.alpha, shape=[-1] + self.config.vgg_out_dim + [1])
            print("alpha", self.alpha)
            
            weighted = self.alpha * self.vgg_out
            print("weighted",weighted)
            self.attended_image = tf.reduce_sum(weighted, axis=(1,2))
            self.attended_image = tf.nn.dropout(self.attended_image, keep_prob=dropout_keep_prob)
            print("attended_image", self.attended_image)
            
            a=tf.reshape(self.attended_image, shape=(-1, 1, 1, self.config.rnn_hidden_size), name="HERE1")
            b=tf.reshape(self.q_enc, shape=(-1, 1, 1, self.config.rnn_hidden_size), name="HERE2")
            print("a", a)
            print("b", b)
            
            self.attd_im_q = compact_bilinear_pooling_layer(bottom1=a, 
                                                            bottom2=b,  
                                                             output_dim=self.config.cbpl_output_dim,
                                                             sum_pool=False)
            #self.attd_im_q = tf.sign(self.attd_im_q) * tf.sqrt(tf.abs(self.attd_im_q))
#             self.attd_im_q = tf.nn.l2_normalize(self.attd_im_q, 0)
            
            self.attd_im_q = tf.reshape(self.attd_im_q, shape=[-1, self.config.rnn_hidden_size])
            print("attd_im_q", self.attd_im_q)
            
        return self.attd_im_q
        #return state

class BaselineDecoder(Encoder):
    def decode(self, knowledge_rep, dropout_keep_prob):
        scores = tf.layers.dense(inputs=knowledge_rep, units=self.config.num_classes, 
                               activation=tf.nn.relu,
                               kernel_initializer=tf.contrib.layers.xavier_initializer())
        return scores

In [None]:
# clear old variables
tf.reset_default_graph()

vqa_encoder = BaselineEncoder(config=Config)
vqa_decoder = BaselineDecoder(config=Config)

vqa_system = VQASystem(encoder=vqa_encoder, decoder=vqa_decoder, 
                       pretrained_embeddings=np_embeddings, config=Config)   

Done Adding Placeholers!
Done Adding Embedding!
vgg_out Tensor("qa/conv5_3:0", shape=(?, 14, 14, 512), dtype=float32)
q_enc Tensor("qa/vqa_additional/dropout/mul:0", shape=(?, 512), dtype=float32)
q_tile Tensor("qa/vqa_additional/Tile:0", shape=(?, 14, 14, 512), dtype=float32)
q_im_attn Tensor("qa/vqa_additional/Reshape_3:0", shape=(?, ?, ?, ?), dtype=float32)
attn_conv1 Tensor("qa/vqa_additional/Relu:0", shape=(?, ?, ?, 256), dtype=float32)
attn_conv2 Tensor("qa/vqa_additional/add_1:0", shape=(?, ?, ?, 1), dtype=float32)
alpha Tensor("qa/vqa_additional/Reshape_5:0", shape=(?, 14, 14, 1), dtype=float32)
weighted Tensor("qa/vqa_additional/mul:0", shape=(?, 14, 14, 512), dtype=float32)
attended_image Tensor("qa/vqa_additional/dropout_1/mul:0", shape=(?, 512), dtype=float32)
a Tensor("qa/vqa_additional/HERE1:0", shape=(?, 1, 1, 512), dtype=float32)
b Tensor("qa/vqa_additional/HERE2:0", shape=(?, 1, 1, 512), dtype=float32)
attd_im_q Tensor("qa/vqa_additional/Reshape_9:0", shape=(?, 512), d

In [None]:
train_saved_model = False
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
#     saver = tf.train.Saver()
#     saver.restore(sess, Config.squeeze_net_dir)
#     print(tf.trainable_variables())
#     print(vqa_system.encoder.vgg_net.parameters)
    vqa_system.encoder.vgg_net.load_weights(weight_file=Config.vgg16_weight_file, sess=sess)
    
    vqa_system.train(sess, dataset)
#     for var in tf.trainable_variables():
#         print(var.name)


0 conv1_1_W (3, 3, 3, 64)
1 conv1_1_b (64,)
2 conv1_2_W (3, 3, 64, 64)
3 conv1_2_b (64,)
4 conv2_1_W (3, 3, 64, 128)
5 conv2_1_b (128,)
6 conv2_2_W (3, 3, 128, 128)
7 conv2_2_b (128,)
8 conv3_1_W (3, 3, 128, 256)
9 conv3_1_b (256,)
10 conv3_2_W (3, 3, 256, 256)
11 conv3_2_b (256,)
12 conv3_3_W (3, 3, 256, 256)
13 conv3_3_b (256,)
14 conv4_1_W (3, 3, 256, 512)
15 conv4_1_b (512,)
16 conv4_2_W (3, 3, 512, 512)
17 conv4_2_b (512,)
18 conv4_3_W (3, 3, 512, 512)
19 conv4_3_b (512,)
20 conv5_1_W (3, 3, 512, 512)
21 conv5_1_b (512,)
22 conv5_2_W (3, 3, 512, 512)
23 conv5_2_b (512,)
24 conv5_3_W (3, 3, 512, 512)
25 conv5_3_b (512,)
Number of params: 25879196 (retreival took 2.297210 secs)
Epoch 1 out of 20
   43/12584 [..............................] - ETA: 18622s - Training: 21.0000

In [None]:
# tf.reset_default_graph()
# all_answers = tf.placeholder(tf.int64, [None, 5])
# answers = tf.placeholder(tf.int64, [None])

# def acc_count(t, val):
#     t = tf.reshape(t, shape=(-1, 1))
#     elements_equal_to_value = tf.equal(t, val)
#     as_ints = tf.cast(elements_equal_to_value, tf.int32)
#     count = tf.reduce_sum(as_ints, axis=1)
#     accuracy = 1.0 * tf.minimum(count / 3, 1)
#     return accuracy

# with tf.Session() as sess:
#     v = sess.run([acc_count(answers, all_answers)], feed_dict={answers : np.array([1,2,3]),
#                                                        all_answers : np.array([[1,2,1,1,1], [0,1,1,2,0], [3,3,3,1,0]])})
#     print(v)

In [None]:
#print(np.sum(dataset.train.answers == 0))

In [None]:
# print(util.Progbar)
# prog = util.Progbar(target=100)
# for i in range(100):
#     prog.update(i + 1)

In [None]:
# a = tf.placeholder(tf.int32, [1,4,5])
# b = tf.placeholder(tf.int32, [1,5])
# func = tf.reduce_sum

# with tf.Session() as sess:
#     #res = sess.run(func, {a : np.array([[1,2,3,4], [8,7,6,5], [9,10,11,12], [16,15,14,13]])})
#     res = sess.run(func, {a : np.array([[[1,2,3,4,1], [8,7,6,5,1], [9,10,11,12,1], [16,15,14,13,1]]]),
#                           b : np.array([[2,3,4,2,2]])})
#     print(res)
print(tf.__version__)