In [2]:
# Import modules needed
import tensorflow as tf
import numpy as np
import math
import timeit
import matplotlib.pyplot as plt
import pickle
%matplotlib inline

import util
from vqa_model import Encoder, Decoder, VQASystem
from squeezenet import SqueezeNet
from vgg16 import VGG16

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

In [2]:
# Load data (excluding images)

# Restrict the number of possible answers using this
# Decreasing this will increase the number of classes 
train_min_count = 20
val_cutoff = 107183
# Load data
dataset = util.load_data_all(train_min_count, val_cutoff=107183, limit=1000)
np_embeddings = np.load("data/glove.trimmed.100.npz")["glove"]
answer_to_id, id_to_answer = util.load_answer_map(min_count=train_min_count)

print("*" * 60)
print("Questions_train", dataset.train.questions.shape)
print("Questions_mask_train", dataset.train.mask.shape)
print("Image_ids_train", dataset.train.image_ids.shape)
print("All_answers_train", dataset.train.all_answers.shape)
print("Answers_train", dataset.train.answers.shape)
print("*" * 60)
print("Questions_val", dataset.val.questions.shape)
print("Questions_mask_val", dataset.val.mask.shape)
print("Image_ids_val", dataset.val.image_ids.shape)
print("All_answers_val", dataset.val.all_answers.shape)
print("Answers_val", dataset.val.answers.shape)
print("*" * 60)
print("Questions_test", dataset.test.questions.shape)
print("Questions_mask_test", dataset.test.mask.shape)
print("Image_ids_test", dataset.test.image_ids.shape)
print("All_answers_test", dataset.test.all_answers.shape)
print("Answers_test", dataset.test.answers.shape)
print("*" * 60)
print("np_embeddings", np_embeddings.shape)
print("*" * 60)
print("There are", len(answer_to_id), "possible answers (including <unk>)")
print("This should be less than or equal to above", np.max(dataset.train.answers) + 1) 
print("This should be less than or equal to above", np.max(dataset.val.answers) + 1) 
print("This should be less than or equal to above", np.max(dataset.test.answers) + 1) 
print("This should be less than or equal to above", np.max(dataset.train.all_answers) + 1) 
print("This should be less than or equal to above", np.max(dataset.train.answers) + 1) 
print("This should be less than or equal to above", np.max(dataset.train.answers) + 1) 
print("*" * 60)

************************************************************
Questions_train (1000, 25)
Questions_mask_train (1000,)
Image_ids_train (1000,)
All_answers_train (1000, 10)
Answers_train (1000,)
************************************************************
Questions_val (1000, 25)
Questions_mask_val (1000,)
Image_ids_val (1000,)
All_answers_val (1000, 10)
Answers_val (1000,)
************************************************************
Questions_test (1000, 25)
Questions_mask_test (1000,)
Image_ids_test (1000,)
All_answers_test (1000, 10)
Answers_test (1000,)
************************************************************
np_embeddings (47382, 100)
************************************************************
There are 10051 possible answers (including <unk>)
This should be less than or equal to above 9862
This should be less than or equal to above 8832
This should be less than or equal to above 9640
This should be less than or equal to above 9938
This should be less than or equal to above 9862

In [1]:
class Config:
    """Holds model hyperparams and data information.
    """
    epochs = 20
    
    learning_rate = 5e-4
    optimizer = tf.train.AdamOptimizer
    max_gradient_norm = 10.0
    clip_gradients = False
    dropout = 0.5
    batch_size = 32
    
    max_question_length = 25    
    num_answers_per_question = 10
    num_classes = len(answer_to_id)
    image_size = [224, 224, 3]
    
    images_input_sequece_len = 14*14
    
    rnn_hidden_size = 100 # RNN
    fc_state_size = 100 # Fully connected
    embedding_size = 100
    
    num_evaluate = 100
    
    print_every = 100 
    
    model_dir = "skeleton_model"
    squeeze_net_dir = "sq_net_model/squeezenet.ckpt"
    vgg16_weight_file = "vgg_net_dir/vgg16_weights.npz"

NameError: name 'tf' is not defined

In [4]:
class BaselineEncoder(Encoder):
    def compute_attention_g(self, level):
        with tf.variable_scope('compute_attention_g'):
            q = tf.expand_dims(self.q, axis=1)
            m_before = tf.expand_dims(self.memory[level-1], axis=1)
            z = tf.concat((self.F_bidir * q, self.F_bidir * m_before, 
                           tf.abs(self.F_bidir - q), tf.abs(self.F_bidir - m_before)), 
                          axis=2)
            out = tf.Print(z, [tf.shape(z)], 'z = ', summarize=20, first_n=7)
            print("z", level, z)
            
            W_att_1 = tf.get_variable(name="W_att_1", 
                                      shape=(4*self.config.rnn_hidden_size, 
                                             self.config.rnn_hidden_size), 
                                      dtype=tf.float32, 
                                      initializer=tf.contrib.layers.xavier_initializer())
            W_att_2 = tf.get_variable(name="W_att_2", 
                                             shape=(self.config.rnn_hidden_size, 1), 
                                             dtype=tf.float32, 
                                             initializer=tf.contrib.layers.xavier_initializer())
            b_att_1 = tf.get_variable(name="b_att_1", 
                                            shape=(self.config.rnn_hidden_size,), 
                                            dtype=tf.float32, 
                                            initializer=tf.constant_initializer(0.0))
            b_att_2 = tf.get_variable(name="b_att_2", 
                                            shape=(), 
                                            dtype=tf.float32, 
                                            initializer=tf.constant_initializer(0.0))
            
            z_reshaped = tf.reshape(z, [-1, 4*self.config.rnn_hidden_size])
            print("z_reshaped", level, z_reshaped)
            Z_temp = tf.tanh(tf.matmul(z_reshaped, W_att_1) + b_att_1)
            print("Z_temp", level, Z_temp)
            
            Z_reshaped = tf.matmul(Z_temp, W_att_2) + b_att_2
            print("Z_reshaped", level, Z_reshaped)
            
            Z = tf.reshape(Z_reshaped, [self.batch_size, -1])
            print("Z", level, Z)
            
            exp_Z = tf.exp(Z)
            g = exp_Z / (1e-6 + tf.reduce_sum(exp_Z, reduction_indices=1, keep_dims=True))
            print("g", g)
                        
            return g
        
    def get_next_memory_state(self, context, level):
        with tf.variable_scope('compute_next_m'):
            combo = tf.concat((self.memory[level-1], context, self.q), axis=1)
            print("combo", level, combo)
            
            W_nm_1 = tf.get_variable(name="W_nm_1", 
                                      shape=(3*self.config.rnn_hidden_size, 
                                             self.config.rnn_hidden_size), 
                                      dtype=tf.float32, 
                                      initializer=tf.contrib.layers.xavier_initializer())
            b_nm_1 = tf.get_variable(name="b_nm_1", 
                                            shape=(self.config.rnn_hidden_size,), 
                                            dtype=tf.float32, 
                                            initializer=tf.constant_initializer(0.0))
            
            m = tf.nn.relu(tf.matmul(combo, W_nm_1) + b_nm_1) 
            print("m", level, m)
        return m
            
    
    def encode(self, inputs, encoder_state_input, embeddings, dropout):
        self.memory = []
        
        
        
        images, seq_len, questions, question_masks = inputs
        
        self.batch_size = tf.shape(images)[0]
        
        self.vgg_net = VGG16(imgs=images, weights=Config.vgg16_weight_file)
        #print("vgg_conv5_3", self.vgg_net.conv5_3)
        print("vgg_pool5", self.vgg_net.pool5)
        self.vgg_out = self.vgg_net.conv5_3
        
        
        
        #_, H, W, C = tf.shape(squeeze_net.features)
        #print(tf.shape(squeeze_net.features))
        with tf.variable_scope('vqa_additional'):
            # Encode question with GRU
            questions_input = tf.nn.embedding_lookup(embeddings, questions)
            with tf.variable_scope('q_encoder') as scope:
                gru_cell = tf.contrib.rnn.GRUCell(self.config.rnn_hidden_size)
                outputs, state = tf.nn.dynamic_rnn(cell=gru_cell,
                                                   inputs=questions_input,
                                                   sequence_length=question_masks,
                                                   #initial_state=image_feats,
                                                   dtype=tf.float32)
            # Question representation
            self.q = state
            self.memory.append(state) # m_0 = q
            print("q", self.q)
                
            F = tf.reshape(self.vgg_out, shape=(-1, 14*14, 512))
            
            print("F", F)
            
            # Forward direction cell
            gru_fw_cell = tf.contrib.rnn.GRUCell(self.config.rnn_hidden_size)
            gru_bw_cell = tf.contrib.rnn.GRUCell(self.config.rnn_hidden_size)
            outputs, output_states = tf.nn.bidirectional_dynamic_rnn(cell_fw=gru_fw_cell,
                                                                     cell_bw=gru_bw_cell,
                                                                     inputs=F,
                                                                     sequence_length=seq_len,
                                                                     dtype=tf.float32)
            #print("outputs", outputs)
            #print("output_states", output_states)
            self.F_bidir = outputs[0] + outputs[1]            
            print("F_bidir", self.F_bidir)
            
            g_1 = self.compute_attention_g(level=1)       
            c_1 = tf.reduce_sum(self.F_bidir * tf.expand_dims(g_1, axis=2), axis=1)
            print("c_1", c_1)
            self.memory.append(self.get_next_memory_state(context=c_1, level=1))
            
            
#             h_flat = tf.contrib.layers.flatten(self.vgg_out)
#             print("h_flat", h_flat)
#             image_feats = tf.layers.dense(inputs=h_flat,
#                                           units=self.config.rnn_hidden_size,
#                                           activation=tf.nn.relu)
            
#             print("image_features", image_feats)

            
        return self.memory[-1]
            
            
        #return state

class BaselineDecoder(Encoder):
    def decode(self, knowledge_rep, dropout):
        scores = tf.layers.dense(inputs=knowledge_rep, units=self.config.num_classes, 
                               activation=tf.nn.relu,
                               kernel_initializer=tf.contrib.layers.xavier_initializer())
        return scores

In [None]:
# clear old variables
tf.reset_default_graph()

vqa_encoder = BaselineEncoder(config=Config)
vqa_decoder = BaselineDecoder(config=Config)

vqa_system = VQASystem(encoder=vqa_encoder, decoder=vqa_decoder, 
                       pretrained_embeddings=np_embeddings, config=Config)   

Done Adding Placeholers!
Done Adding Embedding!
vgg_pool5 Tensor("qa/pool4_1:0", shape=(?, 7, 7, 512), dtype=float32)
q Tensor("qa/vqa_additional/q_encoder/rnn/while/Exit_2:0", shape=(?, 100), dtype=float32)
F Tensor("qa/vqa_additional/Reshape:0", shape=(?, 196, 512), dtype=float32)
F_bidir Tensor("qa/vqa_additional/add:0", shape=(?, 196, 100), dtype=float32)
z 1 Tensor("qa/vqa_additional/compute_attention_g/concat:0", shape=(?, 196, 400), dtype=float32)
z_reshaped 1 Tensor("qa/vqa_additional/compute_attention_g/Reshape:0", shape=(?, 400), dtype=float32)
Z_temp 1 Tensor("qa/vqa_additional/compute_attention_g/Tanh:0", shape=(?, 100), dtype=float32)
Z_reshaped 1 Tensor("qa/vqa_additional/compute_attention_g/add_1:0", shape=(?, 1), dtype=float32)
Z 1 Tensor("qa/vqa_additional/compute_attention_g/Reshape_1:0", shape=(?, ?), dtype=float32)
g Tensor("qa/vqa_additional/compute_attention_g/truediv:0", shape=(?, ?), dtype=float32)
c_1 Tensor("qa/vqa_additional/Sum:0", shape=(?, 100), dtype=floa

In [None]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
#     saver = tf.train.Saver()
#     saver.restore(sess, Config.squeeze_net_dir)
    vqa_system.encoder.vgg_net.load_weights(weight_file=Config.vgg16_weight_file, sess=sess)
    
    vqa_system.train(sess, dataset)


0 conv1_1_W (3, 3, 3, 64)
1 conv1_1_b (64,)
2 conv1_2_W (3, 3, 64, 64)
3 conv1_2_b (64,)
4 conv2_1_W (3, 3, 64, 128)
5 conv2_1_b (128,)
6 conv2_2_W (3, 3, 128, 128)
7 conv2_2_b (128,)
8 conv3_1_W (3, 3, 128, 256)
9 conv3_1_b (256,)
10 conv3_2_W (3, 3, 256, 256)
11 conv3_2_b (256,)
12 conv3_3_W (3, 3, 256, 256)
13 conv3_3_b (256,)
14 conv4_1_W (3, 3, 256, 512)
15 conv4_1_b (512,)
16 conv4_2_W (3, 3, 512, 512)
17 conv4_2_b (512,)
18 conv4_3_W (3, 3, 512, 512)
19 conv4_3_b (512,)
20 conv5_1_W (3, 3, 512, 512)
21 conv5_1_b (512,)
22 conv5_2_W (3, 3, 512, 512)
23 conv5_2_b (512,)
24 conv5_3_W (3, 3, 512, 512)
25 conv5_3_b (512,)
Number of params: 20966440 (retreival took 4.064116 secs)

In [None]:
# tf.reset_default_graph()
# all_answers = tf.placeholder(tf.int64, [None, 5])
# answers = tf.placeholder(tf.int64, [None])

# def acc_count(t, val):
#     t = tf.reshape(t, shape=(-1, 1))
#     elements_equal_to_value = tf.equal(t, val)
#     as_ints = tf.cast(elements_equal_to_value, tf.int32)
#     count = tf.reduce_sum(as_ints, axis=1)
#     accuracy = 1.0 * tf.minimum(count / 3, 1)
#     return accuracy

# with tf.Session() as sess:
#     v = sess.run([acc_count(answers, all_answers)], feed_dict={answers : np.array([1,2,3]),
#                                                        all_answers : np.array([[1,2,1,1,1], [0,1,1,2,0], [3,3,3,1,0]])})
#     print(v)

In [None]:
#print(np.sum(dataset.train.answers == 0))

In [None]:
# print(util.Progbar)
# prog = util.Progbar(target=100)
# for i in range(100):
#     prog.update(i + 1)

In [None]:
# a = tf.placeholder(tf.int32, [1,4,5])
# b = tf.placeholder(tf.int32, [1,5])
# func = tf.reduce_sum

# with tf.Session() as sess:
#     #res = sess.run(func, {a : np.array([[1,2,3,4], [8,7,6,5], [9,10,11,12], [16,15,14,13]])})
#     res = sess.run(func, {a : np.array([[[1,2,3,4,1], [8,7,6,5,1], [9,10,11,12,1], [16,15,14,13,1]]]),
#                           b : np.array([[2,3,4,2,2]])})
#     print(res)