In [None]:
# Import modules needed
import tensorflow as tf
import tensorflow.contrib.slim as slim
import tensorflow.contrib.slim.nets
import numpy as np
import math
import timeit
import matplotlib.pyplot as plt
import pickle

%matplotlib inline
plt.rcParams['figure.figsize'] = (5.0, 5.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'
plt.ion()

import util
from vqa_model import Encoder, Decoder, VQASystem
from squeezenet import SqueezeNet
from vgg16 import VGG16

from compact_bilinear_pooling import compact_bilinear_pooling_layer

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

In [None]:
# Load data (excluding images)

# Restrict the number of possible answers using this
# Decreasing this will increase the number of classes 
train_min_count = 99
val_cutoff = 107183
# Load data
dataset = util.load_data_all(train_min_count, val_cutoff=107183, limit=100000000)
np_embeddings = np.load("data/glove.trimmed.100.npz")["glove"]
answer_to_id, id_to_answer = util.load_answer_map(min_count=train_min_count)
with open('data/qid_to_anstype.dat', 'rb') as fp:
    qid_to_anstype = pickle.load(fp)

print("*" * 60)
print("Questions_train", dataset.train.questions.shape)
print("Questions_mask_train", dataset.train.mask.shape)
print("Questions_ids_train", dataset.train.question_ids.shape)
print("Image_ids_train", dataset.train.image_ids.shape)
print("All_answers_train", dataset.train.all_answers.shape)
print("Answers_train", dataset.train.answers.shape)
print("*" * 60)
print("Questions_val", dataset.val.questions.shape)
print("Questions_mask_val", dataset.val.mask.shape)
print("Questions_ids_val", dataset.val.question_ids.shape)
print("Image_ids_val", dataset.val.image_ids.shape)
print("All_answers_val", dataset.val.all_answers.shape)
print("Answers_val", dataset.val.answers.shape)
print("*" * 60)
print("Questions_test", dataset.test.questions.shape)
print("Questions_mask_test", dataset.test.mask.shape)
print("Questions_ids_test", dataset.test.question_ids.shape)
print("Image_ids_test", dataset.test.image_ids.shape)
print("All_answers_test", dataset.test.all_answers.shape)
print("Answers_test", dataset.test.answers.shape)
print("*" * 60)
print("np_embeddings", np_embeddings.shape)
print("*" * 60)
print("There are", len(answer_to_id), "possible answers (including <unk>)")
print("This should be less than or equal to above", np.max(dataset.train.answers) + 1) 
print("This should be less than or equal to above", np.max(dataset.val.answers) + 1) 
print("This should be less than or equal to above", np.max(dataset.test.answers) + 1) 
print("This should be less than or equal to above", np.max(dataset.train.all_answers) + 1) 
print("This should be less than or equal to above", np.max(dataset.val.all_answers) + 1) 
print("This should be less than or equal to above", np.max(dataset.test.all_answers) + 1) 
print("*" * 60)

In [None]:
class Config:
    """Holds model hyperparams and data information.
    """
    epochs = 1
    
    learning_rate = 0.01
    lr_decay = 0.9
    optimizer = tf.train.AdamOptimizer
    max_gradient_norm = 10.0
    clip_gradients = True
    dropout_keep_prob = 1.0
    l2_reg = 0.0
    batch_size = 32
    train_all = False
    
    train_limit = 10000
    
    max_question_length = 25    
    num_answers_per_question = 10
    num_classes = len(answer_to_id)
    image_size = [224, 224, 3]
    cbpl_output_dim = 8000
    att_conv1_dim = 512
    
    vgg_out_dim = [14, 14]
    
    images_input_sequece_len = vgg_out_dim[0] * vgg_out_dim[1]
    
    rnn_hidden_size = 512 # RNN
    fc_state_size = 100 # Fully connected
    embedding_size = 100
    
    num_evaluate = 500
    
    eval_every = 100
    print_every = 100 
    
    VGG_MEAN = [123.68, 116.78, 103.94]
    
    model_dir = "final_bilinear"
    squeeze_net_dir = "sq_net_model/squeezenet.ckpt"
    vgg16_weight_file = "vgg_net_dir/vgg_16.ckpt"
    
    vgg_exclude_names = ['qa/vgg_16/pool5', 'qa/vgg_16/fc6', 'qa/vgg_16/fc7', 'qa/vgg_16/fc8']

In [None]:
class BaselineEncoder(Encoder):
    
            
    
    def encode(self, inputs, encoder_state_input, embeddings, dropout_keep_prob):
        images, _, questions, question_masks = inputs
        self.batch_size = tf.shape(images)[0]
        
        means = tf.reshape(tf.constant(Config.VGG_MEAN), [1, 1, 1, 3])
        images = images - means 
        
        vgg = tf.contrib.slim.nets.vgg
        with slim.arg_scope(vgg.vgg_arg_scope(weight_decay=5e-4)):
            logits, end_points = vgg.vgg_16(images, 
                                   num_classes=1, 
                                   is_training=False,
                                   dropout_keep_prob=dropout_keep_prob)
            self.vgg_out = end_points['qa/vgg_16/conv5/conv5_3']
            self.vgg_include = [end_points[e] for e in end_points if e not in Config.vgg_exclude_names]
            self.vgg_exclude = [end_points[e] for e in end_points if e not in Config.vgg_exclude_names]
            for e in self.vgg_include:
                print(e)
            print("vgg_out", self.vgg_out)
        
        with tf.variable_scope('vqa_additional'):
            # Encode question with GRU
            questions_input = tf.nn.embedding_lookup(embeddings, questions)
            questions_input = tf.tanh(questions_input)
            with tf.variable_scope('q_encoder') as scope:
                gru_cell = tf.contrib.rnn.GRUCell(self.config.rnn_hidden_size)
#                 gru_cell = tf.contrib.rnn.DropoutWrapper(gru_cell,
#                                                          input_keep_prob=dropout_keep_prob,
#                                                          output_keep_prob=dropout_keep_prob)
                outputs, state = tf.nn.dynamic_rnn(cell=gru_cell,
                                                   inputs=questions_input,
                                                   sequence_length=question_masks,
                                                   dtype=tf.float32)
            # Question representation
            self.q_enc = state
#             self.q_enc = tf.tanh(self.q_enc)
            self.q_enc = tf.nn.dropout(self.q_enc, keep_prob=dropout_keep_prob)
            print("q_enc", self.q_enc)
            
            tile_temp = tf.reshape(self.q_enc, shape=(self.batch_size, 1, 1, self.config.rnn_hidden_size))
            self.q_tile = tf.tile(tile_temp, [1] + self.config.vgg_out_dim + [1])
            print("q_tile", self.q_tile)
            
            self.q_im_attn = compact_bilinear_pooling_layer(bottom1=self.vgg_out, bottom2=self.q_tile, 
                                                            output_dim=self.config.cbpl_output_dim,
                                                            sum_pool=False)
            
            self.q_im_attn = tf.reshape(self.q_im_attn, shape=[-1]+ self.config.vgg_out_dim + [self.config.cbpl_output_dim])
#             self.q_im_attn = tf.sign(self.q_im_attn) * tf.sqrt(tf.abs(self.q_im_attn))
            self.q_im_attn = tf.nn.l2_normalize(self.q_im_attn, dim=3)
#             self.q_im_attn = tf.layers.batch_normalization(self.q_im_attn)
            print("q_im_attn", self.q_im_attn)
            
#             self.att_Wconv1 = tf.get_variable(name="att_Wconv1_weight", 
#                                               shape=[3, 3, self.config.cbpl_output_dim, 
#                                                      self.config.att_conv1_dim], 
#                                               dtype=tf.float32, 
#                                               initializer=tf.contrib.layers.xavier_initializer())
            self.att_Wconv1 = tf.Variable(tf.truncated_normal([3, 3, self.config.cbpl_output_dim, 
                                                               self.config.att_conv1_dim], 
                                                              dtype=tf.float32,
                                                              stddev=1e-1), 
                                          name='att_Wconv1_weight')
            self.att_bconv1 = tf.Variable(tf.constant(0.0, shape=[self.config.att_conv1_dim], dtype=tf.float32),
                                 trainable=True, name='att_bconv1')
            self.attn_conv1 = tf.nn.conv2d(self.q_im_attn,self.att_Wconv1,
                                           strides=[1,1,1,1], padding='SAME') + self.att_bconv1
            self.attn_conv1 = tf.tanh(self.attn_conv1)
#             self.attn_conv1 = tf.layers.batch_normalization(self.attn_conv1)
            print("attn_conv1", self.attn_conv1)
            
#             self.att_Wconv2 = tf.get_variable(name="att_Wconv2_weight", 
#                                               shape=[3, 3, self.config.att_conv1_dim, 1], 
#                                               dtype=tf.float32, 
#                                               initializer=tf.contrib.layers.xavier_initializer())
            
            self.att_Wconv2 =  tf.Variable(tf.truncated_normal([3, 3, self.config.att_conv1_dim, 1], 
                                                              dtype=tf.float32,
                                                              stddev=1e-1), 
                                          name='att_Wconv2_weight')
            self.att_bconv2 = tf.Variable(tf.constant(0.0, shape=[1], dtype=tf.float32),
                                 trainable=True, name='att_bconv2')
            self.attn_conv2 = tf.nn.conv2d(self.attn_conv1, self.att_Wconv2,
                                           strides=[1,1,1,1], padding='SAME') + self.att_bconv2
            print("attn_conv2", self.attn_conv2)
            
            self.attn_flat = tf.reshape(self.attn_conv2, shape=[-1, self.config.images_input_sequece_len])
            self.alpha = tf.nn.softmax(self.attn_flat)
            self.alpha = tf.reshape(self.alpha, shape=[-1] + self.config.vgg_out_dim + [1])
            print("alpha", self.alpha)
            
            weighted = self.alpha * self.vgg_out
            print("weighted",weighted)
            self.attended_image = tf.reduce_sum(weighted, axis=(1,2))
            self.attended_image = tf.nn.dropout(self.attended_image, keep_prob=dropout_keep_prob)
            print("attended_image", self.attended_image)
            
            a=tf.reshape(self.attended_image, shape=(-1, 1, 1, self.config.rnn_hidden_size), name="HERE1")
            b=tf.reshape(self.q_enc, shape=(-1, 1, 1, self.config.rnn_hidden_size), name="HERE2")
            print("a", a)
            print("b", b)
            
            self.attd_im_q = compact_bilinear_pooling_layer(bottom1=a, 
                                                            bottom2=b,  
                                                             output_dim=self.config.cbpl_output_dim,
                                                             sum_pool=False)
            self.attd_im_q = tf.reshape(self.attd_im_q, shape=[-1, self.config.cbpl_output_dim])
#             self.attd_im_q = tf.sign(self.attd_im_q) * tf.sqrt(tf.abs(self.attd_im_q))
            self.attd_im_q = tf.nn.l2_normalize(self.attd_im_q, dim=1)
#             self.attd_im_q = tf.layers.batch_normalization(self.attd_im_q)
            
            
            print("attd_im_q", self.attd_im_q)
            
        return self.attd_im_q
        #return state

class BaselineDecoder(Encoder):
    def decode(self, knowledge_rep, dropout_keep_prob):
        scores = tf.layers.dense(inputs=knowledge_rep, units=self.config.num_classes, 
                               activation=tf.nn.relu,
                               kernel_initializer=tf.contrib.layers.xavier_initializer())
        return scores
    
    def plot(self, attn):
#         print(attn)
        plt.imshow(attn)
        plt.show()

In [None]:
# clear old variables
tf.reset_default_graph()

vqa_encoder = BaselineEncoder(config=Config)
vqa_decoder = BaselineDecoder(config=Config)

vqa_system = VQASystem(encoder=vqa_encoder, decoder=vqa_decoder, 
                       pretrained_embeddings=np_embeddings, config=Config)   

In [None]:
train_saved_model = False
with tf.Session() as sess:
    util.initialize_model(sess, vqa_system, Config.model_dir, train_saved_model,config=Config)    
    vqa_system.train(sess, dataset)
#     vqa_system.evaluate_data(session=sess, sample_size=10000, 
#                              dataset=dataset.test, qid_to_anstype=qid_to_anstype, datatype="test")


In [None]:
# from PIL import Image
# im = np.asarray(Image.open('data/preprocessed_images_val/288944.jpeg'))

# print(im - np.array([123.68, 116.779, 103.939]))