## Visual Question and Answering

This notebook is about Visual Question and answering. 


In [12]:
import os
import re
import cv2
import csv
import string
import time
import sklearn
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from sklearn.model_selection import train_test_split
from helpers.config import *
from helpers.preprocessing import *
from helpers.utils_v2 import *

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [13]:
"""
Resetting default tensorflow computational Graph
"""
tf.reset_default_graph()


In [14]:
cfg = Config()
print ("Weights file is : ", cfg.weights_path)
print ("Config data path is: ",cfg.data_path)
print ("Glove vectors path is: ",cfg.glove_path)

Weights file is :  ../weights/vgg16_weights.npz
Config data path is:  ../data/dataset_v7w_telling.json
Glove vectors path is:  ../data/glove.6B.50d.txt


### Load the data required for the Question Answering

In [15]:
samples = loadData(cfg.data_path.split('../')[1])
train_samples, val_samples = train_test_split( samples, test_size= 0.2)
print ("Total number of training samples are: ",len(train_samples))
print ("Validation examples number: ",len(val_samples))

Total number of training samples are:  111894
Validation examples number:  27974


### Loading the Glove vectors.

In [16]:
## Loading glove vectors here. 
W2VEC = load_glove(cfg)

../data/glove.6B.50d.txt


### Functions available in the utils.py

1. Encoding the image
2. Encoding the text
3. Loading the Weights of the pretrained model
4. Loading placeholders
5. Variables class
6. 

## Generator
1. Load the image, question and answer here and train the network. 
2. Will get the output data in the shape (N, image, question, answer, groundtruth, option1, option2, option3)
3. 

In [17]:
W2VEC_LEN = 50
MAX_QUESTION_WORDS = 15
MAX_ANSWER_WORDS = 5

def vectorize(words_sequence, max_words=15, clean=False):
    'Takes a sentence and returns corresponding list of GloVecs'

    if clean:
        sent = _dataCleaning(words_sequence)

    words = words_sequence.lower().translate(string.punctuation).strip().split()
    # ignoring words beyond max_words
    words = words[:max_words]
    words2vec = np.empty((1, W2VEC_LEN))

    for w in words:
        word2vec = W2VEC.get(w.lower())

        if word2vec is None:
            word2vec = np.random.rand(W2VEC_LEN)

        word2vec = word2vec.reshape((1, W2VEC_LEN))
        words2vec = np.concatenate((words2vec, word2vec), axis=0)

    PADDING = np.zeros((1, W2VEC_LEN))

    for _ in np.arange(max_words - len(words)):
        words2vec = np.concatenate((words2vec, PADDING), axis=0)

    return words2vec[1:]
    
def generator(train_samples, batch_size=32):
    
    """
    1. Reads the image
    2. Reads the question and appends the word2vec for the sentence. 
    3. Reads the answer and the options and appends the word2vec to the corresponding lists. 
    4. Have to tokenize the question and answer here. 
    
    May need preprocessing of the question here. Get word 2 vecs of the word here. 
    The shape of the questions, answers, options1 to options3 is (N, T, D)
    
        N - number of samples
        T - time steps in the RNN
        D - dimension of the word 2 vector
        
    Returns: 1. Images batch, 
             2. Questions batch ,
             3. Answers batch, 
             4. option1 batch, 
             5. option2 batch, 
             6. option3 batch
    """
    
    num_samples = len(train_samples)
    
    while 1:
        
        sklearn.utils.shuffle(train_samples)
        
        path_to_images = "images/"
        
        for offset in range(0, num_samples, batch_size):
            
            batch_samples = train_samples[offset:offset+batch_size]
            
            train_images = []
            questions = []
            answers = []
            options1 = []
            options2 = []
            options3 = []
            
            for batch_sample in batch_samples:
                
                image_path = batch_sample[0]
                question   = batch_sample[1]
                answer     = batch_sample[2]
                choice1    = batch_sample[3]
                choice2    = batch_sample[4]
                choice3    = batch_sample[5]
                
                image1 = cv2.imread( path_to_images + batch_sample[0] )
                image1 = cv2.resize(image1, (448,448))
                train_images.append(image1)
                
                questions.append(vectorize(question, max_words = MAX_QUESTION_WORDS))
                
                answers.append(vectorize(answer, max_words = MAX_ANSWER_WORDS))
                options1.append(vectorize(choice1, max_words = MAX_ANSWER_WORDS))
                options2.append(vectorize(choice2, max_words = MAX_ANSWER_WORDS))
                options3.append(vectorize(choice3, max_words = MAX_ANSWER_WORDS))
                
            
            train_images = np.array(train_images)
            questions = np.array(questions)
            answers = np.array(answers)
            options1 = np.array(options1)
            options2 = np.array(options2)
            options3 = np.array(options3)
            
            labels = np.zeros([batch_size,4])
            labels[:,0] = 1
            yield train_images, questions, answers, options1, options2, options3, labels
            

In [18]:
def compute_loss(logits, labels_placeholder):
    
    """
    Considering that the score is the final logit value without the softmax. 
    """
#     print ("score_value is: ", score)
    final_loss = tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits(logits = logits, labels = labels_placeholder, dim = -1))
    return final_loss

    

### Building the computational Graph

In [8]:
tf.reset_default_graph()

In [None]:
## 1
cfg = Config()
inputIm_placeholder, question_placeholder, answer_placeholder, \
option1_placeholder, option2_placeholder, option3_placeholder, labels_placeholder = load_placeholders(cfg)

## 2. 
encode_image = encodeImage(cfg)
## 3. 
sess = tf.Session()
print ("Weights path is: ",cfg.weights_path)

## 4. 
with sess.as_default():
    encode_image.load_weights(cfg.weights_path, sess, is_Train = True)
    

Weights path is:  ../weights/vgg16_weights.npz
The weights are trainable
weight_file is:  weights/vgg16_weights.npz


In [None]:
"""
1. Loading Placeholders for the computational graph
2. Creating object for the encoding image and encoding text
3. Creating default session in tensorflow
4. As the CNN model is pre trained, the loads are loaded in the encoder object within the defautl session. 
5. The computational graph is run for the convolution part. 
6. Encoding the question using the computational graph. 
7. 
"""

## 5. 
final_conv_layer = encode_image.forward_pass(inputIm_placeholder)
print ("the shape of final_conv_layer is: ",final_conv_layer.get_shape())

## Need to flatten the image here. 
final_conv_layer = tf.contrib.layers.flatten(final_conv_layer)
print ("final_conv_layer shape is: ", final_conv_layer.get_shape())

fully_connected_object = fullyConnected(cfg)
output_fully_connected = fully_connected_object.forward_pass(final_conv_layer)

init_state = tf.Variable(tf.zeros([cfg.batch_size, cfg.state_size], dtype = tf.float32))

## 6. 
with tf.variable_scope( "question", reuse = tf.AUTO_REUSE):
    """
    Reuse permission is given to all the variables within this module. 
    """
    encode_text = encodeText(cfg)
    output_fw_q, final_state_fw_q = encode_text.encode( question_placeholder, encoder_input = init_state) 
    
with tf.variable_scope("answers", reuse = tf.AUTO_REUSE):
    """
    Reuse Permission is given to the answer as well.
    """
    encode_answer = encodeText(cfg)
    output_fw_a, final_state_fw_a       = encode_answer.encode(answer_placeholder,  final_state_fw_q)
    output_fw_opt1, final_state_fw_opt1 = encode_answer.encode(option1_placeholder, final_state_fw_q)
    output_fw_opt2, final_state_fw_opt2 = encode_answer.encode(option2_placeholder, final_state_fw_q)
    output_fw_opt3, final_state_fw_opt3 = encode_answer.encode(option3_placeholder, final_state_fw_q)
   

"""
Now I have to do the dot product of the two outputs and then send it to the loss function. 
"""
pro_value1 = tf.reduce_sum(tf.multiply(final_state_fw_q, final_state_fw_a), axis = 1)
pro_value2 = tf.reduce_sum(tf.multiply(final_state_fw_q, final_state_fw_opt1), axis = 1)
pro_value3 = tf.reduce_sum(tf.multiply(final_state_fw_q, final_state_fw_opt2), axis = 1)
pro_value4 = tf.reduce_sum(tf.multiply(final_state_fw_q, final_state_fw_opt3), axis = 1)

print ("pro_value1 shape is: ",pro_value1.get_shape())

pro_value = tf.stack([pro_value1, pro_value2, pro_value3, pro_value4],axis =1 )
print("pro_value shape is: ",pro_value.get_shape())
loss = compute_loss(pro_value, labels_placeholder)

train_step = tf.train.AdamOptimizer(learning_rate = 3e-4).minimize(loss)


the shape of final_conv_layer is:  (?, 8, 8, 512)


In [None]:
## Define the Classifier. 

saver = tf.train.Saver()
savefile = "models/model1.ckpt"

with sess.as_default():
    
    sess.run(tf.global_variables_initializer())
    
    for i in range(cfg.num_epochs):
        
        print ("Epoch Number: ",i)
        batch_generator = generator(train_samples, cfg.batch_size)
        total_iterations = int(len(train_samples)/cfg.batch_size)
        
        for j in range(total_iterations):
            
            start_time = time.time()
            batch_images_gen, batch_questions_gen, batch_answers_gen, batch_o1, batch_o2, batch_o3, labels = batch_generator.__next__()
            
            sess.run( train_step, feed_dict = \
                     {inputIm_placeholder: batch_images_gen, \
                      question_placeholder: batch_questions_gen,\
                      answer_placeholder: batch_answers_gen,\
                      option1_placeholder: batch_o1,\
                      option2_placeholder: batch_o2,\
                      option3_placeholder: batch_o3,\
                      labels_placeholder: labels
                     })
            
            if(j%50==0):
                loss_value= sess.run(loss, feed_dict = \
                     {inputIm_placeholder: batch_images_gen, \
                      question_placeholder: batch_questions_gen,\
                      answer_placeholder: batch_answers_gen,\
                      option1_placeholder: batch_o1,\
                      option2_placeholder: batch_o2,\
                      option3_placeholder: batch_o3,\
                      labels_placeholder: labels
                     })
                end_time = time.time()
                print ("Iter: ",j,' Total iter: ',total_iterations,"Loss value is: ",loss_value, " Time taken: ", end_time - start_time)
                
    saver.save(sess, savefile)
    