In [None]:
!python -m spacy download en_core_web_md #Restart runtime after executing

In [1]:
import os
import tensorflow as tf
import numpy as np
import pandas as pd
import json
import spacy

from datetime import datetime

from progressbar import Bar, ETA, Percentage, ProgressBar
from itertools import zip_longest
from keras.models import load_model
from os import listdir
from keras.utils import np_utils, generic_utils


from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications.vgg16 import preprocess_input 

from sklearn.preprocessing import LabelEncoder


from keras import Input
from keras.models import *
from keras.layers import *

# Set the seed for random operations. 
# This let our experiments to be reproducible. 
SEED = 1234
tf.random.set_seed(SEED)
np.random.seed(SEED)

# Get current working directory
cwd = os.getcwd()

In [2]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [3]:
!unzip /content/drive/My\ Drive/VQA_Dataset.zip

Archive:  /content/drive/My Drive/VQA_Dataset.zip
replace VQA_Dataset/.DS_Store? [y]es, [n]o, [A]ll, [N]one, [r]ename: N


In [4]:
os.chdir('/content/drive/MyDrive')

In [7]:
cwd

'/content'

## Creation of Feature Maps from images
In this section we create feature maps from the input images using VGG. We decided ti split this part from the rest of the model so it could be implemented in a separate notebook to reduce workload.


In [5]:
IMG_W =  175
IMG_H = 100
BS = 16

dataset_dir = os.path.join(cwd, "VQA_Dataset")
test_dir = os.path.join(dataset_dir, 'Images')

test_data_gen = ImageDataGenerator(rescale=1./255)

test_gen = test_data_gen.flow_from_directory(dataset_dir,
                                             batch_size=BS,
                                             color_mode="rgb",
                                             target_size=(IMG_H, IMG_W),
                                             shuffle=False)
test_gen.reset()

Found 29333 images belonging to 1 classes.


In [None]:
#VGG model
vgg = tf.keras.applications.VGG16(weights='imagenet', include_top=False, input_shape=(IMG_H, IMG_W, 3)) 
vgg.trainable = False
vgg.summary()

In [7]:
pre_model = tf.keras.Sequential()
pre_model.add(vgg)
pre_model.trainable = False
img_predictions = pre_model.predict(test_gen, len(test_gen), verbose=1)



In [8]:
feature_maps = {}
image_names = []

from os import listdir

for name in listdir(test_dir):
  image_names.append(name)

image_names.sort()

for i in range(len(image_names)):
  img_name=image_names[i].replace('.png' , '')
  feature_maps[img_name] = img_predictions[i]

## Creation on input Questions and answer vocab


In [9]:
# OPEN QUESTIONS
file=os.path.join(dataset_dir, 'train_questions_annotations.json')
f = open(file)
train_q = json.load(f)

#Load Spacy
nlp = spacy.load('en_core_web_md')

In [10]:
#Question list
questions = list([text for id in train_q.keys() for text in train_q[id]['question'].splitlines()])

#Answer list
answers = list([text for id in train_q.keys() for text in train_q[id]['answer'].splitlines()])

answer_vocab = list(set([text for id in train_q.keys() for text in train_q[id]['answer'].splitlines()]))
answer_vocab_size = len(answer_vocab)

ans_word_to_idx = { w: i for i, w in enumerate(answer_vocab) }
ans_idx_to_word = { i: w for i, w in enumerate(answer_vocab) }

#Img_id list
img_ids = list([text for id in train_q.keys() for text in train_q[id]['image_id'].splitlines()])

In [11]:
#Check correctness
print(list(train_q.items())[0])
print(answers[0])
print(img_ids[0])
print(questions[0])

('117792', {'question': 'Who looks happier?', 'image_id': '11779', 'answer': 'man'})
man
11779
Who looks happier?


In [12]:
### Find max len quesiton
max_len = 0
max_len_q_index = 0
for i in range(len(questions)):
  q_len = len(questions[i].strip().split(" "))
  if(q_len > max_len):
    max_len = q_len
    max_len_q_index = i

max_len = len(nlp(questions[max_len_q_index]))

print(max_len)
print(list(train_q.items())[max_len_q_index])

24
('81082', {'question': 'Is it likely that it is winter?  The human would say no since there are no coats on the coat rack?', 'image_id': '8108', 'answer': 'no'})


## Functions


In [13]:
#create inputs
def createInputs(text):
  '''
  Returns an array of one-hot vectors representing the words
  in the input text string.
  - text is a string
  - Each one-hot vector has shape (vocab_size, 1)
  '''
  inputs = []
  for w in text.split(' '):
    v = np.zeros((vocab_size, 1)) 
    v[word_to_idx[w]] = 1
    inputs.append(v)
  return inputs

def get_questions_tensor_timeseries(questions, nlp, timesteps):
    #assert not isinstance(questions, list) --- CHECK WHEN CREATING BATCHES
    nb_samples = len(questions)
    word_vec_dim = nlp(questions[0])[0].vector.shape[0]
    questions_tensor = np.zeros((nb_samples, timesteps, word_vec_dim))
    for i in range(len(questions)):
        tokens = nlp(questions[i])
        for j in range(len(tokens)):
            if j<timesteps:
                questions_tensor[i,j,:] = tokens[j].vector
    return questions_tensor

#Encode answers
def encodeAns(w, ans_word_to_idx):
  '''
  Returns a one-hot vectors representing the answer word in the answer vocab.
  Each one-hot vector has shape (answer_vocab_size, 1)
  '''
  ohe_word = np.zeros((answer_vocab_size, 1)) 
  ohe_word[ans_word_to_idx[w]] = 1
  return ohe_word 

#Decode answers
def decodeAns(ohe_w, ans_idx_to_word):
  '''
  Returns the word from the one-hot-encoding answer vector
  '''
  i = 0
  while(ohe_w[i] == 0):
    i = i+1
  return ans_idx_to_word[i] 

def get_questions_tensor_timeseries(questions, nlp, timesteps):
    #assert not isinstance(questions, list) #--- CHECK WHEN CREATING BATCHES
    nb_samples = len(questions)
    word_vec_dim = nlp(questions[0])[0].vector.shape[0]
    questions_tensor = np.zeros((nb_samples, timesteps, word_vec_dim))
    for i in range(len(questions)):
        tokens = nlp(questions[i])
        for j in range(len(tokens)):
            if j<timesteps:
                questions_tensor[i,j,:] = tokens[j].vector
    return questions_tensor

def get_question_tensor_timeseries(question, nlp, timesteps): 
    word_vec_dim = nlp(question)[0].vector.shape[0]
    question_tensor = np.zeros((timesteps, word_vec_dim))
    tokens = nlp(question)
    for j in range(len(tokens)):
      if j<timesteps:
        question_tensor[j,:] = tokens[j].vector
    return question_tensor 

def grouped(iterable, n, fillvalue=None):
    args = [iter(iterable)] * n
   #print(args)
    #for a in(zip_longest(*args, fillvalue=fillvalue)):
        #print(a)
    return zip_longest(*args, fillvalue=fillvalue)      

## Input Creation

In [15]:
def create_tuples(questions, answers, img_ids, max_len, nlp, ans_word_to_idx, feature_maps):
    
    new_answers_train = list()
    new_questions_train = list()
    new_images_train = list()
    for ans, ques, img_id in zip(answers, questions, img_ids):
      
      ohe_ans = encodeAns(ans, ans_word_to_idx) #one hot encoding of answer
      encoded_question = get_question_tensor_timeseries(ques, nlp, max_len)  #question embedding
      img_feature_maps = feature_maps[img_id] #feature maps of that img

      new_answers_train.append(ohe_ans)
      new_questions_train.append(encoded_question)
      new_images_train.append(img_feature_maps.flatten())

    return (new_questions_train, new_answers_train, new_images_train)

def create_test_tuples(questions, img_ids, max_len, nlp, feature_maps):
    
    new_questions_train = list()
    new_images_train = list()

    for ques, img_id in zip(questions, img_ids):
      
      encoded_question = get_question_tensor_timeseries(ques, nlp, max_len)  #question embedding
      img_feature_maps = feature_maps[img_id] #feature maps of that img

      new_questions_train.append(encoded_question)
      new_images_train.append(img_feature_maps.flatten())

    return (new_questions_train, new_images_train)    

In [16]:
#create lists of encoded items
en_questions, en_answers, en_img_ids = create_tuples(questions, answers, img_ids, max_len, nlp, ans_word_to_idx, feature_maps)

In [17]:
print (len(en_questions), len(en_answers),len(en_img_ids))

58832 58832 58832


## Image Model

In [18]:
#With load feature map from file should just be an empty model
image_model = Sequential()
image_model.add(Reshape(input_shape = (7680,), target_shape=(7680,))) ### set correct shape 7680 (175x100)
model1 = Model(inputs = image_model.input, outputs = image_model.output)
model1.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
reshape_input (InputLayer)   [(None, 7680)]            0         
_________________________________________________________________
reshape (Reshape)            (None, 7680)              0         
Total params: 0
Trainable params: 0
Non-trainable params: 0
_________________________________________________________________


## LSTM Model
LSTM model that takes the question as input and creates the output to send to our final dense model

In [19]:
# Question Model
# LSTM RNN
word2vec_dim = 300 
num_layers_lstm = 3
num_hidden_nodes_lstm = 512
output_dim = num_hidden_nodes_lstm

language_model = Sequential()
language_model.add(LSTM(units=output_dim, 
                        return_sequences=True, input_shape=(None, word2vec_dim)))

for i in range(num_layers_lstm-2):
    language_model.add(LSTM(units=output_dim, return_sequences=True))
language_model.add(LSTM(units=output_dim, return_sequences=False))

model2 = Model(language_model.input, language_model.output)
model2.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_input (InputLayer)      [(None, None, 300)]       0         
_________________________________________________________________
lstm (LSTM)                  (None, None, 512)         1665024   
_________________________________________________________________
lstm_1 (LSTM)                (None, None, 512)         2099200   
_________________________________________________________________
lstm_2 (LSTM)                (None, 512)               2099200   
Total params: 5,863,424
Trainable params: 5,863,424
Non-trainable params: 0
_________________________________________________________________


## Final model 
Model that merges the questions and the images and has dense layers after it

In [20]:
# Merge
combined = concatenate([image_model.output, language_model.output])
model = Dense(512, activation = 'relu')(combined)
model = Dropout(0.3)(model)

model = Dense(128, activation = 'relu')(model)
model = Dropout(0.3)(model)

model = Dense(58)(model)
model = Activation("softmax")(model)

model = Model(inputs=[image_model.input, language_model.input], outputs=model)

In [21]:
lr = 1e-4
optimizer = tf.keras.optimizers.Adam(learning_rate=lr)

# Validation metrics

metric = tf.keras.metrics.CategoricalAccuracy()
model.compile(loss='categorical_crossentropy', optimizer=optimizer,metrics = metric)
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
lstm_input (InputLayer)         [(None, None, 300)]  0                                            
__________________________________________________________________________________________________
lstm (LSTM)                     (None, None, 512)    1665024     lstm_input[0][0]                 
__________________________________________________________________________________________________
reshape_input (InputLayer)      [(None, 7680)]       0                                            
__________________________________________________________________________________________________
lstm_1 (LSTM)                   (None, None, 512)    2099200     lstm[0][0]                       
____________________________________________________________________________________________

## Model Training

In [None]:
train_questions = en_questions[:40000]
train_answers = en_answers[:40000]
train_image_id = en_img_ids[:40000]

valid_questions = en_questions[40000:]
valid_answers = en_answers[40000:]
valid_image_id = en_img_ids[40000:]

In [23]:
batch_size = 33
vbatch_size= 10
num_epochs = 10

for k in range(num_epochs):
    print("Epoch Number: ",k+1)
    progbar = generic_utils.Progbar(len(train_questions))
    for question_batch, ans_batch, im_batch, vquestion_batch, vans_batch, vim_batch, in zip(grouped(train_questions, batch_size, fillvalue=train_questions[-1]), 
                                                                                            grouped(train_answers, batch_size, fillvalue=train_answers[-1]),
                                                                                            grouped(train_image_id, batch_size, fillvalue=train_image_id[-1]),
                                                                                            grouped(valid_questions, vbatch_size, fillvalue=valid_questions[-1]), 
                                                                                            grouped(valid_answers, vbatch_size, fillvalue=valid_answers[-1]),
                                                                                            grouped(valid_image_id, vbatch_size, fillvalue=valid_image_id[-1])):
                                                X_ques_batch = question_batch
                                                X_img_batch = im_batch
                                                Y_batch = ans_batch
                                                X_vques_batch = vquestion_batch
                                                X_vimg_batch = vim_batch
                                                Y_vbatch = vans_batch
                                                np_X_ques_batch = np.array(X_ques_batch)
                                                np_X_img_batch = np.array(X_img_batch)
                                                np_Y_batch = np.array(Y_batch)
                                                np_X_vques_batch = np.array(X_vques_batch)
                                                np_X_vimg_batch = np.array(X_vimg_batch)
                                                np_Y_vbatch = np.array(Y_vbatch)
                                                loss, acc = model.train_on_batch(({'lstm_input' : np_X_ques_batch, 'reshape_input' : np_X_img_batch}), np_Y_batch, class_weight=None)
                                                vloss, vacc = model.test_on_batch(({'lstm_input' : np_X_vques_batch, 'reshape_input' : np_X_vimg_batch}), np_Y_vbatch)
                                                #loss = model.train_on_batch(({'lstm_1_input' : np_X_ques_batch, 'reshape_1_input' : np_X_img_batch}), np_Y_batch, class_weight=None)
                                                progbar.add(batch_size, values=[('train loss', loss),('train accuracy', acc),('val loss', vloss),('val accuracy', vacc)])

Epoch Number:  1
Epoch Number:  2
Epoch Number:  3
Epoch Number:  4
Epoch Number:  5
Epoch Number:  6
Epoch Number:  7
Epoch Number:  8
Epoch Number:  9
Epoch Number:  10


In [49]:
widgets = ['Evaluating ', Percentage(), ' ', Bar(marker='#',left='[',right=']'), ' ', ETA()]
pbar = ProgressBar(widgets=widgets)

In [None]:
from sklearn.preprocessing import LabelEncoder

#Check scores
valid_pred = []
batch_size = 1 

for qu_batch,an_batch,im_batch in pbar(zip(grouped(valid_questions, batch_size, 
                                                   fillvalue=valid_questions[0]), 
                                           grouped(valid_answers, batch_size, 
                                                   fillvalue=valid_answers[0]), 
                                           grouped(valid_image_id, batch_size, 
                                                   fillvalue=valid_image_id[0]))):
    X_ques_batch = qu_batch
    X_img_batch = im_batch
    np_X_ques_batch = np.array(X_ques_batch)
    np_X_img_batch = np.array(X_img_batch)
    valid_predict = model.predict(({'lstm_input' : np_X_ques_batch, 'reshape_input' : np_X_img_batch}))
    valid_predict = np.argmax(valid_predict,axis=1)
    valid_pred.extend(valid_predict)

In [None]:
#Decode answers to words
for i in range(len(valid_pred)):
  vector = np.zeros(len(answer_vocab))
  vector[valid_pred[i]] = 1
  valid_pred[i] = decodeAns(vector, ans_idx_to_word)

In [None]:
#Check same length
print(len(valid_pred))
print(len(valid_answers))
#Check correctness
print(valid_pred[2])
print(decodeAns(valid_answers[2], ans_idx_to_word))

In [None]:
#Manual accuracy over validation set
correct = 0
for j in range(len(valid_pred)):
  if(valid_pred[j] == decodeAns(valid_answers[j], ans_idx_to_word)):
    correct = correct + 1

print(correct)

accuracy = correct/len(valid_pred)
print(accuracy)

## Model Prediction

In [42]:
path = os.path.join(dataset_dir, 'test_questions.json')
d = open(path)
test_q = json.load(d)

In [43]:
#Test questions
test_questions = list([text for id in test_q.keys() for text in test_q[id]['question'].splitlines()])

#Test Img_id list 
test_img_ids = list([text for id in test_q.keys() for text in test_q[id]['image_id'].splitlines()])

In [45]:
test_max_len = 0
test_max_len_q_index = 0
for i in range(len(test_questions)):
  test_q_len = len(test_questions[i].strip().split(" "))
  if(test_q_len > test_max_len):
    test_max_len = test_q_len
    test_max_len_q_index = i

test_max_len = len(nlp(test_questions[test_max_len_q_index]))

print(test_max_len)
print(max_len)
print(list(test_q.items())[test_max_len_q_index])

19
24
('21471', {'question': 'Is the lady standing on the rug and the woman in the portrait wearing the same colored shirt?', 'image_id': '2147'})


In [29]:
#ONLY IF NEEDED To avoid running out of RAM after training delete encoded training questions 
del train_questions
del en_questions
del valid_questions

import gc
gc.collect

In [67]:
en_test_questions, en_test_img_ids = create_test_tuples(test_questions, test_img_ids, max_len, nlp, feature_maps)

In [47]:
print(len(test_questions))
print(len(en_test_questions))

6372
6372


In [103]:
test_batch_size = 1
y_pred = []


for qu_batch,im_batch in pbar(zip(grouped(en_test_questions, test_batch_size, 
                                                   fillvalue=en_test_questions[0]),                                            
                                           grouped(en_test_img_ids, test_batch_size, 
                                                   fillvalue=en_test_img_ids[0]))):

    X_ques_batch = qu_batch
    X_img_batch = im_batch
    np_X_ques_batch = np.array(X_ques_batch)
    np_X_img_batch = np.array(X_img_batch)
    y_predict = model.predict(({'lstm_input' : np_X_ques_batch, 'reshape_input' : np_X_img_batch}))
    y_predict = np.argmax(y_predict,axis=1)
    y_pred.extend(y_predict)
    

Evaluating N/A% [#                                             ] Time:  1:10:19

## Creation of submission csv

In [101]:
labels_dict = {
        '0': 0,
        '1': 1,
        '2': 2,
        '3': 3,
        '4': 4,
        '5': 5,
        'apple': 6,
        'baseball': 7,
        'bench': 8,
        'bike': 9,
        'bird': 10,
        'black': 11,
        'blanket': 12,
        'blue': 13,
        'bone': 14,
        'book': 15,
        'boy': 16,
        'brown': 17,
        'cat': 18,
        'chair': 19,
        'couch': 20,
        'dog': 21,
        'floor': 22,
        'food': 23,
        'football': 24,
        'girl': 25,
        'grass': 26,
        'gray': 27,
        'green': 28,
        'left': 29,
        'log': 30,
        'man': 31,
        'monkey bars': 32,
        'no': 33,
        'nothing': 34,
        'orange': 35,
        'pie': 36,
        'plant': 37,
        'playing': 38,
        'red': 39,
        'right': 40,
        'rug': 41,
        'sandbox': 42,
        'sitting': 43,
        'sleeping': 44,
        'soccer': 45,
        'squirrel': 46,
        'standing': 47,
        'stool': 48,
        'sunny': 49,
        'table': 50,
        'tree': 51,
        'watermelon': 52,
        'white': 53,
        'wine': 54,
        'woman': 55,
        'yellow': 56,
        'yes': 57
}


In [61]:
def create_csv(results, results_dir='./'):

    csv_fname = 'results_'
    csv_fname += datetime.now().strftime('%b%d_%H-%M-%S') + '.csv'

    with open(os.path.join(results_dir, csv_fname), 'w') as f:

        f.write('Id,Category\n')

        for key, value in results.items():
            f.write(key + ',' + str(value) + '\n')

In [105]:
#Decode answers to words
de_y_pred = y_pred
for i in range(len(de_y_pred)):
  vector = np.zeros(len(answer_vocab))
  vector[de_y_pred[i]] = 1
  de_y_pred[i] = decodeAns(vector, ans_idx_to_word)

In [107]:
#Re encode with correct dictionary
final_y_pred = de_y_pred
for i in range(len(final_y_pred)):
  final_y_pred[i] = labels_dict.get(final_y_pred[i])

In [109]:
#create submission
results = {}
test_ids = list(test_q.keys())
for i in range(len(test_ids)):
  results[test_ids[i]] = final_y_pred[i]

In [110]:
create_csv(results, '/content/drive/My Drive')