In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
import os
import tensorflow as tf
import numpy as np
import shutil
from tensorflow.keras.applications import InceptionResNetV2

# Set the seed for random operations. 
# This let our experiments to be reproducible. 
SEED = 1234
tf.random.set_seed(SEED)
np.random.seed(SEED)

# Get current working directory
cwd = os.getcwd()

for gpu in tf.config.experimental.list_physical_devices('GPU'):
    tf.config.experimental.set_memory_growth(gpu, True)

In [3]:
cwd = os.getcwd()
cwd

'c:\\Users\\enric\\Downloads\\AN2DL-1st-Project\\3rd Challenge'

In [4]:
dataset_dir = os.path.join(cwd, 'VQA_Dataset')
#if os.path.exists(dataset_dir):
#  shutil.rmtree(dataset_dir)

#!unzip  '/content/drive/MyDrive/anndl-2020-vqa.zip'


# Hyperparameters

In [5]:
img_w = 700
img_h = 400
batch_size = 4
lr = 1e-3

MAX_NUM_WORDS = 5000 # max number of unique words in dictionary

FEATURES = 1536 # size of feature vector for images and questions

UNITS = 512

PERC_DROP = 0.4

decay = 0.5
minimum = 1e-6

In [6]:
labels_dict = {
        '0': 0,
        '1': 1,
        '2': 2,
        '3': 3,
        '4': 4,
        '5': 5,
        'apple': 6,
        'baseball': 7,
        'bench': 8,
        'bike': 9,
        'bird': 10,
        'black': 11,
        'blanket': 12,
        'blue': 13,
        'bone': 14,
        'book': 15,
        'boy': 16,
        'brown': 17,
        'cat': 18,
        'chair': 19,
        'couch': 20,
        'dog': 21,
        'floor': 22,
        'food': 23,
        'football': 24,
        'girl': 25,
        'grass': 26,
        'gray': 27,
        'green': 28,
        'left': 29,
        'log': 30,
        'man': 31,
        'monkey bars': 32,
        'no': 33,
        'nothing': 34,
        'orange': 35,
        'pie': 36,
        'plant': 37,
        'playing': 38,
        'red': 39,
        'right': 40,
        'rug': 41,
        'sandbox': 42,
        'sitting': 43,
        'sleeping': 44,
        'soccer': 45,
        'squirrel': 46,
        'standing': 47,
        'stool': 48,
        'sunny': 49,
        'table': 50,
        'tree': 51,
        'watermelon': 52,
        'white': 53,
        'wine': 54,
        'woman': 55,
        'yellow': 56,
        'yes': 57
}

num_answers = len(labels_dict)

# Functions

In [7]:
import json
def unwrap_weighted(path, split = 0.2):
    
    dataset_dir = os.path.join(path, 'train_questions_annotations.json')
    training_dir = os.path.join(path, 'training.json')
    validation_dir = os.path.join(path, 'validation.json')
        
    dic_images = None
    
    with open(dataset_dir) as f:
       dic_images = json.load(f)
        
    dict_keys = list(dic_images.keys())
    np.random.shuffle(dict_keys)
    questions = int(round(split*len(dict_keys)))
        
    dic_validations = { dict_keys[i]:dic_images[dict_keys[i]] for i in range(questions)}
    dic_training = {dict_keys[i]:dic_images[dict_keys[i]] for i in range(questions, len(dict_keys))}
        
    with open(training_dir, 'w') as fp:
       json.dump(dic_training, fp)
    with open(validation_dir, 'w') as fp:
       json.dump(dic_validations, fp)

path = os.getcwd()

In [8]:
def get_token_dic_quest(path, max_num_words = 5000):
    from tensorflow.keras.preprocessing.text import Tokenizer
    dataset_dir = os.path.join(path, 'train_questions_annotations.json')
    
    # Load dataset
    with open(dataset_dir) as f:
        dic_images = json.load(f)

    # Get all questions as strings in a list
    questions = [dic['question'] for dic in dic_images.values()]

    # Strip '?' from questions
    questions = [s.translate(str.maketrans('', '', '?')).lower() for s in questions if not s == '']
    questions_tokenizer = Tokenizer(num_words=max_num_words)
    questions_tokenizer.fit_on_texts(questions)

    questions_wtoi = questions_tokenizer.word_index # index 0 reserved for padding
    
    questions_tokenized = questions_tokenizer.texts_to_sequences(questions)
    max_question_length = max(len(sentence) for sentence in questions_tokenized)
    
    return questions_tokenizer, questions_wtoi, max_question_length


def from_questions_to_dict(path, dict_req, max_num_words = 5000):
    from tensorflow.keras.preprocessing.sequence import pad_sequences
    
    # Return dictionary q_wtoi
    tokenizer, wtoi, max_len = get_token_dic_quest(path, max_num_words = 5000)
    
    translated_dics = []
    
    for dic in dict_req:
        
        question = dic['question'].translate(str.maketrans('', '', '?')).lower()
        question = tokenizer.texts_to_sequences([question])
        question = pad_sequences(question, maxlen=max_len)
        dic['question'] = question[0]
        dic['answer'] = labels_dict[dic['answer']]
        translated_dics.append(dic)
    
    return translated_dics

In [9]:
from PIL import Image
    
# Patches Generator
class dataset_generator(tf.keras.utils.Sequence):

  def __init__(self, path, preprocessing, subset = "training", image_generator = None, batch_size = 5, max_num_words=5000):
    json_file = subset + ".json"
    dat_dir = os.path.join(path, 'VQA_Dataset')
    subset_file = os.path.join(dat_dir, json_file)
    
    with open(subset_file) as f:
       dictionaries = json.load(f)
       dictionaries = dictionaries.values()
       self.dictionary = from_questions_to_dict(dat_dir, dictionaries, max_num_words)
    
    self.batch_size = batch_size
    self.image_generator = image_generator
    self.preprocessing = preprocessing
    self.dat_dir = dat_dir
    self.gen = image_generator
    self.batch_size = batch_size
    self.max_num_words = max_num_words
    self.n = 0
    
  def __len__(self):
    return len(self.dictionary)//self.batch_size

  def __getitem__(self, index):
    lower_bound = index*self.batch_size
    upper_bound = (index+1)*self.batch_size
    
    batch_img = []
    batch_que = []
    batch_ans = []
    
    for idx in range(lower_bound, upper_bound):
        img, que, ans = self.__data_generation__(idx)
        batch_img.append(img)
        batch_que.append(que)
        batch_ans.append(ans)
        
    batch_img = np.stack(batch_img, axis=0)
    batch_que = np.stack(batch_que, axis=0)
    batch_ans = np.stack(batch_ans, axis=0)
    
    x = [batch_img, batch_que]
    y = batch_ans
    
    return x, y
    
    
  def __data_generation__(self, idx):
    actual_dict = self.dictionary[idx]
    
    img_name = actual_dict['image_id']
    answer = actual_dict['answer']
    question = actual_dict['question']
    
    actual_img = Image.open(os.path.join(self.dat_dir, "Images", img_name + ".png"))
    actual_img = actual_img.convert('RGB')
    img_arr = np.array(actual_img)
    img_arr = np.expand_dims(img_arr, axis=0)
    
    if self.image_generator is not None:
        img_arr = self.gen.random_transform(img_arr)
    
    if self.preprocessing is not None:
        img_arr = self.preprocessing(img_arr)
        
    img_arr = np.squeeze(img_arr, axis=0)
    
    return img_arr, question, answer

Datasets generation

In [10]:
unwrap_weighted(os.path.join(path, 'VQA_Dataset'))

In [11]:
preprocessing_function_inception = tf.keras.applications.inception_resnet_v2.preprocess_input

gen = dataset_generator(path = os.getcwd(), preprocessing = preprocessing_function_inception, 
                  subset = "training", image_generator = None, max_num_words=5000, batch_size = batch_size)

gen_val = dataset_generator(path = os.getcwd(), preprocessing = preprocessing_function_inception, 
                  subset = "validation", image_generator = None, max_num_words=5000, batch_size = batch_size)


# Image Encoder

In [12]:
image_encoder_incpetion = tf.keras.applications.InceptionResNetV2(
    include_top=False,
    weights="imagenet",
    input_shape=(img_h, img_w, 3)
)

for layer in image_encoder_incpetion.layers:
    layer.trainable = False

image_encoder_incpetion.summary()

______________________
batch_normalization_181 (BatchN (None, 11, 20, 224)  672         conv2d_181[0][0]                 
__________________________________________________________________________________________________
activation_181 (Activation)     (None, 11, 20, 224)  0           batch_normalization_181[0][0]    
__________________________________________________________________________________________________
conv2d_179 (Conv2D)             (None, 11, 20, 192)  399360      block8_4_ac[0][0]                
__________________________________________________________________________________________________
conv2d_182 (Conv2D)             (None, 11, 20, 256)  172032      activation_181[0][0]             
__________________________________________________________________________________________________
batch_normalization_179 (BatchN (None, 11, 20, 192)  576         conv2d_179[0][0]                 
______________________________________________________________________________________

# Question Encoder

Load questions into a List

In [13]:
import json
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

dataset_dir = os.path.join(cwd, "VQA_Dataset", "train_questions_annotations.json")

# Load dataset
with open(dataset_dir) as f:
    dic_images = json.load(f)
            
# Get all questions as strings in a list
questions = [dic['question'] for dic in dic_images.values()]

# Strip '?' from questions
questions = [s.translate(str.maketrans('', '', '?')).lower() for s in questions if not s == '']
print(questions[12])

# max_words_in_sentence = max(len(question.split(' ')) for question in questions)
# print(max_words_in_sentence)

is there books on the bookshelf


Tokenize questions

In [14]:
# Create Tokenizer to convert words to integers
questions_tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
questions_tokenizer.fit_on_texts(questions)

questions_tokenized = questions_tokenizer.texts_to_sequences(questions)
# each sentence into a sequence of tokens (in this case, only the 20000 most frequent)

# "hello raffaele" -> [9, 78] 

questions_wtoi = questions_tokenizer.word_index # index 0 reserved for padding
print('Total number of words:', len(questions_wtoi))

print(questions_tokenized[0])

Total number of words: 4640
[47, 797, 1903]


In [15]:
max_question_length = max(len(sentence) for sentence in questions_tokenized)
print('Max question length:', max_question_length)

# Pad to max question sentence length
padded_questions = pad_sequences(questions_tokenized, maxlen=max_question_length)

print("Padded questions shape:", padded_questions.shape)

Max question length: 21
Padded questions shape: (58832, 21)


Load pre-trained GloVe embedding

In [16]:
#!wget http://nlp.stanford.edu/data/glove.6B.zip
#!unzip -q glove.6B.zip

path_to_glove_file = os.path.join(cwd,'glove.6B\glove.6B.100d.txt')

embeddings_index = {}
with open(path_to_glove_file, encoding='utf-8') as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found", len(embeddings_index), "word vectors.")

Found 400000 word vectors.


In [17]:
num_tokens = len(questions_wtoi) + 1
embedding_dim = 100

hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in questions_wtoi.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

Converted 4496 words (144 misses)


Create question encoder

In [18]:
embedding_layer = tf.keras.layers.Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
    trainable=False,
    mask_zero=True,
    input_length=max_question_length
)

question_encoder = tf.keras.models.Sequential()
question_encoder.add(tf.keras.layers.Input(shape=(max_question_length,), dtype="int64"))
question_encoder.add(embedding_layer)
question_encoder.add(tf.keras.layers.Dropout(PERC_DROP))
question_encoder.add(tf.keras.layers.LSTM(units=FEATURES))

question_encoder.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 21, 100)           464100    
_________________________________________________________________
dropout (Dropout)            (None, 21, 100)           0         
_________________________________________________________________
lstm (LSTM)                  (None, 1536)              10057728  
Total params: 10,521,828
Trainable params: 10,057,728
Non-trainable params: 464,100
_________________________________________________________________


# Create complete model

Load indexes for answers

In [19]:
multiplied_features = tf.keras.layers.Multiply()([image_encoder_incpetion.layers[-1].output, question_encoder.layers[-1].output])
dense_1 = tf.keras.layers.Dense(UNITS, activation='tanh')(multiplied_features)
drop_1 = tf.keras.layers.Dropout(PERC_DROP)(dense_1)
out = tf.keras.layers.Dense(num_answers, activation='softmax')(drop_1)
network = tf.keras.models.Model(inputs=[image_encoder_incpetion.layers[0].input, question_encoder.layers[0].input], outputs=out)

network.summary()

             
                                                                 activation_182[0][0]             
__________________________________________________________________________________________________
block8_5_conv (Conv2D)          (None, 11, 20, 2080) 933920      block8_5_mixed[0][0]             
__________________________________________________________________________________________________
block8_5 (Lambda)               (None, 11, 20, 2080) 0           block8_4_ac[0][0]                
                                                                 block8_5_conv[0][0]              
__________________________________________________________________________________________________
block8_5_ac (Activation)        (None, 11, 20, 2080) 0           block8_5[0][0]                   
__________________________________________________________________________________________________
conv2d_184 (Conv2D)             (None, 11, 20, 192)  399360      block8_5_ac[0][0]             

In [20]:
learnings_r = [1e-3,2e-3,1e-4,2e-4,1e-6,2e-6,3e-6]
for lr in learnings_r:
  optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
  metrics = ['accuracy']
  loss = tf.keras.losses.SparseCategoricalCrossentropy()
  network.compile(optimizer=optimizer, loss=loss, metrics=metrics)
  import os
  from datetime import datetime

  cwd = os.getcwd()

  exps_dir = os.path.join('ResultsVQA', 'basic_model')
  if not os.path.exists(exps_dir):
      os.makedirs(exps_dir)

  now = datetime.now().strftime('%b%d_%H-%M-%S')

  exp_name = 'exp'

  exp_dir = os.path.join(exps_dir, exp_name + '_' + str(now))
  if not os.path.exists(exp_dir):
      os.makedirs(exp_dir)
      
  callbacks = []

  # Model checkpoint
  # ----------------
  ckpt_dir = os.path.join(exp_dir, 'ckpts')
  if not os.path.exists(ckpt_dir):
      os.makedirs(ckpt_dir)

  ckpt_callback = tf.keras.callbacks.ModelCheckpoint(filepath=os.path.join(ckpt_dir, 'cp_{epoch:02d}.ckpt'), 
                                                    save_weights_only=True, save_best_only=True)  # False to save the model directly
  callbacks.append(ckpt_callback)

  # Early Stopping
  # --------------
  early_stop = True
  if early_stop:
      es_callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)
      callbacks.append(es_callback)

      # Callback Reduce On Plateau
  # ------------------
  red_callback = tf.keras.callbacks.ReduceLROnPlateau(
      monitor="val_loss",
      factor=decay,
      patience=5,
      verbose=1,
      mode="min",
      cooldown=0,
      min_lr=minimum
  )

  callbacks.append(red_callback)

  # ---------------------------------
  network.fit(x=gen,
            epochs=10,
            steps_per_epoch=len(gen),
            validation_data=gen_val,
            validation_steps=len(gen_val),
            callbacks=callbacks)

  ...
    to  
  ['...']
  ...
    to  
  ['...']
Train for 11766 steps, validate for 2941 steps
Epoch 1/10


InvalidArgumentError:  Incompatible shapes: [4,1] vs. [4,11,20]
	 [[node metrics/accuracy/Equal (defined at <ipython-input-20-f8b244083ab8>:63) ]] [Op:__inference_distributed_function_45774]

Function call stack:
distributed_function


# Predictions

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import string
from PIL import Image

path = './VQA_Dataset/test_questions.json'
max_question_length = 21
# preprocessing_function = tf.keras.applications.vgg16.preprocess_input
dat_dir = './VQA_Dataset/Images'
# ckpt_dir = os.path.join(cwd, 'Checkpoints', model_name)
# network.load_weights(os.path.join(ckpt_dir, 'basic_model_xception-weights-Jan08_22-02-28-epoch-04'))
#os.listdir('/content/drive/MyDrive/ResultsVQA/exp_Jan06_23-07-25/ckpts')
#network.load_weights('/content/drive/MyDrive/ResultsVQA/exp_Jan06_23-07-25/ckpts/cp_23')

with open(path) as f:
       dic_test = json.load(f)

dic_test_values = dic_test.values()
test_questions = [q['question'].lower().translate(str.maketrans('', '', string.punctuation)) for q in dic_test_values]

test_tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
test_tokenizer.fit_on_texts(test_questions)

test_wtoi = test_tokenizer.word_index

test_tokenized = test_tokenizer.texts_to_sequences(test_questions)

# print(test_tokenized[45])
# max_question_length = max(len(sentence) for sentence in test_tokenized)        

results = dict()

count = 0
print('Prediction started, printing every 100 samples computed...')

for question_id in dic_test.keys():
    
    temp_dic = dic_test[question_id]
    # print(temp)
    
    # Get question text
    question = temp_dic['question'].lower().translate(str.maketrans('', '', string.punctuation))
    # print(question)
    
    # Get related image
    image_id = temp_dic['image_id']
    
    # Open image and apply preprocessing
    img = Image.open(os.path.join(dat_dir, image_id + ".png"))
    img = img.convert('RGB')
    img_arr = np.array(img)
    img_arr = np.expand_dims(img_arr, axis=0)
    img_arr = preprocessing_function(img_arr)
    
    # Tokenize question text with test tokenizer
    #question_tokenized = test_tokenizer.texts_to_sequences([question])
    
    # Tokenize question text with QUESITIONS tokenizer from training phase
    question_tokenized = questions_tokenizer.texts_to_sequences([question])
    
    # Pad question to correct length (21)
    padded_question = pad_sequences(question_tokenized, maxlen=max_question_length)
    # print(padded_question)
    
    # Get prediction
    result = network.predict([img_arr, padded_question], verbose=0, batch_size=1)
    
    # Get index with max probability
    result = tf.argmax(result[0])
    result = int(result)
    # itoa = {v: k for k, v in labels_dict.items()}
    # answer = itoa[result]
    # print(answer)
    
    # Add to results dictionary
    results[question_id] = result
    
    count += 1
    
    if (count%100==0):
        print(count)
    
    # print(question_id, question, image_id)

In [None]:
import os
from datetime import datetime

def create_csv(results, results_dir='./'):

    csv_fname = 'results_'
    csv_fname += datetime.now().strftime('%b%d_%H-%M-%S') + '.csv'

    with open(os.path.join(results_dir, csv_fname), 'w') as f:

        f.write('Id,Category\n')

        for key, value in results.items():
            f.write(key + ',' + str(value) + '\n')

In [None]:
create_csv(results, results_dir='/content/drive/MyDrive/ResultsVQA')