# UTILITIES

## SHELL OUTPUT

In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## MODULES IMPORT

In [None]:
import os
import tensorflow as tf
import numpy as np
import h5py
import pickle
import zipfile
import json
import pandas as pd

## UNZIP DATA

In [None]:
# since there will be some operations on disk, let's define a utilities directory where 
# we will store matrixes, archives, etc.
utils_dir = '/content/drive/MyDrive/AN2DL-competitions/HW3'

In [None]:
# if the dataset directory still does not exist let's create it
# necessary to have the zip file of the dataset (downloadable from Kaggle)
# in the utils_directory defined above
if not os.path.exists('/content/VQA_Dataset'):
  os.makedirs('/content/VQA_Dataset')
  with zipfile.ZipFile(os.path.join(utils_dir, 'VQA_Dataset.zip'), 'r') as zip_ref:
    zip_ref.extractall('/content/VQA_Dataset')

In [None]:
# necessary to have the zip file of the GloVe embedding in the utils_directory defined above
# downloadable from http://nlp.stanford.edu/data/glove.42B.300d.zip
if not os.path.exists('/content/glove.42B.300d.txt'):
  with zipfile.ZipFile(os.path.join(utils_dir, 'glove.42B.300d.zip'), 'r') as zip_ref:
    zip_ref.extractall('/content/')

## PARAMS

In [None]:
# set dataset directory
dataset_dir = '/content/VQA_Dataset'
# set the path of the GloVe txt
glove_path = '/content/glove.42B.300d.txt'
# image size
IMG_H = 224
IMG_W = 224
# batch size
BS = 64
# number of epochs
EPOCHS = 30
# learning rate
LR = 5e-4
# seed
SEED = 1234
# early stopping
ES = False
# number of classes
NC = 58

## ANSWERS DICTIONARY

In [None]:
LABELS_DICT = {
  '0': 0,
  '1': 1,
  '2': 2,
  '3': 3,
  '4': 4,
  '5': 5,
  'apple': 6,
  'baseball': 7,
  'bench': 8,
  'bike': 9,
  'bird': 10,
  'black': 11,
  'blanket': 12,
  'blue': 13,
  'bone': 14,
  'book': 15,
  'boy': 16,
  'brown': 17,
  'cat': 18,
  'chair': 19,
  'couch': 20,
  'dog': 21,
  'floor': 22,
  'food': 23,
  'football': 24,
  'girl': 25,
  'grass': 26,
  'gray': 27,
  'green': 28,
  'left': 29,
  'log': 30,
  'man': 31,
  'monkey bars': 32,
  'no': 33,
  'nothing': 34,
  'orange': 35,
  'pie': 36,
  'plant': 37,
  'playing': 38,
  'red': 39,
  'right': 40,
  'rug': 41,
  'sandbox': 42,
  'sitting': 43,
  'sleeping': 44,
  'soccer': 45,
  'squirrel': 46,
  'standing': 47,
  'stool': 48,
  'sunny': 49,
  'table': 50,
  'tree': 51,
  'watermelon': 52,
  'white': 53,
  'wine': 54,
  'woman': 55,
  'yellow': 56,
  'yes': 57
}

# GLOVE EMBEDDING FUNCTIONS & STRUCTURES

Let's define the embedding matrix and the words' index (GloVe embedding) if not defined yet. If already defined, let's retrieve them from the disk memory (Google Drive).

In [None]:
if not os.path.exists(os.path.join(utils_dir, 'embedding_matrix.h5')) or not os.path.exists(os.path.join(utils_dir,'word_idx.pickle')):

  embeddings = {}
  word_idx = {}
    
  with open(glove_path,'r') as f:
    for i, line in enumerate(f):
      values = line.split(' ')
      word = values[0]
      coefs = np.asarray(values[1:], dtype='float32')
      embeddings[word] = coefs
      word_idx[word] = i+1 # perché lo 0 sarà assegnato alle parole non trovate!
      # let's break the loop when we already have the 300K most frequent words
      # 0 to 299998 = 299999, bisogna poi aggiungere 1 per le parole non trovate
      if i==299998:
        break

  num_words = len(word_idx)
  embedding_matrix = np.zeros((1+num_words, 300)) # DIM = num_words_dictionary x 300 (300 is the number of coefs)

  for i, word in enumerate(word_idx.keys()):
    embedding_matrix[i+1] = embeddings[word]

  # let's store the embedding_matrix
  with h5py.File(os.path.join(utils_dir,'embedding_matrix.h5'), 'w') as hf:
    hf.create_dataset('embedding_matrix', data=embedding_matrix)

  # let's store the word indexes
  with open(os.path.join(utils_dir,'word_idx.pickle'),'wb') as f:
    pickle.dump(word_idx, f)

else:

  with h5py.File(os.path.join(utils_dir,'embedding_matrix.h5'),'r') as hf:
    data = hf.get('embedding_matrix')
    embedding_matrix = np.array(data)

  with open(os.path.join(utils_dir,'word_idx.pickle'),'rb') as file:
    word_idx = pickle.load(file)  

# DATASET

## FUNCTIONS & CLASS DEFINITION
Functions for obtaining both question and answer matrixes. Class for CustomDataset.

In [None]:
from keras.preprocessing.sequence import pad_sequences
import nltk
from collections import defaultdict
from keras.preprocessing import image
from PIL import Image

nltk.download('punkt')

def get_question_matrix(df):
  questions = df[['question']].values.tolist()
  seq_list = []
  for question in questions:
    words = nltk.word_tokenize(question[0].lower().replace("?", ""))
    seq = []
    for word in words:
      seq.append(word_idx.get(word, 0))
    seq_list.append(seq)
  question_matrix = pad_sequences(seq_list) # DIM = num_questions x num_words_in_longer_question
  return question_matrix

def answer_to_onehot():
  answers = [k for k in LABELS_DICT.keys()]
  answer_to_onehot = {}
  for i, word in enumerate(answers):
    onehot = np.zeros(NC)
    onehot[i] = 1.0
    answer_to_onehot[word] = onehot
  return answer_to_onehot

def get_answer_matrix(df, answer_to_onehot):
  answers = df[['answer']].values.tolist()
  answer_matrix = np.zeros((len(answers), NC))
  for i, answer in enumerate(answers):
    answer_matrix[i] = answer_to_onehot.get(answer[0].lower())
  return answer_matrix

class CustomDataset(tf.keras.utils.Sequence):

  def __init__(self, images_dir, images_names, question_matrix, answer_matrix, batch_size):

    self.images_dir = images_dir
    self.images_names = images_names
    self.question_matrix = question_matrix
    self.answer_matrix = answer_matrix
    self.batch_size = batch_size

  def __len__(self):
    return int(np.floor(len(self.images_names)/(self.batch_size)))

  def __getitem__(self, index):

    X = np.zeros((self.batch_size, IMG_H, IMG_W, 3))

    for i in range(self.batch_size):
      img = image.load_img(os.path.join(self.images_dir, self.images_names[(index*self.batch_size)+i][0] + '.png'), target_size=(IMG_H, IMG_W))
      x = image.img_to_array(img)
      X[i,] = x

    return [self.question_matrix[index*self.batch_size:(index*self.batch_size)+self.batch_size], X], self.answer_matrix[index*self.batch_size:(index*self.batch_size)+self.batch_size]

## TRAINING & VALIDATION DATASETS

In [None]:
from sklearn.utils import shuffle 

df = pd.read_json(os.path.join(dataset_dir,"train_questions_annotations.json")).transpose()
df = shuffle(df)

train_df, val_df = np.split(df, [int(.8*len(df))])

answer_to_onehot = answer_to_onehot()

train_question_matrix = get_question_matrix(train_df)
train_answer_matrix = get_answer_matrix(train_df, answer_to_onehot)

val_question_matrix = get_question_matrix(val_df)
val_answer_matrix = get_answer_matrix(val_df, answer_to_onehot)

# since there is a shuffle and a padding operation, it could happen that
# the padding added on the 2 matrixes (train and val) is different
# --> different dimensions are not allowed from the model that wants a fixed
# input. We have to resize the matrixes in order to have equivalent dimensions
tqm = train_question_matrix.shape[1]
vqm = val_question_matrix.shape[1]

if tqm < vqm:
  train_question_matrix = np.hstack([np.zeros((train_question_matrix.shape[0], vqm-tqm)), 
                                     train_question_matrix])
elif vqm < tqm:
  val_question_matrix = np.hstack([np.zeros((val_question_matrix.shape[0], tqm-vqm)), 
                                     val_question_matrix])
  
#print(train_question_matrix.shape)
#print(train_answer_matrix.shape)
#print(val_question_matrix.shape)
#print(val_answer_matrix.shape)

dataset_training = CustomDataset(os.path.join(dataset_dir, 'Images'), 
                                 train_df[['image_id']].values.tolist(),
                                 train_question_matrix,
                                 train_answer_matrix,
                                 BS)

dataset_validation = CustomDataset(os.path.join(dataset_dir, 'Images'), 
                                   val_df[['image_id']].values.tolist(),
                                   val_question_matrix,
                                   val_answer_matrix,
                                   BS)

## DATASET TEST

In [None]:
iterator = iter(dataset_training)

In [None]:
[q, img], a = next(iterator)
print(q.shape)
print(img.shape)
print(a.shape)

## MODEL DEFINITION

In [None]:
from keras.models import Sequential, Model
from keras.layers import Dense, Embedding, LSTM, Reshape, Dropout, concatenate, Flatten, Input
from keras.utils import plot_model
from keras.applications import VGG19

def img_model():

  vgg = VGG19(weights='imagenet', 
            include_top = True, 
            input_shape=(IMG_H, IMG_W, 3))
  for layer in vgg.layers:
    layer.trainable = False

  edit_vgg = Model(vgg.input, vgg.layers[-2].output)

  model = Sequential()
  model.add(edit_vgg)
  model.add(Dense(1024, input_dim=4096, activation='relu'))

  return model

def question_model(embedding_matrix, seq_length):
  model = Sequential()
  model.add(Embedding(embedding_matrix.shape[0], embedding_matrix.shape[1], 
      weights=[embedding_matrix], input_length=seq_length, trainable=False, mask_zero=True))
  model.add(LSTM(units=512, return_sequences=True, input_shape=(seq_length, embedding_matrix.shape[1])))
  model.add(Dropout(0.5))
  model.add(LSTM(units=512, return_sequences=False))
  model.add(Dropout(0.5))
  model.add(Dense(1024, activation='relu'))

  return model

image_input = Input(shape=(IMG_H, IMG_W, 3))
encoded_image = img_model()(image_input)

question_input = Input(shape=(train_question_matrix.shape[1]))
encoded_question = question_model(embedding_matrix, train_question_matrix.shape[1])(question_input)

merged = concatenate([encoded_question, encoded_image])
output = Dropout(0.3)(merged)
output = Dense(1024, activation='relu')(output)
output = Dropout(0.3)(output)
output = Dense(NC, activation='softmax')(output)

model = Model(inputs=[question_input, image_input], outputs=output)
model.summary()
#plot_model(model, '/content/model1.png', True, expand_nested=True)

## OPTIMIZATION PARAMS

In [None]:
# Loss -> i'm using one-hot-encoding
loss = tf.keras.losses.CategoricalCrossentropy()

# Optimizer
optimizer = tf.keras.optimizers.RMSprop(learning_rate=LR, rho=0.9)

# Metrics
metrics = ['accuracy']

# Set random seed
tf.random.set_seed(SEED)

# Compile Model
model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

## CALLBACKS

In [None]:
from datetime import datetime

cwd = '/content/drive/MyDrive/AN2DL-competitions/HW3'

experiments_dir = os.path.join(utils_dir, 'experiments-singl-model')
if not os.path.exists(experiments_dir):
    os.makedirs(experiments_dir)

now = datetime.now().strftime('%b%d_%H-%M-%S')

model_name = 'single_proj'

proj_dir = os.path.join(experiments_dir, model_name + '_' + str(now))
if not os.path.exists(proj_dir):
    os.makedirs(proj_dir)
    
callbacks = []

### Model Checkpoint

In [None]:
ckpt_dir = os.path.join(proj_dir, 'checkpoints')
if not os.path.exists(ckpt_dir):
    os.makedirs(ckpt_dir)

ckpt_callback = tf.keras.callbacks.ModelCheckpoint(filepath=os.path.join(ckpt_dir, 'cp_{epoch:02d}.ckpt'), 
                                                   save_weights_only=True)  # False to save the model directly
callbacks.append(ckpt_callback)

### Tensorboard

In [None]:
tb_dir = os.path.join(proj_dir, 'tensorboard-logs')
if not os.path.exists(tb_dir):
    os.makedirs(tb_dir)
    
# By default shows losses and metrics for both training and validation
tb_callback = tf.keras.callbacks.TensorBoard(log_dir=tb_dir,
                                             profile_batch=0,
                                             histogram_freq=1)
callbacks.append(tb_callback)

### Early Stopping

In [None]:
if ES:
  es_callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=PATIENCE)
  callbacks.append(es_callback)

## MODEL FIT

In [None]:
model.fit(dataset_training,
          epochs=EPOCHS,
          steps_per_epoch=len(dataset_training),
          validation_data=dataset_validation,
          validation_steps=len(dataset_validation), 
          callbacks=callbacks)

# SUBMISSION

## MODEL RELOADING

In [None]:
# Cambiare in base a migliore modello !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
model.load_weights("/content/drive/MyDrive/HW3_Utilities/HW3_Experiments/Proj3_Jan31_00-38-05/Checkpoints/cp_15.ckpt")

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f0b93133a90>

## TEST CUSTOM DATASET

In [None]:
class TestCustomDataset(tf.keras.utils.Sequence):

  def __init__(self, images_dir, images_names, question_matrix):

    self.images_dir = images_dir
    self.images_names = images_names
    self.question_matrix = question_matrix

  def __len__(self):
    return len(self.images_names)

  def __getitem__(self, index):

    img = image.load_img(os.path.join(self.images_dir, self.images_names[index][0] + '.png'), target_size=(IMG_H, IMG_W))
    x = image.img_to_array(img)
    X = np.expand_dims(x, axis=0)

    return [np.expand_dims(self.question_matrix[index], axis=0), X]

## TEST DATASET

In [None]:
test_df = pd.read_json(os.path.join(dataset_dir,"test_questions.json")).transpose()
#test_df = shuffle(test_df)

test_question_matrix = get_question_matrix(test_df)

tqm = test_question_matrix.shape[1]

if tqm < 23:
  test_question_matrix = np.hstack([np.zeros((test_question_matrix.shape[0], 23-tqm)), 
                                     test_question_matrix])

print(test_question_matrix.shape)
  
test_generator = TestCustomDataset(os.path.join(dataset_dir, 'Images'), 
                                  test_df[['image_id']].values.tolist(),
                                  test_question_matrix)

(6372, 23)


## PREDICTIONS & CSV

In [None]:
from datetime import datetime

def create_csv(results, results_dir='./'):

    csv_fname = 'results_'
    csv_fname += datetime.now().strftime('%b%d_%H-%M-%S') + '.csv'

    with open(os.path.join(results_dir, csv_fname), 'w') as f:

        f.write('Id,Category\n')

        for key, value in results.items():
            f.write(key + ',' + str(value) + '\n')

pred = model.predict(test_generator)
results = {}
for i in range(len(pred)):
  results[str(test_df.index[i])] = np.argmax(pred[i])

create_csv(results)