In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
import os
import tensorflow as tf
import numpy as np

# Set the seed for random operations. 
# This let our experiments to be reproducible. 
SEED = 1234
tf.random.set_seed(SEED)
np.random.seed(SEED)

# Get current working directory
cwd = os.getcwd()

for gpu in tf.config.experimental.list_physical_devices('GPU'):
    tf.config.experimental.set_memory_growth(gpu, True)

In [3]:
cwd = os.getcwd()
cwd

'/content'

In [4]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True) # mount the drive

Mounted at /content/drive


In [5]:
dataset_dir = os.path.join(cwd, 'VQA_Dataset')
if os.path.exists(dataset_dir):
  shutil.rmtree(dataset_dir)

!unzip  '/content/drive/MyDrive/anndl-2020-vqa.zip'


[1;30;43mOutput streaming troncato alle ultime 5000 righe.[0m
  inflating: VQA_Dataset/Images/5390.png  
  inflating: VQA_Dataset/Images/5391.png  
  inflating: VQA_Dataset/Images/5392.png  
  inflating: VQA_Dataset/Images/5393.png  
  inflating: VQA_Dataset/Images/5394.png  
  inflating: VQA_Dataset/Images/5395.png  
  inflating: VQA_Dataset/Images/5396.png  
  inflating: VQA_Dataset/Images/5397.png  
  inflating: VQA_Dataset/Images/5398.png  
  inflating: VQA_Dataset/Images/5399.png  
  inflating: VQA_Dataset/Images/54.png  
  inflating: VQA_Dataset/Images/540.png  
  inflating: VQA_Dataset/Images/5400.png  
  inflating: VQA_Dataset/Images/5401.png  
  inflating: VQA_Dataset/Images/5402.png  
  inflating: VQA_Dataset/Images/5403.png  
  inflating: VQA_Dataset/Images/5404.png  
  inflating: VQA_Dataset/Images/5405.png  
  inflating: VQA_Dataset/Images/5406.png  
  inflating: VQA_Dataset/Images/5407.png  
  inflating: VQA_Dataset/Images/5408.png  
  inflating: VQA_Dataset/Images/5409

# Hyperparameters

In [9]:
img_w = 700
img_h = 400
batch_size = 64
lr = 1e-4

MAX_NUM_WORDS = 5000 # max number of unique words in dictionary

FEATURES = 512 # size of feature vector for images and questions

UNITS = 64

In [10]:
labels_dict = {
        '0': 0,
        '1': 1,
        '2': 2,
        '3': 3,
        '4': 4,
        '5': 5,
        'apple': 6,
        'baseball': 7,
        'bench': 8,
        'bike': 9,
        'bird': 10,
        'black': 11,
        'blanket': 12,
        'blue': 13,
        'bone': 14,
        'book': 15,
        'boy': 16,
        'brown': 17,
        'cat': 18,
        'chair': 19,
        'couch': 20,
        'dog': 21,
        'floor': 22,
        'food': 23,
        'football': 24,
        'girl': 25,
        'grass': 26,
        'gray': 27,
        'green': 28,
        'left': 29,
        'log': 30,
        'man': 31,
        'monkey bars': 32,
        'no': 33,
        'nothing': 34,
        'orange': 35,
        'pie': 36,
        'plant': 37,
        'playing': 38,
        'red': 39,
        'right': 40,
        'rug': 41,
        'sandbox': 42,
        'sitting': 43,
        'sleeping': 44,
        'soccer': 45,
        'squirrel': 46,
        'standing': 47,
        'stool': 48,
        'sunny': 49,
        'table': 50,
        'tree': 51,
        'watermelon': 52,
        'white': 53,
        'wine': 54,
        'woman': 55,
        'yellow': 56,
        'yes': 57
}

num_answers = len(labels_dict)

# Functions

In [11]:
import json
def unwrap_weighted(path, split = 0.2):
    
    dataset_dir = os.path.join(path, 'train_questions_annotations.json')
    training_dir = os.path.join(path, 'training.json')
    validation_dir = os.path.join(path, 'validation.json')
        
    dic_images = None
    
    with open(dataset_dir) as f:
       dic_images = json.load(f)
        
    dict_keys = list(dic_images.keys())
    np.random.shuffle(dict_keys)
    questions = int(round(split*len(dict_keys)))
        
    dic_validations = { dict_keys[i]:dic_images[dict_keys[i]] for i in range(questions)}
    dic_training = {dict_keys[i]:dic_images[dict_keys[i]] for i in range(questions, len(dict_keys))}
        
    with open(training_dir, 'w') as fp:
       json.dump(dic_training, fp)
    with open(validation_dir, 'w') as fp:
       json.dump(dic_validations, fp)

path = os.getcwd()

In [12]:
def get_token_dic_quest(path, max_num_words = 5000):
    from tensorflow.keras.preprocessing.text import Tokenizer
    dataset_dir = os.path.join(path, 'train_questions_annotations.json')
    
    # Load dataset
    with open(dataset_dir) as f:
        dic_images = json.load(f)

    # Get all questions as strings in a list
    questions = [dic['question'] for dic in dic_images.values()]

    # Strip '?' from questions
    questions = [s.translate(str.maketrans('', '', '?')).lower() for s in questions if not s == '']
    questions_tokenizer = Tokenizer(num_words=max_num_words)
    questions_tokenizer.fit_on_texts(questions)

    questions_wtoi = questions_tokenizer.word_index # index 0 reserved for padding
    
    questions_tokenized = questions_tokenizer.texts_to_sequences(questions)
    max_question_length = max(len(sentence) for sentence in questions_tokenized)
    
    return questions_tokenizer, questions_wtoi, max_question_length


def from_questions_to_dict(path, dict_req, max_num_words = 5000):
    from tensorflow.keras.preprocessing.sequence import pad_sequences
    
    # Return dictionary q_wtoi
    tokenizer, wtoi, max_len = get_token_dic_quest(path, max_num_words = 5000)
    
    translated_dics = []
    
    for dic in dict_req:
        
        question = dic['question'].translate(str.maketrans('', '', '?')).lower()
        question = tokenizer.texts_to_sequences([question])
        question = pad_sequences(question, maxlen=max_len)
        dic['question'] = question[0]
        dic['answer'] = labels_dict[dic['answer']]
        translated_dics.append(dic)
    
    return translated_dics

In [13]:
from PIL import Image
    
# Patches Generator
class dataset_generator(tf.keras.utils.Sequence):

  def __init__(self, path, preprocessing, subset = "training", image_generator = None, batch_size = 5, max_num_words=5000):
    json_file = subset + ".json"
    dat_dir = os.path.join(path, 'VQA_Dataset')
    subset_file = os.path.join(dat_dir, json_file)
    
    with open(subset_file) as f:
       dictionaries = json.load(f)
       dictionaries = dictionaries.values()
       self.dictionary = from_questions_to_dict(dat_dir, dictionaries, max_num_words)
    
    self.batch_size = batch_size
    self.image_generator = image_generator
    self.preprocessing = preprocessing
    self.dat_dir = dat_dir
    self.gen = image_generator
    self.batch_size = batch_size
    self.max_num_words = max_num_words
    self.n = 0
    
  def __len__(self):
    return len(self.dictionary)//self.batch_size

  def __getitem__(self, index):
    lower_bound = index*self.batch_size
    upper_bound = (index+1)*self.batch_size
    
    batch_img = []
    batch_que = []
    batch_ans = []
    
    for idx in range(lower_bound, upper_bound):
        img, que, ans = self.__data_generation__(idx)
        batch_img.append(img)
        batch_que.append(que)
        batch_ans.append(ans)
        
    batch_img = np.stack(batch_img, axis=0)
    batch_que = np.stack(batch_que, axis=0)
    batch_ans = np.stack(batch_ans, axis=0)
    
    x = [batch_img, batch_que]
    y = batch_ans
    
    return x, y
    
    
  def __data_generation__(self, idx):
    actual_dict = self.dictionary[idx]
    
    img_name = actual_dict['image_id']
    answer = actual_dict['answer']
    question = actual_dict['question']
    
    actual_img = Image.open(os.path.join(self.dat_dir, "Images", img_name + ".png"))
    actual_img = actual_img.convert('RGB')
    img_arr = np.array(actual_img)
    img_arr = np.expand_dims(img_arr, axis=0)
    
    if self.image_generator is not None:
        img_arr = self.gen.random_transform(img_arr)
    
    if self.preprocessing is not None:
        img_arr = self.preprocessing(img_arr)
        
    img_arr = np.squeeze(img_arr, axis=0)
    
    return img_arr, question, answer

Datasets generation

In [14]:
unwrap_weighted(os.path.join(path, 'VQA_Dataset'))

In [15]:
preprocessing_function = tf.keras.applications.vgg16.preprocess_input

gen = dataset_generator(path = os.getcwd(), preprocessing = preprocessing_function, 
                  subset = "training", image_generator = None, max_num_words=5000, batch_size = batch_size)

gen_val = dataset_generator(path = os.getcwd(), preprocessing = preprocessing_function, 
                  subset = "validation", image_generator = None, max_num_words=5000, batch_size = batch_size)

'''
dataset = tf.data.Dataset.from_generator(lambda: gen, output_types=([tf.float32, tf.uint8], tf.uint8), 
                                         output_shapes=([2,], ()))

dataset_val = tf.data.Dataset.from_generator(lambda: gen_val, output_types=([tf.float32, tf.uint8], tf.uint8), 
                                         output_shapes=([2,], ()))

dataset = dataset.batch(batch_size)
dataset = dataset.repeat()

dataset_val = dataset_val.batch(batch_size)
dataset_val = dataset_val.repeat()

iterator = iter(dataset)
giggino = next(iterator)
print(giggino)
'''

for f in gen:
    print(f)
    break
    

'\ndataset = tf.data.Dataset.from_generator(lambda: gen, output_types=([tf.float32, tf.uint8], tf.uint8), \n                                         output_shapes=([2,], ()))\n\ndataset_val = tf.data.Dataset.from_generator(lambda: gen_val, output_types=([tf.float32, tf.uint8], tf.uint8), \n                                         output_shapes=([2,], ()))\n\ndataset = dataset.batch(batch_size)\ndataset = dataset.repeat()\n\ndataset_val = dataset_val.batch(batch_size)\ndataset_val = dataset_val.repeat()\n\niterator = iter(dataset)\ngiggino = next(iterator)\nprint(giggino)\n'

([array([[[[150.061    , 134.22101  ,  89.32     ],
         [150.061    , 134.22101  ,  89.32     ],
         [150.061    , 134.22101  ,  89.32     ],
         ...,
         [150.061    , 134.22101  ,  89.32     ],
         [150.061    , 134.22101  ,  89.32     ],
         [150.061    , 134.22101  ,  89.32     ]],

        [[150.061    , 134.22101  ,  89.32     ],
         [150.061    , 134.22101  ,  89.32     ],
         [150.061    , 134.22101  ,  89.32     ],
         ...,
         [150.061    , 134.22101  ,  89.32     ],
         [150.061    , 134.22101  ,  89.32     ],
         [150.061    , 134.22101  ,  89.32     ]],

        [[150.061    , 134.22101  ,  89.32     ],
         [150.061    , 134.22101  ,  89.32     ],
         [150.061    , 134.22101  ,  89.32     ],
         ...,
         [150.061    , 134.22101  ,  89.32     ],
         [150.061    , 134.22101  ,  89.32     ],
         [150.061    , 134.22101  ,  89.32     ]],

        ...,

        [[-99.939    ,   7.2210007, 

# Image Encoder

In [16]:
image_encoder = tf.keras.applications.VGG16(
    include_top=False,
    weights="imagenet",
    input_shape=(img_h, img_w, 3),
    pooling='avg'
)

for layer in image_encoder.layers:
    layer.trainable = False

image_encoder.summary()

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5
Model: "vgg16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 400, 700, 3)]     0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 400, 700, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 400, 700, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 200, 350, 64)      0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 200, 350, 128)     73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 200, 350, 128)    

# Question Encoder

Load questions into a List

In [17]:
import json
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

dataset_dir = os.path.join(cwd, "VQA_Dataset", "train_questions_annotations.json")

# Load dataset
with open(dataset_dir) as f:
    dic_images = json.load(f)
            
# Get all questions as strings in a list
questions = [dic['question'] for dic in dic_images.values()]

# Strip '?' from questions
questions = [s.translate(str.maketrans('', '', '?')).lower() for s in questions if not s == '']
print(questions[12])

# max_words_in_sentence = max(len(question.split(' ')) for question in questions)
# print(max_words_in_sentence)

is there books on the bookshelf


Tokenize questions

In [18]:
# Create Tokenizer to convert words to integers
questions_tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
questions_tokenizer.fit_on_texts(questions)

questions_tokenized = questions_tokenizer.texts_to_sequences(questions)
# each sentence into a sequence of tokens (in this case, only the 20000 most frequent)

# "hello raffaele" -> [9, 78] 

questions_wtoi = questions_tokenizer.word_index # index 0 reserved for padding
print('Total number of words:', len(questions_wtoi))

print(questions_tokenized[0])

Total number of words: 4640
[47, 797, 1903]


In [19]:
max_question_length = max(len(sentence) for sentence in questions_tokenized)
print('Max question length:', max_question_length)

# Pad to max question sentence length
padded_questions = pad_sequences(questions_tokenized, maxlen=max_question_length)

print("Padded questions shape:", padded_questions.shape)

Max question length: 21
Padded questions shape: (58832, 21)


Load pre-trained GloVe embedding

In [20]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip -q glove.6B.zip

path_to_glove_file = os.path.join(cwd,'glove.6B.100d.txt')

embeddings_index = {}
with open(path_to_glove_file, encoding='utf-8') as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found", len(embeddings_index), "word vectors.")

--2021-01-06 22:01:13--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2021-01-06 22:01:14--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2021-01-06 22:01:14--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2021-0

In [21]:
num_tokens = len(questions_wtoi) + 1
embedding_dim = 100

hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in questions_wtoi.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

Converted 4496 words (144 misses)


Create question encoder

In [22]:
embedding_layer = tf.keras.layers.Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
    trainable=False,
    mask_zero=True,
    input_length=max_question_length
)

question_encoder = tf.keras.models.Sequential()
question_encoder.add(tf.keras.layers.Input(shape=(max_question_length,), dtype="int64"))
question_encoder.add(embedding_layer)
question_encoder.add(tf.keras.layers.LSTM(units=FEATURES))

question_encoder.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 21, 100)           464100    
_________________________________________________________________
lstm (LSTM)                  (None, 512)               1255424   
Total params: 1,719,524
Trainable params: 1,255,424
Non-trainable params: 464,100
_________________________________________________________________


# Create complete model

Load indexes for answers

In [23]:
multiplied_features = tf.keras.layers.Multiply()([image_encoder.layers[-1].output, question_encoder.layers[-1].output])
dense_1 = tf.keras.layers.Dense(UNITS, activation='tanh')(multiplied_features)
out = tf.keras.layers.Dense(num_answers, activation='softmax')(dense_1)

network = tf.keras.models.Model(inputs=[image_encoder.layers[0].input, question_encoder.layers[0].input], outputs=out)

network.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 400, 700, 3) 0                                            
__________________________________________________________________________________________________
block1_conv1 (Conv2D)           (None, 400, 700, 64) 1792        input_1[0][0]                    
__________________________________________________________________________________________________
block1_conv2 (Conv2D)           (None, 400, 700, 64) 36928       block1_conv1[0][0]               
__________________________________________________________________________________________________
block1_pool (MaxPooling2D)      (None, 200, 350, 64) 0           block1_conv2[0][0]               
______________________________________________________________________________________________

In [24]:
optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
metrics = ['accuracy']
loss = tf.keras.losses.SparseCategoricalCrossentropy()

In [25]:
network.compile(optimizer=optimizer, loss=loss, metrics=metrics)

In [26]:
import os
from datetime import datetime

cwd = os.getcwd()

exps_dir = os.path.join('Checkpoints', 'basic_model')
if not os.path.exists(exps_dir):
    os.makedirs(exps_dir)

now = datetime.now().strftime('%b%d_%H-%M-%S')

exp_name = 'exp'

exp_dir = os.path.join(exps_dir, exp_name + '_' + str(now))
if not os.path.exists(exp_dir):
    os.makedirs(exp_dir)
    
callbacks = []

# Model checkpoint
# ----------------
ckpt_dir = os.path.join(exp_dir, 'ckpts')
if not os.path.exists(ckpt_dir):
    os.makedirs(ckpt_dir)

ckpt_callback = tf.keras.callbacks.ModelCheckpoint(filepath=os.path.join(ckpt_dir, 'cp_{epoch:02d}.ckpt'), 
                                                   save_weights_only=True, save_best_only=True)  # False to save the model directly
callbacks.append(ckpt_callback)

# Early Stopping
# --------------
early_stop = True
if early_stop:
    es_callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)
    callbacks.append(es_callback)

decay = 0.1
min_lr = 1e-5


# Decay
decay_callback = tf.keras.callbacks.ReduceLROnPlateau(
    monitor='val_loss', factor=decay, patience=15, verbose=1,
    mode='auto', min_lr=min_lr)
callbacks.append(decay_callback)

# ---------------------------------

# Training

In [None]:
network.fit(x=gen,
            epochs=100,
            steps_per_epoch=len(gen),
            validation_data=gen_val,
            validation_steps=len(gen_val),
            callbacks=callbacks)

Epoch 1/100

In [26]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from PIL import Image

path = os.path.join(cwd, 'VQA_Dataset','test_questions.json')
max_num_words = 50000
max_question_length = 21
# preprocessing_function = tf.keras.applications.vgg16.preprocess_input
dat_dir = os.path.join(cwd, 'VQA_Dataset','Images')
# network = None

with open(path) as f:
       dic_test = json.load(f)

dic_test_values = dic_test.values()
test_questions = [q['question'].lower().translate(str.maketrans('', '', '?')) for q in dic_test_values]

test_tokenizer = Tokenizer(num_words=max_num_words)
test_tokenizer.fit_on_texts(test_questions)

test_wtoi = test_tokenizer.word_index

test_tokenized = test_tokenizer.texts_to_sequences(test_questions)

# print(test_tokenized[45])

# max_question_length = max(len(sentence) for sentence in test_tokenized)        

results = dict()

for question_id in dic_test.keys():
    
    temp_dic = dic_test[question_id]
    # print(temp)
    question = temp_dic['question'].lower().translate(str.maketrans('', '', '?'))
    # print(question)
    image_id = dic_test[question_id]['image_id']
    
    img = Image.open(os.path.join(dat_dir, image_id + ".png"))
    img = img.convert('RGB')
    img_arr = np.array(img)
    img_arr = np.expand_dims(img_arr, axis=0)
    img_arr = preprocessing_function(img_arr)
    
    question_tokenized = test_tokenizer.texts_to_sequences([question])
    
    padded_question = pad_sequences(question_tokenized, maxlen=max_question_length)
    # print(padded_question)
    
    result = network.predict([img_arr, padded_question], verbose=0, batch_size=1)
    
    result = tf.argmax(result[0])
    result = int(result)
    itoa = {v: k for k, v in labels_dict.items()}
    
    answer = itoa[result]
    
    print(answer)
    
    results[question_id] = answer
    
    # print(question_id, question, image_id)


[1;30;43mOutput streaming troncato alle ultime 5000 righe.[0m
dog
2
yes
yes
1
no
2
2
yes
yes
no
no
1
no
2
yes
tree
yes
dog
monkey bars
yes
no
2
yes
2
2
dog
3
2
1
no
yes
2
no
dog
dog
cat
yes
no
no
yes
yes
no
yes
2
cat
2
wine
no
yes
yes
2
yes
2
bike
plant
2
dog
yes
yes
5
couch
yes
yes
no
2
no
brown
dog
no
yes
dog
yes
2
yes
no
dog
yes
no
dog
dog
yes
red
2
yes
no
yes
yes
no
gray
yes
book
green
dog
dog
yes
cat
yes
dog
rug
yes
football
yes
book
football
no
2
nothing
football
cat
no
green
2
no
no
tree
boy
3
yes
2
no
no
no
2
1
brown
2
yes
no
rug
yes
3
no
no
grass
yes
no
woman
yes
yes
no
2
yes
2
no
cat
2
yes
dog
yes
no
stool
no
yes
yes
no
sandbox
no
2
grass
baseball
no
dog
no
yes
yes
2
yes
no
yes
stool
2
no
yellow
no
yes
no
2
bird
cat
monkey bars
3
grass
yes
bench
dog
no
yes
yes
gray
wine
no
yes
no
2
no
2
food
cat
dog
yes
cat
no
no
no
yes
yes
food
3
bird
yes
yes
yes
no
no
rug
sitting
yes
1
yes
yes
2
wine
no
2
yes
2
grass
1
yes
no
yes
2
rug
no
no
football
yes
nothing
tree
no
2
yes
yes
yes
no
y

In [None]:
import os
from datetime import datetime

def create_csv(results, results_dir='./'):

    csv_fname = 'results_'
    csv_fname += datetime.now().strftime('%b%d_%H-%M-%S') + '.csv'

    with open(os.path.join(results_dir, csv_fname), 'w') as f:

        f.write('Id,Category\n')

        for key, value in results.items():
            f.write(key + ',' + str(value) + '\n')

In [28]:
create_csv(results, results_dir='/content/drive/MyDrive/')