This tutorial/handson session is designed based on the Visual Turing Test tutorial in here:
https://github.com/mateuszmalinowski/visual_turing_test-tutorial
Visual Turing Challenge
Mateusz Malinowski and Mario Fritz
Max-Plank Institute

Mehdi Ghanimifard

In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"]= "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]= "0"

import numpy as np
#from keras.applications.resnet50 import ResNet50
from keras.applications.imagenet_utils import preprocess_input
from keras.preprocessing import image as kimage

Using TensorFlow backend.


In [2]:
from keras.models import load_model
from keras.models import Sequential, Model
from keras.layers import Dense, LSTM, Embedding, Concatenate, Dropout
from keras.layers import Input
from keras.callbacks import EarlyStopping
import keras.backend as K


In [None]:
import glob

In [None]:
# Loads my image_model 
pretrained_cnn_model = load_model('image_model.h5')


In [None]:
# we can make this into a function (from file path to feature vectors)
# this is function doesn't have an ideal performance but it would be useful for the sake of this tutorial (mehdi)

def img2vec(image_path):
    x = kimage.load_img(image_path, target_size=[48,48])#from 224,224
    x_array = kimage.img_to_array(x)
    xs_array = np.array([x_array,])
    # notice that we are not using full capacity of the GPU when we are passing only one image per prediction.
    # we could have a larger batch.
    return pretrained_cnn_model.predict(preprocess_input(xs_array)).flatten()

In [None]:
#use image_model to extract features from .png format pictures: 

angry_images=[]
happy_images=[]

path = "pngs/happy/output*.png"
for png in glob.glob(path):
    happy_images.append(img2vec(png))
print('Happy faces:   ', len(happy_images))   

path = "pngs/angry/output*.png"
for png in glob.glob(path):
    angry_images.append(img2vec(png))
print('Angry faces:   ', len(angry_images)) 
    

In [5]:
#Load sentences (from prerocessing.py)
happy_sents=np.load('happy_sents.npy')#With sentiment score above 0.6
angry_sents=np.load('angry_sents.npy')#With sentiment score below 0.4

In [None]:
#Splits each category and input in train and test before mapping. When mapping, one image is paired with some sentences, and we dont want to shuffle some anwers into the test set.
train_angry_images=angry_images[:int(0.8*len(angry_images))]
test_angry_images=angry_images[int(0.8*len(angry_images)):]

train_happy_images=happy_images[:int(0.8*len(happy_images))]
test_happy_images=happy_images[int(0.8*len(happy_images)):]

train_angry_sents=angry_sents[:int(0.8*len(angry_sents))]
test_angry_sents= [:int(0.8*len(angry_sents))]

train_happy_sents=happy_sents[:int(0.8*len(happy_sents))]
test_happy_sents= [:int(0.8*len(happy_sents))]

In [None]:
#np.save("angry_images.npy", angry_images)
#np.save("happy_images.npy", happy_images)

In [None]:
#angry_images = [
#    img2vec("pngs/angry/{0}.png".format(image_name.strip()))
#    for image_name in open('pngs/angry/{0}.png')
#]

In [None]:
# this code takes time:
#X_images = [
#    img2vec("data/daquar/images/{0}.png".format(image_name.strip()))
#    for image_name in open('data/daquar/qa.894.raw.train.format_triple.contexts')
#]

In [None]:
#Maps each image with with the chosen sentences
train=[]
test=[]

sent_count=0
for image in train_angry_images:    
    for step in range(6):
        train.append((image, train_angry_sents[sent_count][0], 'NEG'))
        sent_count=+1
sent_count=0
for image in train_happy_images:    
    for step in range(6):
        train.append((image, train_happy_sents[sent_count][0], 'POS'))
        sent_count=+1
        
sent_count=0        
for image in test_happy_images:
    for step in range(5):
        test.append((image, test_happy_sents[sent_count][0], 'POS'))
        sent_count=+1 
sent_count=0        

for image in test_angry_images:
    for step in range(5):
        test.append((image, test_angry_sents[sent_count][0], 'NEG'))
        sent_count=+1 
    
print('total pairs: ',len(train)+len(test))

In [None]:
shuffle(train)
shuffle(test)

In [None]:
#print('data points: ',len(all_data))

In [None]:
#shuffle(all_data)

In [None]:
#s=int((len(all_data)*0.8))#splits with 80% training, 20% test
#train, test=all_data[:s], all_data[s:]

In [None]:
xtrain=[(image,sent) for (image,sent,label) in train]
ytrain=[label for (image,sent,label) in train]
xtest=[(image,sent) for (image,sent,label) in test]
ytest=[label for (image,sent,label) in test]

image_size=len(train[0][0])
sent_size=len(train[0][1])
#labels=set(list(l for (i,s,l) in all_data))

xtrain2 = list(zip(*xtrain))
xtrain2 = [np.array(xtrain2[1]), np.array(xtrain2[0])]

xtest2 = list(zip(*xtest))
xtest2 = [np.array(xtest2[1]), np.array(xtest2[0])]

In [None]:
vocab = list(np.load('vocab.npy'))
#image_size, sent_size = list(np.load('parameters.npy'))
labels = {'POS', 'NEG'}

In [None]:
#xtrain = [np.load('xtrain0.npy'), np.load('xtrain1.npy')]
#ytrain = np.load('ytrain.npy')
#xtest = [np.load('xtest0.npy'), np.load('xtest1.npy')]
#ytest = np.load('ytest.npy')



In [None]:
image_size=len(train[0][0])
sent_size=len(train[0][1])
#scores=list(l for (i,s,l) in all_data)

xtrain2 = list(zip(*xtrain))
xtrain2 = [np.array(xtrain2[1]), np.array(xtrain2[0])]#sentences, images

xtest2 = list(zip(*xtest))
xtest2 = [np.array(xtest2[1]), np.array(xtest2[0])]

In [None]:
input_question = Input([sent_size,])
input_context = Input([image_size,])

# learn embedings (size=50 as we chose just now :D)
q_embs = Embedding(len(vocab), 50)(input_question)

# encode the question
q_encoded = LSTM(50)(q_embs)

mlp_1 = Dense(image_size, activation='tanh')(q_encoded)

q_composed = Concatenate()([input_context, mlp_1])

mlp_2 = Dropout(0.2)(Dense(image_size, activation='relu')(q_composed))
#mlp_2 = Dropout(0.2)(Dense(image_size, activation='relu')(mlp_1))

final_a = Dense(len(labels), activation='softmax')(mlp_2)

model = Model([input_question, input_context], final_a)
model.summary()

In [None]:

model.compile('adam', 'sparse_categorical_crossentropy')

In [None]:
model.fit(xtrain2, ytrain, epochs=100, batch_size=32, validation_split=0.1, callbacks=[EarlyStopping(patience=5)])

In [None]:
model.save_weights('combi_model.h5')  # creates a HDF5 file 'my_model.h5'


In [None]:
model.evaluate(xtest2, ytest)

In [None]:
predictions = model.predict([xtest[0][:1], xtest[1][:1]])
#print('answer predictions', predictions)