This tutorial/handson session is designed based on the Visual Turing Test tutorial in here:
https://github.com/mateuszmalinowski/visual_turing_test-tutorial
Visual Turing Challenge
Mateusz Malinowski and Mario Fritz
Max-Plank Institute

Mehdi Ghanimifard

In [4]:
#import os
#os.environ["CUDA_DEVICE_ORDER"]= "PCI_BUS_ID"
#os.environ["CUDA_VISIBLE_DEVICES"]= "0"

import numpy as np
#from keras.applications.resnet50 import ResNet50
from keras.applications.imagenet_utils import preprocess_input
from keras.preprocessing import image as kimage

from keras.models import load_model
from keras.models import Sequential, Model
from keras.layers import Dense, LSTM, Embedding, Concatenate, Dropout
from keras.layers import Input
from keras.callbacks import EarlyStopping
import keras.backend as K


import glob

from random import shuffle

The following cell is to show how Mehdi's code from the tutorial would work. In my case, my image model predicts classes (angry/happy), and not features. My bottleneck_fc_model.h5 predicts features, but these are actually already loaded in bottleneck_features_train and validation.npy (from run of eslp_image.py), which I load in the next cell.

In [110]:
# Loads my image_model (the one that predicts features, not classes) 

#pretrained_cnn_model = load_model('2bottleneck_fc_model.h5')
#pretrained_cnn_model = load_model('2bottleneck_fc_model.h5')

# a function (from image file path to feature vectors)
# From Mehdi's tutorial

def img2vec(image_path):
    x = kimage.load_img(image_path, target_size=[48,48])#from 224,224
    x_array = kimage.img_to_array(x)
    xs_array = np.array([x_array,])
    # notice that we are not using full capacity of the GPU when we are passing only one image per prediction.
    # we could have a larger batch.
    return pretrained_cnn_model.predict(preprocess_input(xs_array)).flatten()

#use model to extract features from .png format pictures with our function img2vec: 

#angry_images=[]
#happy_images=[]

#path = "pngs/happy/output*.png"
#for png in glob.glob(path):
#    happy_images.append(img2vec(png))
#print('Happy faces:   ', len(happy_images))  

#path = "pngs/angry/output*.png"
#for png in glob.glob(path):
#    angry_images.append(img2vec(png))
#print('Angry faces:   ', len(angry_images))

In [None]:
#angry_images = [
#    img2vec("pngs/angry/{0}.png".format(image_name.strip()))
#    for image_name in open('pngs/angry/{0}.png')
#]

In [None]:
#np.save("angry_images.npy", angry_images)
#np.save("happy_images.npy", happy_images)

In [111]:
b_train=np.load('bottleneck_features_train.npy')
b_val=np.load('bottleneck_features_validation.npy')

In [112]:
b_train.shape

(7904, 1, 1, 512)

In [113]:
b_val.shape

(1952, 1, 1, 512)

In [114]:
#these features are half angry, half happy, so I save them accordingly:
angry_images_train=b_train[:int((7904/2))]
angry_images_test=b_val[:int((1952/2))]
happy_images_train=b_train[int((7904/2)):]
happy_images_test=b_val[int((1952/2)):]

In [116]:
angry_images_train.shape


(3952, 1, 1, 512)

In [128]:
#Load sentences (from run of language_preprocessing.py): 

happy_sents=np.load('happy_sents.npy')#With sentiment score above 0.6
angry_sents=np.load('angry_sents.npy')#With sentiment score below 0.4

In [118]:
#Splits each category and input in train and test before mapping. 
#When mapping, one image is paired with some sentences, and we dont want to shuffle some anwers into the test set.

#train_angry_images=angry_images[:int(0.8*len(angry_images))]
#test_angry_images=angry_images[int(0.8*len(angry_images)):]

#train_happy_images=happy_images[:int(0.8*len(happy_images))]
#test_happy_images=happy_images[int(0.8*len(happy_images)):]

train_angry_sents=angry_sents[:int(0.8*len(angry_sents))]
test_angry_sents=angry_sents[int(0.8*len(angry_sents)):]

train_happy_sents=happy_sents[:int(0.8*len(happy_sents))]
test_happy_sents=happy_sents[int(0.8*len(happy_sents)):]

In [16]:
#Maps each image with with the chosen sentences. 
#Every image is mapped with more than one sentence, to grow the data size

#Angry face with negative sentences
#happy face with positive sentences
#Saves the sentiment category as 'NEG' or 'POS'


train=[]
test=[]

sent_count=0
for image in angry_images_train:    
    for step in range(6):
        train.append((image[0][0], train_angry_sents[sent_count][0], 'NEG'))
        sent_count=+1
sent_count=0
for image in happy_images_train:    
    for step in range(6):
        train.append((image[0][0], train_happy_sents[sent_count][0], 'POS'))
        sent_count=+1
        
sent_count=0        
for image in happy_images_test:
    for step in range(5):
        test.append((image[0][0], test_happy_sents[sent_count][0], 'POS'))
        sent_count=+1 
sent_count=0        

for image in angry_images_test:
    for step in range(5):
        test.append((image[0][0], test_angry_sents[sent_count][0], 'NEG'))
        sent_count=+1 
    
print('total pairs: ',len(train)+len(test))

total pairs:  57184


In [54]:
#Now we can shuffle in each set:
shuffle(train)
shuffle(test)

In [99]:
xtrain=[(image,sent) for (image,sent,label) in train]
ytrain=[label for (image,sent,label) in train]
xtest=[(image,sent) for (image,sent,label) in test]
ytest=[label for (image,sent,label) in test]

#image_size=len(train[0][0])
#sent_size=len(train[0][1])
#labels=set(list(l for (i,s,l) in all_data))

xtrain2 = list(zip(*xtrain))
xtrain2 = [np.array(xtrain2[1]), np.array(xtrain2[0])]

xtest2 = list(zip(*xtest))
xtest2 = [np.array(xtest2[1]), np.array(xtest2[0])]#[sents, images]

In [100]:
cat_codes = {
    'POS': 1,
    'NEG': 0
}

In [101]:
ytrain2 = np.array([cat_codes[c] for c in ytrain])
ytest2 = np.array([cat_codes[c] for c in ytest])

In [25]:
np.save("xtrain0.npy", xtrain2[0])#sents
np.save("xtrain1.npy", xtrain2[1])#images
np.save("ytrain.npy", ytrain2)#label
np.save("xtest0.npy", xtest2[0])#
np.save("xtest1.npy", xtest2[1])#
np.save("ytest.npy", ytest2)#label

In [62]:
image_size=len(xtrain2[1][0])
sent_size=len(xtrain2[0][0])
vocab = list(np.load('vocab.npy'))
labels = {'POS', 'NEG'}

In [63]:
#xtrain = [np.load('xtrain0.npy'), np.load('xtrain1.npy')]
#ytrain = np.load('ytrain.npy')
#xtest = [np.load('xtest0.npy'), np.load('xtest1.npy')]
#ytest = np.load('ytest.npy')



In [64]:
xtrain2[0]

array([[    0,     0,     0, ...,  3413,  5532,  8426],
       [    0,     0,     0, ...,  3413,  5532,  8426],
       [    0,     0,     0, ...,  8373, 17064, 14872],
       ...,
       [    0,     0,     0, ...,  3413,  5532,  8426],
       [    0,     0,     0, ...,  8373, 17064, 14872],
       [    0,     0,     0, ...,  3413,  5532,  8426]])

In [65]:
xtrain2[1]

array([[0.        , 0.        , 0.9062693 , ..., 0.4142326 , 0.5021698 ,
        0.00232075],
       [0.82107186, 0.        , 1.0568473 , ..., 0.        , 0.27432036,
        0.        ],
       [0.908764  , 0.        , 0.        , ..., 0.        , 0.96987593,
        0.        ],
       ...,
       [0.74720246, 0.        , 0.        , ..., 0.80306214, 0.3244789 ,
        0.        ],
       [0.9740577 , 0.        , 1.325321  , ..., 0.        , 0.        ,
        0.1857352 ],
       [0.37611964, 0.        , 0.        , ..., 0.2590153 , 0.12551779,
        0.        ]], dtype=float32)

In [66]:
image_size

512

In [67]:
sent_size

56

In [68]:
input_question = Input([sent_size,])
input_context = Input([image_size,])

# learn embedings (size=50 as we chose just now :D)
q_embs = Embedding(len(vocab), 50)(input_question)

# encode the question
q_encoded = LSTM(50)(q_embs)

mlp_1 = Dense(image_size, activation='tanh')(q_encoded)

q_composed = Concatenate()([input_context, mlp_1])

mlp_2 = Dropout(0.2)(Dense(image_size, activation='relu')(q_composed))
#mlp_2 = Dropout(0.2)(Dense(image_size, activation='relu')(mlp_1))

final_a = Dense(len(labels), activation='softmax')(mlp_2)

model = Model([input_question, input_context], final_a)
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            (None, 56)           0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 56, 50)       905600      input_5[0][0]                    
__________________________________________________________________________________________________
lstm_3 (LSTM)                   (None, 50)           20200       embedding_3[0][0]                
__________________________________________________________________________________________________
input_6 (InputLayer)            (None, 512)          0                                            
__________________________________________________________________________________________________
dense_7 (D

In [69]:

model.compile('adam', 'sparse_categorical_crossentropy', ['accuracy'])

In [None]:
history=model.fit(xtrain2, ytrain2, epochs=50, batch_size=64, validation_split=0.1, callbacks=[EarlyStopping(patience=5)])

In [71]:
model.save('combi_model.h5')  # creates a HDF5 file 'my_model.h5'


In [72]:
print(history.history)
modelfile = open('./combihistory.txt',"w")
modelfile.write(str(history.history))
modelfile.close()

{'val_loss': [1.1920928244535389e-07, 1.1920928244535389e-07, 1.1920928244535389e-07, 1.1920928244535389e-07, 1.1920928244535389e-07, 1.1920928244535389e-07], 'val_acc': [1.0, 1.0, 1.0, 1.0, 1.0, 1.0], 'loss': [0.06765512260154777, 1.1920928243636347e-07, 1.1920928243636347e-07, 1.1920928243636347e-07, 1.1920928243636347e-07, 1.1920928243636347e-07], 'acc': [0.9666807254323071, 1.0, 1.0, 1.0, 1.0, 1.0]}


In [74]:
import matplotlib.pyplot as plt

In [75]:
### summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.savefig('combiloss.pdf')

In [76]:
### summarize history for accuracy
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.savefig('combiaccuracy.pdf')

In [126]:
model.evaluate(xtest2, ytest2)



[8.059047769327632, 0.5]

In [127]:
scores = model.evaluate(xtest2, ytest2, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))



Accuracy: 50.00%
