In [1]:
import sys
import numpy as np
from numpy import array
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import string
import os
from os import listdir
from PIL import Image
import glob
from pickle import dump, load
from time import time
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import LSTM, Embedding, TimeDistributed, Dense, RepeatVector,\
                         Activation, Flatten, Reshape, concatenate, Dropout, BatchNormalization
from keras.optimizers import Adam, RMSprop
from keras.layers.wrappers import Bidirectional
from keras.layers.merge import add
from keras.preprocessing import image
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array
from keras.models import Model
from keras import Input, layers
from keras import optimizers
from keras.applications.inception_v3 import preprocess_input
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.applications import imagenet_utils
from keras.applications import VGG16

Using TensorFlow backend.


In [2]:
def load_doc(filename):
    file = open(filename,'r')
    text = file.read()
    return text

In [5]:
# load a pre-defined list of photo identifiers
def load_set(filename):
    doc = load_doc(filename)
    dataset = list()
    # process line by line
    for line in doc.split('\n'):
        # skip empty lines
        if len(line) < 1:
            continue
        # get the image identifier
        identifier = line.split('.')[0]
        dataset.append(identifier)
    return (dataset)

In [6]:
# load training dataset (6K)
train_text_filename = 'Flickr8k_text/Flickr_8k.trainImages.txt'
train = load_set(train_text_filename)
print('Train-Dataset: %d' % len(train))

Train-Dataset: 6000


In [10]:
# loading full dataset image name  
image_names = []
images_directory= 'Flickr8k_Dataset'
for name in listdir(images_directory):
    image_names.append(images_directory+'/'+name)

In [12]:
train_names = []
# check and load training images
for i in image_names:
    i=i.split('/')
    
    if i[-1].split('.')[0] in train:
        train_names.append('/'.join(i))
print(len(train_names))

6000


In [16]:
#load test dataset images
# load training dataset (6K)
test_text_filename = 'Flickr8k_text/Flickr_8k.testImages.txt'
test = load_set(test_text_filename)
print('Test-Dataset: %d' % len(test),'\n')

test_names = []
# check and load testing images
for i in image_names:
    i=i.split('/')
    
    if i[-1].split('.')[0] in test:
        test_names.append('/'.join(i))

Test-Dataset: 1000 



In [17]:
def preprocess(image_path):
    # Convert all the images to size 299x299 as expected by the inception v3 model
    img = load_img(image_path, target_size=(224, 224))
    # Convert PIL image to numpy array of 3-dimensions
    x = img_to_array(img)
    # Add one more dimension
    x = np.expand_dims(x, axis=0)
    # preprocess the images using preprocess_input() from inception module
    x = preprocess_input(x)
    return x

In [18]:
# Load the inception v3 model
model = VGG16(weights='imagenet')

Instructions for updating:
Colocations handled automatically by placer.


In [19]:
# Create a new model, by removing the last layer (output layer) from the inception v3
model_new = Model(model.input, model.layers[-2].output)

In [20]:
# Function to encode a given image into a vector of size (2048, )
def encode(image):
    image = preprocess(image) # preprocess the image
    fea_vec = model_new.predict(image,verbose=0) # Get the encoding vector for the image
    fea_vec = np.reshape(fea_vec, fea_vec.shape[1]) # reshape from (1, 2048) to (2048, )
    return fea_vec

In [21]:
# Call the funtion to encode all the train images
start = time()
a=1
encoding_train = {}
for img in train_names:
    encoding_train[img.split('/')[-1]] = encode(img)
    sys.stdout.write('\r'+'Images encoded : '+str(a)+'/6000')
    sys.stdout.flush()
    a=a+1
print("Time taken in seconds =", time()-start)


Images encoded : 6000/6000Time taken in seconds = 235.5976688861847


In [24]:
# Save the bottleneck train features to disk
with open("encoded_train_images.pkl", "wb") as encoded_pickle:
   dump(encoding_train, encoded_pickle) 

In [25]:
print('output shape of the image features are: ',len(list(encoding_train.items())[0][1]))

output shape of the image features are:  4096


In [26]:
# Call the funtion to encode all the test images - Execute this only once
start = time()
encoding_test = {}
a=1
for img in test_names:
    encoding_test[img.split('/')[-1]] = encode(img)
    sys.stdout.write('\r'+'Images encoded : '+str(a)+'/1000')
    sys.stdout.flush()
    a=a+1
print("\nTime taken in seconds =", time()-start)

Images encoded : 1000/1000
Time taken in seconds = 38.65292692184448


In [28]:
# Save the bottleneck test features to disk
with open("encoded_test_images.pkl", "wb") as encoded_pickle:
   dump(encoding_test, encoded_pickle)