In [1]:
import glob
from keras.applications.inception_v3 import InceptionV3
from keras.models import Model
from time import time
from keras.preprocessing import image
import numpy as np
from keras.applications.inception_v3 import preprocess_input
import pickle as p
from pickle import dump, load

Using TensorFlow backend.


In [2]:
images = '../../datasets/flickr8k/Flicker8k_Dataset/'
train_images_file = '../../datasets/flickr8k/Flickr8k_text/Flickr_8k.trainImages.txt'
test_images_file = '../../datasets/flickr8k/Flickr8k_text/Flickr_8k.testImages.txt'
img = glob.glob(images + '*.jpg')

In [3]:
def load_descriptions(filename):
    file = open(filename, 'r')
    doc = file.read()
    file.close()
    
    mapping = dict()
    for line in doc.split('\n'):
        tokens = line.split()
        if len(line) < 2:
            continue
        image_id, image_desc = tokens[0], tokens[1:]
        image_id = image_id.split('.')[0]
        image_desc = ' '.join(image_desc)
        if image_id not in mapping:
            mapping[image_id] = list()
        mapping[image_id].append(image_desc)
    return mapping

In [4]:
def preprocess(image_path):
    # Convert all the images to size 299x299 as expected by the inception v3 model
    img = image.load_img(image_path, target_size=(299, 299))
    # Convert PIL image to numpy array of 3-dimensions
    x = image.img_to_array(img)
    # Add one more dimension
    x = np.expand_dims(x, axis=0)
    # preprocess the images using preprocess_input() from inception module
    x = preprocess_input(x)
    return x

In [5]:
def encode(image):
    image = preprocess(image) # preprocess the image
    fea_vec = model_new.predict(image) # Get the encoding vector for the image
    fea_vec = np.reshape(fea_vec, fea_vec.shape[1]) # reshape from (1, 2048) to (2048, )
    return fea_vec

In [6]:
def to_lines(descriptions):
	all_desc = list()
	for key in descriptions.keys():
		[all_desc.append(d) for d in descriptions[key]]
	return all_desc

In [7]:
# calculate the length of the description with the most words
def max_length(descriptions):
	lines = to_lines(descriptions)
	return max(len(d.split()) for d in lines)

### Get full path to all train images

In [8]:
train_images = set(open(train_images_file, 'r').read().strip().split('\n'))

train_img = []
for i in img:
    if i[len(images):] in train_images:
        train_img.append(i)

### Get full path all test images

In [9]:
test_images = set(open(test_images_file, 'r').read().strip().split('\n'))

test_img = []
for i in img:
    if i[len(images):] in test_images:
        test_img.append(i)

In [10]:
train_descriptions = load_descriptions('train_captions.txt')
print('Descriptions: train=%d' % len(train_descriptions))

Descriptions: train=6000


In [11]:
test_descriptions = load_descriptions('test_captions.txt')
print('Descriptions: test=%d' % len(test_descriptions))

Descriptions: test=1000


## Load InceptionV3

In [12]:
model = InceptionV3(weights='imagenet')

In [13]:
model_new = Model(model.input, model.layers[-2].output)

## Extract feature vector of all train images

In [14]:
start = time()
encoding_train = {}
for img in train_img:
    encoding_train[img[len(images):]] = encode(img)
print("Time taken in seconds =", time()-start)

Time taken in seconds = 754.1098401546478


In [15]:

# Save the bottleneck train features to disk
with open("./encoded_train_images.pkl", "wb") as encoded_pickle:
    p.dump(encoding_train, encoded_pickle)

## Extract feature vector of all test images

In [16]:
# Call the funtion to encode all the test images - Execute this only once
start = time()
encoding_test = {}
for img in test_img:
    encoding_test[img[len(images):]] = encode(img)
print("Time taken in seconds =", time()-start)

Time taken in seconds = 129.94025707244873


In [17]:
# Save the bottleneck test features to disk
with open("./encoded_test_images.pkl", "wb") as encoded_pickle:
    p.dump(encoding_test, encoded_pickle)

# Build vocab

In [18]:
train_features = load(open("./encoded_train_images.pkl", "rb"))
print('Photos: train=%d' % len(train_features))

Photos: train=6000


In [19]:
# Create a list of all the training captions
all_train_captions = []
for key, val in train_descriptions.items():
    for cap in val:
        all_train_captions.append(cap)
len(all_train_captions)

30000

In [20]:
# # Consider only words which occur at least 10 times in the corpus
# word_count_threshold = 10
# word_counts = {}
# nsents = 0
# for sent in all_train_captions:
#     nsents += 1
#     for w in sent.split(' '):
#         word_counts[w] = word_counts.get(w, 0) + 1

# vocab = [w for w in word_counts if word_counts[w] >= word_count_threshold]
# print('preprocessed words %d -> %d' % (len(word_counts), len(vocab)))

In [21]:
vocab = set()
for sent in all_train_captions:
    for w in sent.split(' '):
        vocab.add(w)
vocab = list(vocab)

## Build ixtoword and wordtoix

In [22]:
ixtoword = {}
wordtoix = {}

ix = 1
for w in vocab:
    wordtoix[w] = ix
    ixtoword[ix] = w
    ix += 1

In [23]:
with open("./wordtoix.pkl", "wb") as encoded_pickle:
    p.dump(wordtoix, encoded_pickle)

In [24]:
with open("./ixtoword.pkl", "wb") as encoded_pickle:
    p.dump(ixtoword, encoded_pickle)

## Calculate max_descr_length and vocab_size

In [25]:
max_length = max_length(train_descriptions)
print('Description Length: %d' % max_length)

Description Length: 34


In [26]:
vocab_size = len(ixtoword) + 1 # one for appended 0's
vocab_size

7579