In [None]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

import glob
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pickle
from tqdm import tqdm
import pandas as pd
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import LSTM, Embedding, TimeDistributed, Dense, RepeatVector, Merge, Activation, Flatten
from keras.layers.wrappers import Bidirectional
from keras.applications.inception_v3 import InceptionV3
from keras.preprocessing import image
import nltk

import matplotlib.pyplot as plt
%matplotlib inline

from collections import Counter, OrderedDict
import json

## 1. Load Flickr8k dataset

In [None]:
caption_file = 'Flickr8k_text/Flickr8k.token.txt'
img2captions = {}
for row in open(caption_file):
    row = row.strip()
    row = row.split('\t')
    img = row[0][:len(row[0])-2]
    cap = row[1].lower()
    if img not in img2captions:
        img2captions[img] = []
    img2captions[img].append(cap)

In [None]:
img2captions['1000268201_693b08cb0e.jpg']

In [None]:
images_dir = 'Flickr8k_Dataset/'
train_images_file = 'Flickr8k_text/Flickr_8k.trainImages.txt'
train_imgs = [line.strip() for line in open(train_images_file)]
print(len(train_imgs), train_imgs[:3])

val_images_file = 'Flickr8k_text/Flickr_8k.devImages.txt'
val_imgs = [line.strip() for line in open(val_images_file)]
print(len(val_imgs), val_imgs[:3])

test_images_file = 'Flickr8k_text/Flickr_8k.testImages.txt'
test_imgs = [line.strip() for line in open(test_images_file)]
print(len(test_imgs), test_imgs[:3])

In [None]:
img = train_imgs[0]
plt.imshow(Image.open(images_dir + '/' + img))
print('\n'.join(img2captions[img]))

## 2. Build vocabulary

In [None]:
# example for understanding Counter
counter = Counter()
counter.update(["aaa", "bbb", "aaa"])
counter.update(["aaa", "ccc"])
counter.update(["ccc"])
print(len(counter))
print(counter)

counts = [x for x in counter.items()]
print(counts)
counts.sort(key=lambda x: x[1], reverse=True)
print(counts)
json.dump(counts, open('counts.json', "w"), indent=2)
print(counts)
words = [w for w, c in counts if c >= 1]
print(words)

In [None]:
from collections import Counter, OrderedDict
import json

word_counter = Counter()
n_sample = 0
maxlen = 0
for img, captions in img2captions.items():
    for caption in captions:
        n_sample += 1
        caption = caption.lower()
        caption = str(caption)
        tokens = caption.split()
        maxlen = max([maxlen,len(tokens)])
        word_counter.update(tokens)
print('number of sample = ' + str(n_sample))
print('max len = ' + str(maxlen))


word_counts = [x for x in word_counter.items()]
word_counts.sort(key=lambda x: x[1], reverse=True)
json.dump(word_counts, open('word_counts.json', "w"), indent=2)

vocab = [w for w, c in word_counts if c >= 1]
start_word = '<start>'
end_word = '<end>'
vocab = [start_word, end_word] + vocab
print('vocabulary size = %d (<start> and <end> included)'%len(vocab))

word2idx = OrderedDict(zip(vocab,range(len(vocab))))
idx2word = OrderedDict(zip(range(len(vocab)), vocab))
json.dump(word2idx, open('word2idx.json', 'w'), indent=2)

In [None]:
caption = 'I am a student .'
caption = caption.lower()
tokens = caption.split()
print(caption)
print(tokens)

## 3. Extract features for images

We will feed these images to VGG-16 to get the encoded images. Hence we need to preprocess the images as the authors of VGG-16 did. The last layer of VGG-16 is the softmax classifier(FC layer with 1000 hidden neurons) which returns the probability of a class. This layer should be removed so as to get a feature representation of an image. We will use the last Dense layer(4096 hidden neurons) after popping the classifier layer. Hence the shape of the encoded image will be (1, 4096)

In [None]:
def preprocess_input(x):
    x /= 255.
    x -= 0.5
    x *= 2.
    return x

In [None]:
def preprocess(image_path):
    img = image.load_img(image_path, target_size=(299, 299))
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)

    x = preprocess_input(x)
    return x

In [None]:
plt.imshow((1 + np.squeeze(preprocess(images_dir + '/' + train_imgs[0])))/2.0)

In [None]:
model = InceptionV3(weights='inception_v3_weights_tf_dim_ordering_tf_kernels.h5')

In [None]:
model.summary()

In [None]:
from keras.models import Model

new_input = model.input
hidden_layer = model.layers[-2].output

model_new = Model(new_input, hidden_layer)

In [None]:
tryi = model_new.predict(preprocess(images_dir + '/' + train_imgs[0]))

In [None]:
tryi[:10]

In [None]:
def encode(image):
    image = preprocess(image)
    temp_enc = model_new.predict(image)
    temp_enc = np.reshape(temp_enc, temp_enc.shape[1])
    return temp_enc

In [None]:
encoding_train = {}
for img in tqdm(train_imgs):
    encoding_train[img] = encode(images_dir + '/' + img)
with open("encoded_images_train_inceptionV3.p", "wb") as encoded_pickle:
    pickle.dump(encoding_train, encoded_pickle) 

In [None]:
encoding_test = {}
for img in tqdm(test_imgs):
    encoding_test[img] = encode(images_dir + '/' + img)
with open("encoded_images_test_inceptionV3.p", "wb") as encoded_pickle:
    pickle.dump(encoding_test, encoded_pickle) 

In [None]:
encoding_train = pickle.load(open('encoded_images_train_inceptionV3.p', 'rb'))
encoding_test = pickle.load(open('encoded_images_test_inceptionV3.p', 'rb'))

## 4. Preprocess the captions
Adding '< start >' and '< end >' to all the captions to indicate the starting and ending of a sentence.

In [None]:
f = open('flickr8k_train_dataset.txt', 'w')
f.write("image_id\tcaptions\n")
for img in train_imgs:
    for cap in img2captions[img]:
        f.write(img + "\t" + "<start> " + cap +" <end>" + "\n")

f.close()

In [None]:
f = open('flickr8k_val_dataset.txt', 'w')
f.write("image_id\tcaptions\n")
for img in val_imgs:
    for cap in img2captions[img]:
        f.write(img + "\t" + "<start> " + cap +" <end>" + "\n")
f.close()

In [None]:
f = open('flickr8k_test_dataset.txt', 'w')
f.write("image_id\tcaptions\n")
for img in test_imgs:
    for cap in img2captions[img]:
        f.write(img + "\t" + "<start> " + cap +" <end>" + "\n")
f.close()

In [None]:
df = pd.read_csv('flickr8k_train_dataset.txt', delimiter='\t')

In [None]:
len(df)

In [None]:
c = [i for i in df['captions']]
len(c)

In [None]:
imgs = [i for i in df['image_id']]

In [None]:
a = c[-1]
a, imgs[-1]

In [None]:
for i in a.split():
    print (i, "=>", word2idx[i])

In [None]:
samples_per_epoch = 0
for cap in df['captions']:
    samples_per_epoch += len(cap.split())-1
print(samples_per_epoch)

## 5. Data generator

In [None]:
max_len = 40
vocab_size = len(word2idx)
def data_generator(batch_size = 128, split='train'):
        partial_caps = []
        next_words = []
        images = []
        
        csv_file = 'flickr8k_%s_dataset.txt'%split
        encoding_img_feat = pickle.load(open('encoded_images_%s_inceptionV3.p'%split, 'rb'))
        df = pd.read_csv(csv_file, delimiter='\t')
        df = df.sample(frac=1)
        iter = df.iterrows()
        c = []
        imgs = []
        for i in range(df.shape[0]):
            x = next(iter)
            c.append(x[1][1])
            imgs.append(x[1][0])


        count = 0
        while True:
            for j, text in enumerate(c):
                current_image = encoding_img_feat[imgs[j]]
                for i in range(len(text.split())-1):
                    count+=1
                    
                    partial = [word2idx[txt] for txt in text.split()[:i+1]]
                    partial_caps.append(partial)
                    
                    # Initializing with zeros to create a one-hot encoding matrix
                    # This is what we have to predict
                    # Hence initializing it with vocab_size length
                    n = np.zeros(vocab_size)
                    # Setting the next word to 1 in the one-hot encoded matrix
                    n[word2idx[text.split()[i+1]]] = 1
                    next_words.append(n)
                    
                    images.append(current_image)

                    if count>=batch_size:
                        next_words = np.asarray(next_words)
                        images = np.asarray(images)
                        partial_caps = sequence.pad_sequences(partial_caps, maxlen=max_len, padding='post')
                        yield [[images, partial_caps], next_words]
                        partial_caps = []
                        next_words = []
                        images = []
                        count = 0

                        
train_set = data_generator(split='train')
val_set = data_generator(split='val')
test_set = data_generator(split='test')

image, little girl running in field

X1,		X2 (text sequence), 						y (word)
image	< start >, 									little

image	< start >, little,							girl

image	< start >, little, girl, 					running

image	< start >, little, girl, running, 			in

image	< start >, little, girl, running, in, 		field

image	< start >, little, girl, running, in, field,  < end >


In [None]:
x = next(train_set)
print(x[0][0].shape)
print(x[0][1][:3])
print(np.argmax(x[1][:10], axis=1))
print(x[0][0][:10])

## 6. Build the model

In [None]:
embedding_size = 300
image_model = Sequential([
        Dense(embedding_size, input_shape=(2048,), activation='relu'),
        RepeatVector(1)
    ])
word_embedding_model = Sequential([
        Embedding(vocab_size, embedding_size, input_length=max_len),
        TimeDistributed(Dense(embedding_size, activation='relu'))
    ])
final_model = Sequential([
        Merge([image_model, word_embedding_model], mode='concat', concat_axis=1),
        LSTM(256, return_sequences=False),
        Dense(vocab_size),
        Activation('softmax')
    ])

from keras.optimizers import Adam, RMSprop
final_model.summary()
final_model.compile(loss='categorical_crossentropy', optimizer=RMSprop(), metrics=['accuracy'])

In [None]:
batch_size = 256
steps_per_epoch = samples_per_epoch // batch_size
final_model.fit_generator(train_set, 
                          steps_per_epoch=steps_per_epoch, 
                          epochs=30,
                          verbose=1)

In [None]:
import keras
keras.__version__

In [None]:
final_model.save_weights('saved_model.h5')

## Test the captioning model

In [None]:
final_model.load_weights('saved_model.h5')

In [None]:
def predict_captions(image):
    start_word = ["<start>"]
    e = encode(image)
    while True:
        print(start_word)
        par_caps = [word2idx[i] for i in start_word]
        par_caps = sequence.pad_sequences([par_caps], maxlen=max_len, padding='post')
        
        preds = final_model.predict([np.array([e]), np.array(par_caps)])
        word_pred = idx2word[np.argmax(preds[0])]
        start_word.append(word_pred)
        
        
        if word_pred == "<end>" or len(start_word) > max_len:
            break
    print(start_word)
    return ' '.join(start_word[1:-1])

In [None]:
try_image = images_dir + '/' + test_imgs[0]
plt.imshow(Image.open(try_image))
plt.show()
print (predict_captions(try_image)) 