In [154]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import re
import cv2
import json
from keras.applications.resnet50 import ResNet50, preprocess_input, decode_predictions
from keras.applications.vgg16 import VGG16
from keras.preprocessing import image
from keras.models import Model, load_model
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Input, Dense, Dropout, Embedding, LSTM
from keras.layers.merge import add
plt.style.use('seaborn')

### Loading Captions

In [136]:
with open('../Flickr8k/Flickr_TextData/Flickr8k.token.txt') as f:
    data=f.read()
data=data.split('\n')[:-1] #last entry is empty

In [137]:
data[0]

'1000268201_693b08cb0e.jpg#0\tA child in a pink dress is climbing up a set of stairs in an entry way .'

### Data Cleaning
stemming ,lemmatising , stopword removal techniques have not been performed while cleaning the captions so as to teach model write captions with correct english otherwise - a caption like 'A child in a pink dress is climbing up a set of stairs in an entry way' model will predict caption of form 'child pink dress climb set of stairs entry way' which doesn't make sense.

In [138]:
def clean_text(sentence):
    sentence = sentence.lower()
    sentence = re.sub("[^a-z]+"," ",sentence)
    return sentence

In [139]:
captions={}
for i in range(len(data)):
    img_name,img_caption=data[i].split('\t')
    img_name=img_name.split('.jpg')[0]
    img_caption=clean_text(img_caption)
    if captions.get(img_name) is None:
        captions[img_name] = []
    captions[img_name].append(img_caption)

In [140]:
# print(captions)
with open("cleaned_caption_description.json", "w") as file:
    json.dump(captions, file)

In [141]:
with open("cleaned_caption_description.json", "r") as file:
    captions=json.load(file)

### Load train and test data

In [144]:
train=None
test=[]
with open('../Flickr8k/Flickr_TextData/Flickr_8k.trainImages.txt') as f:
    train=f.readlines()
with open('../Flickr8k/Flickr_TextData/Flickr_8k.testImages.txt') as f:
    test=f.readlines()

In [145]:
train=[c.split('.jpg')[0] for c in train]
test=[c.split('.jpg')[0] for c in test]

In [146]:
print(train[:5])

['2513260012_03d33305cf', '2903617548_d3e38d7f88', '3338291921_fe7ae0c8f8', '488416045_1c6d903fe0', '2644326817_8f45080b87']


<start\> (start-of-sequence) and <end\> (end-of-sequence) characters are added to every caption to help our model determine when to start and - more importantly - end sequences.
Because while generating text(captions) model should know when it has to end otherwise it will keep predicting words and add them to the captions.  That's why we teach our model to decide the length of a caption on that by itself. when model will predict <end\> as next word for the caption, model should stop predicting more words.

In [147]:
#prepare train descriptions
train_descriptions={}
for t in train:
    train_descriptions[t]=[]
    for sent in captions[t]:
        new_caption='<start>'+sent+'<end>'
        train_descriptions[t].append(new_caption)

In [148]:
with open('train_descriptions.json','w') as file:
    json.dump(train_descriptions,file)

In [149]:
with open('train_descriptions.json','r') as file:
    train_descriptions=json.load(file)

### Using Resnet model to extract features from image 
- Resnet is used as an encoder
- output of 3rd last layer of resnet will be used as an encoding containing information about different features captured by resnet. These encodings will be later fed as input to the main model to predict captions.

In [150]:
model = ResNet50(weights="imagenet",input_shape=(224,224,3))
# model.summary()

In [153]:
new_model = Model(model.input,model.layers[-2].output)
# model.summary()

In [159]:
def get_image_encodings(img):
    feature_vector = new_model.predict(img)
    feature_vector = feature_vector.reshape((-1,))
    return feature_vector

In [160]:
encoding_train = {}
for img_name in train:
    img_path='../Flickr8k/Images/'+img_name+'.jpg'
    img = image.load_img(img_path,target_size=(224,224))
    img = image.img_to_array(img)
    img = img.reshape((1,224,224,3)) 
    #resnet expects a batch of image so expanding dimensions
    # (224,224,3) -> (1,224,224,3)
    img = preprocess_input(img)
    encoding_train[img_name] = get_image_encodings(img)

In [165]:
encoding_test = {}
for img_name in test:
    img_path='../Flickr8k/Images/'+img_name+'.jpg'
    img = image.load_img(img_path,target_size=(224,224))
    img = image.img_to_array(img)
    img = img.reshape((1,224,224,3)) 
    img = preprocess_input(img)
    encoding_test[img_name] = get_image_encodings(img)

In [166]:
import pickle
with open("encoded_image_features.pkl","wb") as f:
    pickle.dump(encoding_train,f)
with open("encoded_image_features_testdata.pkl","wb") as f:
    pickle.dump(encoding_test,f)

### creating Vocab

In [167]:
total_words=[]
for img_name in captions.keys():
    [total_words.append(i) for sent in captions[img_name] for i in sent.split()]
print('total words= {}'.format(len(total_words)))
count=np.array(np.unique(total_words,return_counts=True))
count=[(count[0][i],count[1][i])for i in range(count.shape[1])]
print('no of unique words= {}'.format(len(count))) #no of unique words

total words= 437466
no of unique words= 8441


In [168]:
#reducing size of vocab by removing words with frequency less than 10
size=10
vocab=[w[0] for w in count if int(w[1])>size]
print('no of unique words= {}'.format(len(vocab)))

no of unique words= 1850


In [169]:
total_words.append('<start>')
total_words.append('<end>')

In [170]:
word_to_idx={}
idx_to_word={}
# 0th index will be used to store character that will be used for padding while making 
# all captions of equal length (length of maximum sentence)
# <start> dog (padding) (padding) (padding)
# <start> dog is (padding) (padding)
# <start> dog is playing 
for i,word in enumerate(total_words):
    word_to_idx[word]=i+1
    idx_to_word[i+1]=word

In [171]:
max_len = 0 
for key in train_descriptions.keys():
    for cap in train_descriptions[key]:
        max_len = max(max_len,len(cap.split()))
print(max_len)

37


### Data Generator
- Data is very big that loading a batch one by one in memory while training is adviced.

In [181]:
def DataGenerator(train_descriptions,batch_size,encoding_train,word_to_idx):
    X1,X2,Y=[],[],[]
    #x1- image features, x2=captions ,y = next word that should be predicted
    no=0
    while True:
        for img_name,caption_list in train_descriptions.items():
            no+=1
            img_encoding=encoding_train[img_name]
            for sent in caption_list:
                sent_=[word_to_idx[w] for w in sent if w in word_to_idx]
                for i in range(len(sent_)):
                    x=sent_[:i]
                    y=sent_[i]
                    x=pad_sequencesd([x],maxlen=max_len,value=0,padding='post')[0]
                    vocab_size=len(word_to_idx) + 1
                    y=to_categorical([y],num_classes=vocab_size)[0]
                    X1.append(img_encoding)
                    X2.append(x)
                    Y.append(y)
            if(no==batch_size):
                yield [[np.array(X1),np.array(X2)],np.array(Y)]
                X1,X2,Y = [],[],[]
                no = 0