# Understanding the dataset

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import keras
import re
import string
import cv2
import json
import pickle
import collections
from keras.applications.resnet import ResNet50, preprocess_input
from keras.preprocessing import image
from keras.models import Model, load_model
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from keras.layers import Input, Dense, Dropout, Embedding, LSTM
from keras.layers.merge import add


In [None]:
def readTxtFile(path):
    with open(path) as f:
        captions = f.read()
    return captions

In [None]:
# read from the input that contains captions of each image
captions = readTxtFile("../input/flickr8k/captions.txt")

In [None]:
captions = captions.split('\n')[1:-1] #rejecting the first and last row as it is redundant

In [None]:
print(len(captions))
print(captions[0])
print(captions[1])
print(captions[2])
captions[0].split(".jpg,")

In [None]:
# dictionary to map image ids with their respective captions
descriptions = {}

for x in captions:
    img_name, img_desc = x.split(".jpg,")
    # if the img_name is not present
    if descriptions.get(img_name) is None:
        descriptions[img_name] = []
    descriptions[img_name].append(img_desc)

In [None]:
# checking if we got our dictionary correctly
# we have 5 captions for each image in the dataset
descriptions["1000268201_693b08cb0e"]

In [None]:
IMG_PATH = "../input/flickr8k/Images/"

def readImg(imgId):
    img = cv2.imread(IMG_PATH + imgId + ".jpg")# checking our image with a id
    # since cv2 reads our image in bgr format, we convert it to RGB
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    plt.imshow(img)
    plt.axis('off')
    #plt.title(descriptions[imgId][0])
    plt.show()
readImg("1001773457_577c3a7d70")
descriptions["1001773457_577c3a7d70"]

**Key points from the data set:**
* Has /Images folder with 8000 images
* Has a captions.txt file which has 5 captions for each image
* The captions are mapped with the image id

# Data Cleaning

In [None]:
def clean(sentence):
    sentence = sentence.lower()
    sentence = re.sub("[^a-z]+", " ", sentence) # substitute that is not an alphabet replace with space
    sentence = sentence.split()
    sentence = [s for s in sentence if len(s) > 1] # reject all words of length 1
    sentence = " ".join(sentence)
    return sentence

In [None]:
# iterate over all (key, value) pairs
for key,caption in descriptions.items():
    # iterate over all captions i.e. 5
    for i in range(len(caption)):
        # clean ith caption
        caption[i] = clean(caption[i])

In [None]:
descriptions["1001773457_577c3a7d70"]

In [None]:
# save our newly made dictionary after cleaning
# so that we can save time for large datasets
with open("descriptions.txt", "w") as f:
    f.write(str(descriptions))

# Vocabulary
Set of unique words model can predict, i.e. trace back our probability output number to map from the vocab

In [None]:
# using json.load() to read the dictionary into python
descriptions = None
with open("descriptions.txt") as f:
    descriptions = f.read()
json_str = descriptions.replace("'", "\"")
descriptions = json.loads(json_str)

In [None]:
print(type(descriptions))

In [None]:
vocab = set()
for key in descriptions.keys():
    # for each sentence we split it into words and pass it to set vocab from descriptions dictionary
    [vocab.update(sentence.split()) for sentence in descriptions[key]]
print("Vocab Size : %d"% len(vocab))

In [None]:
total_words = []
for key in descriptions.keys():
    [total_words.append(i) for des in descriptions[key] for i in des.split()]
print("Total Word %d"%len(total_words))

In [None]:
counter = collections.Counter(total_words)
freq_count = dict(counter)

In [None]:
# sort our frequency count based on value
sorted_freq_count = sorted(freq_count.items(), reverse=True, key=lambda x : x[1])
threshold_freq = 10
# we reject all the words whose frequency is less than 10
sorted_freq_count = [x for x in sorted_freq_count if x[1] > threshold_freq]
total_words = [x[0] for x in sorted_freq_count]

In [None]:
print(len(total_words)) # final vocab size

# Preparing Training Data

In [None]:
data = []
for k in descriptions.keys():
    data.append(k)
print(len(data))

In [None]:
splitSize = int(0.8 * len(data)) # 80-20 partition for training and test data
train_data = data[ : splitSize]
test_data = data[splitSize : ]
print(len(train_data), len(test_data))

In [None]:
# Add <start> and <end> token to our training data
# this determines where to start and where to stop
train_descriptions = {}
for img_id in train_data:
    train_descriptions[img_id] = []
    for cap in descriptions[img_id]:
        cap_ = "startseq " + cap + " endseq"
        train_descriptions[img_id].append(cap_)

In [None]:
train_descriptions["1001773457_577c3a7d70"]
# each sentence has 'startseq' and 'endseq'

# Transfer Learning 
To extract features from images and text

## Features from Images

In [None]:
# using pretrained model RESNET50
model = ResNet50(weights="imagenet", input_shape=(224,224,3))
model.summary()

In [None]:
model.layers[-2]

In [None]:
# we want to create a use the CONV model but upto the global average pooling layer
model_new = Model(model.input, model.layers[-2].output)

In [None]:
def preprocess_img(img):
    img = image.load_img(img, target_size=(224,224))
    img = image.img_to_array(img)
    img = np.expand_dims(img, axis=0) # [Batchsize, 224, 224, 3] extend 3d tensor to 4d
    # Normalisation
    img = preprocess_input(img)
    return img

In [None]:
img = preprocess_img(IMG_PATH+"1001773457_577c3a7d70.jpg")
plt.imshow(img[0])
plt.axis('off')
plt.show()

In [None]:
def encode_img(img):
    img = preprocess_img(img) # preprocess the image and normalize it
    feature_vector = model_new.predict(img) # passing through resnet conv layers
    # print(feature_vector.shape) # (1, 2048)
    feature_vector = feature_vector.reshape((-1,))
    return feature_vector

In [None]:
encode_img(IMG_PATH+"1001773457_577c3a7d70.jpg")

In [None]:
from time import time
encoding_train = {}
# img_id -> feature_vector
start_t = time()
for i, img_id in enumerate(train_data):
    img_path = IMG_PATH+img_id+".jpg"
    encoding_train[img_id] = encode_img(img_path)
    
    if i%100==0:
        print("Encoding Progress %d"%i)
end_t = time()
print("Total Time Taken :", end_t-start_t)

In [None]:
# store the trained data from the resnet locally
encoding_train["1001773457_577c3a7d70"]

In [None]:
with open("encoded_train_features.pkl","wb") as f:
    pickle.dump(encoding_train, f)

In [None]:
encoding_test = {}
# img_id -> feature_vector
start_t = time()
for i, img_id in enumerate(test_data):
    img_path = IMG_PATH+img_id+".jpg"
    encoding_test[img_id] = encode_img(img_path)
    
    if i%100==0:
        print("Encoding Progress %d"%i)
end_t = time()
print("Total Time Taken :", end_t-start_t)

In [None]:
with open("encoded_test_features.pkl", "wb") as f:
    pickle.dump(encoding_test, f)

# Preprocessing Captions

In [None]:
# vocab
len(total_words)

In [None]:
word_to_index = {}
index_to_word = {}
for i, word in enumerate(total_words):
    word_to_index[word] = i+1;
    index_to_word[i+1] = word 

In [None]:
print(word_to_index["on"], index_to_word[7])

In [None]:
index_to_word[1846] = "startseq"
word_to_index["startseq"] = 1846

index_to_word[1847] = "endseq"
word_to_index["endseq"] = 1847

vocab_size = len(word_to_index) + 1
print(vocab_size)

In [None]:
pickle
max_len = 0
for key in train_descriptions.keys():
    for cap in train_descriptions[key]:
        max_len = max(max_len, len(cap.split()))
print(max_len)

# Supervised Learning Problem
Language Modelling: 
Probabilities of getting a output depends on all the previous outputs
P(W_t+1 | W1.....Wt)


In [None]:
# data generator
def data_generator(train_descriptions, encoding_train, word_to_index, max_len, batch_size):
    X1, X2, y = [], [], [] #image vector X1, partial vector X2, target word y
    n = 0
    while True:
        for key, desc_list in train_descriptions.items():
            n += 1
            photo = encoding_train[key]
            for capt in desc_list:
                seq = [word_to_index[word] for word in capt.split() if word in word_to_index]
                for i in range(1, len(seq)):
                    xi = seq[0:i]
                    yi = seq[i]
                    
                    # zero padding
                    # returns a 2d matrix
                    xi = pad_sequences([xi], maxlen=max_len, value=0, padding='post')[0]
                    yi = to_categorical([yi], num_classes=vocab_size)[0]
                    
                    X1.append(photo)
                    X2.append(xi)
                    y.append(yi)
                if n == batch_size:
                    yield ([np.array(X1), np.array(X2)], np.array(y))
                    X1, X2, y = [], [], []
                    n = 0

# Word Embeddings

In [None]:
# transfer learning on text
f = open("../input/glove6b50dtxt/glove.6B.50d.txt")

In [None]:
embedding_index = {}
for line in f:
    values = line.split()
    word = values[0]
    word_embedding = np.array(values[1:],dtype='float')
    embedding_index[word] = word_embedding
f.close()

In [None]:
embedding_index['boy']

In [None]:
def get_embedding_matrix():
    emb_dim = 50
    matrix = np.zeros((vocab_size, emb_dim))
    for word, idx in word_to_index.items():
        embedding_vector = embedding_index.get(word)
        
        if embedding_vector is not None:
            matrix[idx] = embedding_vector
    return matrix

In [None]:
embedding_matrix = get_embedding_matrix()
embedding_matrix.shape

# Model Architecture

image features and partial sequence ----> |model| ---->  next word in seq


In [None]:
# output of resnet50 is feed in this
input_img_features = Input(shape=(2048,))
input_img1 = Dropout(0.3)(input_img_features)
input_img = Dense(256, activation='relu')(input_img1)


In [None]:
input_captions = Input(shape=(max_len, ))
input_cap1 = Embedding(input_dim = vocab_size, output_dim = 50, mask_zero=True)(input_captions)
input_cap2 = Dropout(0.3)(input_cap1)
input_cap = LSTM(256)(input_cap2) # output size 256

In [None]:
decoder1 = add([input_img, input_cap])
decoder2 = Dense(256, activation='relu')(decoder1)
outputs = Dense(vocab_size, activation='softmax')(decoder2)

In [None]:
model = Model(inputs=[input_img_features, input_captions], outputs=outputs)

In [None]:
model.summary()

In [None]:
model.layers[2].set_weights([embedding_matrix])
model.layers[2].trainable = False # pre trained using transfer learning

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam')

# Training Model

In [None]:
epochs = 20
batch_size = 3
steps = len(train_descriptions)//batch_size

for i in range(epochs):
    generator = data_generator(train_descriptions, encoding_train, word_to_index, max_len, batch_size)
    model.fit_generator(generator, epochs=1, steps_per_epoch=steps, verbose=1)
    model.save('./model_weights/model_'+str(i)+'.h5')

# Predictions

In [None]:
model = load_model('./model_weights/model_6.h5')

In [None]:
def predict_caption(photo):
    text = "startseq"
    for i in range(max_len):
        sequence = [word_to_index[w] for w in text.split() if w in word_to_index]
        sequence = pad_sequences([sequence], maxlen=max_len, padding='post')
        yPred = model.predict([photo, sequence])
        yPred = yPred.argmax()
        word = index_to_word[yPred]
        text += (' '+ word)
        if word == 'endseq':
            break
    
    final_caption = text.split()[1:-1]
    final_caption = ' '.join(final_caption)
    return final_caption

In [None]:
for i in range(15):
    idx = np.random.randint(0,1000)
    all_img = list(encoding_test.keys())
    img_name = all_img[idx]
    photo = encoding_test[img_name].reshape((1,2048))
    lol = plt.imread(IMG_PATH+img_name+'.jpg')
    caption = predict_caption(photo)
    print(caption)
    print(descriptions[img_name][0])
    plt.imshow(lol)
    plt.axis('off')
    plt.show()
    