# **Version 1**
---
A simple image captioning model using pre-trained embeddings to embed the words in the model. 
The image encoder is a pre-trained model, and the features of the image are extracted.


#### Importing the necessary libraries

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pickle
import numpy as np
import os
import string
from collections import Counter
from PIL import Image
 
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.image import load_img
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras.applications.resnet import preprocess_input
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Input, Dropout, Dense, Embedding, LSTM, Add, Flatten, Conv2D, MaxPooling2D
from tensorflow.keras.models import Model
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.layers import Layer

In [None]:
!pip3 install pipreqsnb
!pipreqsnb --savepath 'requirements.txt' '/content/drive/My Drive/Colab Notebooks/ImageCaptioning_V1.ipynb'

In [None]:
main_dir = '/content/drive/My Drive/ImageCaptioning/'

#### Loading the annotations file, pre-processing it and saving it as a descriptions.txt file

In [None]:
def load_doc(filename):
    """
    function to read the annotations file.

    Parameters
    --------------
    filename: str
        full path of the annptations file
    """
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text

# extract descriptions for images
def load_descriptions(doc):
    """
    function to create a mapping of the caption with the filename
    """
    mapping = dict()
    for line in doc.split('\n'):
        tokens = line.strip().split('\t')
        if len(line) < 2:
            continue
        image_id, image_desc = tokens[0], tokens[1:]
        image_id = image_id.split('.')[0]
        image_desc = ' '.join(image_desc)
        if image_id not in mapping:
            mapping[image_id] = image_desc
    return mapping

def clean_descriptions(descriptions):
    """
    function to clean the descriptions.
    """
    table = str.maketrans('', '', string.punctuation)
    for key, desc in descriptions.items():
        desc = desc.split()
        desc = [word.lower() for word in desc]
        desc = [w.translate(table) for w in desc]
        desc = [word for word in desc if len(word)>1]
        descriptions[key] =  ' '.join(desc)

# save descriptions to file, one per line
def save_doc(descriptions, filename):
    lines = list()
    for key, desc in descriptions.items():
        lines.append(key + '\t' + desc)
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()

filename = main_dir + 'annotations_ajio_v3.txt'
doc = load_doc(filename)
print('Finished loading annotations.txt')
descriptions = load_descriptions(doc)
print('Loaded: %d ' % len(descriptions))
clean_descriptions(descriptions)
print("Finished cleaning descriptions")
all_tokens = ' '.join(descriptions.values()).split()
vocabulary = set(all_tokens)
print('Vocabulary Size: %d' % len(vocabulary))
#save_doc(descriptions, main_dir+'descriptions_v2_m1.txt')

#### Splitting the dataset

Splitting the dataset as training, validation and test data.

In [None]:
### SPLITTING DATASET
import random
product_ids = list(descriptions.keys())
random.shuffle(product_ids)
train_product_ids = product_ids[:int(0.8*len(product_ids))]
val_product_ids = product_ids[int(0.9*len(product_ids)):int(0.95*len(product_ids))]
test_product_ids = product_ids[int(0.95*len(product_ids)):]
print(len(product_ids))
print(len(train_product_ids))
print(len(val_product_ids))
print(len(test_product_ids))

#### Some pre-processing to find the vocab_size, max_length_of_caption for the model

`vocab_size` is required for building the model and deciding the size of the final layer of the model

`max_length_of_caption` is required to understand the number of times the prediction loop has to be run.

In [None]:
### PREPROCESSING CAPTIONS FOR TRAINING
def load_captions(descriptions,train_product_ids):
    train_captions=[]
    for image_id in descriptions.keys():
      if image_id in train_product_ids:
        train_captions.append('startseq '+descriptions[image_id]+' endseq')
    
    return train_captions

train_captions = load_captions(descriptions,train_product_ids)
val_captions = load_captions(descriptions, val_product_ids)
test_captions = load_captions(descriptions, test_product_ids)

In [None]:
corpus = []
for caption in val_captions+train_captions:
    for token in caption.split():
        corpus.append(token)
        
hash_map = Counter(corpus)
vocab = []
for token,count in hash_map.items():
        vocab.append(token)
        
print('Number of original tokens',len(hash_map))
print('Number of tokens after threshold',len(vocab))

word_to_index = {}
index_to_word = {}
    
for idx,token in enumerate(vocab):
    word_to_index[token] = idx+1
    index_to_word[idx+1] = token

vocab_size = len(index_to_word) + 1 # one for appended 0's

## max length of train captions

def max_len_caption(all_train_captions):   
    max_len = 0
    for caption in all_train_captions:
        max_len = max(max_len,len(caption.split()))
    print('Maximum length of caption= ',max_len)
    return max_len

max_length_caption = max_len_caption(train_captions+val_captions+test_captions)

#### Loading the GloVe embeddings for the model

We are using pre-trained embeddings to embed the words in our captions in the model 

In [None]:
### LOADING GLOVE EMBEDDINGS
embeddings_index = {} # empty dictionary
f = open(main_dir+'glove.6B.50d.txt', encoding="utf-8")

for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' % len(embeddings_index))

In [None]:
## Embedding matrix
embedding_dim = 50

# Get 200-dim dense vector for each of the 10000 words in out vocabulary
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in word_to_index.items():
    #if i < max_words:
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in the embedding index will be all zeros
        embedding_matrix[i] = embedding_vector
        
embedding_matrix.shape

#### Extracting the zip folder of images

In [None]:
from zipfile import ZipFile

zip = ZipFile(main_dir+'/images_v3.zip',mode='r')
zip.extractall()

#### Extracting the images features using VGG-16 and storing it in a dictionary

In [None]:

def load_img_features(product_ids):
    features=dict()
    product_ids_new = []
    image_dir ='images_v3/'
    in_layer = Input(shape=(224, 224, 3))
    model = VGG16(include_top=False, input_tensor=in_layer)
    for j,id in enumerate(product_ids): 
        print(j)
        try:
            image_name = image_dir+id+'.jpg'
            image=  load_img(image_name,target_size=(224, 224,3))
            image = img_to_array(image)
            image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
            image = preprocess_input(image)
            feature = model.predict(image, verbose=0)
            product_ids_new.append(id)
            features[id] = feature.reshape(7,7,512)
        except OSError:
            print("Error with file")
  
    print("Loaded", len(features.keys()) ,"number of features" )
    print(features[id].shape)
    print(type(features[id]))
    return features, product_ids_new

train_features, train_product_ids = load_img_features(train_product_ids)
val_features, val_product_ids = load_img_features(val_product_ids)
test_features, test_product_ids = load_img_features(test_product_ids)

print(train_features[train_product_ids[0]].flatten().reshape(-1,1).shape)

#### Loading the captions dictionary

Creating seperate dictionaries for different splits of data, along with the `<startseq>` and `<endseq>` token.

In [None]:
def load_captions_dict(descriptions,train_product_ids):
    train_captions=dict()
    for image_id in descriptions.keys():
        if image_id in train_product_ids:
            train_captions[image_id]= 'startseq '+descriptions[image_id]+' endseq'
    
    return train_captions

train_captions = load_captions_dict(descriptions,train_product_ids)
val_captions = load_captions_dict(descriptions, val_product_ids)
test_captions = load_captions_dict(descriptions, test_product_ids)

#### Defining the model

In [5]:
inputs1 = Input(shape=(7,7,512,))
conv1 = Conv2D(filters = 3, kernel_size=(3,3))(inputs1)
pool1 = MaxPooling2D(pool_size=(2,2))(conv1)
fe1 = Dropout(0.4)(pool1)
fe2 = Dense(256, activation='relu')(fe1)
inputs2 = Input(shape=(max_length_caption,))
se1 = Embedding(vocab_size,50, mask_zero=True)(inputs2)
se2 = Dropout(0.5)(se1)
se3 = LSTM(256)(se2)
decoder1 = Add()([fe2, se3])
decoder2 = Dense(256, activation='relu')(decoder1)
flatten2 = Flatten()(decoder2)
outputs = Dense(vocab_size, activation='softmax')(flatten2)
model = Model(inputs=[inputs1, inputs2], outputs=outputs)
model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            [(None, 7, 7, 512)]  0                                            
__________________________________________________________________________________________________
conv2d_2 (Conv2D)               (None, 5, 5, 3)      13827       input_4[0][0]                    
__________________________________________________________________________________________________
input_5 (InputLayer)            [(None, 13)]         0                                            
__________________________________________________________________________________________________
max_pooling2d_2 (MaxPooling2D)  (None, 2, 2, 3)      0           conv2d_2[0][0]                   
_______________________________________________________________________________________

#### Plotting the model

In [None]:
from keras.utils import plot_model
plot_model(model, to_file=main_dir+'model.png')

#### Setting the GloVe embeddings matrix as the embeddings layer

In [None]:
print(model.layers[4])

model.layers[4].set_weights([embedding_matrix])
model.layers[4].trainable = False

#### Defining the custom data generator

In [None]:
def data_generator(descriptions, photos, wordtoix, max_length, num_photos_per_batch):
    X1, X2, y = list(), list(), list()
    n=0
    # loop for ever over images
    while 1:
        for key, desc in descriptions.items():
            n+=1
            # retrieve the photo feature
            temp=main_dir
            
            photo = photos[key]
      
            for abc in range(1):
                # encode the sequence
                seq = [wordtoix[word] for word in desc.split(' ') if word in wordtoix]
                
                # split one sequence into multiple X, y pairs
                for i in range(1, len(seq)):
                    # split into input and output pair
                    in_seq, out_seq = seq[:i], seq[i]
                    # pad input sequence
                    in_seq = pad_sequences([in_seq], maxlen=max_length, dtype='float64')[0]
                    # encode output sequence
                    out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
                    # store
             
                    X1.append(photo)
                    X2.append(in_seq)
                    y.append(out_seq)
                 
            # yield the batch data
            if n==num_photos_per_batch:
                yield [[np.array(X1, dtype='float64'),
                        np.array(X2, dtype='float64')], 
                       np.array(y, dtype='float64')]
                X1, X2, y = list(), list(), list()
                n=0

#### Finally! Training the model

In [None]:
from tensorflow.keras.optimizers import Adam
model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=5e-4,decay=1e-5))
epochs = 10
number_pics_per_batch = 16
steps = len(train_captions)//number_pics_per_batch
temp = main_dir

generator = data_generator(train_captions, train_features, word_to_index, max_length_caption, number_pics_per_batch)
val_generator = data_generator(val_captions, val_features,  word_to_index, max_length_caption, number_pics_per_batch)

In [None]:
output_dir = main_dir + 'model1/'
if os.path.exists(output_dir) == False:
  os.mkdir(output_dir)

from keras.callbacks import ReduceLROnPlateau
reduce_lr = ReduceLROnPlateau(min_lr = 1e-7, monitor='val_loss', patience = 1, factor = 0.5, verbose=1)

history = model.fit_generator(generator, validation_data=val_generator, validation_steps=8,
                                   epochs=5, steps_per_epoch=steps, verbose=1,callbacks=[reduce_lr],shuffle=True)
model.save(output_dir+'ajio_v3_20.h5')

#### Loading the trained model.

In [None]:
from keras.models import load_model

output_dir = main_dir + 'model1/'
model = load_model(output_dir+'ajio_v3.h5')

print(test_captions)

#### Testing the model.

In [None]:
import matplotlib.pyplot as plt
import cv2

def greedySearch(photo):
  in_text = 'startseq'
  for i in range(max_length_caption):
    sequence = [word_to_index[w] for w in in_text.split() if w in word_to_index]
    sequence = pad_sequences([sequence], maxlen = max_length_caption)
    yhat = model.predict([photo,sequence],verbose=1)
    yhat = np.argmax(yhat)
    word = index_to_word[yhat]
    in_text+=' '+word
    if word == 'endseq':
      break
  final = in_text.split()
  final = final[1:-1]
  final = ' '.join(final)
  return final

print(len(test_product_ids))
z=54
pic= test_product_ids[z]
print(pic)
image = test_features[pic].reshape(1,7,7,512)
plt.imshow(cv2.imread('images_v3/'+pic+'.jpg'))
plt.show()
result = greedySearch(image)
print("Actual output:",test_captions[pic])
print("Predicted output:",result)