**First**, what do we want from this project
> We want to input a random image into the model and for the model to create a suitable caption for the image

## Import Library

In [None]:
import os
import pickle#to save feature and redo use again 
import numpy as np

from tqdm.notebook import tqdm#How much data is processed till now 

#For extract feature from image data
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input #vgg:- pre_traind model cnn for extract feature
 
from tensorflow.keras.preprocessing.image import load_img, img_to_array #For preprocessing on image 

from tensorflow.keras.preprocessing.text import Tokenizer #For preprocessing on text & Tokenizer:-Split text into Sentance 

from tensorflow.keras.preprocessing.sequence import pad_sequences #how word will take in patch and complete with zeros

#build Model
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical, plot_model #to show architecture of our model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, add #bulid Neural Network 

In [None]:
#path the dataset
BASE_DIR = '/kaggle/input/flickr8k'
WORKING_DIR = '/kaggle/working'

## Extract Image Feature 

In [None]:
#Ready CNN  
# load vgg16 model
model = VGG16()

#restrucuting the model
model = Model(inputs=model.inputs, outputs=model.layers[-2].output)#cuz we need only feature extraction

print(model.summary())
#plot_model(model, show_shapes=True)

In [None]:
#feature extraction from image 
#we need to make dict key:Image_ID , value:the features
features = {}
directory = os.path.join(BASE_DIR, 'Images')

for img_name in tqdm(os.listdir(directory)):
    #load img from file concat path with img_name
    img_path = directory + '/' + img_name
#print(img_path)
    image = load_img(img_path, target_size=(224, 224))
#print(image)
    # convert image pixels to numpy array
    image = img_to_array(image)
    # reshape data for model
    image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
#print(image)
    # preprocess image for vgg
    image = preprocess_input(image)
#print(image)
    #Extract features or fit cnn on image
    feature = model.predict(image, verbose=0) #verbose take 0,1,2 useage:how do you want to 'see' the training progress for each epoch.
    #get image ID with remove extation
    image_id = img_name.split('.')[0]
    # store feature
    features[image_id] = feature


## Store feature to ReDo again with traning 

In [None]:
#store features using pickle
pickle.dump(features, open(os.path.join(WORKING_DIR, 'features.pkl'), 'wb'))# wb:-open for writing and open in binary mode.

In [None]:
#make file in working_dir and load feature 
with open(os.path.join(WORKING_DIR, 'features.pkl'), 'rb') as f: # rb:-read mode  for binary mode
    features = pickle.load(f)

## Load caption data

In [None]:
#load caption to pickle to use any time
with open(os.path.join(BASE_DIR, 'captions.txt'), 'r') as f:
    next(f) #Why next cuz we dont't need first line in text caption data set 
    captions_doc = f.read()


Now we need for each each image  correspond with suitable caption

In [None]:
#create map using dictionary for each image and caption
mapping = {}
for line in tqdm(captions_doc.split('\n')): #for start line by line
    # split the line by comma(,)
    tokens = line.split(',') 
    if len(line) < 2:
        continue
    #now concate for image id and caption 
    image_id, caption = tokens[0], tokens[1:]
    #To romove extations "JPG" for image_id
    image_id = image_id.split('.')[0]
    #To convert caption list to string
    caption = " ".join(caption)
    #We have many caption for the same image to handle this 
    if image_id not in mapping:
        mapping[image_id] = []
    #add caption to image id 
    mapping[image_id].append(caption)
len(mapping) #good all image mapping   

In [None]:
#to show many captions have same id 
mapping['1000268201_693b08cb0e']

## Preprocessing Step For Text Data 

Now we will create pre_processing to clean txt in map 

In [None]:

def clean(mapping):
    for key, captions in mapping.items():
        for i in range(len(captions)):
            # take one caption at a time
            caption = captions[i]
            # preprocessing steps
            # convert to lowercase
            caption = caption.lower()
            # delete digits, special chars, etc., 
            caption = caption.replace('[^A-Za-z]', '')
            # delete additional spaces
            caption = caption.replace('\s+', ' ')
            # add start and end tags to the caption
            caption = '<s>' + " ".join([word for word in caption.split() if len(word)>1]) + ' <\s>'
            captions[i] = caption


In [None]:
# before preprocess of text
mapping['1000268201_693b08cb0e']

In [None]:
# preprocess the text
clean(mapping)

In [None]:
# after preprocess of text
mapping['1000268201_693b08cb0e']

In [None]:
#Take copy from mapping to make tokenize
all_captions = []
for key in mapping:
    for caption in mapping[key]:
        all_captions.append(caption)
len(all_captions)

In [None]:
#create tokenize on the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_captions)
vocab_size = len(tokenizer.word_index) + 1

In [None]:
vocab_size

In [None]:
# get maximum length of the caption available
max_length = max(len(caption.split()) for caption in all_captions)
max_length

## Split Data Train & Test

In [None]:

image_ids = list(mapping.keys())
split = int(len(image_ids) * 0.90)
train = image_ids[:split]
test = image_ids[split:]


In [None]:
#create a data generator to get data in batch 
def data_generator(data_keys, mapping, features, tokenizer, max_length, vocab_size, batch_size):
    # loop over images
    X1, X2, y = list(), list(), list()
    n = 0
    while 1:
        for key in data_keys:
            n += 1
             #after hold the image_id we will hold each caption for image
            captions = mapping[key]
            # process each caption
            for caption in captions:
               #we will indexing each word for sequences in list or encode the secqueces
                seq = tokenizer.texts_to_sequences([caption])[0]
                
                #split the sequences into X, y pairs
                #such <start> young boy runs aross the street <end>'
                #when give<start> will predict "young" when give "young" will predict "boy" and so on 
                for i in range(1, len(seq)):
                    # split into input and output pairs
                    in_seq, out_seq = seq[:i], seq[i]
                    # pad input sequence
                    in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                    # encode output sequence
                    out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
                    
                    # store the sequences
                    X1.append(features[key][0])
                    X2.append(in_seq)
                    y.append(out_seq)
            if n == batch_size:
                X1, X2, y = np.array(X1), np.array(X2), np.array(y) 
                yield [X1, X2], y #return the collected sample to the genrator
                X1, X2, y = list(), list(), list() #to free memory again  
                n = 0

In [None]:
# encoder model
# image feature layers
inputs1 = Input(shape=(4096,))
fe1 = Dropout(0.4)(inputs1)
fe2 = Dense(256, activation='relu')(fe1)
#fe3 = Dense(256, activation='relu')(fe2)

# sequence feature layers
inputs2 = Input(shape=(max_length,))
se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
se2 = Dropout(0.4)(se1)
se3 = LSTM(256)(se2)

# decoder model
decoder1 = add([fe2, se3])
decoder2 = Dense(256, activation='relu')(decoder1)
#decoder3 = Dense(256, activation='relu')(decoder2)
outputs = Dense(vocab_size, activation='softmax')(decoder2)

model = Model(inputs=[inputs1, inputs2], outputs=outputs)
model.compile(loss='categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])

# plot the model
plot_model(model, show_shapes=True)

In [None]:
# train the model
epochs =35
batch_size = 16
steps = len(train) // batch_size

for i in tqdm (range(epochs)):
    # create data generator
    generator = data_generator(train, mapping, features, tokenizer, max_length, vocab_size, batch_size)
    # fit for one epoch
    model.fit(generator, epochs=1, steps_per_epoch=steps, verbose=2)

In [None]:
#save best model 
model.save(WORKING_DIR+'/best_model.h5')


## Last but not least we will generate Caption for input image 


In [None]:
# First we need to take index word and covert to word 
def idx_to_word(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

## Genrete Caption For Image 


In [None]:
# generate caption for an image
def predict_caption(model, image, tokenizer, max_length):
    # add start tag for generation process
    in_text = '<s>'
    # iterate over the max length of sequence
    for i in range(max_length):
        # encode input sequence
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        # pad the sequence
        sequence = pad_sequences([sequence], max_length)
        # predict next word
        yhat = model.predict([image, sequence], verbose=0)
        # get index with high probability
        yhat = np.argmax(yhat)
        # convert index to word
        word = idx_to_word(yhat, tokenizer)
        # stop if word not found
        if word is None:
            break
        # append word as input for generating next word
        in_text += " " + word
        # stop if we reach end tag
        if word == '<\s>':
            break
        in_text = in_text.replace('<s>', '')
        in_text = in_text.replace('<\s>', '')

    return in_text

## Test Model on dataset

In [None]:
#from nltk.translate.bleu_score import corpus_bleu
# validate with test data
#actual, predicted = list(), list()

#for key in tqdm(test):
    # get actual caption
   # captions = mapping[key]
    # predict the caption for image
   # y_pred = predict_caption(model, features[key], tokenizer, max_length) 
    # split into words
   # actual_captions = [caption.split() for caption in captions]
   # y_pred = y_pred.split()
    # append to the list
   # actual.append(actual_captions)
   # predicted.append(y_pred)
    
# calcuate BLEU score like engram score 
#print("BLEU-1: %f" % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0))) #like one garm , range from 0 to 1 if 0.4 or higher it is better 
#print("BLEU-2: %f" % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0))) 

## Finally we will visualize output 

In [None]:
#for loading image
from PIL import Image 
import matplotlib.pyplot as plt

def generate_caption(image_name):
    # load the image
    # image_name = "1001773457_577c3a7d70.jpg"
    image_id = image_name.split('.')[0]
    img_path = os.path.join(BASE_DIR, "Images", image_name)
    image = Image.open(img_path)
    captions = mapping[image_id]
    print('Actual:-')
    for caption in captions:
        print(caption)
    # predict the caption
    y_pred = predict_caption(model, features[image_id], tokenizer, max_length)
    print(" ")
    print('Predicted:-')
    print(y_pred)
    plt.imshow(image)


In [None]:
generate_caption("1007320043_627395c3d8.jpg")


In [None]:
generate_caption("1009434119_febe49276a.jpg")


In [None]:
generate_caption("1287475186_2dee85f1a5.jpg")

In [None]:
generate_caption("1287931016_fb015e2e10.jpg")

In [None]:
generate_caption("1285067106_2adc307240.jpg")

In [None]:
generate_caption("1277185009_06478dd457.jpg")

In [None]:
generate_caption("1273001772_1585562051.jpg")

In [None]:
generate_caption("1262454669_f1caafec2d.jpg")

In [None]:
generate_caption("1262454669_f1caafec2d.jpg")