|<h1 align="center">**Name**</h1>|<h1 align="center">**ID**</h1>|
|---|---|
|<h4 align="center">**Abd Allah Mohamed Abd Allah Mohamed Taman**</h4>|<h4 align="center">**20010906**</h4>|
|<h4 align="center">**Karim Fathy Abd Alaziz Mohamed Mostafa**</h4>|<h4 align="center">**20011116**</h4>|
|<h4 align="center">**Mahmoud Ali Ahmed Ali**</h4>|<h4 align="center">**20011811**</h4>|:

## **Important modules imports**

*   **<font color = "magenta" >OS</font>**: used for handling treating with files.
*   **<font color = "magenta" >tqaddum</font>**: provides a visual indication of loop progress.
*   **<font color = "magenta" >numpy</font>**: provides mathematical functions useful for treating with vectors and matrices.
*   **<font color = "magenta" >img_to_array</font>**: convert image to 2d array
*   **<font color = "magenta" >load_img</font>**: loading the image given the path which lies on.
* **<font color = "magenta" >Tokenizer</font>**: this is used to convert each text into a sequence of integers.
* **<font color = "magenta" >pad_sequence</font>**: used to make all sequences in a list have the same length by padding to the maximum sequence length.
* **<font color = "magenta">Sequential</font>**: used to create sequential models.
* **<font color = "magenta" >ResNet50</font>**: it is a deep convolutional neural network used for image classification tasks.
* **<font color = "magenta" >preprocess_input</font>**: preprocesses input images to be compatible with the ResNet50 model.
* **<font color = "magenta" >layers</font>**: contains classes and functions for defining different layers in a neural network.
* **<font color = "magenta" >optimizers</font>**: contains classes and functions for defining optimization algorithms to train neural network models.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from keras.applications.resnet import ResNet50,preprocess_input
from keras.callbacks import EarlyStopping,History,ModelCheckpoint,Callback
from keras.layers import *
from keras.models import Model,Sequential
from keras.preprocessing.image import img_to_array,load_img
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical,plot_model
import pickle
from tqdm.notebook import tqdm
from nltk.translate.bleu_score import corpus_bleu,sentence_bleu,SmoothingFunction
from keras.saving import load_model

In [None]:
WORKING_DIR = "/kaggle/working/"

# Using ResNet to extract image features from all the dataset images


In [None]:
resnet = ResNet50()
resnet = Model(resnet.input,resnet.layers[-2].output)


In [None]:
img_features = {}
def img_preprocessing():
    for image in tqdm(dataset["image"].unique().tolist())
        img = load_img(img_path,target_size=(224,224))
        img = img_to_array(img)
        img = np.expand_dims(img,axis=0)
        img = preprocess_input(img)
        feature = resnet.predict(img,verbose=0)
        feature = feature.reshape(2048,)
        img_features[image] = feature

## saveing image features for further use

In [None]:
img_features = pickle.load(open("/kaggle/input/my-files/features (1).pkl","rb"))

## Exploring dataset

In [None]:
dataset = pd.read_csv("/kaggle/input/flickr8k/captions.txt")

In [None]:
dataset.head()

In [None]:
def text_preprocess(dataset):
    dataset["caption"] = dataset["caption"].apply(lambda x:x.lower())
    dataset["caption"] = dataset["caption"].apply(lambda x:x.replace("[^A-Za-z]",""))
    dataset["caption"] = dataset["caption"].apply(lambda x:x.replace("\s+"," "))
    dataset["caption"] = dataset["caption"].apply(lambda x:" ".join(word for word in x.split() if len(word)>1))
    dataset["caption"] = "ssttaarrtt " + dataset["caption"] + " eenndd"
    return dataset
text_preprocess(dataset)

# Spliting dataset to train,validate and test sets

In [None]:
images = dataset["image"].unique().tolist()
images_number = len(images)

images_training_set = images[:6000]
images_validation_set = images[6000:7000]
images_test_set = images[7000:8000]

training_set = dataset[dataset["image"].isin(images_training_set)]
validation_set = dataset[dataset["image"].isin(images_validation_set)]
test_set = dataset[dataset["image"].isin(images_test_set)]

training_set.reset_index(drop=True,inplace=True)
validation_set.reset_index(drop=True,inplace=True)
test_set.reset_index(drop=True,inplace=True)

# Extracting Vocab from dataset captions

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(training_set["caption"].tolist())
vocab_size = len(tokenizer.word_index) + 1
max_sentence_len = max([len(s.split()) for s in training_set["caption"].tolist()])

print(f"""
vocab size = {vocab_size}
max sentence length = {max_sentence_len}
images number = {images_number}
""")

# mapping 5 captions to image

In [None]:
train_ref={}
for image in tqdm(training_set["image"].unique().tolist()):
    references = training_set.loc[training_set["image"]==image]
    references = references["caption"]
    references = [x.split() for x in references]
    train_ref[image] = references


In [None]:
validate_ref={}
for image in tqdm(validation_set["image"].unique().tolist()):
    references = validation_set.loc[validation_set["image"]==image]
    references = references["caption"]
    references = [x.split() for x in references]
    validate_ref[image] = references


In [None]:
test_ref={}
for image in tqdm(test_set["image"].unique().tolist()):
    references = test_set.loc[test_set["image"]==image]
    references = references["caption"]
    references = [x.split() for x in references]
    test_ref[image] = references


### spliting extracted features to different sets

In [None]:
all_feat = list(img_features.values())
training_features =  all_feat[:6000]
validation_features =  all_feat[6000:7000]
test_features =  all_feat[7000:8000]

# Data generator
* used for feeding training_data on batches to the Model during training to save memory

In [None]:
def data_generator(dataset,img_features,tokenizer,vocab_size,max_sentence_length,batch_size):
    image_features,learn_seq,word_seq=[],[],[]
    size=0
    while True:
        for image,caption in zip(dataset["image"].tolist(),dataset["caption"].tolist()):
            size+=1
            sequence = tokenizer.texts_to_sequences([caption])[0]
            for i in range(len(sequence)):
                in_seq,out_seq = sequence[:i],sequence[i]
                in_seq = pad_sequences([in_seq],maxlen=max_sentence_length)[0]
                out_seq = to_categorical([out_seq],num_classes=vocab_size)[0]
                learn_seq.append(in_seq)
                word_seq.append(out_seq)
                image_features.append(img_features[image])
            if(size==batch_size):
                image_features=np.array(image_features)
                learn_seq = np.array(learn_seq)
                word_seq = np.array(word_seq)
                yield (image_features,learn_seq),word_seq
                image_features,learn_seq,word_seq=[],[],[]
                size=0

# Model 1 (partial_injection model)
* feature vector for input image is concatenated to each word vector in the sequence that is being fed to the LSTM layer in order to influence the output of the sequence.
* the model consists of an lstm to study the sequence of the words
* the output of the that lstm (whole sequence) is concatinated with img vector
* the merged vector is fed to another lstm to predict the next word
* at last the predected word vector is fed to a Dense layer of the size of the vocab(available classes) with softmax activation function to predict the suitable word


![image.png](attachment:a4434833-7a67-4997-bb30-93d29ceaad55.png)

In [None]:
#par_inject model
image_input = Input(shape=(2048,))
image_embedder = Dense(128,activation='relu')(image_input)
encoder_output = RepeatVector(max_sentence_len)(image_embedder)

text_input = Input(shape=(max_sentence_len,))
text_embedding = Embedding(vocab_size,128)(text_input)
sequence_encoder = LSTM(256,return_sequences=True)(text_embedding)
sequence_embedder = TimeDistributed(Dense(128))(sequence_encoder)

concat = Concatenate()([encoder_output,sequence_embedder])
decoder = LSTM(128,return_sequences=True)(concat)
decoder2 = LSTM(512)(decoder)
output = Dense(vocab_size,activation='softmax')(decoder2)

model = Model([image_input,text_input],output,name="hamadabta3medium")
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=["accuracy"])
plot_model(model,show_shapes=True)

# implementing a CallBack for calculating BLEU score after each epoch

In [None]:
from tensorflow.keras.callbacks import Callback
from nltk.translate.bleu_score import sentence_bleu

class BLEUCallback(Callback):
    def __init__(self, tokenizer,img_features,max_sentence_len,training_img,training_references,validation_img,validation_references,test_img,test_references,batch_size):
        self.tokenizer = tokenizer
        self.img_features = img_features
        self.max_sentence_len = max_sentence_len
        self.training_img = training_img
        self.training_references = training_references

        self.validation_img = validation_img
        self.validation_references = validation_references

        self.test_img = test_img
        self.test_references = test_references

        self.batch_size = batch_size

    def on_epoch_end(self, epoch, logs=None):

        validation_predictions = self.get_caption(self.validation_img)
        validation_bleu_4 = 0.0
        smoothie = SmoothingFunction().method4
        for prediction,reference in zip(validation_predictions,self.validation_references):
            validation_bleu_4 += (corpus_bleu([reference],[prediction.split()],smoothing_function=smoothie))
            validation_bleu_4 = validation_bleu_4/2.0

        logs["val_bleu_4"]=validation_bleu_4

    def get_caption(self,images):
        predictions = []
        for i in range(0,len(images),self.batch_size):
            batch = images[i:i+self.batch_size]
            sentences = ["ssttaarrtt"]*len(batch)
            for i in range(self.max_sentence_len):
                sequence = self.tokenizer.texts_to_sequences(sentences)
                sequence = pad_sequences(sequence,maxlen=self.max_sentence_len)
                yhat = self.model.predict((np.array(batch),sequence),verbose=0)
                yhat = np.argmax(yhat,axis=-1)
                word = [self.tokenizer.index_word[pred] for pred in yhat]
                sentences =[sentences[k]+" "+word[k] if word[k] is not None and word[k] !="eenndd" else sentences[k] for k in range(len(sentences))]
        predictions.extend(sentences)
        return predictions


# Model Training Stoped after 28 epochs out of 50 to avoid overfitting on training data
the used value of weights for the model is the values that correspond to the epoch with the lowest validation loss

In [None]:
checkpoint = ModelCheckpoint(WORKING_DIR+"par_injectModel.keras",save_best_only=True)
early_stop = EarlyStopping(patience=10,restore_best_weights=True)
bleusda = BLEUCallback(tokenizer,img_features,max_sentence_len,training_features,train_ref.values(),validation_features,validate_ref.values(),test_features,test_ref.values(),100)

epochs = 40
batch_size = 100
train_step = len(training_set)//batch_size
val_step = len(validation_set)//batch_size
train_gen = data_generator(training_set,img_features,tokenizer,vocab_size,max_sentence_len,batch_size)
val_gen = data_generator(validation_set,img_features,tokenizer,vocab_size,max_sentence_len,batch_size)

history = model.fit(train_gen,validation_data=val_gen,epochs=epochs,steps_per_epoch=train_step,validation_steps=val_step,verbose=1,callbacks=[checkpoint,early_stop,bleusda])

In [None]:
pickle.dump(history.history,open("par_injectModel","wb"))

In [None]:
history = pickle.load(open("par_injectModel","rb"))

In [None]:
history = history.history

In [None]:
#plotting loss
plt.plot(range(1,len(history["loss"])+1),history["loss"],color='g',label="training_loss")
plt.plot(range(1,len(history["val_loss"])+1),history["val_loss"],color='orange',label="validation_loss")
plt.xlabel("epochs")
plt.ylabel("loss")
plt.legend()
plt.show()

In [None]:
#plotting accuracy
plt.plot(range(1,len(history["accuracy"])+1),history["accuracy"],color='g',label="training_accuracy")
plt.plot(range(1,len(history["val_accuracy"])+1),history["val_accuracy"],color='orange',label="validation_accuracy")
plt.xlabel("epochs")
plt.ylabel("accuracy")
plt.legend()
plt.show()

In [None]:
#plotting BLEU-4 for validation
plt.plot(range(1,len(history["val_bleu_4"])+1),history["val_bleu_4"],color='r',label="validation_bleu_score")
plt.xlabel("epochs")
plt.ylabel("BLEU-4")
plt.title("BLEU-4 value for validation set")
plt.show()


In [None]:
model.save(WORKING_DIR+"/par_inject_model.keras")

# Merge Model(with Add layer)
this model does not take image features into account during training the lstm layer as the RNN train only on the sequences of text and their order while extracted image features are only added to the output of the RNN to influence the probability of the chosen next word by the lstm, by adding the values of image features vector to the output  of the RNN and using a Dense layer with "relu" activation function to decode that new merged vector to another Dense layer of size equal to vocab size of our captions that utilizes "softmax" activation function to get the word class corresponding to the highest probability

![image.png](attachment:41f6e0d1-2332-478c-bb32-286193681f8b.png)

In [None]:
#merging Model(with add)
img_input = Input(shape=(2048,))
dropout = Dropout(0.5)(img_input)
img_embedding = Dense(256,activation='relu')(dropout)

text_input = Input(shape=(max_sentence_len,))
text_embedding = Embedding(vocab_size,256)(text_input)
lstm = LSTM(256)(text_embedding)
dropout = Dropout(0.5)(lstm)

merging = Add()([img_embedding,dropout])

decoder = Dense(256,activation='relu')(merging)
output = Dense(vocab_size,activation='softmax')(decoder)

model2 = Model([img_input,text_input],output,name="Merging_Model_add")
model2.compile(loss = "categorical_crossentropy",optimizer='adam',metrics=['accuracy'])
plot_model(model2,show_shapes=True)

# Model Training Stoped after 15 epochs out of 50 to avoid overfitting on training data
the used value of weights for the model is the values that correspond to the epoch with the lowest validation loss

In [None]:
checkpoint = ModelCheckpoint(WORKING_DIR+"merging_model.keras",save_best_only=True)
early_stop = EarlyStopping(patience=10,restore_best_weights=True)
bleusCall= BLEUCallback(tokenizer,img_features,max_sentence_len,training_features,train_ref.values(),validation_features,validate_ref.values(),test_features,test_ref.values(),100)

epochs = 50
batch_size = 100
train_step = len(training_set)//batch_size
val_step = len(validation_set)//batch_size
train_gen = data_generator(training_set,img_features,tokenizer,vocab_size,max_sentence_len,batch_size)
val_gen = data_generator(validation_set,img_features,tokenizer,vocab_size,max_sentence_len,batch_size)

history2 = model2.fit(train_gen,validation_data=val_gen,epochs=epochs,steps_per_epoch=train_step,validation_steps=val_step,verbose=1,callbacks=[checkpoint,early_stop,bleusCall])

In [None]:
historyModel_2 = history2.history
plt.plot(range(1,len(historyModel_2["loss"])+1),historyModel_2["loss"],color='g',label="train_loss")
plt.plot(range(1,len(historyModel_2["val_loss"])+1),historyModel_2["val_loss"],color='r',label="val_loss")
plt.xlabel("epochs")
plt.ylabel("loss")
plt.legend()
plt.show()

In [None]:
plt.plot(range(1,len(historyModel_2["accuracy"])+1),historyModel_2["accuracy"],color='g',label="train_accuracy")
plt.plot(range(1,len(historyModel_2["val_accuracy"])+1),historyModel_2["val_accuracy"],color='r',label="val_accuracy")
plt.xlabel("epochs")
plt.ylabel("accuracy")
plt.legend()
plt.show()

In [None]:
plt.plot(range(1,len(historyModel_2["val_bleu_4"])+1),historyModel_2["val_bleu_4"],color='g',label="val_bleu_4")
plt.xlabel("epochs")
plt.ylabel("score")
plt.title("BLEU-4 score for validations set")
plt.show()

In [None]:
pickle.dump(historyModel_2,open(WORKING_DIR+"model_merging_history.pkl","wb"))

In [None]:
model2.save(WORKING_DIR+"merging_model_add.keras")

# Merge Model (Concatenate merging)
similar to the previous Merge model the model doesn't use the image features vector to influence the inner sequences of the RNN rather, it uses the vector to affect the value of the decoded outcome of the RNN layer

that model utilizes a Concatenate Layer rather than an Add layer in order to preserve the outcome of the RNN without change but uses the Dense layer to decode the larger concatenated vector

In [None]:
# encoder model
# image feature layers
inputs1 = Input(shape=(2048,), name="image")
fe2 = Dense(256, activation='relu')(inputs1)

# sequence feature layers
inputs2 = Input(shape=(max_sentence_len,), name="text")
se1 = Embedding(vocab_size, 256, mask_zero=False)(inputs2)
se3 = LSTM(256)(se1)

concat = Concatenate()([fe2, se3])

x = Dense (vocab_size)(concat)
x = Dropout(0.3)(x)
out = Activation('softmax')(x)

model3 = Model(inputs=[inputs1, inputs2], outputs=out,name="merging_model_concat")
model3.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# plot the model
plot_model(model3, show_shapes=True)

# Model Training Stoped after 19 epochs out of 50 to avoid overfitting on training data
the used value of weights for the model is the values that correspond to the epoch with the lowest validation loss

In [None]:
checkpoint = ModelCheckpoint(WORKING_DIR+"merging_model_concat_chackpoint.keras",save_best_only=True)
early_stop = EarlyStopping(patience=10,restore_best_weights=True)
bleusCall= BLEUCallback(tokenizer,img_features,max_sentence_len,training_features,train_ref.values(),validation_features,validate_ref.values(),test_features,test_ref.values(),100)

epochs = 50
batch_size = 100
train_step = len(training_set)//batch_size
val_step = len(validation_set)//batch_size
train_gen = data_generator(training_set,img_features,tokenizer,vocab_size,max_sentence_len,batch_size)
val_gen = data_generator(validation_set,img_features,tokenizer,vocab_size,max_sentence_len,batch_size)

history3 = model3.fit(train_gen,validation_data=val_gen,epochs=epochs,steps_per_epoch=train_step,validation_steps=val_step,verbose=1,callbacks=[checkpoint,early_stop,bleusCall])

In [None]:
historyModel_3 = history3.history
plt.plot(range(1,len(historyModel_3["loss"])+1),historyModel_3["loss"],color='g',label="train_loss")
plt.plot(range(1,len(historyModel_3["val_loss"])+1),historyModel_3["val_loss"],color='r',label="val_loss")
plt.xlabel("epochs")
plt.ylabel("loss")
plt.legend()
plt.show()

In [None]:
historyModel_3 = history3.history
plt.plot(range(1,len(historyModel_3["accuracy"])+1),historyModel_3["accuracy"],color='g',label="train_accuracy")
plt.plot(range(1,len(historyModel_3["val_accuracy"])+1),historyModel_3["val_accuracy"],color='r',label="val_accuracy")
plt.xlabel("epochs")
plt.ylabel("accuracy")
plt.legend()
plt.show()

In [None]:
historyModel_3 = history3.history
plt.plot(range(1,len(historyModel_3["val_bleu_4"])+1),historyModel_3["val_bleu_4"],color='g',label="val_bleu_4")
plt.xlabel("epochs")
plt.ylabel("BLUE-4")
plt.title("BLUE-4 score for validation set")
plt.show()

In [None]:
def get_caption(model,image,img_features,tokenizer,max_sentence_len):
    sentence = "ssttaarrtt"
    img_feature = np.array([img_features[image]])
    for i in range(max_sentence_len):
        sequence = tokenizer.texts_to_sequences([sentence])
        sequence = pad_sequences(sequence,maxlen=max_sentence_len)
        yhat = model.predict((img_feature,sequence),verbose=0)
        yhat = np.argmax(yhat)
        word = tokenizer.index_word[yhat]
        sentence +=" "+word
        if(word == "eenndd"):
            break
    return sentence

In [None]:
first_model = tf.keras.models.load_model("/kaggle/working/par_inject_model.keras")

In [None]:
get_caption(first_model,training_set["image"][0],img_features,tokenizer,max_sentence_len)

In [None]:
#reproting test_set on first model
predictions = []
for i in tqdm(range(0,len(test_features),100)):
    batch = test_features[i:i+100]
    sentences = ["ssttaarrtt"]*len(batch)
    for i in range(max_sentence_len):
        sequence = tokenizer.texts_to_sequences(sentences)
        sequence = pad_sequences(sequence,maxlen=max_sentence_len)
        yhat = first_model.predict((np.array(batch),sequence),verbose=0)
        yhat = np.argmax(yhat,axis=-1)
        word = [tokenizer.index_word[pred] for pred in yhat]
        sentences =[sentences[k]+" "+word[k] if word[k] is not None and word[k] !="eenndd" else sentences[k] for k in range(len(sentences))]
    predictions.extend(sentences)

bleu_1 = 0.0
bleu_2 = 0.0
bleu_3 = 0.0
bleu_4 = 0.0
smoothi1 = SmoothingFunction().method1
smoothi2 = SmoothingFunction().method2
smoothi3 = SmoothingFunction().method3
smoothi4 = SmoothingFunction().method4


for predection,reference in zip(predictions,list(test_ref.values())):
    bleu_1 += corpus_bleu([reference],[predection.split()],weights=(1.0,0.0,0.0,0.0),smoothing_function=smoothi1)
    bleu_2 += corpus_bleu([reference],[predection.split()],weights=(0.5,0.5,0.0,0.0),smoothing_function=smoothi2)
    bleu_3 += corpus_bleu([reference],[predection.split()],weights=(0.3,0.3,0.3,0.0),smoothing_function=smoothi3)
    bleu_4 += corpus_bleu([reference],[predection.split()],weights=(0.25,0.25,0.25,0.25),smoothing_function=smoothi4)
    bleu_1 = bleu_1/2.0
    bleu_2 = bleu_2/2.0
    bleu_3 = bleu_3/2.0
    bleu_4 = bleu_4/2.0

print(f"test set cumulative BLEU-1: {bleu_1}")
print(f"test set cumulative BLEU-2: {bleu_2}")
print(f"test set cumulative BLEU-3: {bleu_3}")
print(f"test set cumulative BLEU-4: {bleu_4}")







In [None]:
#reproting test_set on 3rd model only
predictions = []
for i in tqdm(range(0,len(test_features),100)):
    batch = test_features[i:i+100]
    sentences = ["ssttaarrtt"]*len(batch)
    for i in range(max_sentence_len):
        sequence = tokenizer.texts_to_sequences(sentences)
        sequence = pad_sequences(sequence,maxlen=max_sentence_len)
        yhat = model3.predict((np.array(batch),sequence),verbose=0)
        yhat = np.argmax(yhat,axis=-1)
        word = [tokenizer.index_word[pred] for pred in yhat]
        sentences =[sentences[k]+" "+word[k] if word[k] is not None and word[k] !="eenndd" else sentences[k] for k in range(len(sentences))]
    predictions.extend(sentences)

bleu_1 = 0.0
bleu_2 = 0.0
bleu_3 = 0.0
bleu_4 = 0.0
smoothi1 = SmoothingFunction().method1
smoothi2 = SmoothingFunction().method2
smoothi3 = SmoothingFunction().method3
smoothi4 = SmoothingFunction().method4


for predection,reference in zip(predictions,list(test_ref.values())):
    bleu_1 += corpus_bleu([reference],[predection.split()],weights=(1.0,0.0,0.0,0.0),smoothing_function=smoothi1)
    bleu_2 += corpus_bleu([reference],[predection.split()],weights=(0.5,0.5,0.0,0.0),smoothing_function=smoothi2)
    bleu_3 += corpus_bleu([reference],[predection.split()],weights=(0.3,0.3,0.3,0.0),smoothing_function=smoothi3)
    bleu_4 += corpus_bleu([reference],[predection.split()],weights=(0.25,0.25,0.25,0.25),smoothing_function=smoothi4)
    bleu_1 = bleu_1/2.0
    bleu_2 = bleu_2/2.0
    bleu_3 = bleu_3/2.0
    bleu_4 = bleu_4/2.0

print(f"test set cumulative BLEU-1: {bleu_1}")
print(f"test set cumulative BLEU-2: {bleu_2}")
print(f"test set cumulative BLEU-3: {bleu_3}")
print(f"test set cumulative BLEU-4: {bleu_4}")







In [None]:
#examples
example_predictions = []
batch = test_features[:20]
sentences = ["ssttaarrtt"]*len(batch)
for i in range(max_sentence_len):
    sequence = tokenizer.texts_to_sequences(sentences)
    sequence = pad_sequences(sequence,maxlen=max_sentence_len)
    yhat = model3.predict((np.array(batch),sequence),verbose=0)
    yhat = np.argmax(yhat,axis=-1)
    word = [tokenizer.index_word[pred] for pred in yhat]
    sentences =[sentences[k]+" "+word[k] if word[k] is not None and word[k] !="eenndd" else sentences[k] for k in range(len(sentences))]
example_predictions.extend(sentences)

In [None]:
for i,image in enumerate(test_set["image"].unique().tolist()):
    img = load_img("/kaggle/input/flickr8k/Images"+image)
    print("actual captions:")
    captions=test_set.loc[test_set["image"]==image]
    for caption in  captions["caption"].tolist():
        print("caption")
    print("predicted caption:")
    print(example_predictions[i])



# modified inject model


In [None]:
img_input = Input(shape=(2048,))
dropout = Dropout(0.5)(img_input)
img_embedding = Dense(256,activation='relu')(dropout)
encoder_output = RepeatVector(max_sentence_len)(img_embedding)

text_input = Input(shape=(max_sentence_len,))
text_embedding = Embedding(vocab_size,256)(text_input)
dropout = Dropout(0.5)(text_embedding)

merge = Concatenate()([encoder_output,dropout])

lstm = LSTM(256)(merge)

decoder = Dense(256,activation='relu')(lstm)
output = Dense(vocab_size,activation="softmax")(decoder)

modelSpecial = Model([img_input,text_input],output)
modelSpecial.compile(optimizer="adam",loss="categorical_crossentropy",metrics=["accuracy"])
plot_model(modelSpecial,show_shapes=True)


In [None]:
checkpoint = ModelCheckpoint(WORKING_DIR+"injecting_model_withdropouts.keras",save_best_only=True)
early_stop = EarlyStopping(patience=5,restore_best_weights=True)
bleusCall= BLEUCallback(tokenizer,img_features,max_sentence_len,training_features,train_ref.values(),validation_features,validate_ref.values(),test_features,test_ref.values(),100)

epochs = 50
batch_size = 64
train_step = len(training_set)//batch_size
val_step = len(validation_set)//batch_size
train_gen = data_generator(training_set,img_features,tokenizer,vocab_size,max_sentence_len,batch_size)
val_gen = data_generator(validation_set,img_features,tokenizer,vocab_size,max_sentence_len,batch_size)

historyLast = modelSpecial.fit(train_gen,validation_data=val_gen,epochs=epochs,steps_per_epoch=train_step,validation_steps=val_step,verbose=1,callbacks=[checkpoint,early_stop,bleusCall])

In [None]:
h = historyLast.history

In [None]:
plt.plot(range(1,len(h["loss"])+1),h["loss"],color='g',label="train_loss")
plt.plot(range(1,len(h["val_loss"])+1),h["val_loss"],color='r',label="val_loss")
plt.xlabel("epochs")
plt.ylabel("loss")
plt.legend()
plt.show()

In [None]:
plt.plot(range(1,len(h["accuracy"])+1),h["accuracy"],color='g',label="train_accuracy")
plt.plot(range(1,len(h["val_accuracy"])+1),h["val_accuracy"],color='r',label="val_accuracy")
plt.xlabel("epochs")
plt.ylabel("accuracy")
plt.legend()
plt.show()

In [None]:
plt.plot(range(1,len(h["val_bleu_4"])+1),h["val_bleu_4"],color='g',label="val_bleu_4")
plt.xlabel("epochs")
plt.ylabel("BLEU-4")
plt.title("accumulative BLEU-4 score for validation data")
plt.show()

In [None]:
predictions = []
for i in tqdm(range(0,len(test_features),100)):
    batch = test_features[i:i+100]
    sentences = ["ssttaarrtt"]*len(batch)
    for i in range(max_sentence_len):
        sequence = tokenizer.texts_to_sequences(sentences)
        sequence = pad_sequences(sequence,maxlen=max_sentence_len)
        yhat = modelSpecial.predict((np.array(batch),sequence),verbose=0)
        yhat = np.argmax(yhat,axis=-1)
        word = [tokenizer.index_word[pred] for pred in yhat]
        sentences =[sentences[k]+" "+word[k] if word[k] is not None and word[k] !="eenndd" else sentences[k] for k in range(len(sentences))]
    predictions.extend(sentences)

bleu_1 = 0.0
bleu_2 = 0.0
bleu_3 = 0.0
bleu_4 = 0.0
smoothi1 = SmoothingFunction().method1
smoothi2 = SmoothingFunction().method2
smoothi3 = SmoothingFunction().method3
smoothi4 = SmoothingFunction().method4

reference = list(test_ref.values())
predection = [x.split() for x in predictions]
bleu_1 = corpus_bleu(reference,predection,weights=(1.0,0.0,0.0,0.0),smoothing_function=smoothi1)
bleu_2 = corpus_bleu(reference,predection,weights=(0.5,0.5,0.0,0.0),smoothing_function=smoothi2)
bleu_3 = corpus_bleu(reference,predection,weights=(0.3,0.3,0.3,0.0),smoothing_function=smoothi3)
bleu_4 = corpus_bleu(reference,predection,weights=(0.25,0.25,0.25,0.25),smoothing_function=smoothi4)


print(f"test set cumulative BLEU-1: {bleu_1}")
print(f"test set cumulative BLEU-2: {bleu_2}")
print(f"test set cumulative BLEU-3: {bleu_3}")
print(f"test set cumulative BLEU-4: {bleu_4}")







In [None]:
#examples
example_predictions = []
batch = test_features[:20]
sentences = ["ssttaarrtt"]*len(batch)
for i in range(max_sentence_len):
    sequence = tokenizer.texts_to_sequences(sentences)
    sequence = pad_sequences(sequence,maxlen=max_sentence_len)
    yhat = modelSpecial.predict((np.array(batch),sequence),verbose=0)
    yhat = np.argmax(yhat,axis=-1)
    word = [tokenizer.index_word[pred] for pred in yhat]
    sentences =[sentences[k]+" "+word[k] if word[k] is not None and word[k] !="eenndd" else sentences[k] for k in range(len(sentences))]
example_predictions.extend(sentences)

In [None]:
for i,image in enumerate(test_set["image"].unique().tolist()[:20]):
    img = load_img("/kaggle/input/flickr8k/Images/"+image)
    print("___________________actual captions___________________")
    captions=test_set.loc[test_set["image"]==image]
    for caption in  captions["caption"].tolist():
        print(caption)
    print("___________________predicted captions___________________")
    print(example_predictions[i])
    print("___________________Image___________________")
    plt.imshow(img)
    plt.show()



In [None]:
#overfitting the model
checkpoint = ModelCheckpoint(WORKING_DIR+"injecting_model_withdropouts.keras",save_best_only=True)
early_stop = EarlyStopping(patience=5,restore_best_weights=True)
bleusCall= BLEUCallback(tokenizer,img_features,max_sentence_len,training_features,train_ref.values(),validation_features,validate_ref.values(),test_features,test_ref.values(),100)

epochs = 20
batch_size = 64
train_step = len(training_set)//batch_size
val_step = len(validation_set)//batch_size
train_gen = data_generator(training_set,img_features,tokenizer,vocab_size,max_sentence_len,batch_size)
val_gen = data_generator(validation_set,img_features,tokenizer,vocab_size,max_sentence_len,batch_size)

historyLast = modelSpecial.fit(train_gen,validation_data=val_gen,epochs=epochs,steps_per_epoch=train_step,validation_steps=val_step,verbose=1,callbacks=[checkpoint,bleusCall],initial_epoch=7)

In [None]:
#for overfitted model

predictions = []
for i in tqdm(range(0,len(test_features),100)):
    batch = test_features[i:i+100]
    sentences = ["ssttaarrtt"]*len(batch)
    for i in range(max_sentence_len):
        sequence = tokenizer.texts_to_sequences(sentences)
        sequence = pad_sequences(sequence,maxlen=max_sentence_len)
        yhat = modelSpecial.predict((np.array(batch),sequence),verbose=0)
        yhat = np.argmax(yhat,axis=-1)
        word = [tokenizer.index_word[pred] for pred in yhat]
        sentences =[sentences[k]+" "+word[k] if word[k] is not None and word[k] !="eenndd" else sentences[k] for k in range(len(sentences))]
    predictions.extend(sentences)

bleu_1 = 0.0
bleu_2 = 0.0
bleu_3 = 0.0
bleu_4 = 0.0
smoothi1 = SmoothingFunction().method1
smoothi2 = SmoothingFunction().method2
smoothi3 = SmoothingFunction().method3
smoothi4 = SmoothingFunction().method4

reference = list(test_ref.values())
predection = [x.split() for x in predictions]
bleu_1 = corpus_bleu(reference,predection,weights=(1.0,0.0,0.0,0.0),smoothing_function=smoothi1)
bleu_2 = corpus_bleu(reference,predection,weights=(0.5,0.5,0.0,0.0),smoothing_function=smoothi2)
bleu_3 = corpus_bleu(reference,predection,weights=(0.3,0.3,0.3,0.0),smoothing_function=smoothi3)
bleu_4 = corpus_bleu(reference,predection,weights=(0.25,0.25,0.25,0.25),smoothing_function=smoothi4)


print(f"test set cumulative BLEU-1: {bleu_1}")
print(f"test set cumulative BLEU-2: {bleu_2}")
print(f"test set cumulative BLEU-3: {bleu_3}")
print(f"test set cumulative BLEU-4: {bleu_4}")


In [None]:
import os
#using eternal examples
external_features = []
for image in tqdm(os.listdir("/kaggle/input/amsfbashb")):
    img = load_img(os.path.join("/kaggle/input/amsfbashb",image),target_size=(224,224))
    img = img_to_array(img)
    img = np.expand_dims(img,axis=0)
    img = preprocess_input(img)
    feature = resnet.predict(img,verbose=0)
    feature = feature.reshape(2048,)
    external_features.append(feature)

In [None]:
external_predictions = []
batch = external_features
sentences = ["ssttaarrtt"]*len(batch)
for i in range(max_sentence_len):
    sequence = tokenizer.texts_to_sequences(sentences)
    sequence = pad_sequences(sequence,maxlen=max_sentence_len)
    yhat = modelSpecial.predict((np.array(batch),sequence),verbose=0)
    yhat = np.argmax(yhat,axis=-1)
    word = [tokenizer.index_word[pred] for pred in yhat]
    sentences =[sentences[k]+" "+word[k] if word[k] is not None and word[k] !="eenndd" else sentences[k] for k in range(len(sentences))]
external_predictions.extend(sentences)

In [None]:
for image,caption in zip(os.listdir("/kaggle/input/amsfbashb"),external_predictions) :
    print(f"Caption == {caption[1:]}")
    img = load_img(os.path.join("/kaggle/input/amsfbashb",image))
    plt.imshow(img)
    plt.show()
