In [None]:
import numpy as np
from glob import glob
import cv2
import tensorflow as tf
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import Model, Sequential
import tensorflow.keras.layers as lay
import os
import matplotlib.pyplot as plt
import pickle

In [None]:
images_path = '../input/flickr8k/Images/'
images = glob(images_path+'*.jpg')

caption_path = '../input/flickr8k/captions.txt'
captions = open(caption_path, 'rb').read().decode('utf-8').split('\n')
captions=captions[1:]

In [None]:
# this variable is how many images that we will work on ( you can use the full dataset if you have good gpu and ram)
samp=1499

In [None]:
from tensorflow.keras.applications import ResNet50
incept_model = ResNet50(include_top=True)
last = incept_model.layers[-2].output
resnet = Model(inputs = incept_model.input,outputs = last)
resnet.trainable=False
resnet.summary()

In [None]:
# here we have the image and it is text description (image can have multiple text descriptions)
captions[:10]

In [None]:
# here we have a list of paths to every image
images[:10]

## Read images and extract the latent features by resnet

In [None]:
imgs={}
for i in range(samp):
    k=images[i].split('/')[-1] # extract the name of the image

    img = cv2.imread(images[i])
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = cv2.resize(img, (224,224))
    img = img.reshape(1,224,224,3)
    pred = resnet.predict(img).reshape(2048,)
    imgs[k]=pred


In [None]:
caps={}
for cap in captions:
    try:
        all=cap.split(',')
        path=all[0]
        cape=all[1][:-2]
    except:
        continue
    
    if path in imgs :
        cape=cape.lower()
        cape='<sos> '+cape+' <eos>'
        if path not in caps:
            caps[path]=[cape]
        else:
            caps[path].append(cape)

In [None]:
#extracting the vocabulary
vacob ={}
for i,c in caps.items():
    for cc in range(len(c)):
        words=caps[i][cc].split()
        for w in words:
            if w not in vacob:
                vacob[w]=len(vacob)+1

In [None]:
for i,c in caps.items():
    for cc in range(len(c)):
        words=caps[i][cc].split()
        encoded=[]
        for w in words:
            encoded.append(vacob[w])
        caps[i][cc]=encoded

In [None]:
# max words in every sentence ( used in padding )
max=40

In [None]:
from tensorflow.keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences

In [None]:
xx=[]
yin=[]
yout=[]

for i,c in caps.items():
    for c2 in c:
        for g in range(1,len(c2)):
            yi = [c2[:g]]
            yo = c2[g]

            in_seq = pad_sequences(yi, maxlen=max, padding='post', truncating='post')[0]
            out_seq = to_categorical([yo], num_classes=len(vacob)+1)[0]

            xx.append(imgs[i])
            yin.append(in_seq)
            yout.append(out_seq)

In [None]:
xxx = np.array(xx)
y_in = np.array(yin, dtype='float64')
y_out = np.array(yout, dtype='float64')

In [None]:
i=5
print(y_in[i])
print(y_out[i])

In [None]:
print(xxx.shape,y_in.shape,y_out.shape)

In [None]:
image_model = Sequential()
image_model.add(lay.Dense(128, input_shape=(2048,), activation='relu'))
image_model.add(lay.RepeatVector(max))

In [None]:
language_model = Sequential()
language_model.add(lay.Embedding(input_dim=len(vacob)+1, output_dim=128, input_length=max))
language_model.add(lay.LSTM(256, return_sequences=True))
language_model.add(lay.TimeDistributed(lay.Dense(128)))

In [None]:
conca = lay.Concatenate()([image_model.output, language_model.output])
x = lay.LSTM(128, return_sequences=True)(conca)
x = lay.LSTM(512, return_sequences=False)(x)
out = lay.Dense(len(vacob)+1,activation='softmax')(x)
model = Model(inputs=[image_model.input, language_model.input], outputs = out)

# model.load_weights("../input/model_weights.h5")
model.compile(loss='categorical_crossentropy', optimizer='RMSprop', metrics=['accuracy'])
model.summary()

In [None]:
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath='./best_res.h5',
                                                 save_weights_only=True,save_best_only=True,
                                                 verbose=1)


In [None]:
model.fit([xxx, y_in], y_out, batch_size=256, epochs=100,callbacks=[cp_callback])