In [None]:
!pip install kaggle tensorflow nltk

In [None]:
!mkdir ~/.kaggle

In [None]:
!cp kaggle.json ~/.kaggle/

In [None]:
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets download -d adityajn105/flickr8k

In [None]:
!unzip flickr8k.zip

In [None]:
import os
import pickle
import numpy as np
from PIL import Image
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from nltk.translate.bleu_score import corpus_bleu
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical, plot_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, add

In [None]:
DATASET_DIR = 'Images'

In [None]:
model = VGG16()
model = Model(inputs = model.inputs, outputs = model.layers[-2].output)

In [None]:
features = {}
for img_name in tqdm(os.listdir(DATASET_DIR)):
  img_path = DATASET_DIR + '/' + img_name
  img = load_img(img_path, target_size = (224, 224))
  img = img_to_array(img)
  img = img.reshape((1, img.shape[0], img.shape[1], img.shape[2]))
  img = preprocess_input(img)
  feature = model.predict(img, verbose = 0)
  img_id = img_name.split('.')[0]
  features[img_id] = feature

In [None]:
pickle.dump(features, open("features.pkl", "wb"))

In [None]:
with open("features.pkl" ,"rb") as f:
  features = pickle.load(f)

In [None]:
with open("captions.txt", "r") as f:
  next(f)
  captions = f.read()

In [None]:
mapping = {}
for line in tqdm(captions.split("\n")):
  tokens = line.split(",")
  if len(line) < 2:
    continue
  img_id, caption = tokens[0].split(".")[0], tokens[1]
  if img_id not in mapping:
    mapping[img_id] = []
  mapping[img_id].append(caption)

In [None]:
len(mapping)

In [None]:
mapping

In [None]:
def clean(mapping):
  for key, captions in mapping.items():
    for i in range(len(captions)):
      caption = captions[i]
      caption = caption.lower()
      caption = caption.replace("[^A-Za-z]", "")
      caption = caption.replace("\s+", " ")
      caption = "start " + " ".join([word for word in caption.split(" ") if len(word) > 1]) + " end"
      captions[i] = caption

In [None]:
clean(mapping)

In [None]:
mapping

In [None]:
captions = []
for key in mapping:
  for caption in mapping[key]:
    captions.append(caption)

In [None]:
captions[:5]

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(captions)
vocab_size = len(tokenizer.word_index) + 1

In [None]:
with open("tokenizer.pkl", "wb") as file:

    pickle.dump(tokenizer, file)

In [None]:
vocab_size

In [None]:
max_length = max(len(caption.split()) for caption in captions)

In [None]:
max_length

In [None]:
image_ids = list(mapping.keys())
split = int(len(image_ids) * 0.9)
train = image_ids[: split]
test = image_ids[split :]
len(train), len(test)

In [None]:
def data_generator(data_keys, mapping, features, tokenizer, max_length, vocab_size, batch_size) :
  X1, X2, y = list(), list(), list()
  n = 0

  while True :
    for key in data_keys :
      n += 1
      captions = mapping[key]

      for caption in captions :
        seq = tokenizer.texts_to_sequences([caption])[0]
        for i in range(1, len(seq)) :
          in_seq, out_seq = seq[: i], seq[i]
          in_seq = pad_sequences([in_seq], maxlen = max_length)[0]
          out_seq = to_categorical([out_seq], num_classes = vocab_size)[0]
          X1.append(features[key][0])
          X2.append(in_seq)
          y.append(out_seq)

      if n ==  batch_size :
        X1, X2, y = np.array(X1), np.array(X2), np.array(y)
        yield [X1, X2], y

        X1, X2, y = list(), list(), list()
        n = 0

In [None]:
input_layer_1 = Input(shape = (4096))
dropout_layer_1 = Dropout(0.4)(input_layer_1)
dense_layer_1 = Dense(256, activation = "relu")(dropout_layer_1)

input_layer_2 = Input(shape = (max_length))
embedding_layer = Embedding(vocab_size, 256, mask_zero = True)(input_layer_2)
dropout_layer_2 = Dropout(0.4)(embedding_layer)
lstm_layer = LSTM(256)(dropout_layer_2)

In [None]:
decoder_layer_1 = add([dense_layer_1, lstm_layer])
decoder_layer_2 = Dense(256, activation = "relu")(decoder_layer_1)
output_layer = Dense(vocab_size, activation = "softmax")(decoder_layer_2)

In [None]:
model = Model(inputs = [input_layer_1, input_layer_2], outputs = output_layer)
model.compile(loss = "categorical_crossentropy", optimizer = "adam")

In [None]:
plot_model(model, show_shapes =  True)

In [None]:
for i in range(15) :
  generator = data_generator(data_keys = train, mapping = mapping, features = features, tokenizer = tokenizer, max_length = max_length, vocab_size = vocab_size, batch_size = 64)
  model.fit(generator, epochs = 1, steps_per_epoch = len(train) // 64, verbose = 1)

In [None]:
model.save("model.model")

In [None]:
!zip -r model.zip "model.model"

In [None]:
def index_to_word(y, tokenizer) :
  for word, index in tokenizer.word_index.items():
    if index == y :
      return word
  return None

In [None]:
def predict_caption(model, image, tokenizer, max_length) :
  in_text = "start"
  for i in range(max_length) :
    sequence = tokenizer.texts_to_sequences([in_text])[0]
    sequence = pad_sequences([sequence], max_length)
    prediction = model.predict([image, sequence], verbose = 1)
    result = np.argmax(prediction)
    word = index_to_word(result, tokenizer)

    if word is None :
      break

    in_text += " " + word

    if word == "end" :
      break

  return in_text

In [None]:
actual, predicted = list(), list()

for key in tqdm(test) :
  captions = mapping[key]
  y_pred = predict_caption(model, features[key], tokenizer, max_length)
  actual_caption = [caption.split() for caption in captions]
  predicted_caption = y_pred.split()
  actual.append(actual_caption)
  predicted.append(predicted_caption)

print(f"BLEU-1 : {corpus_bleu(actual, predicted, weights = (1, 0, 0, 0, 0))}")
print(f"BLEU-2 : {corpus_bleu(actual, predicted, weights = (0.5, 0.5, 0, 0, 0))}")

In [None]:
img_model = VGG16()
img_model = Model(inputs = img_model.inputs, outputs = img_model.layers[-2].output)

In [None]:
def generate_caption(img_path, img_model, model, tokenizer, max_length) :
  img_path = ""
  img = load_img(img_path, target_size = (224, 224))
  img = img_to_array(img)
  img = img.reshape((1, img.shape[0], img.shape[1], img.shape[2]))
  img = preprocess_input(img)
  feature = img_model.predict(img, verbose = 0)
  y_pred = predict_caption(model, feature, tokenizer, max_length)
  return y_pred

In [None]:
caption = generate_caption("boat.jpg", img_model, model, tokenizer, max_length)

In [None]:
img = Image.open("boat.jpg")
plt.imshow(img)
print(caption)