In [None]:
import pandas as pd
import numpy as np
import os
import pickle # for storing the features we are going to extract from images
from tqdm.notebook import tqdm


from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.preprocessing.image import  load_img, img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.models import Model

from tensorflow.keras.utils import to_categorical, plot_model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, add

In [None]:
!pip install -q kaggle
! mkdir ~/.kaggle
! cp /content/kaggle.json ~/.kaggle/
!chmod 600 /root/.kaggle/kaggle.json


!kaggle datasets download -d adityajn105/flickr8k

mkdir: cannot create directory ‘/root/.kaggle’: File exists
Downloading flickr8k.zip to /content
100% 1.04G/1.04G [00:12<00:00, 149MB/s]
100% 1.04G/1.04G [00:12<00:00, 89.3MB/s]


In [None]:
!unzip "/content/flickr8k.zip" -d "/content/dataset/"

## **EXTRACT IMAGE FEATURES**

In [None]:
# load VGG16 model

model = VGG16()

# restructure the model
model = Model(inputs = model.inputs, outputs = model.layers[-2].output)

# summarize the model
print(model.summary())

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 224, 224, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 224, 224, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 224, 224, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 112, 112, 64)      0         
                                                                 
 block2_conv1 (Conv2D)       (None, 112, 112, 128)     73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 112, 112, 128)     147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 56, 56, 128)       0     

In [None]:
# extract features from image and store it in a dictionary

features = {}

directory = os.path.join("/content/dataset/Images")

for img_name in tqdm(os.listdir(directory)):

    # load image from file
    img_path = directory + '/' + img_name
    image = load_img(img_path, target_size=(224, 224))

    # convert image pixels to numpy array
    image = img_to_array(image)

    # reshape the image for model
    image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))

    # preprocess image according to vgg16
    image = preprocess_input(image)

    # extract features
    feature = model.predict(image, verbose = 0)

    # get image id
    image_id = img_name.split('.')[0]

    # store feature
    features[image_id] = feature




  0%|          | 0/8091 [00:00<?, ?it/s]

In [None]:
# store features in pickle

pickle.dump(features, open("/content/drive/MyDrive/image-caption-generator/features.pkl", "wb"))

In [None]:
# load features from pickle
with open("/content/drive/MyDrive/image-caption-generator/features.pkl", "rb") as f:
    features2 = pickle.load(f)

## **LOAD THE CAPTIONS DATA**

In [None]:
with open("/content/dataset/captions.txt", "r") as f:

    # we don't want the first line
    next(f)

    captions_doc = f.read()


In [None]:
captions_doc

In [28]:
# Create mapping of image to captions

mapping = {}

# process lines
for line in tqdm(captions_doc.split("\n")):

    # split the line by comma
    tokens = line.split(",")

    if(len(line) < 2):
        continue

    image_id, caption = tokens[0], tokens[1:]

    # remove .jpg from image_id
    image_id = image_id.split('.')[0]

    # convert caption list to string
    caption = " ".join(caption)

    # some images have multiple captions, so we have to store the captions in a list
    if image_id not in mapping:
        mapping[image_id] = []

    # store the caption
    mapping[image_id].append(caption)


  0%|          | 0/40456 [00:00<?, ?it/s]

In [None]:
mapping

In [29]:
# preprocess the captions

def clean(mapping):

    for key, captions in mapping.items():

        for i in len(captions):
            caption = captions[i]

            # preprocessing steps
            caption = caption.lower()

            # remove special characters
            caption = caption.replace('[^A-Za-z]', '')

            #