Image Caption Generation

In [2]:
import os
import pickle
import numpy as np
from tqdm.notebook import tqdm

from keras.applications.vgg16 import VGG16, preprocess_input
from keras.preprocessing.image import load_img, img_to_array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.utils import to_categorical, plot_model
from keras.layers import Input, Dense, LSTM, Embedding, Dropout, add

Base_Dir and working_dir is the folder where u are going to save all your project related files. 

In [3]:
BASE_DIR = r'D:\near_by_share\mlai\ImageC'      #folder path where zip file is being extracted
WORKING_DIR = r'D:\near_by_share\mlai\ImageC'   #folder path where zip file is being extracted

in your project folder download the zip file which is having our dataset on which we are going to train our model.

zip file    = flickr8k data set from kaggle website.

it contain arounnd 8000 images with around 40000 caption.


after downloading the zip file extract it in ur project folder.

using VGG16() model to train our model on it

In [4]:
# load vgg16 model
model = VGG16()
# restructure the model
model = Model(inputs=model.inputs, outputs=model.layers[-2].output)

extracting the important feature from all the images and storing it in features ditionary from the future use

In [None]:
## extract features from image
features = {}
directory = os.path.join(BASE_DIR, 'Images')

for img_name in tqdm(os.listdir(directory),desc='feature are extracting'):
    # load the image from file
    img_path = directory + '/' + img_name
    try:
      image = load_img(img_path, target_size=(224, 224))
    except UnidentifiedImageError as e:
      continue
    # convert image pixels to numpy array
    image = img_to_array(image)
    # reshape data for model
    image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
    # preprocess image for vgg
    image = preprocess_input(image)
    # extract features
    feature = model.predict(image, verbose=0)
    # get image ID
    image_id = img_name.split('.')[0]
    # store feature
    features[image_id] = feature


we are dumping our features dictionary in pickel format. so that we dont have to extract features from all  the images each time we execute our code after closing our code. Dumping help us to directly excess the features ditionary without waiting for so long as extracting images is a time consuming process(depending upon your hardware as in my case it takes around 2 hours which do not have egpu). so prefer to go for google collab. if u dont have egpu in ur laptop. As google collab is a virtual platform  which provide us around 12gb of ram and 15gb of egpu which makes our extracting time from 2hr to 12 min.

In [None]:
# pickle.dump(features, open(os.path.join(WORKING_DIR, 'features (1).pkl'), 'wb'))

and after dumping u can load the features (1).pkl file in ur code easily

In [5]:
# load features from pickle
with open(os.path.join(WORKING_DIR, 'features (1).pkl'), 'rb') as f:
    features = pickle.load(f)

ur extracted zip file is going to contain the captions.txt file

In [6]:
with open(os.path.join(BASE_DIR, 'captions.txt'), 'r') as f:
    next(f)
    captions_doc = f.read()

In [None]:
# create mapping of image to captions
mapping = {}
# process lines
for line in tqdm(captions_doc.split('\n')):
    # split the line by comma(,)
    tokens = line.split(',')
    if len(line) < 2:
        continue
    image_id, caption = tokens[0], tokens[1:]
    # remove extension from image ID
    image_id = image_id.split('.')[0]
    # convert caption list to string
    caption = " ".join(caption)
    # create list if needed
    if image_id not in mapping:
        mapping[image_id] = []
    # store the caption
    mapping[image_id].append(caption)

clean mapping function remove unwanted character from the captions which can create problem for our machine to learn.

In [8]:
def clean(mapping):
    for key, captions in mapping.items():
        for i in range(len(captions)):
            # take one caption at a time
            caption = captions[i]
            # preprocessing steps
            # convert to lowercase
            caption = caption.lower()
            # delete digits, special chars, etc.,
            caption = caption.replace('[^A-Za-z]', '')
            # delete additional spaces
            caption = caption.replace('\s+', ' ')
            # add start and end tags to the caption
            caption = 'startseq ' + " ".join([word for word in caption.split() if len(word)>1]) + ' endseq'
            captions[i] = caption

In [None]:
# before preprocess of text
mapping['17273391_55cfc7d3d4']

In [10]:
# preprocess the text
clean(mapping)

In [None]:
# after preprocess of text
mapping['17273391_55cfc7d3d4']

In [12]:
all_captions = []
for key in mapping:
    for caption in mapping[key]:
        all_captions.append(caption)

In [13]:
# tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_captions)
vocab_size = len(tokenizer.word_index) + 1

In [33]:
# Save the tokenizer and max length for later use during prediction
with open(os.path.join(WORKING_DIR, 'tokenizer.pkl'), 'wb') as token_file:
    pickle.dump(tokenizer, token_file)

In [39]:
with open(os.path.join(WORKING_DIR, 'tokenizer.pkl'), 'rb') as token_file:
        tokenizer = pickle.load(token_file)

In [17]:
#  get maximum length of the caption available
max_length = max(len(caption.split()) for caption in all_captions)
max_length

35

In [35]:
with open(os.path.join(WORKING_DIR, 'max_length.pkl'), 'wb') as maxlen_file:
    pickle.dump(max_length, maxlen_file)

In [None]:
with open(os.path.join(WORKING_DIR, 'max_length.pkl'), 'rb') as maxlen_file:
        max_length = pickle.load(maxlen_file)

use 90% of images in flickr8k dataset for training the model and rest for testing

can be commented after  u train ur model and dump ur model as after dumping the trained model u dont have to train it again and again 

In [18]:
image_ids = list(mapping.keys())
split = int(len(image_ids) * 0.90)
train = image_ids[:split]
test = image_ids[split:]

can be commented after dumping our trained model

In [37]:
# create data generator to get data in batch (avoids session crash)
def data_generator(data_keys, mapping, features, tokenizer, max_length, vocab_size, batch_size):
    # loop over images
    X1, X2, y = list(), list(), list()
    n = 0
    while 1:
        for key in data_keys:
            n += 1
            captions = mapping[key]
            # process each caption
            for caption in captions:
                # encode the sequence
                seq = tokenizer.texts_to_sequences([caption])[0]
                # split the sequence into X, y pairs
                for i in range(1, len(seq)):
                    # split into input and output pairs
                    in_seq, out_seq = seq[:i], seq[i]
                    # pad input sequence
                    in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                    # encode output sequence
                    out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]

                    # store the sequences
                    X1.append(features[key][0])
                    X2.append(in_seq)
                    y.append(out_seq)
            if n == batch_size:
                X1, X2, y = np.array(X1), np.array(X2), np.array(y)
                yield [X1, X2], y
                X1, X2, y = list(), list(), list()
                n = 0

add various layer criteria for the model

In [38]:
# encoder model
# image feature layers
inputs1 = Input(shape=(4096,))
fe1 = Dropout(0.4)(inputs1)
fe2 = Dense(256, activation='relu')(fe1)
# sequence feature layers
inputs2 = Input(shape=(max_length,))
se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
se2 = Dropout(0.4)(se1)
se3 = LSTM(256)(se2)

# decoder model
decoder1 = add([fe2, se3])
decoder2 = Dense(256, activation='relu')(decoder1)
outputs = Dense(vocab_size, activation='softmax')(decoder2)

model = Model(inputs=[inputs1, inputs2], outputs=outputs)
model.compile(loss='categorical_crossentropy', optimizer='adam')

# plot the model
plot_model(model, show_shapes=True)

this will also gonna take time according to ur hardware specification


here we are training our model over 90% of 8091 images

In [None]:
# train the model
epochs = 15
batch_size = 32
steps = len(train) // batch_size

for i in range(epochs):
    # create data generator
    generator = data_generator(train, mapping, features, tokenizer, max_length, vocab_size, batch_size)
    # fit for one epoch
    model.fit(generator, epochs=1, steps_per_epoch=steps, verbose=1)

In [1]:
# model.save('D:\near_by_share\mlai\ImageC\modeltrain1.h5')

after saving our model we can also comment the training part as model has been trained and we can simply load it in any of our program  

In [19]:
from keras.models import load_model
model = load_model(r'D:\near_by_share\mlai\ImageC\modeltrain1.h5')

In [20]:
def idx_to_word(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

In [21]:
# generate caption for an image
def predict_caption(model, image, tokenizer, max_length):
    # add start tag for generation process
    in_text = 'startseq'
    # iterate over the max length of sequence
    for i in range(max_length):
        # encode input sequence
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        # pad the sequence
        sequence = pad_sequences([sequence], max_length)
        # predict next word
        yhat = model.predict([image, sequence], verbose=0)
        # get index with high probability
        yhat = np.argmax(yhat)
        # convert index to word
        word = idx_to_word(yhat, tokenizer)
        # stop if word not found
        if word is None:
            break
        # append word as input for generating next word
        in_text += " " + word
        # stop if we reach end tag
        if word == 'endseq':
            break

    return in_text

accuracy of our program can be test on the remaining test data

In [None]:
# from nltk.translate.bleu_score import corpus_bleu
# # validate with test data
# actual, predicted = list(), list()

# for key in tqdm(test):
#     # get actual caption
#     captions = mapping[key]
#     # predict the caption for image
#     y_pred = predict_caption(model, features, tokenizer, max_length)
#     # split into words
#     actual_captions = [caption.split() for caption in captions]
#     y_pred = y_pred.split()
#     # append to the list
#     actual.append(actual_captions)
#     predicted.append(y_pred)

# # calcuate BLEU score
# print("BLEU-1: %f" % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
# print("BLEU-2: %f" % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))

check your model over different images

In [44]:
from PIL import Image
import matplotlib.pyplot as plt


class CaptionGenerator:
    def __init__(self):
        # Initialize your class if needed
        pass

    def generate_caption(self, image_name):
        image_id = image_name.split('.')[0]
        img_path = os.path.join(BASE_DIR, "Images", image_name)
        image = Image.open(img_path)
        captions = mapping[image_id]
        print('---------------------Actual---------------------')
        for caption in captions:
            print(caption)
        # predict the caption
        y_pred = predict_caption(model, features[image_id], tokenizer, max_length)
        print('--------------------Predicted--------------------')
        print(y_pred)
        plt.imshow(image)

# Create an instance of the CaptionGenerator class
models = CaptionGenerator()

In [None]:
models.generate_caption("230016181_0c52b95304.jpg")

run ur predict_caption func for any image of ur choice

In [33]:
vgg_model = VGG16()
# restructure the model
vgg_model = Model(inputs=vgg_model.inputs, outputs=vgg_model.layers[-2].output)

In [35]:
image_path = 'D:/near_by_share/mlai/ImageC/Images/47870024_73a4481f7d.jpg'
# load image
image = load_img(image_path, target_size=(224, 224))
# convert image pixels to numpy array
image = img_to_array(image)
# reshape data for model
image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
# preprocess image for vgg
image = preprocess_input(image)
# extract features
feature = vgg_model.predict(image, verbose=0)
# predict from the trained model
predict_caption(model, feature, tokenizer, max_length)

'startseq man in yellow shirt and black pants is performing trick on his bicycle endseq'

If u want to make ur model more interactive u can go for flask and html process repository in my repository section