In [1]:
import pandas as pd
import numpy as np
import os

import cv2
import matplotlib.pyplot as plt
from PIL import Image

from tqdm import tqdm

from pickle import dump
from pickle import load

from collections import Counter

import nltk
import spacy
import re

from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from keras.applications.xception import Xception
from keras.applications.xception import preprocess_input
from keras.utils.vis_utils import plot_model
from keras.layers import Input, Dropout, Dense, Embedding, LSTM, add
from keras.layers.merging import concatenate
from keras.models import Model, load_model
from tensorflow.keras.callbacks import ModelCheckpoint

from tokenizers import ByteLevelBPETokenizer

import warnings
warnings.filterwarnings('ignore')

In [2]:
base_model = Xception(include_top = True)
base_model.summary()

Model: "xception"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 299, 299, 3  0           []                               
                                )]                                                                
                                                                                                  
 block1_conv1 (Conv2D)          (None, 149, 149, 32  864         ['input_1[0][0]']                
                                )                                                                 
                                                                                                  
 block1_conv1_bn (BatchNormaliz  (None, 149, 149, 32  128        ['block1_conv1[0][0]']           
 ation)                         )                                                          

 block4_sepconv1_bn (BatchNorma  (None, 37, 37, 728)  2912       ['block4_sepconv1[0][0]']        
 lization)                                                                                        
                                                                                                  
 block4_sepconv2_act (Activatio  (None, 37, 37, 728)  0          ['block4_sepconv1_bn[0][0]']     
 n)                                                                                               
                                                                                                  
 block4_sepconv2 (SeparableConv  (None, 37, 37, 728)  536536     ['block4_sepconv2_act[0][0]']    
 2D)                                                                                              
                                                                                                  
 block4_sepconv2_bn (BatchNorma  (None, 37, 37, 728)  2912       ['block4_sepconv2[0][0]']        
 lization)

 n)                                                                                               
                                                                                                  
 block7_sepconv1 (SeparableConv  (None, 19, 19, 728)  536536     ['block7_sepconv1_act[0][0]']    
 2D)                                                                                              
                                                                                                  
 block7_sepconv1_bn (BatchNorma  (None, 19, 19, 728)  2912       ['block7_sepconv1[0][0]']        
 lization)                                                                                        
                                                                                                  
 block7_sepconv2_act (Activatio  (None, 19, 19, 728)  0          ['block7_sepconv1_bn[0][0]']     
 n)                                                                                               
          

 block9_sepconv3_bn (BatchNorma  (None, 19, 19, 728)  2912       ['block9_sepconv3[0][0]']        
 lization)                                                                                        
                                                                                                  
 add_7 (Add)                    (None, 19, 19, 728)  0           ['block9_sepconv3_bn[0][0]',     
                                                                  'add_6[0][0]']                  
                                                                                                  
 block10_sepconv1_act (Activati  (None, 19, 19, 728)  0          ['add_7[0][0]']                  
 on)                                                                                              
                                                                                                  
 block10_sepconv1 (SeparableCon  (None, 19, 19, 728)  536536     ['block10_sepconv1_act[0][0]']   
 v2D)     

                                                                                                  
 block12_sepconv3_act (Activati  (None, 19, 19, 728)  0          ['block12_sepconv2_bn[0][0]']    
 on)                                                                                              
                                                                                                  
 block12_sepconv3 (SeparableCon  (None, 19, 19, 728)  536536     ['block12_sepconv3_act[0][0]']   
 v2D)                                                                                             
                                                                                                  
 block12_sepconv3_bn (BatchNorm  (None, 19, 19, 728)  2912       ['block12_sepconv3[0][0]']       
 alization)                                                                                       
                                                                                                  
 add_10 (A

In [3]:
model = Model(inputs = base_model.input, outputs = base_model.get_layer('avg_pool').output)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 299, 299, 3  0           []                               
                                )]                                                                
                                                                                                  
 block1_conv1 (Conv2D)          (None, 149, 149, 32  864         ['input_1[0][0]']                
                                )                                                                 
                                                                                                  
 block1_conv1_bn (BatchNormaliz  (None, 149, 149, 32  128        ['block1_conv1[0][0]']           
 ation)                         )                                                             

 block4_sepconv1_bn (BatchNorma  (None, 37, 37, 728)  2912       ['block4_sepconv1[0][0]']        
 lization)                                                                                        
                                                                                                  
 block4_sepconv2_act (Activatio  (None, 37, 37, 728)  0          ['block4_sepconv1_bn[0][0]']     
 n)                                                                                               
                                                                                                  
 block4_sepconv2 (SeparableConv  (None, 37, 37, 728)  536536     ['block4_sepconv2_act[0][0]']    
 2D)                                                                                              
                                                                                                  
 block4_sepconv2_bn (BatchNorma  (None, 37, 37, 728)  2912       ['block4_sepconv2[0][0]']        
 lization)

 n)                                                                                               
                                                                                                  
 block7_sepconv1 (SeparableConv  (None, 19, 19, 728)  536536     ['block7_sepconv1_act[0][0]']    
 2D)                                                                                              
                                                                                                  
 block7_sepconv1_bn (BatchNorma  (None, 19, 19, 728)  2912       ['block7_sepconv1[0][0]']        
 lization)                                                                                        
                                                                                                  
 block7_sepconv2_act (Activatio  (None, 19, 19, 728)  0          ['block7_sepconv1_bn[0][0]']     
 n)                                                                                               
          

 block9_sepconv3_bn (BatchNorma  (None, 19, 19, 728)  2912       ['block9_sepconv3[0][0]']        
 lization)                                                                                        
                                                                                                  
 add_7 (Add)                    (None, 19, 19, 728)  0           ['block9_sepconv3_bn[0][0]',     
                                                                  'add_6[0][0]']                  
                                                                                                  
 block10_sepconv1_act (Activati  (None, 19, 19, 728)  0          ['add_7[0][0]']                  
 on)                                                                                              
                                                                                                  
 block10_sepconv1 (SeparableCon  (None, 19, 19, 728)  536536     ['block10_sepconv1_act[0][0]']   
 v2D)     

                                                                                                  
 block12_sepconv3_act (Activati  (None, 19, 19, 728)  0          ['block12_sepconv2_bn[0][0]']    
 on)                                                                                              
                                                                                                  
 block12_sepconv3 (SeparableCon  (None, 19, 19, 728)  536536     ['block12_sepconv3_act[0][0]']   
 v2D)                                                                                             
                                                                                                  
 block12_sepconv3_bn (BatchNorm  (None, 19, 19, 728)  2912       ['block12_sepconv3[0][0]']       
 alization)                                                                                       
                                                                                                  
 add_10 (A

In [4]:
cap_df = pd.read_csv('D:\\DS\\Imarticus\\CAP 2\\data\\captions.csv')
cap_df = cap_df[['Image_name','Paragraph']]

In [5]:
cap_df = cap_df.iloc[:5000,:]
cap_df = cap_df.sort_values(by = 'Image_name')
cap_df.reset_index(drop = True, inplace = True)

In [6]:
cap_df.columns = [i.lower() for i in cap_df.columns]

In [None]:
def image_resize(width, height):
    
    images = []

    for image_name in tqdm(cap_df['image_name']):
    
        img = cv2.imread(f'images\\{image_name}.jpg')

        resized_img = cv2.resize(img, (width,height), interpolation = cv2.INTER_AREA)
        
        img_array = np.array(resized_img)
        
        images.append([str(image_name) + '.jpg', img_array])
        
    return images

In [8]:
images = image_resize(299,299)

100%|█████████████████████████████████████████████████████████████████████████████| 5000/5000 [00:30<00:00, 162.09it/s]


In [9]:
temp = pd.DataFrame(images)
temp.columns = ['name','img_array']

In [10]:
cap_df = cap_df.sort_values(by = 'image_name')
cap_df.reset_index(drop = True, inplace = True)

In [11]:
temp['name'] = [int(i.split('.')[0]) for i in temp['name']]

In [12]:
temp = temp.sort_values(by = 'name')
temp.reset_index(drop = True, inplace = True)

In [13]:
temp['captions'] = cap_df['paragraph']

In [14]:
img_df = temp.copy()

In [15]:
def feature_extractor(image_array, image_name):
    features = {}
    for pic,name in zip(image_array, image_name):
        
        # Preprocessing for the model
        image = pic
        image = np.expand_dims(image, axis = 0)
        image = preprocess_input(image)
        
        # Extract features from the model
        feature = model.predict(image)
        features[name] = feature
        
    return features

In [16]:
xc_features = feature_extractor(img_df['img_array'], cap_df['image_name'])



































In [19]:
dump(xc_features, open('xc_features.pkl', 'wb'))
xc_features = load(open('xc_features.pkl', 'rb'))

In [20]:
train,test = train_test_split(img_df, test_size = 0.3)

In [21]:
def caption_preprocessing(captions):
    
    # Load the spacy lemmatizer
    lemmatizer = spacy.load('en_core_web_trf')

    # Create an empty list to store lemmatized captions
    lemma_sent_list = []

    # Remove punctuations from ecah caption
    for sent in tqdm(captions):    
        sentence = re.sub(pattern = r'[^\w\s]', repl = '', string = sent)

        # Converting to lowercase
        sentence = sentence.lower()

        # Removing stopwords
        stop_words_removed = ''
        for word in sentence.split():
            if word in nltk.corpus.stopwords.words('english'):
                continue
            else:
                stop_words_removed = stop_words_removed + ' ' + word
        stop_words_removed = stop_words_removed.strip() 

        # Lemmatizing using en_core_web_trf
        doc = lemmatizer(stop_words_removed)
        lemma_sent = ''
        for token in doc:
            lemma_sent = lemma_sent + ' ' + token.lemma_
        lemma_sent = lemma_sent.strip()
        lemma_sent_list.append(lemma_sent)

    # Return lemmatized caption list
    return lemma_sent_list

In [22]:
train['lemma_caption'] = caption_preprocessing(train['captions'])
test['lemma_caption'] = caption_preprocessing(test['captions'])

100%|██████████████████████████████████████████████████████████████████████████████| 3500/3500 [03:53<00:00, 15.00it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 1500/1500 [01:39<00:00, 15.04it/s]


In [23]:
corpus = []
corpus.extend(['<START>','<END>'])
for i in train['lemma_caption']:
    corpus.extend(i.split())

In [24]:
train['lemma_caption'] = ['<START> ' + i + ' <END>' for i in train['lemma_caption']]
test['lemma_caption'] = ['<START> ' + i + ' <END>' for i in test['lemma_caption']]

In [25]:
train_lemma_dict = {}
for name,caption in zip(train['name'],train['lemma_caption']):
    train_lemma_dict[name] = caption
    
test_lemma_dict = {}
for name,caption in zip(test['name'],test['lemma_caption']):
    test_lemma_dict[name] = caption

In [26]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
print('vocab_size : ', len(tokenizer.index_word))

max_cap_ln = 0
for i in train['lemma_caption']:
    if len(i.split()) > max_cap_ln:
        max_cap_ln = len(i.split())
print('max_caption_length : ', max_cap_ln)

vocab_size :  4206
max_caption_length :  106


In [27]:
embeddings_index = dict()
fid = open('glove.6B.50d.txt' ,encoding="utf8")
for line in fid:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
fid.close()

In [28]:
embedding_dim = 50
word_index = tokenizer.word_index
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))

for word,idx in word_index.items():
    embeded_vector = embeddings_index.get(word)
    if embeded_vector is not None:
        embeddings_index[idx] = embeded_vector

dump(embedding_matrix, open('embedding_matrix.pkl','wb'))
embedding_matrix = load(open('embedding_matrix.pkl', 'rb'))

In [29]:
def data_generator(descriptions, features, tokenizer, max_length, vocab_size):
    temp_list = []
    for key, desc_list in descriptions.items():
        temp_list.append(desc_list)
    x1, x2, y = [], [], []
    max_seq_value = 0
    for desc in temp_list:
        seq = tokenizer.texts_to_sequences([desc])[0]
        for i in range(1,len(seq)):
            in_seq, out_seq = seq[:i], seq[i]
            in_seq = pad_sequences([in_seq], maxlen = max_length)[0]
            out_seq = to_categorical([out_seq], num_classes = vocab_size)[0]
            x1.append(features[key][0])
            x2.append(in_seq)
            y.append(out_seq)

    return [[np.array(x1), np.array(x2)], np.array(y)]

In [31]:
caption_max_length = max_cap_ln
vocab_size = len(tokenizer.index_word) + 1
generator = data_generator(train_lemma_dict, xc_features, tokenizer, caption_max_length, vocab_size)
inputs, outputs = generator
print(inputs[0].shape)
print(inputs[1].shape)
print(outputs.shape)

(83629, 2048)
(83629, 106)
(83629, 4207)


In [42]:
def define_model(vocab_size, max_length, embedding_matrix):
    # feature extractor model
    inputs1 = Input(shape = (2048,))
    image_feature = Dropout(0.5)(inputs1)
    image_feature = Dense(256, activation = 'relu')(image_feature)
    # sequence model
    inputs2 = Input(shape = (max_length,))
    language_feature = Embedding(vocab_size, 256, mask_zero = True)(inputs2)
    language_feature = Dropout(0.5)(language_feature)
    language_feature = LSTM(256)(language_feature)
    # Concatination
    output = concatenate([image_feature, language_feature])
    output = Dense(256, activation = 'relu')(output)
    output = Dense(vocab_size, activation = 'softmax')(output)
    # Define final model
    model = Model(inputs = [inputs1, inputs2], outputs = output)
    model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', run_eagerly = True)
    print(model.summary())
    return model


caption_max_length = max_cap_ln
vocab_size = len(tokenizer.index_word) + 1
concat_model = define_model(vocab_size, caption_max_length, embedding_matrix)

Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_7 (InputLayer)           [(None, 106)]        0           []                               
                                                                                                  
 input_6 (InputLayer)           [(None, 2048)]       0           []                               
                                                                                                  
 embedding_2 (Embedding)        (None, 106, 256)     1076992     ['input_7[0][0]']                
                                                                                                  
 dropout_4 (Dropout)            (None, 2048)         0           ['input_6[0][0]']                
                                                                                            

In [59]:
batch_size = 16
epochs = 3
steps = len(train_lemma_dict)
steps_per_epoch = steps//batch_size
# os.mkdir('models')

In [35]:
[[x1_train, x2_train], y_train] = data_generator(train_lemma_dict, xc_features, tokenizer, caption_max_length, vocab_size)
[[x1_test, x2_test], y_test] = data_generator(test_lemma_dict, xc_features, tokenizer, caption_max_length, vocab_size)

In [63]:
filepath="models/model-{epoch:02d}.hdf5"
checkpoint = ModelCheckpoint(filepath, verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

concat_model.fit(x = [x1_train,x2_train], y = y_train, epochs = epochs, batch_size = batch_size, callbacks = callbacks_list)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x1eb2a6d9520>

In [64]:
concat_model.save('models/model_1.h5')
pred_model = load_model('models/model_1.h5')

In [65]:
def generate_caption(pred_model, tokenizer, image, max_length):
    in_text = '<START>'
    caption_text = []
    for i in range(max_length):
        seq = tokenizer.texts_to_sequences([in_text])[0]
        # pad input
        seq = pad_sequences([seq], maxlen = max_length)
        # predict next word
        out = pred_model.predict([image,seq], verbose = 0)
        # convert probability to integer
        word_index = np.argmax(out)
        # map integer to word
        word = tokenizer.index_word[word_index]
        # stop if cannot map the word
        if word is None:
            break
        # append as input for generating next word
        in_text = in_text + ' ' + word
        # stop if the prediction is end
        if word != 'end':
            caption_text.append(word)
        if word == 'end':
            break
    return caption_text

In [124]:
demo = train[train['name'] == min(train['name'])]
demo_img = feature_extractor(demo['img_array'], demo['name'])
caption = generate_caption(pred_model, tokenizer, demo_img, caption_max_length)
print(' '.join(caption))

man wear black shirt black shirt hold white umbrella man wear black shirt
