<a href="https://colab.research.google.com/github/kunwarsaaim/Image_Captioning/blob/master/model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [5]:
from os import listdir
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array
from keras.applications.vgg16 import preprocess_input
from keras.applications.vgg16 import VGG16
from keras.models import Model
from keras.layers import Input
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.layers import BatchNormalization
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import Concatenate
from keras.layers import LSTM
from keras.layers import TimeDistributed
from keras.optimizers import Adam
import cv2


In [6]:
def load_photos(directory):      #directory:in which photos are stored
  image_arr = dict()
  for file in listdir(directory):
    filename = directory + '/' + file
    #load image from file
    #image = load_img(filename,target_size=(224,224))
    image =  cv2.imread(filename)
    image = cv2.resize(image,(224,224))
    #convert to numpy array
    #image = img_to_array(image)
    #reshape data for the model
    image = np.expand_dims(image,axis=0)
    image = preprocess_input(image)
    #get image id
    image_id = file.split(".")[0]
    #mapping image_id to numpy array converted image
    image_arr[image_id] = image
  return image_arr
                          

In [7]:
img_arr = load_photos("images")


In [10]:
img_arr

{'1000268201_693b08cb0e': array([[[[ -27.939003 ,  -35.779    ,  -31.68     ],
          [   2.060997 ,    6.2210007,   -6.6800003],
          [  21.060997 ,   27.221    ,   14.32     ],
          ...,
          [ -99.939    , -114.779    , -120.68     ],
          [ -98.939    , -111.779    , -119.68     ],
          [ -97.939    , -107.779    , -118.68     ]],
 
         [[ -30.939003 ,  -40.779    ,  -37.68     ],
          [   6.060997 ,    9.221001 ,    0.3199997],
          [  24.060997 ,   31.221    ,   19.32     ],
          ...,
          [ -94.939    , -106.779    , -115.68     ],
          [ -96.939    , -108.779    , -114.68     ],
          [ -89.939    ,  -96.779    , -114.68     ]],
 
         [[ -33.939003 ,  -45.779    ,  -36.68     ],
          [  10.060997 ,   13.221001 ,    6.3199997],
          [  23.060997 ,   31.221    ,   21.32     ],
          ...,
          [ -96.939    , -107.779    , -119.68     ],
          [-100.939    , -106.779    , -120.68     ],
      

In [11]:
def load_doc(filename):
  #open file
  file = open(filename,'r')
  text = file.read()
  file.close()
  return text


#discription for images
def load_discription(doc):
  mapping = dict()
  for line in doc.split("\n"):
    #split line by white spaces
    tokens = line.split()
    if len(line)<2:
      continue
    #first token as image id and rest as caption
    image_id,caption = tokens[0],tokens[1:]
    #remove filename from image
    image_id = image_id.split('.')[0]
    #tokens to string
    caption = " ".join(caption)
    #store first caption
    if image_id not in mapping:
      mapping[image_id] = caption
  return mapping

    

In [25]:
def text_prepross(filename):
    #load text from file
    raw_text = load_doc(filename)
    #seprate caption and image name
    text_dict = load_discription(raw_text)
    #convert dict to dataframe
    text_df = pd.DataFrame(list(text_dict.items()))
    #convert text to integers
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(text_df[1])
    vocab_size = len(tokenizer.word_index)+1
    seq = tokenizer.texts_to_sequences(text_df[1])
    #make all the tokenized caption of equal length
    padded_seq = (sequence.pad_sequences(seq))
    #make 2d array to list to assign to dataframe col
    padded_seq = padded_seq.tolist()
    for i in range(len(padded_seq)):
        padded_seq[i] = np.array(padded_seq[i])
        padded_seq[i] = padded_seq[i].reshape(33,1)
    text_df[2] = padded_seq
    return text_df,vocab_size
    
    

In [26]:
filename = "Flickr_TextData/Flickr8k.token.txt"

In [27]:
df,vocab = text_prepross(filename)

In [28]:
df.head()

Unnamed: 0,0,1,2
0,1000268201_693b08cb0e,A child in a pink dress is climbing up a set o...,"[[0], [0], [0], [0], [0], [0], [0], [0], [0], ..."
1,1001773457_577c3a7d70,A black dog and a spotted dog are fighting,"[[0], [0], [0], [0], [0], [0], [0], [0], [0], ..."
2,1002674143_1b742ab4b8,A little girl covered in paint sits in front o...,"[[0], [0], [0], [0], [0], [0], [0], [0], [0], ..."
3,1003163366_44323f5815,A man lays on a bench while his dog sits by him .,"[[0], [0], [0], [0], [0], [0], [0], [0], [0], ..."
4,1007129816_e794419615,A man in an orange hat starring at something .,"[[0], [0], [0], [0], [0], [0], [0], [0], [0], ..."


In [30]:
(df[2][1]).shape

(33, 1)

In [31]:
 image_df = pd.DataFrame(list(img_arr.items()))

In [32]:
image_df.head()

Unnamed: 0,0,1
0,1000268201_693b08cb0e,"[[[[-27.939003 -35.779 -31.68 ], [ 2.060..."
1,1001773457_577c3a7d70,"[[[[ -4.939003 -16.779 -18.68 ], [ -3.93..."
2,1002674143_1b742ab4b8,"[[[[105.061 99.221 87.32 ], [104.061 97.221..."
3,1003163366_44323f5815,"[[[[142.061 121.221 83.32 ], [146.061 124.221..."
4,1007129816_e794419615,"[[[[-72.939 -86.779 -95.68 ], [-71.939 -85.779..."


In [24]:
def model_arc(vocab_size=4421,learning_rate=0.00051,lstm_layers=3,embedding_size=300,dropout_rate=0.22):
    image_input = Input(shape=(224,224,3))
    model_vgg16 = VGG16(weights='imagenet',input_tensor=image_input)
    for layer in model_vgg16.layers:
        layer.trainable = False
    dense_input = BatchNormalization(axis=-1)(model_vgg16.layers[-2].output)
    image_dense = Dense(units=embedding_size)(dense_input)
    image_embedding = RepeatVector(1)(image_dense)
    sentence_input = Input(shape=(33,))
    word_embedding = Embedding(input_dim=vocab_size,output_dim=embedding_size)(sentence_input)
    sequence_input = Concatenate(axis=1)([image_embedding,word_embedding])
    input_ = sequence_input
    for _ in range(lstm_layers):
        input_ = BatchNormalization(axis=1)(input_)
        lstm_out = LSTM(units=embedding_size,
                       return_sequences=True,
                       dropout=dropout_rate,
                       recurrent_dropout=dropout_rate)(input_)
        input_ = lstm_out
    sequence_output = TimeDistributed(Dense(units=vocab_size))(lstm_out)
    
    model = Model(inputs=[image_input,sentence_input],outputs=sequence_output)
    model.compile(optimizer=Adam(lr=learning_rate),loss='categorical_crossentropy',metrics=['accuracy'])
    print(model.summary())
    return model
    

In [25]:
model_arc()

Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, 224, 224, 3)  0                                            
__________________________________________________________________________________________________
block1_conv1 (Conv2D)           (None, 224, 224, 64) 1792        input_3[0][0]                    
__________________________________________________________________________________________________
block1_conv2 (Conv2D)           (None, 224, 224, 64) 36928       block1_conv1[0][0]               
__________________________________________________________________________________________________
block1_pool (MaxPooling2D)      (None, 112, 112, 64) 0           block1_conv2[0][0]          

<keras.engine.training.Model at 0xb32bcc3c8>