In [49]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import cv2
from nltk.corpus import stopwords
import string
import json
from time import time
import pickle
from keras.applications.vgg16 import VGG16
from keras.applications.resnet50 import ResNet50,preprocess_input,decode_predictions,decode_predictions
from keras.preprocessing import image
from keras.models import Model,load_model
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Input,Dense,Dropout,Embedding,LSTM
from keras.layers.merge import add

In [13]:
def str_to_dict(path):
    train = None
    with open(path) as f:
        train = f.read()

    json_accept_str = train.replace("'","\"")
#     json_accept_str = json_accept_str.replace(";","\"")
    train = json.loads(json_accept_str)
    return train

In [14]:
train_img_Cap_map = str_to_dict("train_img_cap_map.txt")

In [43]:
# word_to_idx = str_to_dict("word_to_idx.txt")
# len(train_img_Cap_map)
# idx_to_word = str_to_dict("idx_to_word.txt")

6000

In [16]:
with open("total_word.txt","r") as f:
    total_words = f.read()
with open("vocab.txt","r") as f:
    vocab = f.read()
    
total_words = [wrd[2:-1] for wrd in total_words.split(",")]
vocab = [wrd[2:-1] for wrd in vocab.split(",")]

word_to_idx = {}
idx_to_word = {}

for i,word in enumerate(total_words):
    word_to_idx[word] = i+1
    idx_to_word[i+1] = word
# Two special words
idx_to_word[1846] = 'ss'
word_to_idx['ss'] = 1846

idx_to_word[1847] = 'es'
word_to_idx['es'] = 1847

vocab_size = len(word_to_idx) + 1
print("Vocab Size",vocab_size)

Vocab Size 1848


### Generator

In [53]:
def data_generator(train_img_Cap_map,encoding_train,word_to_idx,max_len,batch_size):
    X1,X2, y = [],[],[]
    
    n =0
    while True:
        for key,desc_list in train_img_Cap_map.items():
            n += 1
            
            photo = encoding_train[key]
            for desc in desc_list:
                
                seq = [word_to_idx[word] for word in desc.split() if word in word_to_idx]
                for i in range(1,len(seq)):
                    xi = seq[0:i]
                    yi = seq[i]
                    
                    #0 denote padding word
                    xi = pad_sequences([xi],maxlen=max_len,value=0,padding='post')[0]
                    yi = to_categorical([yi],num_classes=vocab_size)[0]
                    
                    X1.append(photo)
                    X2.append(xi)
                    y.append(yi)
                    
                if n==batch_size:
                    yield [[np.array(X1),np.array(X2)],np.array(y)]
                    X1,X2,y = [],[],[]
                    n = 0

In [18]:
with open("Encoding/encoded_train_features.pkl","rb") as f:
    encoding_train = pickle.load(f)

In [19]:
with open("Encoding/encoded_test_features.pkl","rb") as f:
    encoding_test = pickle.load(f)

In [42]:
# len(encoding_test)

1000

### Embedding Matrix

In [22]:
f = open("glove.6B.50d.txt",encoding='utf8')

In [23]:
embedding_index = {}

for line in f:
    values = line.split()
    
    word = values[0]
    word_embedding = np.array(values[1:],dtype='float')
    embedding_index[word] = word_embedding
f.close()    

In [24]:
# embedding_index['appe']

array([ 0.063795 , -0.12926  ,  0.57822  ,  0.091202 , -0.49303  ,
       -1.1864   , -0.33771  ,  0.098739 ,  0.36918  , -0.081918 ,
       -0.38308  ,  0.46332  , -0.6848   , -0.082209 , -0.20556  ,
        0.28582  ,  0.72209  , -0.0068532,  0.61001  , -0.017818 ,
       -0.4877   , -0.22869  , -1.4502   ,  0.91573  ,  0.17179  ,
        1.0807   ,  0.61252  , -0.20502  , -0.37131  ,  0.30343  ,
       -1.0234   , -0.38941  ,  1.1749   ,  0.1327   , -0.22279  ,
        0.0060945, -0.28316  , -0.46707  ,  0.11977  , -0.19949  ,
        0.38805  ,  0.056067 , -0.36484  ,  0.61864  , -0.11445  ,
        0.17343  ,  0.52787  , -0.35576  , -0.41406  ,  0.70935  ])

In [25]:
def get_embedding_matrix():
    emb_dim = 50
    matrix = np.zeros((vocab_size,emb_dim))
    for word,idx in word_to_idx.items():
        embedding_vector = embedding_index.get(word)
        
        if embedding_vector is not None:
            matrix[idx] = embedding_vector
            
    return matrix 

In [26]:
embedding_matrix = get_embedding_matrix()
embedding_matrix.shape

(1848, 50)

### Model Architecture

In [38]:
def model_():
    input_img_features = Input(shape=(2048,))
    inp_img1 = Dropout(0.3)(input_img_features)
    inp_img2 = Dense(256,activation='relu')(inp_img1)
    # Captions as Input
    input_captions = Input(shape=(35,))
    inp_cap1 = Embedding(input_dim=vocab_size,output_dim=50,mask_zero=True)(input_captions)
    inp_cap2 = Dropout(0.3)(inp_cap1)
    inp_cap3 = LSTM(256)(inp_cap2)
    decoder1 = add([inp_img2,inp_cap3])
    decoder2 = Dense(256,activation='relu')(decoder1)
    outputs = Dense(vocab_size,activation='softmax')(decoder2)

    # Combined Model
    model = Model(inputs=[input_img_features,input_captions],outputs=outputs)
    return model

In [39]:
model = model_()
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_7 (InputLayer)            (None, 35)           0                                            
__________________________________________________________________________________________________
input_6 (InputLayer)            (None, 2048)         0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 35, 50)       92400       input_7[0][0]                    
__________________________________________________________________________________________________
dropout_6 (Dropout)             (None, 2048)         0           input_6[0][0]                    
__________________________________________________________________________________________________
dropout_7 

In [40]:
# Important Thing - Embedding Layer
model.layers[2].set_weights([embedding_matrix])
model.layers[2].trainable = False

In [41]:
model.compile(loss='categorical_crossentropy',optimizer="adam")


### Training of Model

In [56]:
epochs = 20
max_len = 35
batch_size = 50
number_pics_per_batch = 2000
steps = len(train_img_Cap_map)//number_pics_per_batch

In [57]:
def train(epoch = 2): 
    for i in range(epochs):
        generator = data_generator(train_img_Cap_map,encoding_train,word_to_idx,max_len,batch_size)
        model.fit_generator(generator,epochs=1,steps_per_epoch=steps,verbose=1)
        model.save('./model_weights/model_'+str(i)+'.h5')

In [58]:
train()

Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
