In [86]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [87]:
import numpy as np
from numpy import array
import matplotlib.pyplot as plt
%matplotlib inline

import string
import os
import glob
from PIL import Image
from time import time

from keras import Input, layers
from keras import optimizers
from keras.optimizers import Adam
from keras.preprocessing import sequence
from keras.preprocessing import image
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import LSTM, Embedding, Dense, Activation, Flatten, Reshape, Dropout
from keras.layers.wrappers import Bidirectional
from keras.layers.merge import add
from keras.applications.inception_v3 import InceptionV3
from keras.applications.inception_v3 import preprocess_input
from keras.models import Model
from keras.utils import to_categorical

In [None]:
token_path = "/content/gdrive/My Drive/Flickr8k/Flickr8k_text/Flickr8k.lemma.token.txt"
train_images_path = '/content/gdrive/My Drive/Flickr8k/Flickr8k_text/Flickr_8k.trainImages.txt'
test_images_path = '/content/gdrive/My Drive/Flickr8k/Flickr8k_text/Flickr_8k.testImages.txt'
images_path = '/content/gdrive/My Drive/Flickr8k/Flickr8k_images/'
glove_path = '/content/gdrive/My Drive/glove6b'

doc = open(token_path,'r').read()


In [89]:
descriptions = dict()
for line in doc.split('\n'):
        tokens = line.split()
        if len(line) > 2:
          image_id = tokens[0].split('.')[0]
          image_desc = ' '.join(tokens[1:])
          if image_id not in descriptions:
              descriptions[image_id] = list()
          descriptions[image_id].append(image_desc)

In [90]:
table = str.maketrans('', '', string.punctuation)
for key, desc_list in descriptions.items():
    for i in range(len(desc_list)):
        desc = desc_list[i]
        desc = desc.split()
        desc = [word.lower() for word in desc]
        desc = [w.translate(table) for w in desc]
        desc_list[i] =  ' '.join(desc)

In [92]:
vocabulary = set()
for key in descriptions.keys():
        [vocabulary.update(d.split()) for d in descriptions[key]]
print('Original Vocabulary Size: %d' % len(vocabulary))

Original Vocabulary Size: 6751


In [93]:
lines = list()
for key, desc_list in descriptions.items():
    for desc in desc_list:
        lines.append(key + ' ' + desc)
new_descriptions = '\n'.join(lines)

In [94]:
doc = open(train_images_path,'r').read()
dataset = list()
for line in doc.split('\n'):
    if len(line) > 1:
      identifier = line.split('.')[0]
      dataset.append(identifier)

train = set(dataset)

In [95]:
doc = open(test_images_path,'r').read()
dataset = list()
for line in doc.split('\n'):
    if len(line) > 1:
      identifier = line.split('.')[0]
      dataset.append(identifier)

test = set(dataset)

In [96]:
test

{'3433982387_3fa993cf5a',
 '3737539561_d1dc161040',
 '2077079696_03380d218b',
 '1472230829_803818a383',
 '2308271254_27fb466eb4',
 '293327462_20dee0de56',
 '2525270674_4ab536e7ec',
 '2075321027_c8fcbaf581',
 '3640422448_a0f42e4559',
 '1808370027_2088394eb4',
 '2084217208_7bd9bc85e5',
 '244571201_0339d8e8d1',
 '3317073508_7e13565c1b',
 '2495931537_9b8d4474b6',
 '2712787899_d85048eb6a',
 '498444334_a680d318a1',
 '2283966256_70317e1759',
 '3627011534_485f667b10',
 '463978865_c87c6ca84c',
 '3692593096_fbaea67476',
 '464251704_b0f0c4c87a',
 '3064383768_f6838f57da',
 '3016606751_0e8be20abd',
 '2559503010_84f20b3bc9',
 '1056338697_4f7d7ce270',
 '3439382048_d2e23b2b4c',
 '3135504530_0f4130d8f8',
 '3025549604_38b86198f5',
 '3432550415_e7b77232de',
 '566397227_a469e9e415',
 '3545652636_0746537307',
 '444481722_690d0cadcf',
 '2502905671_c6039804ab',
 '2103568100_5d018c495b',
 '3537400880_8f410d747d',
 '3584561689_b6eb24dd70',
 '2475162978_2c51048dca',
 '2112921744_92bf706805',
 '3263497678_8bb688

In [97]:
img = glob.glob(images_path + '*.jpg')
train_images = set(open(train_images_path, 'r').read().strip().split('\n'))
train_img = []
for i in img: 
    if i[len(images_path):] in train_images:
        train_img.append(i)

test_images = set(open(test_images_path, 'r').read().strip().split('\n'))
test_img = []
for i in img: 
    if i[len(images_path):] in test_images: 
        test_img.append(i)

In [98]:
train_descriptions = dict()
for line in new_descriptions.split('\n'):
    tokens = line.split()
    image_id, image_desc = tokens[0], tokens[1:]
    if image_id in train:
        if image_id not in train_descriptions:
            train_descriptions[image_id] = list()
        desc = 'startseq ' + ' '.join(image_desc) + ' endseq'
        train_descriptions[image_id].append(desc)

In [99]:
test_descriptions = dict()
for line in new_descriptions.split('\n'):
    tokens = line.split()
    image_id, image_desc = tokens[0], tokens[1:]
    if image_id in test:
        if image_id not in test_descriptions:
            test_descriptions[image_id] = list()
        desc = 'startseq ' + ' '.join(image_desc) + ' endseq'
        test_descriptions[image_id].append(desc)

In [101]:
all_train_captions = []
for key, val in train_descriptions.items():
    for cap in val:
        all_train_captions.append(cap)

In [102]:
word_count_threshold = 10
word_counts = {}
nsents = 0
for sent in all_train_captions:
    nsents += 1
    for w in sent.split(' '):
        word_counts[w] = word_counts.get(w, 0) + 1
vocab = [w for w in word_counts if word_counts[w] >= word_count_threshold]

print('Vocabulary = %d' % (len(vocab)))

Vocabulary = 1350


In [103]:
ixtoword = {}
wordtoix = {}
ix = 1
for w in vocab:
    wordtoix[w] = ix
    ixtoword[ix] = w
    ix += 1

vocab_size = len(ixtoword) + 1

In [104]:
all_desc = list()
for key in train_descriptions.keys():
    [all_desc.append(d) for d in train_descriptions[key]]
lines = all_desc
max_length = max(len(d.split()) for d in lines)

print('Description Length: %d' % max_length)

Description Length: 38


In [105]:
embeddings_index = {} 
f = open(os.path.join(glove_path, 'glove.6B.200d.txt'), encoding="utf-8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs

In [106]:
embedding_dim = 200
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in wordtoix.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [107]:
model = InceptionV3(weights='imagenet')

In [108]:
model_new = Model(model.input, model.layers[-2].output)

In [109]:
def preprocess(image_path):
    img = image.load_img(image_path, target_size=(299, 299))
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    return x

In [110]:
def encode(image):
    image = preprocess(image) 
    fea_vec = model_new.predict(image) 
    fea_vec = np.reshape(fea_vec, fea_vec.shape[1])
    return fea_vec

#encoding_train = {}
#for img in train_img:
 #   encoding_train[img[len(images_path):]] = encode(img)
#train_features = encoding_train

#encoding_test = {}
#for img in test_img:
 #   encoding_test[img[len(images_path):]] = encode(img)

In [111]:
encoding_train = {}
encoding_test = {}


import pickle #credits to stack overflow user= blender

with open('/content/gdrive/My Drive/train_encoding.pkl', 'rb') as handle:
    encoding_train = pickle.load(handle)

with open('/content/gdrive/My Drive/test_encoding.pkl', 'rb') as handle:
    encoding_test = pickle.load(handle)

In [112]:
inputs1 = Input(shape=(2048,))
fe1 = Dropout(0.5)(inputs1)
fe2 = Dense(256, activation='relu')(fe1)

inputs2 = Input(shape=(max_length,))
se1 = Embedding(vocab_size, embedding_dim, mask_zero=True)(inputs2)
se2 = Dropout(0.5)(se1)
se3 = LSTM(256)(se2)

decoder1 = add([fe2, se3])
decoder2 = Dense(256, activation='relu')(decoder1)
outputs = Dense(vocab_size, activation='softmax')(decoder2)

model = Model(inputs=[inputs1, inputs2], outputs=outputs)
model.summary()

Model: "model_5"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_9 (InputLayer)            [(None, 38)]         0                                            
__________________________________________________________________________________________________
input_8 (InputLayer)            [(None, 2048)]       0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 38, 200)      270200      input_9[0][0]                    
__________________________________________________________________________________________________
dropout_4 (Dropout)             (None, 2048)         0           input_8[0][0]                    
____________________________________________________________________________________________

In [113]:
model.layers[2].set_weights([embedding_matrix])
model.layers[2].trainable = False

In [114]:
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [115]:
def data_generator(descriptions, photos, wordtoix, max_length, num_photos_per_batch):
    X1, X2, y = list(), list(), list()
    n=0
    # loop for ever over images
    while 1:
        for key, desc_list in descriptions.items():
            n+=1
            # retrieve the photo feature
            photo = photos[key+'.jpg']
            for desc in desc_list:
                # encode the sequence
                seq = [wordtoix[word] for word in desc.split(' ') if word in wordtoix]
                # split one sequence into multiple X, y pairs
                for i in range(1, len(seq)):
                    # split into input and output pair
                    in_seq, out_seq = seq[:i], seq[i]
                    # pad input sequence
                    in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                    # encode output sequence
                    out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
                    # store
                    X1.append(photo)
                    X2.append(in_seq)
                    y.append(out_seq)

            if n==num_photos_per_batch:
                yield ([array(X1), array(X2)], array(y))
                X1, X2, y = list(), list(), list()
                n=0

In [116]:
epochs = 30
batch_size = 3
steps = len(train_descriptions)//batch_size

#generator = data_generator(train_descriptions, train_features, wordtoix, max_length, batch_size)
#model.fit(generator, epochs=epochs, steps_per_epoch=steps, verbose=1)

model.load_weights('/content/gdrive/My Drive/mymodel.h5')

In [117]:
#model.save_weights('/content/mymodel.h5')

In [118]:
def greedySearch(photo):
    in_text = 'startseq'
    for i in range(max_length):
        sequence = [wordtoix[w] for w in in_text.split() if w in wordtoix]
        sequence = pad_sequences([sequence], maxlen=max_length)
        yhat = model.predict([photo,sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = ixtoword[yhat]
        in_text += ' ' + word
        if word == 'endseq':
            break

    final = in_text.split()
    final = final[1:-1]
    final = ' '.join(final)
    return final

In [119]:
def beam_search_predictions(image, beam_index = 3):
    start = [wordtoix["startseq"]]
    start_word = [[start, 0.0]]
    while len(start_word[0][0]) < max_length:
        temp = []
        for s in start_word:
            par_caps = sequence.pad_sequences([s[0]], maxlen=max_length, padding='post')
            preds = model.predict([image,par_caps], verbose=0)
            word_preds = np.argsort(preds[0])[-beam_index:]
            # Getting the top <beam_index>(n) predictions and creating a 
            # new list so as to put them via the model again
            for w in word_preds:
                next_cap, prob = s[0][:], s[1]
                next_cap.append(w)
                prob += preds[0][w]
                temp.append([next_cap, prob])
                    
        start_word = temp
        # Sorting according to the probabilities
        start_word = sorted(start_word, reverse=False, key=lambda l: l[1])
        # Getting the top words
        start_word = start_word[-beam_index:]
    
    start_word = start_word[-1][0]
    intermediate_caption = [ixtoword[i] for i in start_word]
    final_caption = []
    
    for i in intermediate_caption:
        if i != 'endseq':
            final_caption.append(i)
        else:
            break

    final_caption = ' '.join(final_caption[1:])
    return final_caption

In [None]:
pic = '2398605966_1d0c9e6a20.jpg'
image = encoding_test[pic].reshape((1,2048))
x=plt.imread(images_path+pic)
plt.imshow(x)
plt.show()

print("Greedy Search:",greedySearch(image))
print("Beam Search, K = 3:",beam_search_predictions(image, beam_index = 3))
print("Beam Search, K = 5:",beam_search_predictions(image, beam_index = 5))
print("Beam Search, K = 7:",beam_search_predictions(image, beam_index = 7))
print("Beam Search, K = 10:",beam_search_predictions(image, beam_index = 10))

from nltk.translate.bleu_score import sentence_bleu
bl = 0
for p in test:
  image = p+'.jpg'
  image = encoding_test[pic].reshape((1,2048))
  r = test_descriptions[p]
  d = beam_search_predictions(image, beam_index = 3)
  score = sentence_bleu(r, d)
  bl = bl+score
  print(score)


bleu = bl/1000
print(bleu)


In [121]:


from nltk.translate.bleu_score import sentence_bleu
bl = 0
i = 0
for p in test:
  i = i+1
  image = p+'.jpg'
  image = encoding_test[pic].reshape((1,2048))
  r = test_descriptions[p]
  d = beam_search_predictions(image, beam_index = 5)
  score = sentence_bleu(r, d)
  bl = bl+score
  print(i,score)


bleu = bl/1000
print(i,bleu)

1 0.5249126307719946
2 0.5305638914089146
3 0.28997844147152074
4 0.2866349951853739
5 0.4629068057316199
6 0.42038309467033413
7 0.32421646421108913
8 0.34864618107987183
9 0.3335278648107571
10 0.18784584025436754
11 0.2388488455322318
12 0.45901094012967086
13 0.5249686549711556
14 0.3159198427591763
15 0.4142214902898184
16 0.3733347364653099
17 0.2744656866480642
18 0.2372131540377057
19 0.4365623215127961
20 0.41128204129540774
21 0.23912896776277892
22 0.29082862656071573
23 0.39294672275271025
24 0.3080297600908789
25 0.3065460100108194
26 0.47905888955258785
27 0.37244265186746417
28 0.3507628026481858
29 0.3992886659015045
30 0.30266556395272204
31 0.6389876853059888
32 0.31359553122785805
33 0.42117371890534844
34 0.40852438723856205
35 0.39032168122678973
36 0.31234955390078434
37 0.34269612425037704
38 0.31359553122785805
39 0.37660466823780653
40 0.4759393282758227
41 0.314255729277696
42 0.22662309891912993
43 0.23578150803565898
44 0.3532255594134209
45 0.43268948516751

KeyboardInterrupt: ignored

In [None]:

from nltk.translate.bleu_score import sentence_bleu
bl = 0
for p in test:
  image = p+'.jpg'
  image = encoding_test[pic].reshape((1,2048))
  r = test_descriptions[p]
  d = beam_search_predictions(image, beam_index = 7)
  score = sentence_bleu(r, d)
  bl = bl+score
  print(score)


bleu = bl/1000
print('BLEU SCORE IS'+bleu)

In [125]:

from nltk.translate.bleu_score import sentence_bleu
bl = 0
for p in test:
  image = p+'.jpg'
  image = encoding_test[pic].reshape((1,2048))
  r = test_descriptions[p]
  d = greedySearch(image)
  score = sentence_bleu(r, d)
  bl = bl+score
  print(score)


bleu = bl/1000
print("Bleu score:",bleu)

0.4031249188485891
0.44002420888747285
0.3509726982243311
0.37902107521565515
0.291755222321701
0.373202308884518
0.2762921998439348
0.35884577330387607
0.3873562955977445
0.17642306150369455
0.2990695198259317
0.4375734868956942
0.45437856130026405
0.42644029364324865
0.4255158779768911
0.33483149841781423
0.37657917707148786
0.3267336674165555
0.466873534951577
0.5292031904718658
0.33959475318145005
0.37899820418055136
0.3973518452269017
0.47150402513047984
0.38327915426541587
0.46200370668831353
0.3617306369411334
0.4558009079797004
0.48155797687544905
0.4203129949288125
0.6107496286812021
0.4247752034930807
0.34505012541085295
0.450528871894257
0.34115708007679457
0.3093771121329176
0.40589301775404524
0.36879157378617644
0.41535444363522783
0.5026413055743576
0.49664052133650727
0.21098480158998953
0.3669834760563031
0.3445330145454746
0.5033724349608917
0.3655813126933284
0.4779285879258037
0.4411199375096363
0.2931950954898784
0.34982989490433664
0.37331747043513475
0.4036971437

Corpus/Sentence contains 0 counts of 4-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


0.4250795929370487
0.4590046060072806
0.4611582856250679
0.5146536269817716
0.3843578557457184
0.3354343787632011
0.5161481133270862
0.5124087319755878
0.48060607981442643
0.3728557287100086
0.4542747632261371
0.4457031151160146
0.3789112424879945
0.43923409690475806
0.3855160711342024
0.30028116996840226
0.39661389636690575
0.2627800303883764
0.33339689306341136
0.5771900053937047
0.4029443522770047
0.3962923363005083
0.3426683716939363
0.4413278458606416
0.4751292436423125
0.36369891186463815
0.4216170820652361
0.5349286705437757
0.6014033031399167
0.4733835843825693
0.26872458649773434
0.4780612905869109
0.2755543422531603
0.47545993013150367
0.3633533543136063
0.3942065654287719
0.25549018254123496
0.360661092398299
0.41840761042462565
0.4247752034930807
0.3427455557211075
0.3428201661000232
0.38622826550533074
0.2833453359024894
0.24123448477210344
0.46704821733891333
0.3862390586411139
0.5777183090225233
0.3404113770827923
0.4457031151160146
0.3667254332613537
0.48297373572829677