In [0]:
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
import numpy as np
import cv2
import pandas as pd
import functools
import ast
import pickle

In [0]:
image_data_dir = '/content/gdrive/My Drive/ImageCaptioning2/images'
label_data_dir = '/content/gdrive/My Drive/ImageCaptioning2/text'
model_save_dir = '/content/gdrive/My Drive/ImageCaptioning2/model'
weights_save_dir = '/content/gdrive/My Drive/ImageCaptioning2/weights'

# Read Label Data

In [0]:
label_data_file_path = label_data_dir + '/Flickr8k.token.csv'
label_df = pd.read_csv(label_data_file_path, header = None, delimiter = ',')
label_df.columns = ['Image_Name', 'Description']

In [0]:
print(len(label_df))

40460


In [0]:
label_df.head()

Unnamed: 0,Image_Name,Description
0,1000268201_693b08cb0e.jpg#0,A child in a pink dress is climbing up a set o...
1,1000268201_693b08cb0e.jpg#1,A girl going into a wooden building .
2,1000268201_693b08cb0e.jpg#2,A little girl climbing into a wooden playhouse .
3,1000268201_693b08cb0e.jpg#3,A little girl climbing the stairs to her playh...
4,1000268201_693b08cb0e.jpg#4,A little girl in a pink dress going into a woo...


In [0]:
label_df.columns.tolist()

['Image_Name', 'Description']

In [0]:
label_df.dropna(inplace = True)

In [0]:
def process_image_name(image_name):
  image_name = image_name[:-2]
  return image_name

In [0]:
import re
import string

def process_description(description):
    
    regexes = [
        (r'\.+', '.'),
        (r'\s([?.!,"](?:\s|$))', r'\1'),
        (r'\s\s+', ' ')
        ]

    for regex, replacement in regexes:
        description = re.sub(regex, replacement, description)
     
    description = description.strip('.')
  
    # lower case entire string
    description = description.lower()

    # remove words with length == 1
    words = [w for w in description.split(' ') if len(w) > 1 ]
    
    table = str.maketrans('', '', string.punctuation)
    
    words = [w.translate(table) for w in words]
    
    description = ' '.join(words)
    
    description = '<SOS> ' + description + ' <EOS>'
    
    return description

In [0]:
label_df['Processed_Image_Name'] = label_df['Image_Name'].apply(lambda x: process_image_name(x))
label_df['Processed_Description'] = label_df['Description'].apply(lambda x: process_description(x))

In [0]:
label_df.head()

Unnamed: 0,Image_Name,Description,Processed_Image_Name,Processed_Description
0,1000268201_693b08cb0e.jpg#0,A child in a pink dress is climbing up a set o...,1000268201_693b08cb0e.jpg,<SOS> child in pink dress is climbing up set o...
1,1000268201_693b08cb0e.jpg#1,A girl going into a wooden building .,1000268201_693b08cb0e.jpg,<SOS> girl going into wooden building <EOS>
2,1000268201_693b08cb0e.jpg#2,A little girl climbing into a wooden playhouse .,1000268201_693b08cb0e.jpg,<SOS> little girl climbing into wooden playhou...
3,1000268201_693b08cb0e.jpg#3,A little girl climbing the stairs to her playh...,1000268201_693b08cb0e.jpg,<SOS> little girl climbing the stairs to her p...
4,1000268201_693b08cb0e.jpg#4,A little girl in a pink dress going into a woo...,1000268201_693b08cb0e.jpg,<SOS> little girl in pink dress going into woo...


In [0]:
label_df['Processed_Description'][999]

'<SOS> thin brown horse standing and small black horse sitting on sand <EOS>'

In [0]:
import os
files = set(os.listdir(image_data_dir))
image_names = set(list(label_df['Processed_Image_Name']))
print(len(files))
print(len(image_names))
diff = list(image_names.difference(files))

8091
8092


In [0]:
print(len(label_df))
for value in diff:
  label_df = label_df[label_df.Processed_Image_Name != value]
print(len(label_df))

40460
40455


In [0]:
print(len(set(list(label_df['Processed_Image_Name']))))

8091


In [0]:
# Build vocabulary
split_descriptions = list(map(lambda l: list(filter(lambda s: s != '', l)), label_df['Processed_Description'].map(lambda x: x.split(' '))))
label_df['Split_Description'] = split_descriptions

In [0]:
print(label_df.head())
print(label_df['Processed_Description'][10])
print(label_df['Split_Description'][10])

                    Image_Name  \
0  1000268201_693b08cb0e.jpg#0   
1  1000268201_693b08cb0e.jpg#1   
2  1000268201_693b08cb0e.jpg#2   
3  1000268201_693b08cb0e.jpg#3   
4  1000268201_693b08cb0e.jpg#4   

                                         Description  \
0  A child in a pink dress is climbing up a set o...   
1              A girl going into a wooden building .   
2   A little girl climbing into a wooden playhouse .   
3  A little girl climbing the stairs to her playh...   
4  A little girl in a pink dress going into a woo...   

        Processed_Image_Name  \
0  1000268201_693b08cb0e.jpg   
1  1000268201_693b08cb0e.jpg   
2  1000268201_693b08cb0e.jpg   
3  1000268201_693b08cb0e.jpg   
4  1000268201_693b08cb0e.jpg   

                               Processed_Description  \
0  <SOS> child in pink dress is climbing up set o...   
1        <SOS> girl going into wooden building <EOS>   
2  <SOS> little girl climbing into wooden playhou...   
3  <SOS> little girl climbing the stairs 

In [0]:
print(len(split_descriptions))
print(split_descriptions[2])

40455
['<SOS>', 'little', 'girl', 'climbing', 'into', 'wooden', 'playhouse', '<EOS>']


In [0]:
flattened_descriptions = [item for sublist in split_descriptions for item in sublist]
vocab = set(flattened_descriptions)
vocab_size = len(vocab)

In [0]:
print(vocab_size)

8820


In [0]:
word_to_index = {}
begin = 1

for word in vocab:
    word_to_index[word] = begin
    begin += 1

In [0]:
print(word_to_index['<EOS>'])
print(word_to_index['<SOS>'])

1893
8140


In [0]:
def encode_description(split_description):
   encoded_description = [word_to_index[w] for w in split_description] 
   return encoded_description

In [0]:
label_df['Encoded_Description'] = label_df['Split_Description'].apply(lambda description: encode_description(description))

In [0]:
print(label_df['Encoded_Description'][0])
print(label_df['Encoded_Description'][1])
print(label_df['Encoded_Description'][2])
print(type(label_df['Encoded_Description'][2]))

[8140, 4872, 1729, 5702, 77, 1255, 8015, 1371, 1033, 6673, 7926, 1729, 4694, 7952, 6800, 1893]
[8140, 1625, 8402, 780, 7206, 1218, 1893]
[8140, 3301, 1625, 8015, 780, 7206, 7057, 1893]
<class 'list'>


In [0]:
import json
  
file_name = 'word_to_index.json'
json = json.dumps(word_to_index)
f = open(model_save_dir + '/' + file_name, 'w')
f.write(json)
f.close()

In [0]:
label_df.to_csv(label_data_dir + '/processed_label.csv')

In [0]:
from keras.applications.vgg16 import VGG16
from keras.preprocessing import image
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array
from keras.applications.vgg16 import preprocess_input
from keras.models import Model
import numpy as np

vgg16 = VGG16(weights='imagenet')
vgg16.layers.pop()
model = Model(inputs=vgg16.inputs, outputs=vgg16.layers[-1].output)

In [0]:
height = 224
width = 224
image_names = set(list(label_df['Processed_Image_Name']))

features_dict = {}
for i, img_name in enumerate(image_names):
    print('Running for iteration: {}'.format(i))
    img_path = image_data_dir + '/' + img_name
    image = load_img(img_path, target_size=(width, height))
    image = img_to_array(image)
    image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
    image = preprocess_input(image)
    features = model.predict(image)
    features_dict[img_name] = features

Running for iteration: 0
Running for iteration: 1
Running for iteration: 2
Running for iteration: 3
Running for iteration: 4
Running for iteration: 5
Running for iteration: 6
Running for iteration: 7
Running for iteration: 8
Running for iteration: 9
Running for iteration: 10
Running for iteration: 11
Running for iteration: 12
Running for iteration: 13
Running for iteration: 14
Running for iteration: 15
Running for iteration: 16
Running for iteration: 17
Running for iteration: 18
Running for iteration: 19
Running for iteration: 20
Running for iteration: 21
Running for iteration: 22
Running for iteration: 23
Running for iteration: 24
Running for iteration: 25
Running for iteration: 26
Running for iteration: 27
Running for iteration: 28
Running for iteration: 29
Running for iteration: 30
Running for iteration: 31
Running for iteration: 32
Running for iteration: 33
Running for iteration: 34
Running for iteration: 35
Running for iteration: 36
Running for iteration: 37
Running for iteration:

In [0]:
pickle_out = open(model_save_dir + '/image_features', 'wb')
pickle.dump(features_dict, pickle_out)
pickle_out.close()

In [0]:
label_df = pd.read_csv(label_data_dir + '/processed_label.csv')

In [0]:
import json

file_name = 'word_to_index.json'
with open(model_save_dir + '/' + file_name) as f:
    word_to_index = json.loads(f.read())

In [0]:
print(label_df['Encoded_Description'][0])
print(type(label_df['Encoded_Description'][0]))
label_df['Encoded_Description'] = label_df['Encoded_Description'].apply(lambda x: ast.literal_eval(x))
print(label_df['Encoded_Description'][0])
print(type(label_df['Encoded_Description'][0]))
print(type(label_df['Encoded_Description'][0][0]))

[8140, 4872, 1729, 5702, 77, 1255, 8015, 1371, 1033, 6673, 7926, 1729, 4694, 7952, 6800, 1893]
<class 'str'>
[8140, 4872, 1729, 5702, 77, 1255, 8015, 1371, 1033, 6673, 7926, 1729, 4694, 7952, 6800, 1893]
<class 'list'>
<class 'int'>


In [0]:
with open(model_save_dir + '/image_features', 'rb') as f:
    features_dict = pickle.load(f)

In [0]:
image_names = set(list(label_df['Processed_Image_Name']))
path_dict = {image_name: image_data_dir + '/' + image_name for image_name in image_names}

In [0]:
f = open(label_data_dir + '/Flickr_8k.trainImages.txt', 'r')
train_image_names = f.readlines()
f.close()

In [0]:
max_description_length = max([len(desc) for desc in list(label_df['Encoded_Description'])])
vocab_size = len(word_to_index) + 1
print(max_description_length)
print(vocab_size)

34
8821


In [0]:
start_of_sentence_index = word_to_index['<SOS>']
print(start_of_sentence_index)

8140


In [0]:
latent_dim = 256
epochs = 200
embedding_size = 100
batch_size = 50

In [0]:
encoder_y = np.zeros((vocab_size, ))
encoder_y[start_of_sentence_index] = 1

train_image_names = [img.rstrip() for img in train_image_names]
print(len(train_image_names))

def get_next_batch():
  current_counter = 0
  
  num_steps = int(len(train_image_names) / batch_size)
  
  for i in range(num_steps):
    subset_images = train_image_names[current_counter: current_counter + batch_size]
    current_counter += batch_size
    encoder_x_train = []
    decoder_y_train = []
    decoder_x_train = []
    encoder_y_train = []
    for image_name in subset_images:
      features = features_dict[image_name]
      images_df = label_df.loc[label_df['Processed_Image_Name'] == image_name]
      for description in list(images_df['Encoded_Description']):
        decoder_x = description[:-1]
        diff = max_description_length - len(decoder_x)
        for _ in range(diff):
          decoder_x.append(0)
        decoder_y = np.zeros((max_description_length, vocab_size))
        for index, value in enumerate(description[1:]):
          decoder_y[index, value] = 1
        decoder_x_train.append(decoder_x)
        decoder_y_train.append(decoder_y)
        encoder_x_train.append(features)
        encoder_y_train.append(encoder_y)
    yield np.asarray(encoder_x_train), np.asarray(encoder_y_train), np.asarray(decoder_x_train), np.asarray(decoder_y_train)  

6000


In [0]:
import os
from keras.layers import Input, LSTM, Embedding, Dense, TimeDistributed
from keras.models import load_model
import numpy as np
import tensorflow as tf
from keras.models import Model
from keras.models import load_model

Using TensorFlow backend.


In [0]:
def build_model():
  encoder_inputs = Input(shape = (1, 4096))
  encoder_lstm = LSTM(latent_dim, return_state = True)
  encoder_lstm_outputs, encoder_h, encoder_c = encoder_lstm(encoder_inputs)
  encoder_dense = Dense(vocab_size, activation = 'softmax')
  encoder_outputs = encoder_dense(encoder_lstm_outputs)
  encoder_states = [encoder_h, encoder_c]

  decoder_inputs = Input(shape = (34, ))

  decoder_embedding_layer = Embedding(vocab_size, embedding_size)
  decoder_embedding_layer_output = decoder_embedding_layer(decoder_inputs)

  decoder_lstm = LSTM(latent_dim, return_state = True, return_sequences = True)
  decoder_lstm_outputs, _, _ = decoder_lstm(decoder_embedding_layer_output, initial_state = encoder_states)

  decoder_dense = TimeDistributed(Dense(vocab_size, activation = 'softmax'))
  decoder_outputs = decoder_dense(decoder_lstm_outputs)

  model = Model([encoder_inputs, decoder_inputs], [encoder_outputs, decoder_outputs])

  model.compile(optimizer = 'Adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])
  
  return model

In [0]:
import os.path
run_filename = model_save_dir + '/current_run'

if os.path.isfile(run_filename):
  with open(run_filename, 'rb') as f:
    run = pickle.load(f)
else:    
  run = 1
  with open(run_filename, 'wb') as f:
    pickle.dump(run, f)

In [0]:
epochs_per_run = 10
run_count = int(epochs / epochs_per_run)

def get_model_filename(index):
  file_name = 'model_' + str(index) + '.h5'
  return model_save_dir + '/' + file_name

def get_weights_filename(index):
  file_name = 'model_' + str(index) + '.h5'
  return weights_save_dir + '/' + file_name

In [0]:
print(run)
print(run_count)

19
20


In [0]:
while run <= run_count:

  if run != 1:
    print('Loading model for run:{}'.format(run - 1))
    model = load_model(get_model_filename(run - 1))
  else:  
    print('Building model for run:{}'.format(run))
    model = build_model()
  
  for epoch in np.arange((run - 1) * epochs_per_run + 1, run * epochs_per_run + 1):
      step_count = 0
      for encoder_x_train, encoder_y_train, decoder_x_train, decoder_y_train in get_next_batch():
          #print('Running step: ' + str(step_count))
          loss = model.train_on_batch([encoder_x_train, decoder_x_train], 
            [encoder_y_train, decoder_y_train])

          if step_count % 10 == 0:
              #print('Epoch: {}, step_count: {}, loss: {}, accuracy: {}'.format(epoch, step_count, loss, accuracy))
            print('Epoch: {}, step_count: {}, loss: {}'.format(epoch, step_count, loss))

          step_count += 1

  model_file_name = get_model_filename(run)
  model.save(model_file_name)
  weights_file_name = get_weights_filename(run)
  model.save_weights(weights_file_name) 
  run += 1
  with open(run_filename, 'wb') as f:
    pickle.dump(run, f)

Loading model for run:18
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Epoch: 181, step_count: 0, loss: [0.50926256, 1.1920929e-07, 0.50926244, 1.0, 0.18517648]
Epoch: 181, step_count: 10, loss: [0.49315622, 1.1920929e-07, 0.4931561, 1.0, 0.17729412]
Epoch: 181, step_count: 20, loss: [0.47670504, 1.1920929e-07, 0.47670493, 1.0, 0.17952941]
Epoch: 181, step_count: 30, loss: [0.49555576, 1.1920929e-07, 0.49555564, 1.0, 0.17576471]
Epoch: 181, step_count: 40, loss: [0.4847694, 1.1920929e-07, 0.48476928, 1.0, 0.17588235]
Epoch: 181, step_count: 50, loss: [0.468751, 1.1920929e-07, 0.4687509, 1.0, 0.17682353]
Epoch: 181, step_count: 60, loss: [0.4647728, 1.1920929e-07, 0.46477267, 1.0, 0.17858824]
Epoch: 181, step_count: 70, loss: [0.49749303, 1.1920929e-07, 0.4974929, 1.0, 0.18]
Epoch: 181, step_count: 80, loss: [0.473088, 1.1920929e-07, 0.47308788, 1.0, 0.17894118]
Epoch: 181, step_count: 90, loss: [0.4896481, 1.1920

  '. They will not be included '


Loading model for run:19
Epoch: 191, step_count: 0, loss: [0.48260775, 1.1920929e-07, 0.48260763, 1.0, 0.18976471]
Epoch: 191, step_count: 10, loss: [0.46820387, 1.1920929e-07, 0.46820375, 1.0, 0.18411765]
Epoch: 191, step_count: 20, loss: [0.4544596, 1.1920929e-07, 0.4544595, 1.0, 0.18435293]
Epoch: 191, step_count: 30, loss: [0.47265813, 1.1920929e-07, 0.472658, 1.0, 0.18105882]
Epoch: 191, step_count: 40, loss: [0.46582198, 1.1920929e-07, 0.46582186, 1.0, 0.17929412]
Epoch: 191, step_count: 50, loss: [0.45234853, 1.1920929e-07, 0.4523484, 1.0, 0.17917646]
Epoch: 191, step_count: 60, loss: [0.4393575, 1.1920929e-07, 0.43935737, 1.0, 0.1877647]
Epoch: 191, step_count: 70, loss: [0.47270477, 1.1920929e-07, 0.47270465, 1.0, 0.18729411]
Epoch: 191, step_count: 80, loss: [0.4493616, 1.1920929e-07, 0.44936147, 1.0, 0.18505882]
Epoch: 191, step_count: 90, loss: [0.46143064, 1.1920929e-07, 0.46143052, 1.0, 0.18482353]
Epoch: 191, step_count: 100, loss: [0.45486617, 1.1920929e-07, 0.45486605,

  '. They will not be included '
