In [None]:
import tensorflow as tf
from pickle import load, dump
from tensorflow.keras.preprocessing.text import Tokenizer
import tensorflow_datasets
import numpy as np
import random
import os

In [None]:
def load_doc(filename):
    doc = open(filename, 'r')
    data = doc.read()
    doc.close()
    return data

def load_imgs(filename):
    imgs = []
    data = load_doc(filename)
    for line in data.split('\n'):
        if len(line) < 1:
            continue
        img = line.split('#')[0]
        imgs.append(img)
    return set(imgs)

def load_descriptions(filename, photos):
    descriptions_dict = {}
    data = load_doc(filename)
    for line in data.split('\n'):
        words = line.split('\t')
        img, caption = words[0], words[1:]
        caption = ' '.join(caption)
        if img in photos:
            if img not in descriptions_dict:
                descriptions_dict[img] = []
            descriptions_dict[img].append(caption)
    return descriptions_dict

def load_features(filename, photos):
    all_features = load(open(filename, 'rb'))
    return all_features

def vocab(descriptions):
    vocab = []
    for captions in descriptions.values():
        for cap in captions:
            words = cap.split(' ')
            for word in words:
                if word not in vocab:
                    vocab.append(word)
    return vocab

def dict_to_list(descriptions):
    descriptions_list = []
    for key in descriptions.keys():
        [descriptions_list.append(captions) for captions in descriptions[key]]
    return descriptions_list

In [None]:
def tokenize(cap, vocab):
    tokenizer = Tokenizer(filters='')
    tokenizer.fit_on_texts(vocab)

    seq = tokenizer.texts_to_sequences(cap)
    return tokenizer, seq

def load_images(photos):
    img = tf.io.read_file(photos)
    img = tf.io.decode_jpeg(img, channels=3)
    img = tf.image.resize(img, (299, 299))
    img = tf.keras.applications.xception.preprocess_input(img)
    return img, photos

In [None]:
def img_paths(photos):
  paths = []
  for i in photos:
    thePath = 'Flicker8k_Dataset/' + i
    paths.append(thePath)
  return paths

In [None]:
def pre_tokenize(descriptions):
  new_descriptions = []
  for i in descriptions:
    for j in i:
      new_descriptions.append(j)
  return new_descriptions

In [None]:
photos = load_imgs('Flickr_8k.trainImages.txt')
print(len(photos))

descriptions_dict = load_descriptions('/content/drive/MyDrive/descriptions.txt', photos)
print(len(descriptions_dict))

features = tf.keras.applications.Xception(classes=3000, include_top=False, weights='imagenet')

new_input = features.input
hidden_layer = features.layers[-1].output

image_features_extract_model = tf.keras.Model(new_input, hidden_layer)

vocab = vocab(descriptions_dict)
print(len(vocab) + 1)

descriptions = dict_to_list(descriptions_dict)

imgs = img_paths(photos)
print(imgs[:5])
print(len(imgs))
photos_encode = sorted(set(imgs))
print(len(photos_encode))

image_dataset = tf.data.Dataset.from_tensor_slices(photos_encode)
#we load the images with the features from transfer learning
image_dataset = image_dataset.map(load_images, num_parallel_calls=tf.data.AUTOTUNE).batch(16)
len(image_dataset)

6000
6000
Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/xception/xception_weights_tf_dim_ordering_tf_kernels_notop.h5
7564
['Flicker8k_Dataset/3463034205_e541313038.jpg', 'Flicker8k_Dataset/3656030945_fa003bd696.jpg', 'Flicker8k_Dataset/537758332_8beb9cf522.jpg', 'Flicker8k_Dataset/3551170666_01df31412d.jpg', 'Flicker8k_Dataset/119534510_d52b3781a3.jpg']
6000
6000


375

In [None]:
print(image_dataset)

<BatchDataset shapes: ((None, 299, 299, 3), (None,)), types: (tf.float32, tf.string)>


In [None]:
from tqdm import tqdm
for img, path in tqdm(image_dataset):
    batch_features = image_features_extract_model(img)
    batch_features = tf.reshape(batch_features, (batch_features.shape[0], -1, batch_features.shape[3]))
    for bf, p in zip(batch_features, path):
        path_of_feature = p.numpy().decode("utf-8")
        np.save(path_of_feature, bf.numpy())


100%|██████████| 375/375 [01:44<00:00,  3.58it/s]


In [None]:
#tokenize data
#pre_tokenizer = pre_tokenize(descriptions)
print(descriptions[:10])
tokenizer, train_seq = tokenize(descriptions, vocab)

['<start> child in a pink dress is climbing up a set of stairs in an entry way <end>', '<start> girl going into a wooden building <end>', '<start> little girl climbing into a wooden playhouse <end>', '<start> little girl climbing the stairs to her playhouse <end>', '<start> little girl in a pink dress going into a wooden cabin <end>', '<start> black dog and a spotted dog are fighting <end>', '<start> black dog and a tricolored dog playing with each other on the road <end>', '<start> black dog and a white dog with brown spots are staring at each other in the street <end>', '<start> dogs of different breeds looking at each other on the road <end>', '<start> dogs on pavement moving toward each other <end>']


In [None]:
print(tokenizer.index_word[1001])

cushion


In [None]:
print(train_seq)

[[1, 2, 3, 4, 5, 6, 7, 8, 9, 4, 10, 11, 12, 3, 13, 14, 15, 16], [1, 17, 18, 19, 4, 20, 21, 16], [1, 22, 17, 8, 19, 4, 20, 23, 16], [1, 22, 17, 8, 24, 12, 25, 26, 23, 16], [1, 22, 17, 3, 4, 5, 6, 18, 19, 4, 20, 27, 16], [1, 28, 29, 30, 4, 31, 29, 32, 33, 16], [1, 28, 29, 30, 4, 34, 29, 35, 36, 37, 38, 39, 24, 40, 16], [1, 28, 29, 30, 4, 41, 29, 36, 42, 43, 32, 44, 45, 37, 38, 3, 24, 46, 16], [1, 47, 11, 48, 49, 50, 45, 37, 38, 39, 24, 40, 16], [1, 47, 39, 51, 52, 53, 37, 38, 16], [1, 22, 17, 54, 3, 55, 56, 3, 57, 11, 4, 58, 59, 36, 26, 60, 3, 4, 61, 16], [1, 22, 17, 7, 62, 3, 57, 11, 4, 63, 58, 59, 16], [1, 64, 17, 3, 24, 65, 66, 36, 67, 3, 57, 11, 4, 41, 68, 36, 4, 59, 39, 69, 16], [1, 7, 4, 17, 36, 70, 62, 3, 57, 11, 4, 59, 71, 16], [1, 17, 36, 70, 71, 72, 3, 24, 65, 16], [1, 73, 74, 39, 4, 75, 76, 77, 29, 56, 78, 79, 16], [1, 73, 74, 39, 24, 75, 25, 80, 4, 41, 29, 7, 81, 82, 16], [1, 73, 83, 39, 4, 75, 72, 36, 4, 41, 30, 28, 29, 62, 84, 25, 79, 16], [1, 85, 73, 86, 39, 4, 87, 75, 36,

In [None]:
tokenizer.word_index['<pad>'] = 0
tokenizer.index_word[0] = '<pad>'

In [None]:
cap_vector = tf.keras.preprocessing.sequence.pad_sequences(train_seq, padding='post')

In [None]:
print(len(cap_vector))

30000


In [None]:
def max_length_calc(tensor):
  return max(len(i) for i in tensor)

max_length = max_length_calc(train_seq)
print(max_length)

37


In [None]:
def repeat_imgs_5_times(photos_encode):
  new_photos = []
  for i in photos_encode:
    for j in range(5):
      new_photos.append(i)
  return new_photos



In [None]:
new_photos = repeat_imgs_5_times(photos_encode)
print(len(new_photos))

img_to_cap_vector = {}
for img, cap in zip(new_photos, cap_vector):
  #theImg = img.split('/')[1]
  if img not in img_to_cap_vector:
    img_to_cap_vector[img] = []
  img_to_cap_vector[img].append(cap)

img_keys = list(img_to_cap_vector.keys())
random.shuffle(img_keys)

slice_index = int(len(img_keys) * 0.8)
img_train_keys, img_val_keys = img_keys[:slice_index], img_keys[slice_index:]
print(len(img_train_keys))
print(len(img_val_keys))

img_name_train = []
cap_train = []
for img in img_train_keys:
  capt_len = len(img_to_cap_vector[img])
  img_name_train.extend([img] * capt_len)
  cap_train.extend(img_to_cap_vector[img])

img_name_val = []
cap_val = []
for img in img_val_keys:
  capt_len = len(img_to_cap_vector[img])
  img_name_val.extend([img] * capt_len)
  cap_val.extend(img_to_cap_vector[img])


len(img_name_train), len(cap_train), len(img_name_val), len(cap_val)

30000
4800
1200


(24000, 24000, 6000, 6000)

In [None]:
batch_size = 64
buffer_size = 1000
vocab_size = len(vocab) + 1
units = 1024
steps_per_epoch = len(img_name_train)
embedding_dims = 512
feature_shape = 2048
attention_features_shape = 64

In [None]:
def map_func(img_name, cap):
  img_tensor = np.load(img_name.decode('utf-8')+'.npy')
  return img_tensor, cap

In [None]:
#load all the dataset
dataset = tf.data.Dataset.from_tensor_slices((img_name_train, cap_train))

#use map to load the numpy files in parallel
dataset = dataset.map(
    lambda item1, item2: tf.numpy_function(map_func, [item1, item2], [tf.float32, tf.int32]),
    num_parallel_calls=tf.data.AUTOTUNE
)
dataset = dataset.shuffle(buffer_size).batch(batch_size)
dataset = dataset.prefetch(buffer_size=tf.data.AUTOTUNE)


In [None]:
class Encoder(tf.keras.Model):
  def __init__(self, embedding_dim):
    super(Encoder, self).__init__()
    self.fc = tf.keras.layers.Dense(embedding_dim)

  def call(self, x):
    x = self.fc(x)
    x = tf.nn.relu(x)
    return x

class BahdanauAttention(tf.keras.Model):
  def __init__(self, units):
    super(BahdanauAttention, self).__init__()
    self.units = units
    self.W1 = tf.keras.layers.Dense(self.units)
    self.W2 = tf.keras.layers.Dense(self.units)
    self.V = tf.keras.layers.Dense(1)

  def call(self, features, hidden): #query is the hidden states and values are the outputs
    hidden_with_time_axis = tf.expand_dims(hidden, 1)
    score = self.V(tf.nn.tanh(self.W1(features) + self.W2(hidden_with_time_axis)))
    attention_weights = tf.nn.softmax(score, axis=1)
    context_vector = tf.reduce_sum((attention_weights * features), axis=1)

    return attention_weights, context_vector

class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, dec_units, embedding_dims):
    super(Decoder, self).__init__()
    self.dec_units = dec_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dims)
    self.gru = tf.keras.layers.GRU(self.dec_units, return_sequences=True, return_state=True, recurrent_initializer='glorot_uniform')
    self.fc1 = tf.keras.layers.Dense(self.dec_units)
    self.fc2 = tf.keras.layers.Dense(vocab_size)

    self.attention = BahdanauAttention(self.dec_units)

  def call(self, dec_input, dec_hidden, features):
    attention_weights, context_vector = self.attention(features, dec_hidden)
    dec_input = self.embedding(dec_input)
    dec_input = tf.concat([tf.expand_dims(context_vector, axis=1), dec_input], axis=-1)
    output, state = self.gru(dec_input)
    #output = tf.reshape(output, (-1, output.shape[2]))
    dec_input = self.fc1(output)
    dec_input = tf.reshape(dec_input, (-1, dec_input.shape[2]))
    dec_input = self.fc2(dec_input)

    return dec_input, state

  def reset_state(self, batch_size):
    return tf.zeros((batch_size, self.dec_units))

encoder = Encoder(embedding_dims)
decoder = Decoder(vocab_size, units, embedding_dims)


In [None]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss = loss_object(real, pred)
  mask = tf.cast(mask, dtype=loss.dtype)
  loss *= mask
  return tf.reduce_mean(loss)

In [None]:
checkpoint_path = "/content/drive/MyDrive/img_cap/checkpoint"
ckpt = tf.train.Checkpoint(encoder=encoder,
                           decoder=decoder,
                           optimizer = optimizer)
ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

In [None]:
start_epoch = 0
if ckpt_manager.latest_checkpoint:
  start_epoch = int(ckpt_manager.latest_checkpoint.split('-')[-1])
  ckpt.restore(ckpt_manager.latest_checkpoint)

In [None]:
loss_plot = []

@tf.function
def train_step(img_tensor, target):
    loss = 0
    hidden = decoder.reset_state(batch_size)
    dec_input = tf.expand_dims([tokenizer.word_index['<start>']] * target.shape[0], 1)
    with tf.GradientTape() as tape:
        features = encoder(img_tensor)
        for i in range(1, target.shape[1]):
            predictions, hidden_state = decoder(dec_input, hidden, features)
            loss += loss_function(target[:, i], predictions)
            dec_input = tf.expand_dims(target[:, i], 1)
    total_loss = (loss / int(target.shape[1]))
    trainable_variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, trainable_variables)
    optimizer.apply_gradients(zip(gradients, trainable_variables))

    return loss, total_loss

In [None]:
epochs = 20
import time
for epoch in range(start_epoch, epochs):
  start = time.time()
  total_loss = 0

  for (batch, (img_tensor, target)) in enumerate(dataset):
    batch_loss, t_loss = train_step(img_tensor, target)
    total_loss += t_loss
    if batch % 100 == 0:
      print('Epoch: {}; Batch: {}; Loss: {:.4f}'.format(epoch + 1, batch, batch_loss.numpy() / int(target.shape[1])))

  loss_plot.append(total_loss / steps_per_epoch)

  if epoch % 5 == 0:
    ckpt_manager.save()

  print('Epoch {} Loss {:.6f}'.format(epoch + 1, total_loss/steps_per_epoch))
  print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))



Epoch: 2; Batch: 0; Loss: 2.5498
Epoch: 2; Batch: 100; Loss: 1.1426
Epoch: 2; Batch: 200; Loss: 1.0486
Epoch: 2; Batch: 300; Loss: 1.0554
Epoch 2 Loss 0.017147
Time taken for 1 epoch 186.3516345024109 sec

Epoch: 3; Batch: 0; Loss: 0.9750
Epoch: 3; Batch: 100; Loss: 0.9372
Epoch: 3; Batch: 200; Loss: 0.9044
Epoch: 3; Batch: 300; Loss: 0.9895
Epoch 3 Loss 0.014354
Time taken for 1 epoch 147.27396035194397 sec

Epoch: 4; Batch: 0; Loss: 0.9284
Epoch: 4; Batch: 100; Loss: 0.8854
Epoch: 4; Batch: 200; Loss: 0.8582
Epoch: 4; Batch: 300; Loss: 0.8350
Epoch 4 Loss 0.012944
Time taken for 1 epoch 147.43632292747498 sec

Epoch: 5; Batch: 0; Loss: 0.8635
Epoch: 5; Batch: 100; Loss: 0.6970
Epoch: 5; Batch: 200; Loss: 0.8460
Epoch: 5; Batch: 300; Loss: 0.7288
Epoch 5 Loss 0.011823
Time taken for 1 epoch 147.0920271873474 sec

Epoch: 6; Batch: 0; Loss: 0.7691
Epoch: 6; Batch: 100; Loss: 0.6935
Epoch: 6; Batch: 200; Loss: 0.6323
Epoch: 6; Batch: 300; Loss: 0.6425
Epoch 6 Loss 0.010864
Time taken for

In [1]:
def evaluate(image):
  temp_input = tf.expand_dims(load_images(image)[0], 0)
  img_tensor_val = image_features_extract_model(temp_input)
  img_tensor_val = tf.reshape(img_tensor_val, (img_tensor_val.shape[0], -1, img_tensor_val.shape[3]))
  hidden = decoder.reset_state(1)
  features = encoder(img_tensor_val)
  dec_input = tf.expand_dims([tokenizer.word_index['<start>']], 0)
  result = []
  for i in range(max_length):
    predictions, hidden_state = decoder(dec_input, hidden, features)
    predicted_id = tf.random.categorical(predictions, 1)[0][0].numpy()
    result.append(tokenizer.index_word[predicted_id])

    if tokenizer.index_word[predicted_id] == '<end>':
      return result

    dec_input = tf.expand_dims([predicted_id], 0)


  return result


In [None]:
random_img = np.random.randint(0, len(img_name_val))
image = img_name_val[random_img]
real_caption = ' '.join([tokenizer.index_word[i] for i in cap_val[random_img] if i not in [0]])
result = evaluate(image)
print(image)
print('real: ' + str(real_caption))
print('predicted: ' + str(' '.join(result)))

Flicker8k_Dataset/2990563425_2f7246f458.jpg
real: <start> man on a motorbike riding down a steep rock face <end>
predicted: man on a rock <end>
