<a href="https://colab.research.google.com/github/himanshunaidu/cnn_image_caption/blob/master/ImageCaption.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# IMPORTS

In [2]:
from google.colab import files
uploaded = files.upload()

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import os
import tensorflow as tf
import numpy as np
import math
from PIL import Image, ImageOps
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from tensorflow.keras.models import Model
from tensorflow.keras.applications import InceptionResNetV2, inception_v3
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow.keras.backend as kb
from tqdm import tqdm
import re

In [5]:
print(tf.__version__)

2.5.0


In [6]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [7]:
!ls

create_model.py  gdrive		  prep_ds.py		preprocess_labels.py
drive		 load_dataset.py  preprocess_images.py	sample_data


# LOCAL IMPORTS AND VARIABLES

In [8]:
base_path = './gdrive/MyDrive/ImageCaptions/'
caption_file_path = './Flickr8k.token.txt'
ds_path = 'Flicker8k_Dataset/'
feature_path = 'Flicker8k_Dataset_Features/'
filler_path = 'Filler/'
train_images_path = 'Flickr_8k.trainImages.txt'

# LOAD IMAGES AND CAPTIONS

In [9]:
from load_dataset import loadCaptions, getImages

In [10]:
img_list = getImages(os.path.join(base_path, train_images_path))

In [11]:
capdict = loadCaptions(os.path.join(base_path, caption_file_path))

In [12]:
capdictset = {}
for i, key in enumerate(capdict.keys()):
  if i>3: break
  capdictset[key] = capdict.get(key)

# PREPROCESS IMAGES

In [13]:
from preprocess_images import load_image, get_images, preprocess_images

In [14]:
img_name_list, img_path_list, img_dataset = get_images(base_path, train_images_path, ds_path)

6001
./gdrive/MyDrive/ImageCaptions/Flicker8k_Dataset/2513260012_03d33305cf.jpg


In [15]:
# USE THIS FUNCTION TO SAVE ALL THE PRE-PROCESSED IMAGE FEATURES AS GENERATED BY INCEPTIONV3
# preprocess_images(img_name_list, img_path_list, img_dataset, ds_path, feature_path)

# PREPROCESS CAPTIONS

In [16]:
# USE THE FUNCTIONS TO PRE-PROCESS ALL THE CAPTIONS

In [17]:
from preprocess_labels import captions_clean, add_token, subset_data_dict, all_captions, max_caption_length, create_tokenizer

In [18]:
captions_clean(capdict)

In [19]:
capdict[list(capdict.keys())[0]]
# list(capdict.keys())[0]

['child inpink dress is climbing upset of stairs in an entry way',
 'girl going intowooden building',
 'little girl climbing intowooden playhouse',
 'little girl climbing the stairs to her playhouse',
 'little girl inpink dress going intowooden cabin']

In [20]:
training_dict = subset_data_dict (capdict, img_name_list) #startseq and enseq are being added

In [21]:
tokenizer, vocab_size, max_caption_words = create_tokenizer(training_dict)

# PREP DATASET

In [22]:
from prep_ds import data_prep, map_func, lazy_load_ds

In [23]:
train_X, train_y = data_prep(training_dict, tokenizer, max_caption_words, vocab_size, base_path, ds_path)

In [24]:
print(train_X[0], train_y[0])

./gdrive/MyDrive/ImageCaptions/Flicker8k_Dataset/997338199_7343367d7f.jpg [   1   38   62  105  371 2026    2    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0]


In [25]:
BATCH_SIZE = 64
BUFFER_SIZE = 1000

In [26]:
dataset = lazy_load_ds(train_X, train_y, BUFFER_SIZE, BATCH_SIZE, ds_path, feature_path)

In [27]:
dataset

<PrefetchDataset shapes: (<unknown>, <unknown>), types: (tf.float32, tf.int32)>

In [28]:
print(dataset.cardinality().numpy())

1


# CREATE MODEL

In [29]:
from create_model import Attention, CNN_Encoder, RNN_Decoder

# TRAIN

In [30]:
embedding_dim = 256
units = 512
vocab_size = vocab_size
num_steps = len(train_X) // BATCH_SIZE
# Shape of the vector extracted from InceptionV3 is (64, 2048)
# These two variables represent that vector shape
features_shape = 2048
attention_features_shape = 64

In [31]:
encoder = CNN_Encoder(embedding_dim)
decoder = RNN_Decoder(embedding_dim, units, vocab_size)

In [32]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

In [33]:
def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask

  return tf.reduce_mean(loss_)

In [34]:
@tf.function
def train_step(img_tensor, target):
  loss = 0

  # initializing the hidden state for each batch
  # because the captions are not related from image to image
  hidden = decoder.reset_state(batch_size=target.shape[0])

  dec_input = tf.expand_dims([tokenizer.word_index['startseq']] * target.shape[0], 1)

  with tf.GradientTape() as tape:
      features = encoder(img_tensor)

      for i in range(1, target.shape[1]):
          # passing the features through the decoder
          predictions, hidden, _ = decoder(dec_input, features, hidden)

          loss += loss_function(target[:, i], predictions)

          # using teacher forcing
          dec_input = tf.expand_dims(target[:, i], 1)

  total_loss = (loss / int(target.shape[1]))

  trainable_variables = encoder.trainable_variables + decoder.trainable_variables

  gradients = tape.gradient(loss, trainable_variables)

  optimizer.apply_gradients(zip(gradients, trainable_variables))

  return loss, total_loss

In [35]:
import time
start_epoch = 0
EPOCHS = 20
loss_plot = []

In [36]:
for epoch in range(start_epoch, EPOCHS):
    start = time.time()
    total_loss = 0

    for (batch, (img_tensor, target)) in enumerate(dataset):
        batch_loss, t_loss = train_step(img_tensor, target)
        total_loss += t_loss

        if batch % 100 == 0:
            average_batch_loss = batch_loss.numpy()/int(target.shape[1])
            print(f'Epoch {epoch+1} Batch {batch} Loss {average_batch_loss:.4f}')
    # storing the epoch end loss value to plot later
    loss_plot.append(total_loss / num_steps)

    print(f'Epoch {epoch+1} Loss {total_loss/num_steps:.6f}')
    print(f'Time taken for 1 epoch {time.time()-start:.2f} sec\n')

Epoch 1 Batch 0 Loss 2.1635
Epoch 1 Loss inf
Time taken for 1 epoch 70.53 sec

Epoch 2 Batch 0 Loss 2.1138
Epoch 2 Loss inf
Time taken for 1 epoch 0.18 sec

Epoch 3 Batch 0 Loss 1.9427
Epoch 3 Loss inf
Time taken for 1 epoch 0.15 sec

Epoch 4 Batch 0 Loss 1.6425
Epoch 4 Loss inf
Time taken for 1 epoch 0.15 sec

Epoch 5 Batch 0 Loss 1.1900
Epoch 5 Loss inf
Time taken for 1 epoch 0.14 sec

Epoch 6 Batch 0 Loss 0.8115
Epoch 6 Loss inf
Time taken for 1 epoch 0.14 sec

Epoch 7 Batch 0 Loss 0.7320
Epoch 7 Loss inf
Time taken for 1 epoch 0.13 sec

Epoch 8 Batch 0 Loss 0.7017
Epoch 8 Loss inf
Time taken for 1 epoch 0.13 sec

Epoch 9 Batch 0 Loss 0.7267
Epoch 9 Loss inf
Time taken for 1 epoch 0.13 sec

Epoch 10 Batch 0 Loss 0.7453
Epoch 10 Loss inf
Time taken for 1 epoch 0.14 sec

Epoch 11 Batch 0 Loss 0.7221
Epoch 11 Loss inf
Time taken for 1 epoch 0.13 sec

Epoch 12 Batch 0 Loss 0.6912
Epoch 12 Loss inf
Time taken for 1 epoch 0.13 sec

Epoch 13 Batch 0 Loss 0.7112
Epoch 13 Loss inf
Time taken