In [None]:
import tensorflow as tf

# You'll generate plots of attention in order to see which parts of an image
# your model focuses on during captioning
import matplotlib.pyplot as plt

import numpy as np
from PIL import Image
import configparser

config = configparser.ConfigParser()
config.read("config.ini")

#importing local module 
from models.subclasses import *
from models.utilities import *
from models.train_utils import *
from models.predict import *

In [None]:
# Train sample size (-1 for max) 
# can't exceed 118286 sample
sample = int(config['config']['train_sample'])

#train split percentage 80-20
percentage = float(config['config']['percentage'])

# Max word count for a caption.
max_length = int(config['config']['max_length'])
# Use the top words for a vocabulary.
vocabulary_size = int(config['config']['vocabulary_size'])
use_glove = bool(config['config']['use_glove'])

## Importing data

In [None]:
# create data lists
# import data and save it to a dict, also save it's keys in a list 
train_image_paths, image_path_to_caption = import_files(shuffle= False, method = "train")

## Prepair data

In [None]:
train_captions = []
img_name_vector = []
for image_path in train_image_paths:
  caption_list = image_path_to_caption[image_path]
  train_captions.extend(caption_list)
  img_name_vector.extend([image_path] * len(caption_list))

print(train_captions[0])
Image.open(img_name_vector[0])

## Prepair the models

In [None]:
# create and freeze feature extractor model
image_features_extract_model = get_feature_extractor()

In [None]:
word_to_index, index_to_word, tokenizer, cap_vector = tokenization(train_captions, max_length, vocabulary_size)

In [None]:
glove_path = "./dataset/glove.6B/glove.6B.100d.txt"

embeddings_index = {}
with open(glove_path) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

In [None]:
vocabulary = tokenizer.get_vocabulary()
word_index = dict(zip(vocabulary, range(len(vocabulary))))

In [None]:
num_tokens = len(vocabulary) + 2
embedding_dim = 100
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

In [None]:
len(embedding_matrix)

In [None]:
img_name_train, cap_train = split_data(img_name_vector, cap_vector ,image_features_extract_model,  percentage)


In [None]:
save_dataset(img_name_train, cap_train, tokenizer.get_vocabulary() , train_captions)

## Training phase

In [None]:
# Feel free to change these parameters according to your system's configuration
BATCH_SIZE = int(config['config']['BATCH_SIZE'])
BUFFER_SIZE = int(config['config']['BUFFER_SIZE'])

if use_glove:
    embedding_dim = 100
else:
    embedding_dim = int(config['config']['embedding_dim'])

units = int(config['config']['units'])
# Shape of the vector extracted from InceptionV3 is (64, 2048)
# These two variables represent that vector shape
features_shape = int(config['config']['features_shape'])
attention_features_shape = int(config['config']['attention_features_shape'])

# Training parameters
epochs = int(config['config']['epochs'])
num_steps = len(img_name_train) // BATCH_SIZE

In [None]:
dataset = make_dataset(img_name_train, cap_train)

## Making the models

In [None]:
encoder = CNN_Encoder(embedding_dim)
if use_glove:
    decoder = RNN_Decoder(embedding_dim, units, num_tokens, embedding_matrix)
else:
    decoder = RNN_Decoder(embedding_dim, units, tokenizer.vocabulary_size(), None)

In [None]:
checkpoint_path = "./checkpoints/train"
ckpt = tf.train.Checkpoint(encoder=encoder,
                           decoder=decoder,
                           optimizer=optimizer)
ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

In [None]:
start_epoch = 0
if ckpt_manager.latest_checkpoint:
  start_epoch = int(ckpt_manager.latest_checkpoint.split('-')[-1])
  # restoring the latest checkpoint in checkpoint_path
  ckpt.restore(ckpt_manager.latest_checkpoint)

In [None]:
if start_epoch == 0:
    loss_plot = []
else:
    loss_plot = load_loss()

train(epochs, start_epoch, ckpt_manager,
          num_steps, dataset, decoder,
          encoder, word_to_index, loss_plot)

In [None]:
plt.plot(loss_plot)
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Loss Plot')
plt.show()

## Result

In [None]:
# captions on the validation set
rid = np.random.randint(0, len(img_name_val))
image = img_name_val[rid]
real_caption = ' '.join([tf.compat.as_text(index_to_word(i).numpy())
                         for i in cap_val[rid] if i not in [0]])

result, attention_plot = evaluate(image, encoder, decoder, image_features_extract_model,
                                    word_to_index, index_to_word)

print('Real Caption:', real_caption)
print('Prediction Caption:', ' '.join(result))
plot_attention(image, result, attention_plot)