# Load and preprocess images for captioning

Relevant links (just in case):
- train images http://msvocds.blob.core.windows.net/coco2014/train2014.zip
- validation images http://msvocds.blob.core.windows.net/coco2014/val2014.zip
- captions for both train and validation http://msvocds.blob.core.windows.net/annotations-1-0-3/captions_train-val2014.zip

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import tensorflow as tf
tf.enable_eager_execution()

from tensorflow.contrib import keras
import numpy as np

import zipfile
import json
import os
from tqdm import tqdm

In [None]:
#from sklearn.utils import shuffle

## Load train data

In [None]:
if not os.path.exists(os.path.abspath('.') + '/annotations/captions_train2014.json'):
    annotation_zip = tf.keras.utils.get_file('captions.zip', 
                                          cache_subdir=os.path.abspath('.'),
                                          origin = 'http://images.cocodataset.org/annotations/annotations_trainval2014.zip',
                                          extract = True)
    annotation_file = os.path.dirname(annotation_zip)+'/annotations/captions_train2014.json'
else:
    annotation_file = os.path.abspath('.') +'/annotations/captions_train2014.json'

In [None]:
print(annotation_file)

In [None]:
name_of_zip = 'train2014.zip'
if not os.path.exists(os.path.abspath('.') + '/' + name_of_zip):
    image_zip = tf.keras.utils.get_file(name_of_zip, 
                                      cache_subdir=os.path.abspath('.'),
                                      origin = 'http://images.cocodataset.org/zips/train2014.zip',
                                      extract = True)
    PATH = os.path.dirname(image_zip)+'/train2014/'
else:
    PATH = os.path.abspath('.')+'/train2014/'

In [None]:
# read the json file
with open(annotation_file, 'r') as f:
    annotations = json.load(f)

# storing the captions and the image name in vectors
all_captions = []
all_img_name_vector = []

for annot in annotations['annotations']:
    #caption = '<start> ' + annot['caption'] + ' <end>'
    caption = annot['caption']
    image_id = annot['image_id']
    full_coco_image_path = PATH + 'COCO_train2014_' + '%012d.jpg' % (image_id)
    
    all_img_name_vector.append(full_coco_image_path)
    all_captions.append(caption)

# shuffling the captions and image_names together
# setting a random state
# train_captions, img_name_vector = shuffle(all_captions,
#                                           all_img_name_vector,
#                                           random_state=1)

# selecting the first 30000 captions from the shuffled set
# num_examples = 30000
# train_captions = train_captions[:num_examples]
# img_name_vector = img_name_vector[:num_examples]

In [None]:
len(train_captions), len(img_name_vector), len(all_captions)

In [None]:
img_name_vector[:10]

## Preprocess the images using InceptionV3

In [None]:
def load_image(image_path):
    img = tf.read_file(image_path)
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.image.resize_images(img, (299, 299))
    img = tf.keras.applications.inception_v3.preprocess_input(img)
    return img, image_path

### Initialize InceptionV3 and load the pretrained Imagenet weights

In [None]:
### for attention mechanics:
# image_model = tf.keras.applications.InceptionV3(include_top=False, 
#                                                 weights='imagenet')
# new_input = image_model.input
# hidden_layer = image_model.layers[-1].output

# image_features_extract_model = tf.keras.Model(new_input, hidden_layer)

In [None]:
### with out attention mechamics
image_model = tf.keras.applications.InceptionV3(include_top=False, 
                                                weights='imagenet')
new_input = image_model.input

image_features_extract_model = keras.models.Model(new_input, keras.layers.GlobalAveragePooling2D()(image_model.output))

### Caching the features extracted from InceptionV3

In [None]:
# getting the unique images
encode_train = sorted(set(img_name_vector))

# feel free to change the batch_size according to your system configuration
image_dataset = tf.data.Dataset.from_tensor_slices(encode_train).map(load_image).batch(16)

pbar = tqdm(total=len(encode_train))
for img, path in image_dataset:
    
    pbar.update(16)
    
    batch_features = image_features_extract_model(img)
# uncomment for attention:
#     batch_features = tf.reshape(batch_features, 
#                               (batch_features.shape[0], -1, batch_features.shape[3]))

    for bf, p in zip(batch_features, path):
        path_of_feature = p.numpy().decode("utf-8")
        np.save(path_of_feature, bf.numpy())

In [None]:
print(batch_features.shape)

## Load val data

In [None]:
if not os.path.exists(os.path.abspath('.') + '/annotations/captions_val2014.json'):
    annotation_zip = tf.keras.utils.get_file('captions.zip', 
                                          cache_subdir=os.path.abspath('.'),
                                          origin = 'http://images.cocodataset.org/annotations/annotations_trainval2014.zip',
                                          extract = True)
    annotation_file = os.path.dirname(annotation_zip)+'/annotations/captions_val2014.json'
else:
    annotation_file = os.path.abspath('.') +'/annotations/captions_val2014.json'

In [None]:
print(annotation_file)

In [None]:
name_of_zip = 'val2014.zip'
if not os.path.exists(os.path.abspath('.') + '/' + name_of_zip):
    image_zip = tf.keras.utils.get_file(name_of_zip, 
                                      cache_subdir=os.path.abspath('.'),
                                      origin = 'http://images.cocodataset.org/zips/val2014.zip',
                                      extract = True)
    PATH = os.path.dirname(image_zip)+'/val2014/'
else:
    PATH = os.path.abspath('.')+'/val2014/'

In [None]:
# read the json file
with open(annotation_file, 'r') as f:
    annotations = json.load(f)

# storing the captions and the image name in vectors
all_captions = []
all_img_name_vector = []

for annot in annotations['annotations']:
    #caption = '<start> ' + annot['caption'] + ' <end>'
    caption = annot['caption']
    image_id = annot['image_id']
    full_coco_image_path = PATH + 'COCO_val2014_' + '%012d.jpg' % (image_id)
    
    all_img_name_vector.append(full_coco_image_path)
    all_captions.append(caption)

# shuffling the captions and image_names together
# setting a random state
# val_captions, img_name_vector = shuffle(all_captions,
#                                           all_img_name_vector,
#                                           random_state=1)

# selecting the first 30000 captions from the shuffled set
# num_examples = 30000
# train_captions = train_captions[:num_examples]
# img_name_vector = img_name_vector[:num_examples]

In [None]:
len(val_captions), len(img_name_vector), len(all_captions)

In [None]:
img_name_vector[:10]

In [None]:
# getting the unique images
encode_val = sorted(set(img_name_vector))

# feel free to change the batch_size according to your system configuration
image_dataset = tf.data.Dataset.from_tensor_slices(encode_val).map(load_image).batch(16)

pbar = tqdm(total=len(encode_val))
for img, path in image_dataset:
    
    pbar.update(16)
    
    batch_features = image_features_extract_model(img)
# for attention
#     batch_features = tf.reshape(batch_features, 
#                               (batch_features.shape[0], -1, batch_features.shape[3]))

    for bf, p in zip(batch_features, path):
        path_of_feature = p.numpy().decode("utf-8")
        np.save(path_of_feature, bf.numpy())

In [None]:
print(batch_features.shape)