In [None]:
### MOUNT GOOGLE DRIVE
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
### IMPORT LIBRARIES
import os
import string
import glob
import json
from tensorflow.keras.applications import MobileNet
import tensorflow.keras.applications.mobilenet  
import tensorflow as tf
from tensorflow.keras.applications.inception_v3 import InceptionV3
import tensorflow.keras.applications.inception_v3

import collections

from tqdm import tqdm
import tensorflow.keras.preprocessing.image
import pickle
from time import time
import numpy as np
from PIL import Image
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (LSTM, Embedding, 
    TimeDistributed, Dense, RepeatVector, 
    Activation, Flatten, Reshape, concatenate,  
    Dropout, BatchNormalization)
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras import Input, layers
from tensorflow.keras import optimizers

from tensorflow.keras.models import Model

from tensorflow.keras.layers import add
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import matplotlib.pyplot as plt

START = "startseq"
STOP = "endseq"
EPOCHS = 50

In [None]:
### IMPORT MS-COCO ANNOTATIONS AND GLOVE6B

In [None]:
# Download caption annotation files
annotation_folder = '/annotations/'
if not os.path.exists(os.path.abspath('.') + annotation_folder):
  annotation_zip = tf.keras.utils.get_file('captions.zip',
                                          cache_subdir=os.path.abspath('.'),
                                          origin = 'http://images.cocodataset.org/annotations/annotations_trainval2014.zip',
                                          extract = True)
  annotation_file = os.path.dirname(annotation_zip)+'/annotations/captions_train2014.json'
  os.remove(annotation_zip)

Downloading data from http://images.cocodataset.org/annotations/annotations_trainval2014.zip


In [None]:
# Download glove6B
glove_zip = tf.keras.utils.get_file('glove6B.zip',
                                      cache_subdir=os.path.abspath('.'),
                                      origin = 'http://nlp.stanford.edu/data/glove.6B.zip',
                                      extract = True)
os.remove(glove_zip)

Downloading data from http://nlp.stanford.edu/data/glove.6B.zip


### IMPORT MS-COCO IMAGES DATASET

In [None]:
# Download image files
image_folder = '/train2014/'
if not os.path.exists(os.path.abspath('.') + image_folder):
  image_zip = tf.keras.utils.get_file('train2014.zip',
                                      cache_subdir=os.path.abspath('.'),
                                      origin = 'http://images.cocodataset.org/zips/train2014.zip',
                                      extract = True)
  PATH = os.path.dirname(image_zip) + image_folder
  os.remove(image_zip)
else:
  PATH = os.path.abspath('.') + image_folder

Downloading data from http://images.cocodataset.org/zips/train2014.zip


### Create dictionary of images with associated captions, create vocab.txt out of captions

In [None]:
annotation_file = 'annotations/captions_train2014.json'
with open(annotation_file, 'r') as f:
    annotations = json.load(f)

In [None]:
# Group all captions together having the same image ID, and count words in each caption
image_path_to_caption = collections.defaultdict(list)
word_counts = {}
for val in annotations['annotations']:
  caption = f"startseq {val['caption']} endseq"
  for w in caption.split(' '):
    word_counts[w] = word_counts.get(w, 0) + 1
  image_path = PATH + 'COCO_train2014_' + '%012d.jpg' % (val['image_id'])
  image_path_to_caption[image_path].append(caption)

In [None]:
# Create vocabulary out of ALL captions with word count > threshold
word_count_threshold = 50
vocab = [w for w in word_counts if word_counts[w] >= word_count_threshold]
print('preprocessed words %d ==> %d' % (len(word_counts), len(vocab)))
print(vocab[:5])

preprocessed words 45188 ==> 3664
['startseq', 'A', 'very', 'clean', 'and']


In [None]:
### SAVING VOCAB FILE FOR DEPLOYMENT
with open('/content/gdrive/My Drive/vocab.txt', 'wb') as file:
     pickle.dump(vocab, file)

In [None]:
### SELECT ALL BUT 1000 IMAGE PATHS FOR TRAINING
image_paths = list(image_path_to_caption.keys())
train_image_paths = image_paths[:-1000]
test_image_paths = image_paths[-1000:]
print(len(train_image_paths))
print(train_image_paths[0])

81783
/content/train2014/COCO_train2014_000000318556.jpg


In [None]:
train_captions = []
img_name_vector = []

for image_path in train_image_paths:
  caption_list = image_path_to_caption[image_path]
  train_captions.extend(caption_list)
  img_name_vector.extend([image_path] * len(caption_list))
print(train_captions[0])
print(img_name_vector[0])

startseq A very clean and well decorated empty bathroom endseq
/content/train2014/COCO_train2014_000000318556.jpg


In [None]:
img_caption_list = []
for tup in list(zip(img_name_vector,train_captions)):
  img_caption_list.append(str(tup[0]) + ' ' + str(tup[1]))
print(img_caption_list[0])

/content/train2014/COCO_train2014_000000318556.jpg startseq A very clean and well decorated empty bathroom endseq


In [None]:
# Clean up text, creates dictionary with image names and captions list
null_punct = str.maketrans('', '', string.punctuation)
lookup = dict()

max_length = 0
for line in img_caption_list:
  tok = line.split()
  if len(line) >= 2:
    id = tok[0].split('.')[0]
    desc = tok[1:]
      
    # Cleanup description
    desc = [word.lower() for word in desc]
    desc = [w.translate(null_punct) for w in desc]
    desc = [word for word in desc if len(word)>1]
    desc = [word for word in desc if word.isalpha()]
    max_length = max(max_length,len(desc))
      
    if id not in lookup:
      lookup[id] = list()
    lookup[id].append(' '.join(desc))
      
lex = set()
for key in lookup:
  [lex.update(d.split()) for d in lookup[key]]

In [None]:
train_descriptions = {k:v for k,v in lookup.items() if f'{k}.jpg' \
                      in train_image_paths}
for n,v in train_descriptions.items(): 
  for d in range(len(v)):
    v[d] = f'{v[d]}'

In [None]:
test_descriptions = {k:v for k,v in lookup.items() if f'{k}.jpg' \
                      in test_image_paths}
for n,v in train_descriptions.items(): 
  for d in range(len(v)):
    v[d] = f'{v[d]}'

In [None]:

idxtoword = {}
wordtoidx = {}

ix = 1
for w in vocab[:-1]:
    wordtoidx[w] = ix
    idxtoword[ix] = w
    ix += 1
    
vocab_size = len(idxtoword) 
vocab_size

3663

### Tokenize words in vocab via GloVe6B create emebedding layer for model

In [None]:
glove_dir = os.path.join('')
embeddings_index = {} 
f = open(os.path.join(glove_dir, 'glove.6B.200d.txt'), encoding="utf-8")

for line in tqdm(f):
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs

f.close()
print(f'Found {len(embeddings_index)} word vectors.')

400000it [00:22, 17404.49it/s]

Found 400000 word vectors.





In [None]:
embedding_dim = 200

# Get 200-dim dense vector for each of the 10000 words in out vocabulary
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in wordtoidx.items():
    #if i < max_words:
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in the embedding index will be all zeros
        embedding_matrix[i] = embedding_vector

In [None]:
wordtoidx.items()



In [None]:
embedding_matrix.shape

(3663, 200)

### Pre-Encode Images with InceptionV3

In [None]:
  encode_model = InceptionV3(weights='imagenet')
  encode_model = Model(encode_model.input, encode_model.layers[-2].output)
  WIDTH = 299
  HEIGHT = 299
  OUTPUT_DIM = 2048
  preprocess_input = tensorflow.keras.applications.inception_v3.preprocess_input

In [None]:
def encodeImage(img):
  # Resize all images to a standard size (specified bythe image 
  # encoding network)
  img = img.resize((WIDTH, HEIGHT), Image.ANTIALIAS)
  # Convert a PIL image to a numpy array
  x = tensorflow.keras.preprocessing.image.img_to_array(img)
  # Expand to 2D array
  x = np.expand_dims(x, axis=0)
  # Perform any preprocessing needed by InceptionV3 or others
  x = preprocess_input(x)
  # Call InceptionV3 (or other) to extract the smaller feature set for 
  # the image.
  x = encode_model.predict(x) # Get the encoding vector for the image
  # Shape to correct form to be accepted by LSTM captioning network.
  x = np.reshape(x, OUTPUT_DIM )
  return x

In [None]:
start = time()
encoding_train = {}
for id in tqdm(train_image_paths):
  image_path = os.path.join(id)
  img = tensorflow.keras.preprocessing.image.load_img(image_path, \
            target_size=(HEIGHT, WIDTH))
  encoding_train[id] = encodeImage(img)


100%|██████████| 81783/81783 [2:11:17<00:00, 10.38it/s]


In [None]:
#start = time()
encoding_test = {}
for id in tqdm(test_image_paths):
  image_path = os.path.join(id)
  img = tensorflow.keras.preprocessing.image.load_img(image_path, \
                target_size=(HEIGHT, WIDTH))
  encoding_test[id] = encodeImage(img)

100%|██████████| 1000/1000 [01:35<00:00, 10.43it/s]


In [None]:
len(encoding_train)

81783

### Train sample generator function

In [None]:
def data_generator(descriptions, photos, wordtoidx, \
                   max_length, num_photos_per_batch):
  # x1 - Training data for photos
  # x2 - The caption that goes with each photo
  # y - The predicted rest of the caption
  x1, x2, y = [], [], []
  n=0
  while True:
    for key, desc_list in descriptions.items():
      n+=1
      photo = photos[key+'.jpg']
      # Each photo has 5 descriptions
      for desc in desc_list:
        # Convert each word into a list of sequences.
        seq = [wordtoidx[word] for word in desc.split(' ') \
               if word in wordtoidx]
        # Generate a training case for every possible sequence and outcome
        for i in range(1, len(seq)):
          in_seq, out_seq = seq[:i], seq[i]
          in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
          out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
          x1.append(photo)
          x2.append(in_seq)
          y.append(out_seq)
      if n==num_photos_per_batch:
        yield ([np.array(x1), np.array(x2)], np.array(y))
        x1, x2, y = [], [], []
        n=0

### Define Model

In [None]:
inputs1 = Input(shape=(OUTPUT_DIM,))
fe1 = Dropout(0.1)(inputs1)
fe2 = Dense(256, activation='relu')(fe1)
inputs2 = Input(shape=(max_length,))
se1 = Embedding(vocab_size, embedding_dim, mask_zero=True)(inputs2)
se2 = Dropout(0.1)(se1)
se3 = LSTM(256)(se2)
decoder1 = add([fe2, se3])
decoder2 = Dense(256, activation='relu')(decoder1)
outputs = Dense(vocab_size, activation='softmax')(decoder2)
caption_model = Model(inputs=[inputs1, inputs2], outputs=outputs)
caption_model.summary()

Model: "functional_5"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_6 (InputLayer)            [(None, 49)]         0                                            
__________________________________________________________________________________________________
input_5 (InputLayer)            [(None, 2048)]       0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 49, 200)      732600      input_6[0][0]                    
__________________________________________________________________________________________________
dense_3 (Dense)                 (None, 256)          524544      input_5[0][0]                    
_______________________________________________________________________________________

In [None]:
caption_model.layers[2].set_weights([embedding_matrix])
caption_model.layers[2].trainable = False
caption_model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
def generateCaption(photo):
    in_text = START
    for i in range(max_length):
        sequence = [wordtoidx[w] for w in in_text.split() if w in wordtoidx]
        sequence = pad_sequences([sequence], maxlen=max_length)
        yhat = caption_model.predict([photo,sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = idxtoword[yhat]
        in_text += ' ' + word
        if word == STOP:
            break
    final = in_text.split()
    final = final[1:-1]
    final = ' '.join(final)
    return final

In [None]:
number_pics_per_batch = 16
steps = len(train_descriptions)//number_pics_per_batch
pics2 = collections.defaultdict()

for z in range(20,50):
      pics2[list(encoding_test.keys())[z]] = list()
EPOCHS = 30
model_path = os.path.join(f'caption-model-final-ndo.hdf5')
for i in range(EPOCHS):
  generator = data_generator(train_descriptions, encoding_train, 
                    wordtoidx, max_length, number_pics_per_batch)
  caption_model.fit(generator, epochs=1, 
                            steps_per_epoch=steps, verbose=1)
  if i % 1 == 0:  
    caption_model.save_weights(model_path)
    for pic in pics2.keys():
      image = encoding_test[pic].reshape((1,OUTPUT_DIM))
      pics2[pic].append(generateCaption(image))


 796/5111 [===>..........................] - ETA: 18:12 - loss: 2.6133

KeyboardInterrupt: ignored

In [None]:
from google.colab import files
files.upload()

Saving pics_epochs.pkl to pics_epochs.pkl


{'pics_epochs.pkl': b'\x80\x03ccollections\ndefaultdict\nq\x00)Rq\x01(X2\x00\x00\x00/content/train2014/COCO_train2014_000000018395.jpgq\x02]q\x03(X!\x00\x00\x00vase filled with flowers on tableq\x04X \x00\x00\x00vase with flowers in it on tableq\x05X \x00\x00\x00vase with flowers in it on tableq\x06X0\x00\x00\x00vase filled with flowers sitting on top of tableq\x07X(\x00\x00\x00vase filled with flowers on top of tableq\x08X(\x00\x00\x00vase filled with flowers on top of tableq\tX(\x00\x00\x00vase filled with flowers on top of tableq\nX(\x00\x00\x00vase filled with flowers on top of tableq\x0bX \x00\x00\x00vase with flowers in it on tableq\x0cX \x00\x00\x00vase with flowers in it on tableq\reX2\x00\x00\x00/content/train2014/COCO_train2014_000000560691.jpgq\x0e]q\x0f(X/\x00\x00\x00group of people standing around table with foodq\x10X/\x00\x00\x00group of people standing around table with foodq\x11X/\x00\x00\x00group of people standing around table with foodq\x12X%\x00\x00\x00group of peo

In [None]:
with open('/content/gdrive/My Drive/pics_epochs.txt', 'rb') as file:
        pics = pickle.load(file)

In [None]:

for pic in pics.keys():
  print(pic)
  print(pics[pic])

/content/train2014/COCO_train2014_000000018395.jpg
['vase filled with flowers on table', 'vase with flowers in it on table', 'vase with flowers in it on table', 'vase filled with flowers sitting on top of table', 'vase filled with flowers on top of table', 'vase filled with flowers on top of table', 'vase filled with flowers on top of table', 'vase filled with flowers on top of table', 'vase with flowers in it on table', 'vase with flowers in it on table']
/content/train2014/COCO_train2014_000000560691.jpg
['group of people standing around table with food', 'group of people standing around table with food', 'group of people standing around table with food', 'group of people standing around table', 'group of people sitting around table with food', 'group of people sitting around table with food', 'group of people standing around table with food', 'group of people standing around room with teddy bears', 'group of people standing around table with food', 'group of people standing around t

In [None]:
pics_pkl = collections.defaultdict()
for img,cap in list(train_descriptions.items())[0:100]:
  pics_pkl[f'{img}'.split('/')[-1]] = ' '.join(cap[0].split()[1:-1])
  Image.open(img+'.jpg').save('img_gen2/'+f'{img}'.split('/')[-1]+'.jpg')
  print(img,cap)


/content/train2014/COCO_train2014_000000318556 ['startseq very clean and well decorated empty bathroom endseq', 'startseq blue and white bathroom with butterfly themed wall tiles endseq', 'startseq bathroom with border of butterflies and blue paint on the walls above it endseq', 'startseq an angled view of beautifully decorated bathroom endseq', 'startseq clock that blends in with the wall hangs in bathroom endseq']
/content/train2014/COCO_train2014_000000116100 ['startseq panoramic view of kitchen and all of its appliances endseq', 'startseq panoramic photo of kitchen and dining room endseq', 'startseq wide angle view of the kitchen work area endseq', 'startseq multiple photos of brown and white kitchen endseq', 'startseq kitchen that has checkered patterned floor and white cabinets endseq']
/content/train2014/COCO_train2014_000000379340 ['startseq graffitied stop sign across the street from red car endseq', 'startseq vandalized stop sign and red beetle on the road endseq', 'startseq 

In [None]:
pics2

defaultdict(None, {})

In [None]:
pics_pkl

defaultdict(None,
            {'COCO_train2014_000000002448': 'two kitchen stools sitting in front of an island in kitchen',
             'COCO_train2014_000000004377': 'group of people playing game of croquet',
             'COCO_train2014_000000009469': 'teenage boy is in field looking at wire',
             'COCO_train2014_000000014750': 'bicycle with basket is parked at the side of building',
             'COCO_train2014_000000018691': 'few people sit on dim transportation system',
             'COCO_train2014_000000024091': 'compact car with bicycles mounted on the roof',
             'COCO_train2014_000000025470': 'kitchen with brown cabinets tile backsplash and grey counters',
             'COCO_train2014_000000028149': 'bathroom with tv near the mirror',
             'COCO_train2014_000000028231': 'black car is near someone riding bike',
             'COCO_train2014_000000031813': 'surfer riding his bike to the beach',
             'COCO_train2014_000000032275': 'the vanity con

In [None]:
with open('/content/pics_data2.pkl', 'wb') as file:
     pickle.dump(pics_pkl, file)

In [None]:
!zip -r /content/img_epochs.zip /content/img_epochs

  adding: content/img_epochs/ (stored 0%)
  adding: content/img_epochs/COCO_train2014_000000508723.jpg.jpg (deflated 0%)
  adding: content/img_epochs/COCO_train2014_000000477150.jpg.jpg (deflated 0%)
  adding: content/img_epochs/COCO_train2014_000000046893.jpg.jpg (deflated 0%)
  adding: content/img_epochs/COCO_train2014_000000549575.jpg (deflated 0%)
  adding: content/img_epochs/COCO_train2014_000000018395.jpg.jpg (deflated 10%)
  adding: content/img_epochs/COCO_train2014_000000026944.jpg.jpg (deflated 1%)
  adding: content/img_epochs/COCO_train2014_000000413970.jpg.jpg (deflated 0%)
  adding: content/img_epochs/COCO_train2014_000000305600.jpg.jpg (deflated 1%)
  adding: content/img_epochs/COCO_train2014_000000562217.jpg.jpg (deflated 0%)
  adding: content/img_epochs/COCO_train2014_000000365659.jpg.jpg (deflated 0%)
  adding: content/img_epochs/COCO_train2014_000000560691.jpg.jpg (deflated 0%)
  adding: content/img_epochs/COCO_train2014_000000558771.jpg.jpg (deflated 0%)
  adding: con

In [None]:
from google.colab import files
files.download('/content/img_epochs.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>