<a href="https://colab.research.google.com/github/hyeonjungko/PhotoCaptionGenerator/blob/main/Photo_Caaption_Generator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import scipy
print('scipy: %s' % scipy.__version__)
# numpy
import numpy
print('numpy: %s' % numpy.__version__)
# matplotlib
import matplotlib
print('matplotlib: %s' % matplotlib.__version__)
# pandas
import pandas
print('pandas: %s' % pandas.__version__)
# statsmodels
import statsmodels
print('statsmodels: %s' % statsmodels.__version__)
# scikit-learn
import sklearn
print('sklearn: %s' % sklearn.__version__)

scipy: 1.7.3
numpy: 1.21.6
matplotlib: 3.2.2
pandas: 1.3.5
statsmodels: 0.12.2
sklearn: 1.0.2


In [3]:
!pip install theano

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
# theano
import theano
print('theano: %s' % theano.__version__)
# tensorflow
import tensorflow as tf
print('tensorflow: %s' % tf.__version__)
# keras
import keras
print('keras: %s' % keras.__version__)

theano: 1.0.5
tensorflow: 2.9.2
keras: 2.9.0


In [5]:
from google.colab import drive

# drive.mount('/content/drive/')

In [6]:
PROJ_PATH = '/content/drive/MyDrive/Colab Notebooks/Photo Caption Generator'

In [7]:
# !unzip '/content/drive/MyDrive/Colab Notebooks/Photo Caption Generator/Dataset/Flickr8k_Dataset.zip' -d '/content/drive/MyDrive/Colab Notebooks/Photo Caption Generator/Dataset'
# !unzip '/content/drive/MyDrive/Colab Notebooks/Photo Caption Generator/Dataset/Flickr8k_text.zip' -d '/content/drive/MyDrive/Colab Notebooks/Photo Caption Generator/Dataset/Flickr8k_text'

In [8]:
from os import listdir
import pathlib
from pickle import dump
from keras.applications.vgg16 import VGG16
# from keras.preprocessing.image import load_img
from keras.utils import load_img
from keras.utils import img_to_array
from keras.applications.vgg16 import preprocess_input
from keras.models import Model

In [9]:
def extract_features(directory):
	# load the model
	model = VGG16()
	# re-structure the model
	model = Model(inputs=model.inputs, outputs=model.layers[-2].output)
	# summarize
	print(model.summary())
	# extract features from each photo
	features = dict()
	for name in listdir(directory):
		# load an image from file
		filename = directory + '/' + name
		image = load_img(filename, target_size=(224, 224))
		# convert the image pixels to a numpy array
		image = img_to_array(image)
		# reshape data for the model
		image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
		# prepare the image for the VGG model
		image = preprocess_input(image)
		# get features
		feature = model.predict(image, verbose=0)
		# get image id
		image_id = name.split('.')[0]
		# store feature
		features[image_id] = feature
		print('>%s' % name)
	return features

In [10]:
# extract features from all images
# directory = "/content/drive/MyDrive/Colab Notebooks/Photo Caption Generator/Dataset/Flicker8k_Dataset"
# features = extract_features(directory)
# print('Extracted Features: %d' % len(features))
# save to file
# dump(features, open('/content/drive/MyDrive/Colab Notebooks/Photo Caption Generator/Dataset/features.pkl', 'wb'))

In [11]:
# utility function to load document
def load_doc(filename):
  file = open(filename, 'r')
  text = file.read()
  file.close()
  return text

In [12]:
# load token text doc
token_text_filename = '/content/drive/MyDrive/Colab Notebooks/Photo Caption Generator/Dataset/Flickr8k_text/Flickr8k.token.txt'
token_doc = load_doc(token_text_filename)
print(token_doc[:100])

1000268201_693b08cb0e.jpg#0	A child in a pink dress is climbing up a set of stairs in an entry way .


In [13]:
# utility function to load descriptions for images
def load_descriptions(doc):
  mapping = dict()
  # process doc line by line
  for line in doc.split('\n'):
    # split by whitespace
    tokens = line.split()
    if len(line) < 2:
      continue
    # take first token as the image id, the rest as the description
    image_id, image_desc = tokens[0], tokens[1:]
    # remove .jpg and other unnecessary info from image id
    image_id = image_id.split('.')[0]
    # convert description tokens back to a single string
    image_desc = ' '.join(image_desc)
    # create empty list for key if not already created
    if image_id not in mapping:
      mapping[image_id] = list()
    # add description
    mapping[image_id].append(image_desc)
  return mapping

In [14]:
# parse image descriptions
descriptions = load_descriptions(token_doc)
print('Loaded: %d ' % len(descriptions))
print(list(descriptions.keys())[0])

Loaded: 8092 
1000268201_693b08cb0e


The image descriptions are rather clean, so there's not much to do. 
But let's still 
1. convert all words to lowercase
2. remove all punctuation
3. remove all words that are one character or less in length
4. remove all words with numbers in them





In [15]:
import string

def clean_descriptions(descriptions):
  # prepare translation table to remove punctuation
  table = str.maketrans('','', string.punctuation)
  for _, desc_list in descriptions.items():
    for i in range(len(desc_list)):
      desc = desc_list[i]
      # tokenize
      desc = desc.split()
      # 1. convert to lowercase
      desc = [word.lower() for word in desc]
      # 2. remove punctuation from each token
      desc = [w.translate(table) for w in desc]
      # 3. remove hanging 's' and 'a'
      desc = [word for word in desc if len(word)>1]
      # 4. remove tokens with numbers in them
      desc = [word for word in desc if word.isalpha()]
      # store as string
      desc_list[i] = ' '.join(desc)

In [16]:
# clean image descriptions
clean_descriptions(descriptions)
print(list(descriptions.keys())[0])

1000268201_693b08cb0e


In [17]:
# Let's see how the cleaned descriptions fair in terms of the vocab size
def to_vocabulary(descriptions):
  all_desc = set()
  for key in descriptions.keys():
    [all_desc.update(d.split()) for d in descriptions[key]]
  return all_desc

vocabulary = to_vocabulary(descriptions)
print('Vocabulary Size: %d' % len(vocabulary))

Vocabulary Size: 8763


Change of vocab size from cleaning: 8092 to 8763.

In [18]:
# utility function to save dictionary of image identifier and image descriptions to a file
def save_descriptions(descriptions, filename):
  lines = list()
  for key, desc_list in descriptions.items():
    for desc in desc_list:
      lines.append(key + ' ' + desc)
  data = '\n'.join(lines)
  file = open(filename, 'w')
  file.write(data)
  file.close()

# save descriptions to a file
save_descriptions(descriptions, f"{PROJ_PATH}/descriptions.txt")

# Loading Data

In [19]:
# utility function to load a pre-defined list of photo identifiers
def load_set(filename):
  doc = load_doc(filename)
  dataset = list()
  for line in doc.split('\n'):
    if len(line) < 1:
      continue
    # fetch image identifier
    identifier = line.split('.')[0]
    dataset.append(identifier)
  return set(dataset)

In [20]:
# utility function to load clean descriptions into memory
def load_clean_descriptions(filename, dataset):
  # load document
  doc = load_doc(filename)
  descriptions = dict()
  for line in doc.split('\n'):
    tokens = line.split()
    image_id, image_desc = tokens[0], tokens[1:]
    if image_id in dataset:
      if image_id not in descriptions:
        descriptions[image_id] = list()
        desc = 'startseq ' + ' '.join(image_desc) + ' endseq'
        # store
        descriptions[image_id].append(desc)
  return descriptions

In [21]:
from pickle import load

# function to load photo features
def load_photo_features(filename, dataset):
  all_features = load(open(filename, 'rb'))
  features = {k: all_features[k] for k in dataset}
  return features

In [22]:
# load training dataset (6k)
filename = f"{PROJ_PATH}/Dataset/Flickr8k_text/Flickr_8k.trainImages.txt"
train = load_set(filename)
print(f"Dataset: {len(train)}")

# load training descriptions
train_descriptions = load_clean_descriptions(f"{PROJ_PATH}/descriptions.txt", train)
print(f"Descriptions: train={len(train_descriptions)}")

# load photo features
train_features = load_photo_features(f"{PROJ_PATH}/Dataset/features.pkl", train)
print(f"Photos: train={len(train_features)}")

Dataset: 6000
Descriptions: train=6000
Photos: train=6000


The description text needs to be encoded to numbers before it can be presented ot the model as inputs.

In [23]:
from keras.preprocessing.text import Tokenizer

In [24]:
# function to convert a dictionary of clean descriptions to a list of descriptions
def to_lines(descriptions):
  all_desc = list()
  for key in descriptions.keys():
    [all_desc.append(d) for d in descriptions[key]]
  return all_desc

# function to fit tokenizer given caption descriptions
def create_tokenizer(descriptions):
  lines = to_lines(descriptions)
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(lines)
  return tokenizer

In [25]:
# prepare tokenizer
tokenizer = create_tokenizer(train_descriptions)
vocab_size = len(tokenizer.word_index) + 1
print(f"Vocabulary Size: {vocab_size}")

Vocabulary Size: 3848


In [26]:
# function to create sequences of images, input sequences and output words for an image
def create_sequences(tokenizer, max_length, descriptions, photos, vocab_size):
  X1, X2, y = list(), list(), list()
  # for each image identifier
  for key, desc_list in descriptions.items():
    for desc in desc_list:
      # encode sequence
      seq = tokenizer.texts_to_sequences([desc])[0]
      # split one sequence into multiple X,y pairs
      for i in range(1, len(seq)):
        # split into input and output pair
        in_seq, out_seq = seq[:i], seq[i]
        # pad input sequence
        in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
        # encode output sequence
        out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
        # store
        X1.append(photos[key][0])
        X2.append(in_seq)
        y.append(out_seq)
  return array(X1), array(X2), array(y)

In [27]:
# utlity function to calculate the length of the description with the most words
def max_length(descriptions):
  lines = to_lines(descriptions)
  return max(len(d.split()) for d in lines)

In [28]:
max_length = max_length(train_descriptions)
print(f"Description Length:  {max_length}")

Description Length:  30


In [29]:
from numpy import array
from keras_preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

X1train, X2train, ytrain = create_sequences(tokenizer, max_length, train_descriptions, train_features, vocab_size)

Input of 4096 elements(image) is processed by VGG-16 to output a 256 element feature vector of the input image.

The Word Embedding layer takes in a pre-defined length sequence (34 words) that is fed into an LSTM that outputs a 256 element vector. 

The final model merges the two 256 element vectors. It then makes a softmax prediction over the entire output vocabulary for the next word in the sequence.

# Defining the Captioning Model

In [30]:
from keras.layers import Input, Dense, LSTM, Embedding, Dropout, Add

def define_model(vocab_size, max_length):
  # feature extractor model
  inputs1 = Input(shape=(4096,))
  fe1 = Dropout(0.5)(inputs1)
  fe2 = Dense(256, activation='relu')(fe1)
  
  # sequence model
  inputs2 = Input(shape=(max_length,))
  se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
  se2 = Dropout(0.5)(se1)
  se3 = LSTM(256)(se2)

  # decoder model
  decoder1 = Add()([fe2, se3])
  decoder2 = Dense(256, activation='relu')(decoder1)
  outputs = Dense(vocab_size, activation='softmax')(decoder2)

  model = Model(inputs=[inputs1, inputs2], outputs=outputs)
  model.compile(loss='categorical_crossentropy', optimizer='adam')

  # summarize model
  print(model.summary())
  plot_model(model, to_file=f"{PROJ_PATH}/model.png", show_shapes=True)
  return model

In [31]:
# load test set
test_set_filename = f"{PROJ_PATH}/Dataset/Flickr8k_text/Flickr_8k.devImages.txt"
test = load_set(test_set_filename)
print(f"Dataset: {len(test)}")

Dataset: 1000


In [32]:
# descriptions
test_descriptions = load_clean_descriptions(f"{PROJ_PATH}/descriptions.txt", test)
print(f"Descriptions: test={len(test_descriptions)}")

# photo features
test_features = load_photo_features(f"{PROJ_PATH}/Dataset/features.pkl", test)
print(f"Photos: test={len(test_features)}")

# prepare sequences
X1test, X2test, ytest = create_sequences(tokenizer, max_length, test_descriptions, test_features, vocab_size)

Descriptions: test=1000
Photos: test=1000


In [33]:
from keras.utils import plot_model

model = define_model(vocab_size, max_length)

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 30)]         0           []                               
                                                                                                  
 input_1 (InputLayer)           [(None, 4096)]       0           []                               
                                                                                                  
 embedding (Embedding)          (None, 30, 256)      985088      ['input_2[0][0]']                
                                                                                                  
 dropout (Dropout)              (None, 4096)         0           ['input_1[0][0]']                
                                                                                              

In [34]:
from tensorflow.keras.callbacks import ModelCheckpoint

# define a checkpoint callback
checkpoint_file_path = f"{PROJ_PATH}" + '/model-ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}.h5'
checkpoint = ModelCheckpoint(checkpoint_file_path, mointor='val_loss', verbose=1, save_best_only=True, mode='min')

In [35]:
# fit model
model.fit([X1train, X2train], ytrain, epochs=20, verbose=2, callbacks=[checkpoint], validation_data=([X1test, X2test], ytest))

Epoch 1/20

Epoch 1: val_loss improved from inf to 4.45043, saving model to /content/drive/MyDrive/Colab Notebooks/Photo Caption Generator/model-ep001-loss5.095-val_loss4.450.h5
1916/1916 - 171s - loss: 5.0949 - val_loss: 4.4504 - 171s/epoch - 89ms/step
Epoch 2/20

Epoch 2: val_loss improved from 4.45043 to 4.15294, saving model to /content/drive/MyDrive/Colab Notebooks/Photo Caption Generator/model-ep002-loss4.245-val_loss4.153.h5
1916/1916 - 158s - loss: 4.2455 - val_loss: 4.1529 - 158s/epoch - 83ms/step
Epoch 3/20

Epoch 3: val_loss improved from 4.15294 to 4.05662, saving model to /content/drive/MyDrive/Colab Notebooks/Photo Caption Generator/model-ep003-loss3.897-val_loss4.057.h5
1916/1916 - 158s - loss: 3.8973 - val_loss: 4.0566 - 158s/epoch - 82ms/step
Epoch 4/20

Epoch 4: val_loss improved from 4.05662 to 4.05265, saving model to /content/drive/MyDrive/Colab Notebooks/Photo Caption Generator/model-ep004-loss3.656-val_loss4.053.h5
1916/1916 - 166s - loss: 3.6555 - val_loss: 4.05

KeyboardInterrupt: ignored

# Evaluating the Model
Generate a description for an example photo using the trained model, then compare the prediction to actual description.

In [37]:
from numpy import argmax
from pickle import load
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.models import load_model
from nltk.translate.bleu_score import corpus_bleu

# map an integer to a word
def word_for_id(integer, tokenizer):
  for word, idx in tokenizer.word_index.items():
    if idx == integer:
      return word
  return None

# generate a description for an image
def generate_desc(model, tokenizer, photo, max_length):
  # seed the generation process
  in_text = 'startseq'
  # iterate over the whole length of the sequence
  for i in range(max_length):
    # integer encode input sequence
    sequence = tokenizer.texts_to_sequences([in_text])[0]
    # pad input
    sequence = pad_sequences([sequence], maxlen=max_length)
    # predict next word
    yhat = model.predict([photo,sequence], verbose=0)
    # convert probability to integer
    yhat = argmax(yhat)
    # map integer to word
    word = word_for_id(yhat, tokenizer)
    # stop if we cannot map the word
    if word == None:
      break
    # append as input for generating the next word
    in_text += ' ' + word
    # stop if we predict the end of the sequence
    if word == 'endseq':
      break
  return in_text

In [38]:
# evaluate the skill of the model
def evaluate_model(model, descriptions, photos, tokenizer, max_length):
  actual, predicted = list(), list()
  # step over the whole set
  for key, desc_list in descriptions.items():
    # generate description
    yhat = generate_desc(model, tokenizer, photos[key], max_length)
    # store actual and predicted
    references = [d.split() for d in desc_list]
    actual.append(references)
    predicted.append(yhat.split())
  # calculate BLEU score
  print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
  print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
  print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
  print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))

In [39]:
# load model
best_model_so_far = f"{PROJ_PATH}/model-ep004-loss3.665-val_loss4.049.h5"
model = load_model(best_model_so_far)

# evaluate
evaluate_model(model, test_descriptions, test_features, tokenizer, max_length)

BLEU-1: 0.346666
BLEU-2: 0.161968
BLEU-3: 0.116074
BLEU-4: 0.049976


In [40]:
# save tokenizer
dump(tokenizer, open(f"{PROJ_PATH}/tokenizer.pkl", 'wb'))

In [41]:
# load the tokenizer
tokenizer = load(open(f"{PROJ_PATH}/tokenizer.pkl", 'rb'))
max_length = 34

In [45]:
# extract features from each photo in the directory
def extract_features_single(filename):
	# load the model
	model = VGG16()
	# re-structure the model
	model = Model(inputs=model.inputs, outputs=model.layers[-2].output)
	# load the photo
	image = load_img(filename, target_size=(224, 224))
	# convert the image pixels to a numpy array
	image = img_to_array(image)
	# reshape data for the model
	image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
	# prepare the image for the VGG model
	image = preprocess_input(image)
	# get features
	feature = model.predict(image, verbose=0)
	return feature

example_photo = extract_features_single(f"{PROJ_PATH}/example.png")

In [46]:
example_descrip = generate_desc(model, tokenizer, example_photo, max_length)
print(example_descrip)

startseq brown dog is running through the snow endseq
