In [1]:
import numpy as np
from numpy import array
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import string
import os
from PIL import Image
import glob
from pickle import dump, load
from time import time
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import LSTM, Embedding, TimeDistributed, Dense, RepeatVector,\
                         Activation, Flatten, Reshape, concatenate, Dropout, BatchNormalization
from keras.optimizers import Adam, RMSprop
from keras.layers.wrappers import Bidirectional
from keras.layers.merge import add
from keras.applications.inception_v3 import InceptionV3
from keras.preprocessing import image
from keras.models import Model
from keras import Input, layers
from keras import optimizers
from keras.applications.inception_v3 import preprocess_input
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

Using TensorFlow backend.


In [2]:
import os
# Location for all image and text files
path = os.path.join(os.getcwd(), os.path.join('flickr8k','Flickr_Data'))
txtpath = os.path.join(path, "Flickr_TextData")
imgpath = os.path.join(path, "Images")

In [3]:
def load_doc(filename):
    # open the file as read only
    with open(filename, 'r') as file:
        text = file.read()
        return text

filename = os.path.join(txtpath, "Flickr8k.token.txt")
# load descriptions
doc = load_doc(filename)
print(doc[:300])

1000268201_693b08cb0e.jpg#0	A child in a pink dress is climbing up a set of stairs in an entry way .
1000268201_693b08cb0e.jpg#1	A girl going into a wooden building .
1000268201_693b08cb0e.jpg#2	A little girl climbing into a wooden playhouse .
1000268201_693b08cb0e.jpg#3	A little girl climbing the s


In [4]:
def load_descriptions(doc):
    mapping = dict()
    # process lines
    for line in doc.split('\n'):
        # split line by white space
        tokens = line.split()
        if len(line) < 2:
            continue
        # take the first token as the image id, the rest as the description
        image_id, image_desc = tokens[0], tokens[1:]
        # extract filename from image id
        image_id = image_id.split('.')[0]
        # convert description tokens back to string
        image_desc = ' '.join(image_desc)
        # create the list if needed
        if image_id not in mapping:
            mapping[image_id] = list()
        # store description
        mapping[image_id].append(image_desc)
    return mapping

# parse descriptions
descriptions = load_descriptions(doc)
print('Loaded: %d ' % len(descriptions))

Loaded: 8092 


In [5]:
len(descriptions.keys())

8092

In [8]:
if '3711826708_bba64fb1e1' in descriptions.keys():
    print('yes')

yes


In [7]:
def clean_descriptions(descriptions):
    # prepare translation table for removing punctuation
    table = str.maketrans('', '', string.punctuation)
    for key, desc_list in descriptions.items():
        for i in range(len(desc_list)):
            desc = desc_list[i]
            # tokenize
            desc = desc.split()
            # convert to lower case
            desc = [word.lower() for word in desc]
            # remove punctuation from each token
            desc = [w.translate(table) for w in desc]
            # remove hanging 's' and 'a'
            desc = [word for word in desc if len(word)>1]
            # remove tokens with numbers in them
            desc = [word for word in desc if word.isalpha()]
            # store as string
            desc_list[i] =  ' '.join(desc)

# clean descriptions
clean_descriptions(descriptions)

In [7]:
descriptions['1000268201_693b08cb0e']

['child in pink dress is climbing up set of stairs in an entry way',
 'girl going into wooden building',
 'little girl climbing into wooden playhouse',
 'little girl climbing the stairs to her playhouse',
 'little girl in pink dress going into wooden cabin']

In [9]:
def to_vocabulary(descriptions):
    # build a list of all description strings
    all_desc = set()
    for key in descriptions.keys():
        [all_desc.update(d.split()) for d in descriptions[key]]
    return all_desc

# summarize vocabulary
vocabulary = to_vocabulary(descriptions)
print('Original Vocabulary Size: %d' % len(vocabulary))

Original Vocabulary Size: 8763


In [None]:
# 1

In [10]:
def save_descriptions(descriptions, filename):
    lines = list()
    for key, desc_list in descriptions.items():
        for desc in desc_list:
            lines.append(key + ' ' + desc)
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()

save_descriptions(descriptions, 'descriptions.txt')

In [23]:
# load a pre-defined list of photo identifiers
def load_set(filename):
    doc = load_doc(filename)
    dataset = list()
    # process line by line
    for line in doc.split('\n'):
        # skip empty lines
        if len(line) < 1:
            continue
        # get the image identifier
        identifier = line.split('.')[0]
        dataset.append(identifier)
    return set(dataset)

# load training dataset (6K)
filename = os.path.join(txtpath, 'Flickr_8k.trainImages.txt')
train = load_set(filename)
filename = os.path.join(txtpath, 'Flickr_8k.devImages.txt')
dev = load_set(filename)
filename = os.path.join(txtpath, 'Flickr_8k.testImages.txt')
test = load_set(filename)
print('train : %d' % len(train))
print('dev : %d' % len(dev))
print('test : %d' % len(test))

train : 6000
dev: 1000
test: 1000


In [12]:
train

{'3014169370_fc4059352e',
 '3243233886_235a80e8c7',
 '2805101709_1c8916f63a',
 '2086532897_b8714f2237',
 '3613667665_1881c689ea',
 '2842609837_b3a0b383f7',
 '535249787_0fcaa613a0',
 '2797438951_88a3ed7541',
 '3738685861_8dfff28760',
 '2090327868_9f99e2740d',
 '3121482932_f77ca12c01',
 '2084103826_ffd76b1e3e',
 '247778426_fd59734130',
 '260850192_fd03ea26f1',
 '1187435567_18173c148b',
 '505062117_a70b4e10ab',
 '2765029348_667111fc30',
 '930748509_8ca5cf5c24',
 '918886676_3323fb2a01',
 '3220009216_10f088185e',
 '2480832276_fa55480ecb',
 '3603301825_5817727be2',
 '3551003620_0b02d76f65',
 '3229519418_040f05ced1',
 '2169951750_495820a215',
 '3599568766_9e96def0ef',
 '3312779887_7682db7827',
 '3446762868_06e9d9d899',
 '251056963_c8b67f0107',
 '1159574340_99ba8c3c59',
 '1160034462_16b38174fe',
 '442918418_0f29c97fa9',
 '3208188198_2b271d2a2e',
 '2935703360_4f794f7f09',
 '2831314869_5025300133',
 '3706356018_28f62290e8',
 '3371567346_b6522efdb8',
 '733964952_69f011a6c4',
 '3501386648_e11e3f31

In [13]:
# Getting all image file names
img = []
for file in os.listdir(imgpath):
    if file.endswith(".jpg"):
        img.append(file)

In [14]:
def img_filename(txtfilename):
    images_file = os.path.join(txtpath, txtfilename)
    images = set(open(images_file, 'r').read().strip().split('\n'))
    lstname = []
            
    for i in img:
        if i in images:
            lstname.append(i)
    return lstname

In [15]:
train_img = img_filename('Flickr_8k.trainImages.txt')
dev_img = img_filename('Flickr_8k.devImages.txt')
test_img = img_filename('Flickr_8k.testImages.txt')

print(f'Train --> {len(train_img)} files')
print(f'Dev --> {len(dev_img)} files')
print(f'Test --> {len(test_img)} files')

Train --> 6000 files
Dev --> 1000 files
Test --> 1000 files


In [19]:
# load clean descriptions into memory
def load_clean_descriptions(filename, dataset):
    # load document
    doc = load_doc(filename)
    descriptions = dict()
    for line in doc.split('\n'):
        # split line by white space
        tokens = line.split()
        # split id from description
        image_id, image_desc = tokens[0], tokens[1:]
        # skip images not in the set
        if image_id in dataset:
            # create list
            if image_id not in descriptions:
                descriptions[image_id] = list()
            # wrap description in tokens
            desc = 'startseq ' + ' '.join(image_desc) + ' endseq'
            # store
            descriptions[image_id].append(desc)
    return descriptions

# descriptions
train_descriptions = load_clean_descriptions('descriptions.txt', train)
print('Descriptions: train=%d' % len(train_descriptions))

Descriptions: train=6000


In [20]:
dev_descriptions = load_clean_descriptions('descriptions.txt', dev)
print('Descriptions: train=%d' % len(dev_descriptions))
test_descriptions = load_clean_descriptions('descriptions.txt', test)
print('Descriptions: train=%d' % len(test_descriptions))

NameError: name 'dev' is not defined

In [58]:
def preprocess(image_file):
    image_file_path = os.path.join(imgpath, image_file)
    # Convert all the images to size 299x299 as expected by the inception v3 model
    img = image.load_img(image_file_path, target_size=(299, 299))
    # Convert PIL image to numpy array of 3-dimensions
    x = image.img_to_array(img)
    # Add one more dimension
    x = np.expand_dims(x, axis=0)
    # preprocess the images using preprocess_input() from inception module
    x = preprocess_input(x)
    return x

In [53]:
# Load the inception v3 model
model = InceptionV3(weights='imagenet')

Downloading data from https://github.com/fchollet/deep-learning-models/releases/download/v0.5/inception_v3_weights_tf_dim_ordering_tf_kernels.h5


In [54]:
# Create a new model, by removing the last layer (output layer) from the inception v3
model_new = Model(model.input, model.layers[-2].output)

In [59]:
# Function to encode a given image into a vector of size (2048, )
def encode(image):
    image = preprocess(image) # preprocess the image
    fea_vec = model_new.predict(image) # Get the encoding vector for the image
    fea_vec = np.reshape(fea_vec, fea_vec.shape[1]) # reshape from (1, 2048) to (2048, )
    return fea_vec

In [61]:
# Call the funtion to encode all the train images
# This will take a while on CPU - Execute this only once
start = time()
encoding_train = {}
for img in train_img:
    encoding_train[img] = encode(img)
print("Time taken in seconds =", time()-start)

Time taken in seconds = 1426.9686558246613


In [64]:
# Save the bottleneck train features to disk
with open(os.path.join(path, os.path.join('Models', 'encoded_train_images.pkl')), "wb") as encoded_pickle:
    dump(encoding_train, encoded_pickle)

In [68]:
# Call the funtion to encode all the test images - Execute this only once
start = time()
encoding_test = {}
for img in test_img:
    encoding_test[img] = encode(img)
print("Time taken in seconds =", time()-start)

# Save the bottleneck test features to disk
with open(os.path.join(path, os.path.join('Models', 'encoded_test_images.pkl')), "wb") as encoded_pickle:
    dump(encoding_test, encoded_pickle)

Time taken in seconds = 213.6143295764923


In [70]:
# Call the funtion to encode all the test images - Execute this only once
start = time()
encoding_dev = {}
for img in dev_img:
    encoding_dev[img] = encode(img)
print("Time taken in seconds =", time()-start)

# Save the bottleneck test features to disk
with open(os.path.join(path, os.path.join('Models', 'encoded_dev_images.pkl')), "wb") as encoded_pickle:
    dump(encoding_dev, encoded_pickle)

Time taken in seconds = 213.92125344276428


In [73]:
train_features = load( open(os.path.join(path, os.path.join('Models', 'encoded_train_images.pkl')), "rb")) 
print('Photos: train=%d' % len(train_features))

Photos: train=6000


In [74]:
# Create a list of all the training captions
all_train_captions = []
for key, val in train_descriptions.items():
    for cap in val:
        all_train_captions.append(cap)
len(all_train_captions)

30000

In [75]:
# Consider only words which occur at least 10 times in the corpus
word_count_threshold = 10
word_counts = {}
nsents = 0
for sent in all_train_captions:
    nsents += 1
    for w in sent.split(' '):
        word_counts[w] = word_counts.get(w, 0) + 1

vocab = [w for w in word_counts if word_counts[w] >= word_count_threshold]
print('preprocessed words %d -> %d' % (len(word_counts), len(vocab)))

preprocessed words 7578 -> 1651


In [76]:
ixtoword = {}
wordtoix = {}

ix = 1
for w in vocab:
    wordtoix[w] = ix
    ixtoword[ix] = w
    ix += 1
    
vocab_size = len(ixtoword) + 1 # one for appended 0's
vocab_size

1652

In [77]:
# convert a dictionary of clean descriptions to a list of descriptions
def to_lines(descriptions):
    all_desc = list()
    for key in descriptions.keys():
        [all_desc.append(d) for d in descriptions[key]]
    return all_desc

# calculate the length of the description with the most words
def max_length(descriptions):
    lines = to_lines(descriptions)
    return max(len(d.split()) for d in lines)

# determine the maximum sequence length
max_length = max_length(train_descriptions)
print('Description Length: %d' % max_length)

Description Length: 34


In [78]:
# data generator, intended to be used in a call to model.fit_generator()
def data_generator(descriptions, photos, wordtoix, max_length, num_photos_per_batch):
    X1, X2, y = list(), list(), list()
    n=0
    # loop for ever over images
    while 1:
        for key, desc_list in descriptions.items():
            n+=1
            # retrieve the photo feature
            photo = photos[key+'.jpg']
            for desc in desc_list:
                # encode the sequence
                seq = [wordtoix[word] for word in desc.split(' ') if word in wordtoix]
                # split one sequence into multiple X, y pairs
                for i in range(1, len(seq)):
                    # split into input and output pair
                    in_seq, out_seq = seq[:i], seq[i]
                    # pad input sequence
                    in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                    # encode output sequence
                    out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
                    # store
                    X1.append(photo)
                    X2.append(in_seq)
                    y.append(out_seq)
            # yield the batch data
            if n==num_photos_per_batch:
                yield [[array(X1), array(X2)], array(y)]
                X1, X2, y = list(), list(), list()
                n=0


In [79]:
# Load Glove vectors
# glove_dir = '../../storage/glove'
embeddings_index = {} # empty dictionary

with open(os.path.join(path, os.path.join('glove', 'glove.6B.200d.txt')), encoding="utf-8") as f:

    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [80]:
embedding_dim = 200

# Get 200-dim dense vector for each of the 10000 words in out vocabulary
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in wordtoix.items():
    #if i < max_words:
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in the embedding index will be all zeros
        embedding_matrix[i] = embedding_vector

In [81]:
embedding_matrix.shape

(1652, 200)

In [82]:
inputs1 = Input(shape=(2048,))

fe1 = Dropout(0.5)(inputs1)
fe2 = Dense(256, activation='relu')(fe1)

inputs2 = Input(shape=(max_length,))

se1 = Embedding(vocab_size, embedding_dim, mask_zero=True)(inputs2)
se2 = Dropout(0.5)(se1)
se3 = LSTM(256)(se2)

decoder1 = add([fe2, se3])
decoder2 = Dense(256, activation='relu')(decoder1)

outputs = Dense(vocab_size, activation='softmax')(decoder2)

model = Model(inputs=[inputs1, inputs2], outputs=outputs)

In [83]:
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, 34)           0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 2048)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 34, 200)      330400      input_3[0][0]                    
__________________________________________________________________________________________________
dropout_1 (Dropout)             (None, 2048)         0           input_2[0][0]                    
____________________________________________________________________________________________

In [85]:
model.layers[2].set_weights([embedding_matrix])
model.layers[2].trainable = False

In [86]:
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [87]:
epochs = 10
number_pics_per_bath = 3
steps = len(train_descriptions)//number_pics_per_bath

In [89]:
data_generator??

In [88]:
for i in range(epochs):
    generator = data_generator(train_descriptions, train_features, wordtoix, max_length, number_pics_per_bath)
    model.fit_generator(generator, epochs=1, steps_per_epoch=steps, verbose=1)
    model.save('./model_weights/model_' + str(i) + '.h5')

Epoch 1/1
  55/2000 [..............................] - ETA: 12:26 - loss: 5.7828

KeyboardInterrupt: 