In [1]:
from utils import model_utils
import os
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array
from keras.models import Model
from keras.applications.vgg16 import preprocess_input
from keras.preprocessing.text import Tokenizer
from pickle import dump, load
import string

# Decoder model imports
from keras.layers import Input
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
from keras.layers import Dropout
from keras.layers.merge import add
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from numpy import array

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
def initiate_encoder(arch='capsnet'):
    if arch=='capsnet':
        encoder_model = model_utils.load_DeepCapsNet(input_shape=(64,64,3), n_class=10, routings=3, \
                        weights=r'D:\CapsuleNetwork_ImageCaptioning\Flickr8k_image_captioning_using_CapsNet\weights\deep_caps_best_weights.h5')
    else:
        encoder_model = model_utils.load_VGG()
    return encoder_model

In [4]:
def extract_features_using_caps(model, directory, arch, path):
    """
        Description: Function to extract features through the model
        :model: The model object
        :directory: Path of the directory of images
        :path: Path to save the file
    """
    features = dict()
    model = Model(inputs=model.inputs, outputs=model.layers[-1].output)
    print('Feature extraction started')
    for name in os.listdir(directory):
        image_path = directory + '/' + name
        target_size = (64,64) if arch=='capsnet' else (224,224)
        try:
            image = load_img(image_path, target_size=target_size)
        except:
            print('{} could not be opened. Skipping'.format(image_path))
        image = img_to_array(image)
        image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
        # Extract the features from the last layer
        if arch=='capsnet':
            feature = model.predict(image, verbose=0).reshape(-1, 10*32)
        else:
            image = preprocess_input(image)
            feature = model.predict(image, verbose=0)
        image_id = name.split('.')[0]
        # Populate the dictionary
        features[image_id] = feature
    path = os.path.join(path, 'features_{}.pkl'.format(arch))
    dump(features, open(path, 'wb'))
    print('Features extracted and stored at {}'.format(path))
    return

In [5]:
img_dir = r'D:\CapsuleNetwork_ImageCaptioning\Flickr8k\Flicker8k_Dataset'
encoder_model = initiate_encoder(arch='VGG')
extract_features_using_caps(, img_dir, 'VGG', 'D:\CapsuleNetwork_ImageCaptioning\Flickr8k_image_captioning_using_CapsNet')

Instructions for updating:
Colocations handled automatically by placer.
Complete Capsule Architecture
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 224, 224, 3)       0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 224, 224, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 224, 224, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 112, 112, 64)      0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 112, 112, 128)     73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 112, 112, 128)     147584    
________________________________________

In [4]:
def read_files(filename):
    """
        Description: Generic function to read files and return contents
        :filename: Path of the files
    """
    with open(filename, 'r') as fh:
        content = fh.readlines()
    return ''.join(content)

In [2]:
# Load Descriptions of the images
def map_descriptions(desc_content):
    """
        Description: Map the descriptions <image>:[description_list]
    """
    # Each image contains 5 descriptions in the format
    # <image_name>#<1-5> sentence
    mapping = dict()
    lines = list()
    for line in desc_content.split('\n'):
        if len(line) < 2:
            continue
        tokens = line.split()
        image_id, image_desc = tokens[0], ' '.join(tokens[1:])
        image_id = image_id.split('.')[0]
        table = str.maketrans('', '', string.punctuation)
        image_desc = image_desc.split()
        image_desc = [word.lower() for word in image_desc]
        image_desc = [w.translate(table) for w in image_desc]
        image_desc = [word for word in image_desc if (len(word)>1 and word.isalpha())]
        if image_id not in mapping:
            mapping[image_id] = list()
        # Append the list of the dictionary
        mapping[image_id].append(' '.join(image_desc))
        lines.append(image_id+' '+' '.join(image_desc))
    # Write the files to a clean description file
    with open('descriptions.txt', 'w') as fh:
        fh.writelines('\n'.join(lines))
    return mapping

def to_vocabulary(descriptions):
    # build a list of all description strings
    all_desc = set()
    for key in descriptions.keys():
        [all_desc.update(d.split()) for d in descriptions[key]]
    return all_desc

In [5]:
filename = r'D:\CapsuleNetwork_ImageCaptioning\Flickr8k\Flickr8k_text\Flickr8k.token.txt'
doc = read_files(filename)
descriptions = map_descriptions(doc)
print('Total Desciptions: %d ' % len(descriptions))
vocabulary = to_vocabulary(descriptions)
print('Total Vocabulary: %d' % len(vocabulary))

Total Desciptions: 8092 
Total Vocabulary: 8763


### Create Training Set

In [6]:
train_path = r'D:\CapsuleNetwork_ImageCaptioning\Flickr8k\Flickr8k_text\Flickr_8k.trainImages.txt'
content = read_files(train_path)
train_set = list()
for line in content.split('\n'):
    if len(line) < 1:
        continue
    image_id = line.split('.')[0]
    train_set.append(image_id)
print("Size of Training dataset: {}".format(len(set(train_set))))

Size of Training dataset: 6000


In [90]:
# load photo features
def load_photo_features(filename, dataset):
    all_features = load(open(filename, 'rb'))
    features = {k: all_features[k] for k in dataset}
    return features

In [9]:
def create_tokenizer(descriptions):
    all_desc = list()
    for _, desc in descriptions.items():
        [all_desc.append(d) for d in desc]
    tokenizer = Tokenizer()
    max_length = max([len(desc.split()) for desc in all_desc])
    tokenizer.fit_on_texts(all_desc)
    dump(tokenizer, open('tokenizer.pkl', 'wb'))
    return tokenizer, max_length

In [10]:
# Load the training descriptions
train_desc = {image_id:desc for image_id, desc in descriptions.items() if image_id in train_set}
# Tokenize the the train description
train_tokenizer, max_length = create_tokenizer(train_desc)
# Get the features of training dataset
all_features = load(open("features_VGG.pkl", 'rb'))
train_features = {image_id:feat for image_id, feat in all_features.items() if image_id in train_set}

In [101]:
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: {}\nMaximum Legth: {}\nloaded photo features: {}'\
      .format(vocab_size, max_length, len(train_features)))

Vocabulary Size: 7577
Maximum Legth: 32
loaded photo features: 6000


In [102]:
def define_model(encoder_shape, vocab_size, max_length):
    inputs1 = Input(shape=(encoder_shape,))
    fe1 = Dropout(0.2)(inputs1)
    fe2 = Dense(256, activation='relu')(fe1)
    # sequence model
    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
    se2 = Dropout(0.2)(se1)
    se3 = LSTM(256)(se2)
    # decoder model
    decoder1 = add([fe2, se3])
    decoder2 = Dense(256, activation='relu')(decoder1)
    outputs = Dense(vocab_size, activation='softmax')(decoder2)
    # tie it together [image, seq] [word]
    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    # compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    # summarize model
    model.summary()
    return model

In [115]:
model = define_model(encoder_model.layers[-1].output.shape.as_list()[1], vocab_size, max_length)

Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            (None, 32)           0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, 4096)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 32, 256)      1939712     input_4[0][0]                    
__________________________________________________________________________________________________
dropout_1 (Dropout)             (None, 4096)         0           input_3[0][0]               

In [118]:
# create sequences of images, input sequences and output words for an image
def create_sequences(tokenizer, max_length, desc_list, photo, vocab_size):
    X1, X2, y = list(), list(), list()
    for desc in desc_list:
        seq = tokenizer.texts_to_sequences([desc])[0]
        for i in range(1, len(seq)):
            in_seq, out_seq = seq[:i], seq[i]
            in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
            out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
            X1.append(photo)
            X2.append(in_seq)
            y.append(out_seq)
    return array(X1), array(X2), array(y)

# data generator, intended to be used in a call to model.fit_generator()
def data_generator(descriptions, photos, tokenizer, max_length, vocab_size):
    while 1:
        for key, desc_list in descriptions.items():
            photo = photos[key][0]
            in_img, in_seq, out_word = create_sequences(tokenizer, max_length, desc_list, photo, vocab_size)
            yield [[in_img, in_seq], out_word]

In [124]:
epochs = 10
steps = len(train_desc)
for i in range(epochs):
    # create the data generator
    generator = data_generator(train_desc, train_features, train_tokenizer, max_length, vocab_size)
    # fit for one epoch
    model.fit_generator(generator, epochs=1, steps_per_epoch=steps, verbose=1)
    # save model
model.save('model_arch_' + str(i) + '.h5')

Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
