In [3]:
from numpy import array
from pickle import load
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.utils import plot_model
from keras.models import Model
from keras.layers import *
from keras.layers.merge import add
from keras.layers.merge import concatenate
from keras.layers.pooling import GlobalMaxPooling2D
from keras.callbacks import ModelCheckpoint
import pydot
import numpy as np
from keras.models import load_model
from numpy import argmax
from nltk.translate.bleu_score import corpus_bleu
from keras.utils import CustomObjectScope

In [4]:
# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text
 
# load a pre-defined list of photo identifiers
def load_set(filename):
    doc = load_doc(filename)
    dataset = list()
    # process line by line
    for line in doc.split('\n'):
        # skip empty lines
        if len(line) < 1:
            continue
        # get the image identifier
        identifier = line.split('.')[0]
        dataset.append(identifier)
    return set(dataset)
 
# load clean descriptions into memory
def load_descriptions(filename, dataset):
    # load document
    doc = load_doc(filename)
    descriptions = dict()
    for line in doc.split('\n'):
        # split line by white space
        tokens = line.split()
        # split id from description
        image_id, image_desc = tokens[0], tokens[1:]
        # skip images not in the set
        if image_id in dataset:
            # create list
            if image_id not in descriptions:
                descriptions[image_id] = list()
            # wrap description in tokens
            desc = 'startseq ' + ' '.join(image_desc) + ' endseq'
            # store
            descriptions[image_id].append(desc)
    return descriptions
 
# load photo features
def load_video_features(filename, dataset):
    # load all features
    all_features = load(open(filename, 'rb'))
    # filter features
    features = {k: all_features[k] for k in dataset}
    return features
 
# covert a dictionary of clean descriptions to a list of descriptions
def to_lines(descriptions):
    all_desc = list()
    for key in descriptions.keys():
        [all_desc.append(d) for d in descriptions[key]]
    return all_desc
 
# fit a tokenizer given caption descriptions
def create_tokenizer(descriptions):
    lines = to_lines(descriptions)
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer
 
# calculate the length of the description with the most words
def max_length(descriptions):
    lines = to_lines(descriptions)
    return max(len(d.split()) for d in lines)
 
# create sequences of images, input sequences and output words for an image
def create_sequences(tokenizer, max_length, desc_list, c2d, c3d, semantic):
    X1, X2, X3, X4, y = list(), list(), list(), list(), list()
    # walk through each description for the image
    for desc in desc_list:
        # encode the sequence
        seq = tokenizer.texts_to_sequences([desc])[0]
        # split one sequence into multiple X,y pairs
        for i in range(1, len(seq)):
            # split into input and output pair
            in_seq, out_seq = seq[:i], seq[i]
            # pad input sequence
            in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
            # encode output sequence
            out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
            # store
            X1.append(c2d)
            X2.append(c3d)
            X3.append(semantic)
            X4.append(in_seq)
            y.append(out_seq)
    return array(X1), array(X2), array(X3), array(X4), array(y)

# load a word embedding
def load_embedding(tokenizer, vocab_size, max_length):
	# load the tokenizer
	embedding = load(open('/home/mh/mywork/dataset/MSVD/descriptions/word2vec_embedding.pkl', 'rb'))
	dimensions = 100
	trainable = False
	# create a weight matrix for words in training docs
	weights = np.zeros((vocab_size, dimensions))
	# walk words in order of tokenizer vocab to ensure vectors are in the right index
	for word, i in tokenizer.word_index.items():
		if word not in embedding:
			continue
		weights[i] = embedding[word]
	layer = Embedding(vocab_size, dimensions, weights=[weights], input_length=max_length, trainable=trainable, mask_zero=True)
	return layer
 


In [5]:
# data generator, intended to be used in a call to model.fit_generator()
def data_generator(descriptions, train_c2d, train_c3d, train_sem, tokenizer, max_length):
    # loop for ever over images
    while 1:
        for key, desc_list in descriptions.items():
            # retrieve the photo feature
            c2d= train_c2d[key]
            c3d= train_c3d[key]
            sem=train_sem[key]
            #print(len(photo))
            in_c2d, in_c3d, in_sem, in_seq, out_word = create_sequences(tokenizer, max_length, desc_list, c2d, c3d, sem)
            yield [[in_c2d, in_c3d, in_sem, in_seq], out_word]

In [6]:
class Attention(Layer):
    def __init__(self, step_dim=49,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        return None

    def call(self, x, mask=None):
        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)),
                        K.reshape(self.W, (features_dim, 1))), (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        if mask is not None:
            a *= K.cast(mask, K.floatx())

        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0],  self.features_dim

In [15]:
#load training dataset (6K)
filename = '/home/mh/mywork/dataset/MSVD/class_ids_new/trainID/actions_train_ID.txt'
train_new = load_set(filename)
print('Dataset: %d' % len(train_new))
train_new=list(train_new)
train_new.sort()
# print(train_new)

train=train_new
train_test=train[:100]
#train=train_new

# descriptions
train_descriptions = load_descriptions('/home/mh/mywork/dataset/MSVD/descriptions/descriptions_processed.txt', train)
print('Descriptions: train=%d' % len(train_descriptions))

#load c2d features
train_c2d_features = load_video_features('/home/mh/mywork/dataset/MSVD/features/vlad/msvd_resnet152_vlad_features_k_100.pkl', train)
print('C2D: train=%d' % len(train_c2d_features))

#load c3d features
train_c3d_features = load_video_features('/home/mh/mywork/dataset/MSVD/features/vlad/msvd_c3d_vlad_features_k_100.pkl', train)
print('C3D: train=%d' % len(train_c3d_features))

#load semantic features
train_semantic_features = load_video_features('/home/mh/mywork/dataset/MSVD/features/semantic/msvd_sem_scn_300.pkl', train)
print('Semantic: train=%d' % len(train_semantic_features))


# descriptions
train_descriptions_test = load_descriptions('/home/mh/mywork/dataset/MSVD/descriptions/descriptions_processed.txt', train_test)
print('Descriptions: train=%d' % len(train_descriptions_test))

#load c2d features
train_c2d_features_test = load_video_features('/home/mh/mywork/dataset/MSVD/features/vlad/msvd_resnet152_vlad_features_k_100.pkl', train_test)
print('C2D: train=%d' % len(train_c2d_features_test))

#load c3d features
train_c3d_features_test = load_video_features('/home/mh/mywork/dataset/MSVD/features/vlad/msvd_c3d_vlad_features_k_100.pkl', train_test)
print('C3D: train=%d' % len(train_c3d_features_test))

#load semantic features
train_semantic_features_test = load_video_features('/home/mh/mywork/dataset/MSVD/features/semantic/msvd_sem_scn_300.pkl', train_test)
print('Semantic: train=%d' % len(train_semantic_features_test))


Dataset: 121
Descriptions: train=121
C2D: train=121
C3D: train=121
Semantic: train=121
Descriptions: train=100
C2D: train=100
C3D: train=100
Semantic: train=100


In [14]:
# load test set
filename = '/home/mh/mywork/dataset/MSVD/class_ids_new/testID/actions_test_ID.txt'
test = load_set(filename)
print('Dataset: %d' % len(test))

# descriptions
test_descriptions = load_descriptions('/home/mh/mywork/dataset/MSVD/descriptions/descriptions_processed.txt', test)
print('Descriptions: test=%d' % len(test_descriptions))

#load c2d features
test_c2d_features = load_video_features('/home/mh/mywork/dataset/MSVD/features/vlad/msvd_resnet152_vlad_features_k_100.pkl', test)
print('C2D: test=%d' % len(test_c2d_features))

#load c3d features
test_c3d_features = load_video_features('/home/mh/mywork/dataset/MSVD/features/vlad/msvd_c3d_vlad_features_k_100.pkl', test)
print('C3D: test=%d' % len(test_c3d_features))

#load semantic features
test_semantic_features = load_video_features('/home/mh/mywork/dataset/MSVD/features/semantic/msvd_sem_scn_300.pkl', test)
print('Semantic: test=%d' % len(test_semantic_features))


Dataset: 33
Descriptions: test=33
C2D: test=33
C3D: test=33
Semantic: test=33


In [9]:
# map an integer to a word
def word_for_id(integer, tokenizer):
	for word, index in tokenizer.word_index.items():
		if index == integer:
			return word
	return None

# generate a description for an image
def generate_desc(model, tokenizer,c2d ,c3d, semantic, max_length):
    # seed the generation process
    in_text = 'startseq'
    # iterate over the whole length of the sequence
    for i in range(max_length):
        # integer encode input sequence
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        # pad input
        sequence = pad_sequences([sequence], maxlen=max_length)    
        c2d1=np.array([c2d])
        c3d1=np.array([c3d])
        semantic1=np.array([semantic])
        # print(photo)
        yhat = model.predict([c2d1, c3d1, semantic1, sequence], verbose=0)
        # print(yhat)
        # convert probability to integer
        yhat = argmax(yhat)
        # map integer to word
        word = word_for_id(yhat, tokenizer)
        # stop if we cannot map the word
        if word is None:
            break
        # append as input for generating the next word
        in_text += ' ' + word
        # stop if we predict the end of the sequence
        if word == 'endseq':
            break
    return in_text

# evaluate the skill of the model
def evaluate_model(model, descriptions, test_c2d, test_c3d, test_semantic, tokenizer, max_length, filename):
    actual, predicted = list(), list()
    # step over the whole set
    lines = list()
    for key, desc_list in descriptions.items():
        yhat = generate_desc(model, tokenizer, test_c2d[key], test_c3d[key], test_semantic[key], max_length)
        ex=yhat
        a=yhat.split('startseq')
        b=a[1].split('endseq')
        lines.append('beam_size_1'+'\t'+key + '\t' + b[0])
        references = [d.split() for d in desc_list]
        actual.append(references)
        predicted.append(yhat.split())
        #
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()
    bleu=np.zeros(4)
    # calculate BLEU score
    bleu[0]=corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0))
    bleu[1]=corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0))
    bleu[2]=corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0))
    bleu[3]=corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25))
    print('BLEU-1: %f' % bleu[0])
    print('BLEU-2: %f' % bleu[1])
    print('BLEU-3: %f' % bleu[2])
    print('BLEU-4: %f' % bleu[3])
    return bleu


In [16]:
# prepare tokenizer
tokenizer = create_tokenizer(train_descriptions)
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)
# determine the maximum sequence length
# max_length = max_length(train_descriptions)
# print('Description Length: %d' % max_length)
max_length=49
#Load the good model already built for captioning
model_old = load_model('/home/mh/mywork/result/evaluation_results/meteor_33_3/model_10.h5')
model_old.summary()

Vocabulary Size: 2729
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 2048)         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 4096)         0                                            
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, 128)          262272      input_1[0][0]                    
__________________________________________________________________________________________________
dense_2 (Dense)                 (None, 128)          524416      input_2[0][0]                    
_______________________________________________________________________________________

In [17]:
#load ouput from concatenate layer
layer_name = 'concatenate_3'
model= Model(inputs=model_old.input, outputs=model_old.get_layer(layer_name).output)
# Get rid of the dense layer
# make the previous layers not trainable
for layer in model.layers[:13]:
    layer.trainable = False
#take the output from the previous layers
x = model.output
#Add BLSTM layer
blstm = Bidirectional(LSTM(64))(x)
#Add an attention layer
# att=Attention(max_length)(blstm)
#add dense layer
outputs = Dense(vocab_size, activation='softmax',name='final_dense')(blstm)
# tie it together [image, seq] [word]
model = Model(inputs=model_old.input, outputs=outputs)
    # compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# summarize model
model.summary()
plot_model(model, to_file='/home/mh/mywork/video_caption_domain/best_models/reformed/model_transfer_simpleactions/model_sports.png', show_shapes=True)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 2048)         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 4096)         0                                            
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, 128)          262272      input_1[0][0]                    
__________________________________________________________________________________________________
dense_2 (Dense)                 (None, 128)          524416      input_2[0][0]                    
__________________________________________________________________________________________________
input_4 (I

In [18]:
import pandas as pd
# train the model, run epochs manually and save after each epoch
# model = load_model('/home/mh/mywork/video_caption_domain/best_models/model_transfer_animal/model_12.h5')
epochs = 50
steps = len(train_descriptions)
mylist = list(range(50))
df=pd.DataFrame(index=mylist, columns=['model_no','train_Bleu_1','train_Bleu_2','train_Bleu_3','train_Bleu_4', 'test_Bleu_1','test_Bleu_2','test_Bleu_3','test_Bleu_4'])
for i in range(epochs):
    print(i)
    # create the data generator
    generator = data_generator(train_descriptions, train_c2d_features, train_c3d_features, train_semantic_features, tokenizer, max_length)
    # fit for one epoch
    model.fit_generator(generator, epochs=1, steps_per_epoch=steps, verbose=1)
    # save model
    model.save('/home/mh/mywork/video_caption_domain/best_models/reformed/model_transfer_actions/model_' + str(i) + '.h5')
    # evaluate model on training data
    txt_fname='/home/mh/mywork/video_caption_domain/best_models/reformed/model_transfer_actions/result/test_predicted_'+str(i)+'.txt'
    train_bleu = evaluate_model(model, train_descriptions_test, train_c2d_features_test, train_c3d_features_test, train_semantic_features_test, tokenizer, max_length,txt_fname)
    test_bleu = evaluate_model(model, test_descriptions, test_c2d_features, test_c3d_features, test_semantic_features, tokenizer, max_length,txt_fname)
    # store
    df.at[i,'model_no']=i
    df.at[i,'train_Bleu_1']=train_bleu[0]
    df.at[i,'train_Bleu_2']=train_bleu[1]
    df.at[i,'train_Bleu_3']=train_bleu[2]
    df.at[i,'train_Bleu_4']=train_bleu[3]
    df.at[i,'test_Bleu_1']=test_bleu[0]
    df.at[i,'test_Bleu_2']=test_bleu[1]
    df.at[i,'test_Bleu_3']=test_bleu[2]
    df.at[i,'test_Bleu_4']=test_bleu[3]
    df.to_csv('Resnet152_vlad_k_100.csv', index=False)


0
Epoch 1/1


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


BLEU-1: 0.093200
BLEU-2: 0.045113
BLEU-3: 0.000000
BLEU-4: 0.000000
BLEU-1: 0.098182
BLEU-2: 0.047398
BLEU-3: 0.000000
BLEU-4: 0.000000
1
Epoch 1/1
BLEU-1: 0.652221
BLEU-2: 0.453051
BLEU-3: 0.376711
BLEU-4: 0.248972
BLEU-1: 0.711191
BLEU-2: 0.517836
BLEU-3: 0.445157
BLEU-4: 0.305437
2
Epoch 1/1
BLEU-1: 0.733333
BLEU-2: 0.519208
BLEU-3: 0.429246
BLEU-4: 0.294154
BLEU-1: 0.706960
BLEU-2: 0.517741
BLEU-3: 0.445120
BLEU-4: 0.307156
3
Epoch 1/1
BLEU-1: 0.748731
BLEU-2: 0.544068
BLEU-3: 0.454129
BLEU-4: 0.315304
BLEU-1: 0.746154
BLEU-2: 0.552895
BLEU-3: 0.474819
BLEU-4: 0.330495
4
Epoch 1/1
BLEU-1: 0.753866
BLEU-2: 0.552776
BLEU-3: 0.463247
BLEU-4: 0.322573
BLEU-1: 0.752935
BLEU-2: 0.564796
BLEU-3: 0.484715
BLEU-4: 0.339211
5
Epoch 1/1
BLEU-1: 0.758530
BLEU-2: 0.561337
BLEU-3: 0.471003
BLEU-4: 0.329517
BLEU-1: 0.758887
BLEU-2: 0.569598
BLEU-3: 0.488743
BLEU-4: 0.342667
6
Epoch 1/1
BLEU-1: 0.770378
BLEU-2: 0.579369
BLEU-3: 0.489910
BLEU-4: 0.350759
BLEU-1: 0.771422
BLEU-2: 0.588135
BLEU-3: 0.