# Video Captioning

In [None]:
#Import important libraries
import numpy as np
import pandas as pd
import os
import cv2
import matplotlib.pyplot as plt
import math
from preprocess_videos import load_df, preprocess_df, get_final_list, extract_frames, select_videos, load_video_frames, extract_features, extract_features_resnet50, extract_features_inception_v3, view_frames
from enc_dec_models import basic_enc_dec

Preprocessing

In [None]:
#Import captions
df = load_df("dataset/msvd_videos/video_corpus.csv")
df.head()

In [None]:
data = preprocess_df(df)

In [None]:
data.head()

In [None]:
videos_final = get_final_list("dataset/msvd_videos/msvd_videos", data)

In [None]:
len(videos_final)

In [None]:
#Select single caption for each video
captions = {}
for index, row in data.iterrows():
    if row['Name'] in captions or row['Name'] not in videos_final:
        continue
    else:
        captions[row['Name']] = row['Description']

In [None]:
#Not needed
#df = pd.DataFrame(captions.items(), columns = ['Name', 'Description'])
#df.head()

In [None]:
#Perform once
#extract_frames(videos_final, 'dataset/msvd_videos/msvd_videos/', 'dataset/msvd_videos/img/')

In [None]:
videos_selected = select_videos(videos_final, 'dataset/msvd_videos/frames/', 15)
len(videos_selected)

In [None]:
descriptions = []
for vid in videos_selected:
    descriptions.append(captions[vid])
    
len(descriptions)

Extracting features

In [None]:
frames_path = 'dataset/msvd_videos/frames/'
data = extract_features_resnet50(frames_path, videos_selected) #Use this to load X of shape (1652, 15, 4096)

In [None]:
data.shape

In [None]:
#Save array
#from numpy import save
#save('video_features_vgg16.npy', X)
# load array
#from numpy import load
#data = load('video_features_vgg16.npy')

Coding

In [None]:
data.shape

In [None]:
view_frames('dataset/msvd_videos/frames/mv89psg6zh4_33_46')

In [None]:
#Let's use first 1200 videos for training.
train = data[:1200]
train.shape

In [None]:
#The data contains video extracted features.
#videos_selected contain video names & descriptions contains corresponding caption of those videos.

In [None]:
#Adding 'ssss' and 'eeee' to the descriptions.
for i in range(len(descriptions)):
    if descriptions[i][-1] == '.':
        descriptions[i] = 'ssss ' + descriptions[i][:-1] + ' eeee'
    else:
        descriptions[i] = 'ssss ' + descriptions[i] + ' eeee'

In [None]:
desc_len = [len(s.split(' ')) for s in descriptions]
max(desc_len) #Length of the largest caption. We will set max_length to this.

In [None]:
vocab_size = 2400
embedding_dim = 16
max_length = 20
trunc_type = 'post'
padding_type = 'post'
oov_tok = "<oov>"

In [None]:
#Using Tokenizer to preprocess the descriptions.
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
tokenizer = Tokenizer(num_words = vocab_size, oov_token = oov_tok)
tokenizer.fit_on_texts(descriptions)

In [None]:
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(descriptions)
padded = pad_sequences(sequences, maxlen = max_length, truncating = trunc_type, padding = padding_type)

In [None]:
#Let's look at padded sequences.
padded[:10]

In [None]:
##Updated##
"""
from tensorflow.keras import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding

# returns train, inference_encoder and inference_decoder models
def define_updated(n_input, n_output, n_units):
    # define training encoder
    encoder_inputs = Input(shape=(None, n_input))
    encoder = LSTM(n_units, return_state=True)
    encoder_outputs, state_h, state_c = encoder(encoder_inputs)
    encoder_states = [state_h, state_c]
    
    # define training decoder
    decoder_inputs = Input(shape=(None, n_output))
    embedding = Embedding(10000, 64)
    decoder_lstm1 = LSTM(n_units, return_sequences=True, return_state=True)
    decoder_lstm2 = LSTM(n_units, return_sequences=True, return_state=True)
    
    temp = embedding(decoder_inputs)
    temp, _, _ = decoder_lstm1(temp, initial_state=encoder_states)
    decoder_outputs, _, _ = decoder_lstm2(temp, initial_state=encoder_states)
    decoder_dense = Dense(n_output, activation='softmax')
    
    decoder_outputs = decoder_dense(decoder_outputs)
    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
    
    # define inference encoder
    encoder_model = Model(encoder_inputs, encoder_states)
    
    # define inference decoder
    decoder_state_input_h = Input(shape=(n_units,))
    decoder_state_input_c = Input(shape=(n_units,))
    decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
    
    temp = embedding(decoder_inputs)
    temp, _, _ = decoder_lstm1(temp, initial_state=decoder_states_inputs)
    decoder_outputs, state_h, state_c = decoder_lstm2(temp, initial_state=decoder_states_inputs)
    decoder_states = [state_h, state_c]
    decoder_outputs = decoder_dense(decoder_outputs)
    decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)
    
    # return all models
    return model, encoder_model, decoder_model
"""

In [None]:
from tensorflow.keras import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding

# returns train, inference_encoder and inference_decoder models
def define_updated(n_input, n_output, n_units):
    # define training encoder
    encoder_inputs = Input(shape=(None, n_input))
    encoder = LSTM(n_units, return_state=True)
    encoder_outputs, state_h, state_c = encoder(encoder_inputs)
    encoder_states = [state_h, state_c]
    
    # define training decoder
    decoder_inputs = Input(shape=(None, n_output))
    embedding = Embedding(10000, 64)
    decoder_lstm1 = LSTM(n_units, return_sequences=True, return_state=True)
    decoder_lstm2 = LSTM(n_units, return_sequences=True, return_state=True)
    
    temp = embedding(decoder_inputs)
    temp, _, _ = decoder_lstm1(temp, initial_state=encoder_states)
    decoder_outputs, _, _ = decoder_lstm2(temp, initial_state=encoder_states)
    decoder_dense = Dense(n_output, activation='softmax')
    
    decoder_outputs = decoder_dense(decoder_outputs)
    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
    
    # define inference encoder
    encoder_model = Model(encoder_inputs, encoder_states)
    
    # define inference decoder
    decoder_state_input_h = Input(shape=(n_units,))
    decoder_state_input_c = Input(shape=(n_units,))
    decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
    
    temp = embedding(decoder_inputs)
    temp, _, _ = decoder_lstm1(temp, initial_state=decoder_states_inputs)
    decoder_outputs, state_h, state_c = decoder_lstm2(temp, initial_state=decoder_states_inputs)
    decoder_states = [state_h, state_c]
    decoder_outputs = decoder_dense(decoder_outputs)
    decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)
    
    # return all models
    return model, encoder_model, decoder_model

In [None]:
model, enc, dec = basic_enc_dec(2048, vocab_size, max_length)

In [None]:
model.summary()

In [None]:
x2 = np.hstack([np.zeros((1652, 1)), np.array(padded)])
x2 = x2[:, :-1]

In [None]:
#This is the output to be predicted.
padded[0]

In [None]:
#This is the secondary input for decoder during training.
x2[0]

In [None]:
x2.shape

In [None]:
#Convert to 1652x42x1
#x2 = x2.reshape(x2.shape + (1, ))
#out = padded.reshape(padded.shape + (1, ))

In [None]:
#Convert to 1652x42x1000
from keras.utils.np_utils import to_categorical   

x2_in = to_categorical(x2, num_classes = vocab_size)
outputs = to_categorical(padded, num_classes = vocab_size)
print(x2_in.shape, outputs.shape)

In [None]:
from tensorflow.keras import callbacks
from tensorflow.keras import optimizers
lr_schedule = callbacks.LearningRateScheduler(lambda epoch: 1e-5 * 10**(epoch / 20))
opt = optimizers.RMSprop(lr=1e-5)

In [None]:
#Approximating best lr

model.compile(optimizer=opt, loss='categorical_crossentropy')
history = model.fit([train, x2_in[:1200]], outputs[:1200], validation_split=0.1, epochs = 100, callbacks=[lr_schedule])

In [None]:
#Plotting graph to select best lr

import matplotlib.pyplot as plt
plt.semilogx(history.history["lr"], history.history["loss"])
plt.axis([1e-5, 1, 1, 10])
plt.plot()

In [None]:
#fixed learning rate
opt = optimizers.RMSprop(learning_rate=1e-3)
model.compile(optimizer=opt, loss='categorical_crossentropy')
history = model.fit([train, x2_in[:1200]], outputs[:1200], validation_split=0.1, epochs = 100)

In [None]:
print(history.history.keys())
# "Loss"
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

In [None]:
# generate target given source sequence
reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))

# Function takes a tokenized sentence and returns the words
def sequence_to_text(list_of_indices):
    # Looking up words in dictionary
    words = [reverse_word_map.get(word) for word in list_of_indices if word]
    return(words)
def predict_sequence(infenc, infdec, source, n_steps, cardinality):
    # encode
    state = infenc.predict(source)
    # start of sequence input
    target_seq = np.array([0.0 for _ in range(cardinality)]).reshape(1, 1, cardinality)
    # collect predictions
    output = list()
    for t in range(n_steps):
        # predict next char
        yhat, h, c = infdec.predict([target_seq] + state)
        # store prediction
        output.append(yhat[0, 0, :])
        # update state
        state = [h, c]
        # update target sequence
        target_seq = yhat
    
    out = np.array(output).argmax(axis = 1)
    
    return ' '.join(sequence_to_text(out))

In [None]:
train[0:1].shape

In [None]:
for i in range(20):
    print("Predicted:", predict_sequence(enc, dec, train[i:i+1], max_length, vocab_size))
    print("Actual:", descriptions[i])
    print()

In [None]:
idx = 100
view_frames('dataset/msvd_videos/frames/'+videos_selected[idx])
print("Predicted:", predict_sequence(enc, dec, train[idx:idx+1], max_length, vocab_size))
print("Actual:", descriptions[idx])
print()