In [0]:
!kill -9 -1

In [1]:
from google.colab import drive

drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
!pip install tensorflow
!pip install ipdb

Collecting ipdb
  Downloading https://files.pythonhosted.org/packages/6d/43/c3c2e866a8803e196d6209595020a4a6db1a3c5d07c01455669497ae23d0/ipdb-0.12.tar.gz
Building wheels for collected packages: ipdb
  Building wheel for ipdb (setup.py) ... [?25l[?25hdone
  Stored in directory: /root/.cache/pip/wheels/59/24/91/695211bd228d40fb22dff0ce3f05ba41ab724ab771736233f3
Successfully built ipdb
Installing collected packages: ipdb
Successfully installed ipdb-0.12


In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import os


import ipdb
import time
import cv2
import matplotlib.pyplot as plt

from tensorflow.contrib import rnn 
from keras.preprocessing import sequence

Using TensorFlow backend.


In [2]:
tf.VERSION


'1.13.1'

In [0]:
class Video_Caption_Generator():
    def __init__(self, dim_image, n_words, dim_hidden, batch_size, n_lstm_steps, n_video_lstm_step, n_caption_lstm_step, bias_init_vector=None):
        self.dim_image = dim_image
        self.n_words = n_words
        self.dim_hidden = dim_hidden
        self.batch_size = batch_size
        self.n_lstm_steps = n_lstm_steps
        self.n_video_lstm_step=n_video_lstm_step
        self.n_caption_lstm_step=n_caption_lstm_step

        with tf.device("/device:GPU:0"):
            self.Wemb = tf.Variable(tf.random_uniform([n_words, dim_hidden], -0.1, 0.1), name='Wemb')
        #self.bemb = tf.Variable(tf.zeros([dim_hidden]), name='bemb')

        self.lstm1 = tf.nn.rnn_cell.BasicLSTMCell(dim_hidden, state_is_tuple=False)
        self.lstm2 = tf.nn.rnn_cell.BasicLSTMCell(dim_hidden, state_is_tuple=False)

        self.encode_image_W = tf.Variable( tf.random_uniform([dim_image, dim_hidden], -0.1, 0.1), name='encode_image_W')
        self.encode_image_b = tf.Variable( tf.zeros([dim_hidden]), name='encode_image_b')

        self.embed_word_W = tf.Variable(tf.random_uniform([dim_hidden, n_words], -0.1,0.1), name='embed_word_W')
        if bias_init_vector is not None:
            self.embed_word_b = tf.Variable(bias_init_vector.astype(np.float32), name='embed_word_b')
        else:
            self.embed_word_b = tf.Variable(tf.zeros([n_words]), name='embed_word_b')

    def build_model(self):
        video = tf.placeholder(tf.float32, [self.batch_size, self.n_video_lstm_step, self.dim_image])
        video_mask = tf.placeholder(tf.float32, [self.batch_size, self.n_video_lstm_step])

        caption = tf.placeholder(tf.int32, [self.batch_size, self.n_caption_lstm_step+1])
        caption_mask = tf.placeholder(tf.float32, [self.batch_size, self.n_caption_lstm_step+1])

        video_flat = tf.reshape(video, [-1, self.dim_image])
        image_emb = tf.nn.xw_plus_b( video_flat, self.encode_image_W, self.encode_image_b ) # (batch_size*n_lstm_steps, dim_hidden)
        image_emb = tf.reshape(image_emb, [self.batch_size, self.n_lstm_steps, self.dim_hidden])

        state1 = self.lstm1.zero_state(self.batch_size, tf.float32)
        state2 = self.lstm2.zero_state(self.batch_size, tf.float32)
        #state1 = tf.zeros([self.batch_size, self.lstm1.state_size])
        #state2 = tf.zeros([self.batch_size, self.lstm2.state_size])
        padding = tf.zeros([self.batch_size, self.dim_hidden])

        probs = []
        loss = 0.0

        ##############################  Encoding Stage ##################################
        for i in range(0, self.n_video_lstm_step):
            if i > 0:
                tf.get_variable_scope().reuse_variables()

            with tf.variable_scope("LSTM1"):
                output1, state1 = self.lstm1(image_emb[:,i,:], state1)

            with tf.variable_scope("LSTM2"):
                output2, state2 = self.lstm2(tf.concat([padding, output1], 1), state2)

        ############################# Decoding Stage ######################################
        for i in range(0, self.n_caption_lstm_step): ## Phase 2 => only generate captions
            #if i == 0:
            #    current_embed = tf.zeros([self.batch_size, self.dim_hidden])
            #else:
            with tf.device("/device:GPU:0"):
                current_embed = tf.nn.embedding_lookup(self.Wemb, caption[:, i])

            tf.get_variable_scope().reuse_variables()

            with tf.variable_scope("LSTM1"):
                output1, state1 = self.lstm1(padding, state1)

            with tf.variable_scope("LSTM2"):
                output2, state2 = self.lstm2(tf.concat([current_embed, output1], 1), state2)

            labels = tf.expand_dims(caption[:, i+1], 1)
            indices = tf.expand_dims(tf.range(0, self.batch_size, 1), 1)
            concated = tf.concat( [indices, labels], 1)
            onehot_labels = tf.sparse_to_dense(concated, tf.stack([self.batch_size, self.n_words]), 1.0, 0.0)

            logit_words = tf.nn.xw_plus_b(output2, self.embed_word_W, self.embed_word_b)
            cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=logit_words, labels=onehot_labels)
            cross_entropy = cross_entropy * caption_mask[:,i]
            probs.append(logit_words)

            current_loss = tf.reduce_sum(cross_entropy)/self.batch_size
            loss = loss + current_loss

        return loss, video, video_mask, caption, caption_mask, probs


    def build_generator(self):
        video = tf.placeholder(tf.float32, [1, self.n_video_lstm_step, self.dim_image])
        video_mask = tf.placeholder(tf.float32, [1, self.n_video_lstm_step])

        video_flat = tf.reshape(video, [-1, self.dim_image])
        image_emb = tf.nn.xw_plus_b(video_flat, self.encode_image_W, self.encode_image_b)
        image_emb = tf.reshape(image_emb, [1, self.n_video_lstm_step, self.dim_hidden])

        state1 = tf.zeros([1, self.lstm1.state_size])
        state2 = tf.zeros([1, self.lstm2.state_size])
        padding = tf.zeros([1, self.dim_hidden])

        generated_words = []

        probs = []
        embeds = []

        for i in range(0, self.n_video_lstm_step):
            if i > 0:
                tf.get_variable_scope().reuse_variables()

            with tf.variable_scope("LSTM1"):
                output1, state1 = self.lstm1(image_emb[:, i, :], state1)

            with tf.variable_scope("LSTM2"):
                output2, state2 = self.lstm2(tf.concat( [padding, output1], 1), state2)

        for i in range(0, self.n_caption_lstm_step):
            tf.get_variable_scope().reuse_variables()

            if i == 0:
                with tf.device('/device:GPU:0'):
                    current_embed = tf.nn.embedding_lookup(self.Wemb, tf.ones([1], dtype=tf.int64))

            with tf.variable_scope("LSTM1"):
                output1, state1 = self.lstm1(padding, state1)

            with tf.variable_scope("LSTM2"):
                output2, state2 = self.lstm2(tf.concat( [current_embed, output1], 1), state2)

            logit_words = tf.nn.xw_plus_b( output2, self.embed_word_W, self.embed_word_b)
            max_prob_index = tf.argmax(logit_words, 1)[0]
            generated_words.append(max_prob_index)
            probs.append(logit_words)

            with tf.device("/device:GPU:0"):
                current_embed = tf.nn.embedding_lookup(self.Wemb, max_prob_index)
                current_embed = tf.expand_dims(current_embed, 0)

            embeds.append(current_embed)

        return video, video_mask, generated_words, probs, embeds


In [0]:
#=====================================================================================
# Global Parameters
#=====================================================================================


video_path = '/content/gdrive/My Drive/videoToTexts2vt/youtube_videos'
video_data_path='/content/gdrive/My Drive/s2vt/video_corpus.csv'
video_feat_path = '/content/gdrive/My Drive/videoToTexts2vt/youtube_feats'

vgg16_path = './tensorflow_vgg16/vgg16.tfmodel'


model_path = './models/'

In [0]:
#=======================================================================================
# Train Parameters
#=======================================================================================
dim_image = 4096
dim_hidden= 1000

n_video_lstm_step = 80
n_caption_lstm_step = 20
n_frame_step = 80

n_epochs = 1010
batch_size = 50
learning_rate = 0.0001

In [0]:
def get_video_data(video_data_path, video_feat_path, train_ratio=0.9):
    video_data = pd.read_csv(video_data_path, sep=',')
    video_data = video_data[video_data['Language'] == 'English']
    video_data['video_path'] = video_data.apply(lambda row: row['VideoID']+'_'+str(row['Start'])+'_'+str(row['End'])+'.avi.npy', axis=1)
    video_data['video_path'] = video_data['video_path'].map(lambda x: os.path.join(video_feat_path, x))
    video_data = video_data[video_data['video_path'].map(lambda x: os.path.exists( x ))]
    video_data = video_data[video_data['Description'].map(lambda x: isinstance(x, str))]

    unique_filenames = video_data['video_path'].unique()
    train_len = int(len(unique_filenames)*train_ratio)

    train_vids = unique_filenames[:train_len]
    test_vids = unique_filenames[train_len:]

    train_data = video_data[video_data['video_path'].map(lambda x: x in train_vids)]
    test_data = video_data[video_data['video_path'].map(lambda x: x in test_vids)]

    return train_data, test_data


In [0]:
def preProBuildWordVocab(sentence_iterator, word_count_threshold=5):
    # borrowed this function from NeuralTalk
    print ('preprocessing word counts and creating vocab based on word count threshold %d' % (word_count_threshold))
    word_counts = {}
    nsents = 0
    for sent in sentence_iterator:
        nsents += 1
        for w in sent.lower().split(' '):
           word_counts[w] = word_counts.get(w, 0) + 1
    vocab = [w for w in word_counts if word_counts[w] >= word_count_threshold]
    print ('filtered words from %d to %d' % (len(word_counts), len(vocab)))

    ixtoword = {}
    ixtoword[0] = '<pad>'
    ixtoword[1] = '<bos>'
    ixtoword[2] = '<eos>'
    ixtoword[3] = '<unk>'

    wordtoix = {}
    wordtoix['<pad>'] = 0
    wordtoix['<bos>'] = 1
    wordtoix['<eos>'] = 2
    wordtoix['<unk>'] = 3

    for idx, w in enumerate(vocab):
        wordtoix[w] = idx+4
        ixtoword[idx+4] = w

    word_counts['<pad>'] = nsents
    word_counts['<bos>'] = nsents
    word_counts['<eos>'] = nsents
    word_counts['<unk>'] = nsents

    bias_init_vector = np.array([1.0 * word_counts[ ixtoword[i] ] for i in ixtoword])
    bias_init_vector /= np.sum(bias_init_vector) # normalize to frequencies
    bias_init_vector = np.log(bias_init_vector)
    bias_init_vector -= np.max(bias_init_vector) # shift to nice numeric range

    return wordtoix, ixtoword, bias_init_vector

In [0]:
def train():
  
    train_data,test_data = get_video_data(video_data_path, video_feat_path, train_ratio=0.9)
    train_captions = train_data['Description'].values
    #test_data = get_video_test_data(video_test_data_path, video_test_feat_path)
    test_captions = test_data['Description'].values
    

    captions_list = list(train_captions) + list(test_captions)
    captions = np.asarray(captions_list, dtype=np.object)

    captions = map(lambda x: x.replace('.', ''), captions)
    captions = map(lambda x: x.replace(',', ''), captions)
    captions = map(lambda x: x.replace('"', ''), captions)
    captions = map(lambda x: x.replace('\n', ''), captions)
    captions = map(lambda x: x.replace('?', ''), captions)
    captions = map(lambda x: x.replace('!', ''), captions)
    captions = map(lambda x: x.replace('\\', ''), captions)
    captions = map(lambda x: x.replace('/', ''), captions)

    wordtoix, ixtoword, bias_init_vector = preProBuildWordVocab(captions, word_count_threshold=0)
    
    np.save("/content/gdrive/My Drive/s2vt/wordtoix", wordtoix)
    np.save('/content/gdrive/My Drive/s2vt/ixtoword', ixtoword)
    np.save("/content/gdrive/My Drive/s2vt/bias_init_vector", bias_init_vector)

    model = Video_Caption_Generator(
            dim_image=dim_image,
            n_words=len(wordtoix),
            dim_hidden=dim_hidden,
            batch_size=batch_size,
            n_lstm_steps=n_frame_step,
            n_video_lstm_step=n_video_lstm_step,
            n_caption_lstm_step=n_caption_lstm_step,
            bias_init_vector=bias_init_vector)

    tf_loss, tf_video, tf_video_mask, tf_caption, tf_caption_mask, tf_probs = model.build_model()
    sess = tf.InteractiveSession()
    
    
    saver = tf.train.Saver(max_to_keep=100)
    with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE ) as vscope:
      train_op =  tf.train.AdamOptimizer(learning_rate).minimize(tf_loss)
    tf.global_variables_initializer().run()

    #new_saver = tf.train.Saver()
    #new_saver = tf.train.import_meta_graph('./rgb_models/model-1000.meta')
    #new_saver.restore(sess, tf.train.latest_checkpoint('./models/'))

    loss_fd = open('/content/gdrive/My Drive/s2vt/loss.txt', 'w')
    loss_to_draw = []
    sess.run(tf.global_variables_initializer())
    
    for epoch in range(0, n_epochs):
        loss_to_draw_epoch = []

        index = list(train_data.index)
        np.random.shuffle(index)
        train_data = train_data.loc[index]

        current_train_data = train_data.groupby('video_path').apply(lambda x: x.iloc[np.random.choice(len(x))])
        current_train_data = current_train_data.reset_index(drop=True)
        
        for start, end in zip(
                range(0, len(current_train_data), batch_size),
                range(batch_size, len(current_train_data), batch_size)):

            start_time = time.time()

            current_batch = current_train_data[start:end]
            current_videos = current_batch['video_path'].values

            current_feats = np.zeros((batch_size, n_video_lstm_step, dim_image))
            current_feats_vals = list(map(lambda vid: np.load(vid), current_videos))

            current_video_masks = np.zeros((batch_size, n_video_lstm_step))

            for ind,feat in enumerate(current_feats_vals):
                current_feats[ind][:len(current_feats_vals[ind])] = feat
                current_video_masks[ind][:len(current_feats_vals[ind])] = 1

            current_captions = current_batch['Description'].values
            current_captions = map(lambda x: '<bos> ' + x, current_captions)
            current_captions = map(lambda x: x.replace('.', ''), current_captions)
            current_captions = map(lambda x: x.replace(',', ''), current_captions)
            current_captions = map(lambda x: x.replace('"', ''), current_captions)
            current_captions = map(lambda x: x.replace('\n', ''), current_captions)
            current_captions = map(lambda x: x.replace('?', ''), current_captions)
            current_captions = map(lambda x: x.replace('!', ''), current_captions)
            current_captions = map(lambda x: x.replace('\\', ''), current_captions)
            current_captions = list(map(lambda x: x.replace('/', ''), current_captions))

            for idx, each_cap in enumerate(current_captions):
                word = each_cap.lower().split(' ')
                if len(word) < n_caption_lstm_step:
                    current_captions[idx] = current_captions[idx] + '<eos>'
                else:
                    new_word = ''
                    for i in range(n_caption_lstm_step-1):
                        new_word = new_word + word[i] + ' '
                    current_captions[idx] = new_word + '<eos>'
                #print(current_captions[idx])    

            current_caption_ind = []
            for cap in current_captions:
                current_word_ind = []
                for word in cap.lower().split(' '):
                    if word in wordtoix:
                        current_word_ind.append(wordtoix[word])
                    else:
                        current_word_ind.append(wordtoix['<unk>'])
                current_caption_ind.append(current_word_ind)

            current_caption_matrix = sequence.pad_sequences(current_caption_ind, padding='post', maxlen=n_caption_lstm_step)
            current_caption_matrix = np.hstack( [current_caption_matrix, np.zeros( [len(current_caption_matrix), 1] ) ] ).astype(int)
            current_caption_masks = np.zeros( (current_caption_matrix.shape[0], current_caption_matrix.shape[1]) )
            nonzeros = np.array(list( map(lambda x: (x != 0).sum() + 1, current_caption_matrix ) ))
            #lambda returns list, sum+1 on that list

            for ind, row in enumerate(current_caption_masks):
                row[:nonzeros[ind]] = 1

            probs_val = sess.run(tf_probs, feed_dict={
                tf_video:current_feats,
                tf_caption: current_caption_matrix
                })

            _, loss_val = sess.run(
                    [train_op, tf_loss],
                    feed_dict={
                        tf_video: current_feats,
                        tf_video_mask : current_video_masks,
                        tf_caption: current_caption_matrix,
                        tf_caption_mask: current_caption_masks
                        })
            loss_to_draw_epoch.append(loss_val)

            print( 'idx: ', start, " Epoch: ", epoch, " loss: ", loss_val, ' Elapsed time: ', str((time.time() - start_time)))
            loss_fd.write('epoch ' + str(epoch) + ' loss ' + str(loss_val) + '\n')

       
        if np.mod(epoch, 50) == 0:
            print( "Epoch ", epoch, " is done. Saving the model ...")
            saver.save(sess, os.path.join(model_path, 'model'), global_step=epoch)
             # draw loss curve every epoch
            loss_to_draw.append(np.mean(loss_to_draw_epoch))
            plt_save_dir = "/content/gdrive/My Drive/s2vt/loss_imgs"
            plt_save_img_name = str(epoch) + '.png'
            plt.plot(range(len(loss_to_draw)), loss_to_draw, color='g')
            plt.grid(True)
            plt.xlabel('predicted probability') 
            # naming the y axis 
            plt.ylabel('loss') 
            plt.savefig(os.path.join(plt_save_dir, plt_save_img_name))

            

    sess.close()



In [0]:

def main():
  train()

In [0]:
if __name__=="__main__":
    main()

preprocessing word counts and creating vocab based on word count threshold 0
filtered words from 11286 to 11286
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Create a `tf.sparse.SparseTensor` and use `tf.sparse.to_dense` instead.
Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See `tf.nn.softmax_cross_entropy_with_logits_v2`.

Instructions for updating:
Use tf.cast instead.
idx:  0  Epoch:  0  loss:  40.161854  Elapsed time:  3.942831516265869
idx:  50  Epoch:  0  loss:  37.26174  Elapsed time:  1.030397653579712
idx:  100  Epoch:  0  loss:  35.249405  Elapsed time:  0.9538838863372803
idx:  150  Epoch:  0  loss:  48.15485  Elapsed time:  0.9924674034118652
idx:  200  Epoch:  0  loss:  37.52187  Elapsed time:  0.9812

In [0]:
tf.InteractiveSession._active_session_count

0

In [0]:
tf.global_variables()

[<tf.Variable 'Wemb:0' shape=(2901, 1000) dtype=float32_ref>,
 <tf.Variable 'encode_image_W:0' shape=(4096, 1000) dtype=float32_ref>,
 <tf.Variable 'encode_image_b:0' shape=(1000,) dtype=float32_ref>,
 <tf.Variable 'embed_word_W:0' shape=(1000, 2901) dtype=float32_ref>,
 <tf.Variable 'embed_word_b:0' shape=(2901,) dtype=float32_ref>,
 <tf.Variable 'LSTM1/basic_lstm_cell/kernel:0' shape=(2000, 4000) dtype=float32_ref>,
 <tf.Variable 'LSTM1/basic_lstm_cell/bias:0' shape=(4000,) dtype=float32_ref>,
 <tf.Variable 'LSTM2/basic_lstm_cell/kernel:0' shape=(3000, 4000) dtype=float32_ref>,
 <tf.Variable 'LSTM2/basic_lstm_cell/bias:0' shape=(4000,) dtype=float32_ref>,
 <tf.Variable 'beta1_power:0' shape=() dtype=float32_ref>,
 <tf.Variable 'beta2_power:0' shape=() dtype=float32_ref>,
 <tf.Variable 'Wemb/Adam:0' shape=(2901, 1000) dtype=float32_ref>,
 <tf.Variable 'Wemb/Adam_1:0' shape=(2901, 1000) dtype=float32_ref>,
 <tf.Variable 'encode_image_W/Adam:0' shape=(4096, 1000) dtype=float32_ref>,
 <t

In [0]:

def test(model_path='/content/gdrive/My Drive/s2vt/model-600'):
    #OF = open('/content/gdrive/My Drive/s2vt/out.txt', 'w')
  
    train_data, test_data = get_video_data(video_data_path, video_feat_path, train_ratio=0.9)
    test_videos = test_data['video_path'].unique()

    ixtoword = pd.Series(np.load('/content/gdrive/My Drive/s2vt/ixtoword.npy',allow_pickle=True).tolist())

    bias_init_vector = np.load('/content/gdrive/My Drive/s2vt/bias_init_vector.npy')

    model = Video_Caption_Generator(
            dim_image=dim_image,
            n_words=len(ixtoword),
            dim_hidden=dim_hidden,
            batch_size=batch_size,
            n_lstm_steps=n_frame_step,
            n_video_lstm_step=n_video_lstm_step,
            n_caption_lstm_step=n_caption_lstm_step,
            bias_init_vector=bias_init_vector)

    video_tf, video_mask_tf, caption_tf, probs_tf, last_embed_tf = model.build_generator()
     
    filename='/content/gdrive/My Drive/s2vt/out.txt'
    f = open(filename,'w') 
    
    sess = tf.InteractiveSession()

    saver = tf.train.Saver()
    saver.restore(sess, model_path)

    #test_output_txt_fd = open('S2VT_results.txt', 'w')
    for idx, videos_feat_path in enumerate(test_videos):
        print (idx, videos_feat_path)
        #OF.write(idx, videos_feat_path + "\n")
              
        video_feat = np.load(videos_feat_path)[None,...]
        #video_feat = np.load(videos_feat_path)
        #video_mask = np.ones((videos_feat.shape[0], video_feat.shape[1]))
        if video_feat.shape[1] == n_frame_step:
            video_mask = np.ones((video_feat.shape[0], video_feat.shape[1]))
        else:
            continue
            #shape_templete = np.zeros(shape=(1, n_frame_step, 4096), dtype=float )
            #shape_templete[:video_feat.shape[0], :video_feat.shape[1], :video_feat.shape[2]] = video_feat
            #video_feat = shape_templete
            #video_mask = np.ones((video_feat.shape[0], n_frame_step))

        generated_word_index = sess.run(caption_tf, feed_dict={video_tf:video_feat, video_mask_tf:video_mask})
        generated_words = ixtoword[generated_word_index]
        #print(generated_words)

        punctuation = np.argmax(np.array(generated_words) == '<unk>') 
        #print(punctuation)
        generated_words = generated_words[:punctuation]

        generated_sentence = ' '.join(generated_words)
        generated_sentence = generated_sentence.replace('<bos>', '')
        generated_sentence = generated_sentence.replace('<eos>', '')
        print("Generated this sentence")
        print (generated_sentence,'\n')
        #OF.write(generated_sentence )
        #OF.write("\n")
        #test_output_txt_fd.write(video_feat_path + '\n')
        #test_output_txt_fd.write(generated_sentence + '\n\n')

In [9]:
test()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from /content/gdrive/My Drive/s2vt/model-600
0 /content/gdrive/My Drive/videoToTexts2vt/youtube_feats/HYbEf-JKZnA_13_26.avi.npy
Generated this sentence
a man is taking the 

1 /content/gdrive/My Drive/videoToTexts2vt/youtube_feats/hxZ-5wELSJM_0_12.avi.npy
Generated this sentence
a man is 

2 /content/gdrive/My Drive/videoToTexts2vt/youtube_feats/GnwKcpfr_ng_47_57.avi.npy
Generated this sentence
a man is riding a 

3 /content/gdrive/My Drive/videoToTexts2vt/youtube_feats/OCcy9TDRGKo_41_51.avi.npy
Generated this sentence
a person is cutting the 

4 /content/gdrive/My Drive/videoToTexts2vt/youtube_feats/klFyrnrUSck_42_46.avi.npy
Generated this sentence
a woman is slicin

In [0]:
ixtoword = pd.Series(np.load('/content/gdrive/My Drive/s2vt/ixtoword.npy',allow_pickle=True).tolist()) 
for words in ixtoword:
   print(words)

<pad>
<bos>
<eos>
<unk>
a
man
is
arranging
sushi
person
food
on
dish
making
hot
dog
piece
look
like
an
octopus
woman
arranges
cuts
squid
adding
eyes
to
squash
cut
into
ghost-like
pieces
some
and
decorating
it
preparing
plate
of
places
sesame
seeds
as
octopus-shaped
sausage
placed
small
white
tray
prepares
seed
are
added
someone
with
made
the
lady
garnishing
vegetables
garnish
bento
in
serving
putting
vegetable
showing
how
make
box
puts
placing
black
thing
her
put
two
sausages
platter
coking
somebody
video
plates
asian

adds
full
fruits
displayed
japanese
boxed
lunch
dazzle
zebras
grazing
grassland
flock
group
zebra
herd
standing
field
graze
savannah
stand
grassy
plain
pack
many
several
grazed
grasses
eating
grass
grassing
peacefully
gracing
jungle
were
found
together
lots
all
forest
walking
having
they
running
savanna
gazing
fields
very
cute
boy
does
flips
trampoline
doing
flipping
guy
back
different
children
jump
trampolines
kids
tricks
people
jumping
do
peoples
groups
shown
up
down
b