In [None]:
# https://www.kaggle.com/code/debakshii/a-deep-learning-based-approach-video-captioning/notebook


In [1]:
!pip install pycocoevalcap

Collecting pycocoevalcap
  Downloading pycocoevalcap-1.2-py3-none-any.whl (104.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.3/104.3 MB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pycocoevalcap
Successfully installed pycocoevalcap-1.2


In [2]:
# !pip install pycocoevalcap
from pycocoevalcap.bleu.bleu import Bleu
from pycocoevalcap.meteor.meteor import Meteor
from pycocoevalcap.rouge.rouge import Rouge
from pycocoevalcap.cider.cider import Cider

In [3]:
import os
import cv2
import numpy as np
import sys
import glob
import json
import h5py
from tqdm import tqdm
import pickle
import random
import itertools
from PIL import Image
import warnings
import re
import unicodedata
import math
import time
import matplotlib.pyplot as plt
import copy

import torch
import torchvision
import torchvision.datasets as dset
import torchvision.transforms as transforms
import torchvision.models as models
from torch.utils.data import DataLoader,Dataset
import torch.nn as nn
from torch.nn import functional as F
from torch import optim

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [48]:
warnings.filterwarnings('ignore')

saved_path = '/content/drive/MyDrive/MSVD/Saved'
if not os.path.exists(saved_path):
    os.makedirs(saved_path)
    print("Directory 'Saved' created successfully.")
else:
    print("Directory 'Saved' already exists.")
prediction_path = '/content/drive/MyDrive/MSVD/results'
if not os.path.exists(prediction_path):
    os.makedirs(prediction_path)
    print("Directory 'results' created successfully.")
else:
    print("Directory 'results' already exists.")


Directory 'Saved' already exists.
Directory 'results' already exists.


In [49]:
class Path:
    def __init__(self,cfg,working_path):
        if cfg.dataset == 'msvd':
            self.local_path = os.path.join(working_path,'MSVD')
            self.video_path = '/content/drive/MyDrive/MSVD/YouTubeClips' # For future use
            self.caption_path = '/content/drive/MyDrive/MSVD/video_caption'
            self.feature_path = '/content/drive/MyDrive/MSVD/feature'
            self.name_mapping_file = os.path.join(self.caption_path,'youtube_mapping.txt')
            self.train_annotation_file = os.path.join(self.caption_path,'sents_train_lc_nopunc.txt')
            self.val_annotation_file = os.path.join(self.caption_path,'sents_val_lc_nopunc.txt')
            self.test_annotation_file = os.path.join(self.caption_path,'sents_test_lc_nopunc.txt')

            if cfg.appearance_feature_extractor == 'inceptionresnetv2':
                self.appearance_feature_file = os.path.join(self.feature_path,'MSVD_APPEARANCE_INCEPTIONRESNETV2_28.hdf5')

        self.prediction_path = prediction_path
        self.saved_models_path = saved_path


In [50]:
class ConfigSALSTM:
    '''
    Hyperparameter settings for Soft Attention based LSTM (SA-LSTM) model.
    '''
    def __init__(self,model_name='sa-lstm',opt_encoder=False):
        self.model_name = model_name

        # Device configuration
        self.cuda_device_id = 0
        if torch.cuda.is_available():
            self.device = torch.device('cuda:'+str(self.cuda_device_id))
        else:
            self.device = torch.device('cpu')

        # Dataloader configuration
        self.dataset = 'msvd'
        self.batch_size = 100 # suitable
        self.val_batch_size = 10
        self.opt_truncate_caption = True
        self.max_caption_length = 30

        # Encoder configuration
        self.appearance_feature_extractor = 'inceptionresnetv2'
        self.appearance_input_size = 1536
        self.appearance_projected_size = 512
        self.frame_len = 28
        self.opt_encoder = opt_encoder

        # Decoder configuration
        self.decoder_type = 'lstm'
        self.embedding_size = 468 # word embedding size
        if self.opt_encoder:
            self.feat_size =  self.appearance_projected_size
        else:
            self.feat_size = self.appearance_input_size

        self.decoder_input_size = self.feat_size + self.embedding_size
        self.decoder_hidden_size = 512  # Hidden size of decoder LSTM
        self.attn_size = 128  # attention bottleneck
        self.n_layers = 1
        self.embed_dropout = 0.5
        self.rnn_dropout = 0.4
        self.opt_param_init = False
        self.beam_length = 5

        # Training configuration
        self.encoder_lr = 1e-4
        self.decoder_lr = 1e-4
        self.teacher_forcing_ratio = 1.0
        self.clip = 5 # clip the gradient to counter exploding gradient problem
        self.print_every = 40

        self.lr_decay_start_from = 20
        self.lr_decay_gamma = 0.5
        self.lr_decay_patience = 5
        self.weight_decay = 1e-5
        self.reg_lambda = 0.

        # Vocabulary configuration
        self.SOS_token = 1
        self.EOS_token = 2
        self.PAD_token = 0
        self.UNK_token = 3
        self.vocabulary_min_count = 5


In [51]:
def collate_fn(batch):
    '''
    Custom collate function for supporting batching during training and inference.
    '''
    data = [item[0] for item in batch]
    images = torch.stack(data, 0)
    label = [item[1] for item in batch]
    ides = [item[2] for item in batch]

    motion = [item[3] for item in batch]
    motion_batch = torch.stack(motion, 0)

    object_ = [item[4] for item in batch]
    object_batch = torch.stack(object_, 0)

    max_target_len = max([len(indexes) for indexes in label])
    padList = list(itertools.zip_longest(*label, fillvalue=0))

    lengths = torch.tensor([len(p) for p in label])
    padVar = torch.LongTensor(padList)

    m = []
    for i, seq in enumerate(padVar):
        tmp = []
        for token in seq:
            if token == 0:
                tmp.append(0)
            else:
                tmp.append(1)
        m.append(tmp)
    m = torch.tensor(m)

    return images, padVar, m, max_target_len, ides, motion_batch, object_batch


In [52]:
class CustomDataset(Dataset):
    def __init__(self, cfg, appearance_feature_dict, annotation_dict, video_name_list, voc, motion_feature_dict=None, object_feature_dict=None):
        self.annotation_dict = annotation_dict
        self.appearance_feature_dict = appearance_feature_dict
        self.v_name_list = video_name_list
        self.voc = voc
        self.max_caption_length = cfg.max_caption_length
        self.motion_feature_dict = motion_feature_dict
        self.object_feature_dict = object_feature_dict
        self.opt_truncate_caption = cfg.opt_truncate_caption

    def __len__(self):
        return len(self.v_name_list)

    def __getitem__(self, idx):
        anno = random.choice(self.annotation_dict[self.v_name_list[idx]])
        anno_index = []
        for word in anno.split(' '):
            try:
                anno_index.append(self.voc.word2index[word])
            except:
                pass
        if self.opt_truncate_caption:
            if len(anno_index) > self.max_caption_length:
                anno_index = anno_index[:self.max_caption_length]
        anno_index = anno_index + [self.voc.cfg.EOS_token]

        appearance_tensor = torch.tensor(self.appearance_feature_dict[self.v_name_list[idx]]).float()

        if self.motion_feature_dict is None:
            motion_tensor = torch.zeros_like(appearance_tensor)
        else:
            motion_tensor = torch.tensor(self.motion_feature_dict[self.v_name_list[idx]]).float()
        if self.object_feature_dict is None:
            object_tensor = torch.zeros_like(appearance_tensor)
        else:
            object_tensor = torch.tensor(self.object_feature_dict[self.v_name_list[idx]]).float()

        return appearance_tensor, anno_index, self.v_name_list[idx], motion_tensor, object_tensor


In [53]:
class DataHandler:
    def __init__(self, cfg, path, voc):
        self.voc = voc
        self.cfg = cfg
        self.path = path
        self.appearance_feature_dict = {}

        if cfg.dataset == 'msvd':  # For MSVD dataset
            self._msvd_create_dict() # Reference caption dictionaries
            self.appearance_feature_dict = self._read_feature_file(feature_type='appearance')

        self.train_name_list = list(self.train_dict.keys())
        self.val_name_list = list(self.val_dict.keys())
        self.test_name_list = list(self.test_dict.keys())

    def _read_feature_file(self, feature_type='appearance'):
        feature_dict = {}
        if feature_type == 'appearance':
            f1 = h5py.File(self.path.appearance_feature_file, 'r')
        elif feature_type == 'motion':
            f1 = h5py.File(self.path.motion_feature_file, 'r+')
        else:
            f1 = h5py.File(self.path.object_feature_file, 'r+')

        if self.cfg.model_name == 'sa-lstm' or self.cfg.model_name == 'recnet':
            for key in f1.keys():
                arr = f1[key][()]
                if arr.shape[0] < self.cfg.frame_len:
                    pad = self.cfg.frame_len - arr.shape[0]
                    arr = np.concatenate((arr, np.zeros((pad, arr.shape[1]))), axis=0)
                feature_dict[key] = arr

        if self.cfg.model_name == 'mean_pooling':
            for key in f1.keys():
                feature_dict[key] = f1[key].value.mean(axis=0)

        return feature_dict

    def _file_to_dict(self, path):
        dic = dict()
        with open(path, 'r') as fil:
            for f in fil.readlines():
                l = f.split()
                ll = ' '.join(x for x in l[1:])
                if l[0] not in dic:
                    dic[l[0]] = [ll]
                else:
                    dic[l[0]].append(ll)
        return dic

    def _msvd_create_dict(self):
        self.train_dict = self._file_to_dict(self.path.train_annotation_file)
        self.val_dict = self._file_to_dict(self.path.val_annotation_file)
        self.test_dict = self._file_to_dict(self.path.test_annotation_file)

    def getDatasets(self):
        if self.cfg.model_name in ['mean_pooling', 'sa-lstm', 'recnet']:
            train_dset = CustomDataset(self.cfg, self.appearance_feature_dict, self.train_dict, self.train_name_list, self.voc)
            val_dset = CustomDataset(self.cfg, self.appearance_feature_dict, self.val_dict, self.val_name_list, self.voc)
            test_dset = CustomDataset(self.cfg, self.appearance_feature_dict, self.test_dict, self.test_name_list, self.voc)

        return train_dset, val_dset, test_dset

    def getDataloader(self, train_dset, val_dset, test_dset):
        train_loader = DataLoader(train_dset, batch_size=self.cfg.batch_size, num_workers=8, shuffle=True, collate_fn=collate_fn, drop_last=True)
        val_loader = DataLoader(val_dset, batch_size=10, num_workers=8, shuffle=False, collate_fn=collate_fn, drop_last=False)
        test_loader = DataLoader(test_dset, batch_size=10, num_workers=8, shuffle=False, collate_fn=collate_fn, drop_last=False)

        return train_loader, val_loader, test_loader


In [54]:
class Utils:
    '''
    Generic utility functions that our model and dataloader would require

    '''

    @staticmethod
    def set_seed(seed):
        '''
          For reproducibility
        '''
        random.seed(seed)
        np.random.seed(seed)
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)

    @staticmethod
    def unicodeToAscii(s):
        '''
        Turn a Unicode string to plain ASCII,
        Thanks to https://stackoverflow.com/a/518232/2809427
        '''
        return ''.join(
            c for c in unicodedata.normalize('NFD', s)
            if unicodedata.category(c) != 'Mn'
        )

    @staticmethod
    def normalizeString(s):
        '''
        Lowercase, trim, and remove non-letter
        '''
        s = unicodeToAscii(s.lower().strip())
        s = re.sub(r"([.!?])", r" \1", s)
        s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
        return s

    @staticmethod
    def target_tensor_to_caption(voc,target):
        '''
        Convert target tensor to Caption
        '''
        gnd_trh = []
        lend = target.size()[1]
        for i in range(lend):
            tmp = ' '.join(voc.index2word[x.item()] for x in target[:,i])
            gnd_trh.append(tmp)
        return gnd_trh

    @staticmethod
    def maskNLLLoss(inp, target, mask, device):
        '''
        Masked cross-entropy loss calculation;
        refers: https://pytorch.org/tutorials/beginner/chatbot_tutorial.html
        '''
        inp = inp.squeeze(0)
        nTotal = mask.sum()
        mask = mask.bool() #Updated
        crossEntropy = -torch.log(torch.gather(inp.squeeze(0), 1, target.view(-1, 1)).squeeze(1).float())
        loss = crossEntropy.masked_select(mask).mean()
        loss = loss.to(device)
        return loss, nTotal.item()

    @staticmethod
    def score(ref, hypo):
        """
        ref, dictionary of reference sentences (id, sentence)
        hypo, dictionary of hypothesis sentences (id, sentence)
        score, dictionary of scores
        refers: https://github.com/zhegan27/SCN_for_video_captioning/blob/master/SCN_evaluation.py
        """
        scorers = [
            (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
            (Meteor(),"METEOR"),
            (Rouge(), "ROUGE_L"),
            (Cider(), "CIDEr")
        ]
        final_scores = {}
        for scorer, method in scorers:
            score, scores = scorer.compute_score(ref, hypo)
            if type(score) == list:
                for m, s in zip(method, score):
                    final_scores[m] = s
            else:
                final_scores[method] = score
        return final_scores

    @staticmethod
    def FrameCapture(video_path, video_name):
        '''
        Function to extract frames
        For MSVD Sample every 10th frame
        '''

        #video_path = video_path_dict[video_name]
        # Path to video file
        video_path = video_path+video_name  #Changes
        vidObj = cv2.VideoCapture(video_path)
        count = 0
        fail = 0
        # checks whether frames were extracted
        success = 1
        frames = []
        while success:
            # OpenCV Uses BGR Colormap
            success, image = vidObj.read()
            try:
                RGBimage = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
                if count%10 == 0:            #Sample 1 frame per 10 frames
                    frames.append(RGBimage)
                count += 1
            except:
                fail += 1
        vidObj.release()
        if count > 80:
            frames = frames[:81]
        return np.stack(frames[:-1]),count-1, fail

    @staticmethod
    def imshow(img):
        '''
        Shows a grid of images
        '''
        #img = img / 2 + 0.5     # unnormalize
        npimg = img.numpy()
        plt.imshow(np.transpose(npimg, (1, 2, 0)))

In [55]:
class Vocabulary:

    def __init__(self, cfg):
        self.name = cfg.dataset
        self.cfg = cfg
        self.trimmed = False
        self.word2index = {"PAD":cfg.PAD_token,"EOS":cfg.EOS_token,"SOS":cfg.SOS_token, "UNK":cfg.UNK_token}
        self.word2count = {}
        self.index2word = {cfg.PAD_token:"PAD",cfg.EOS_token:"EOS",cfg.SOS_token:"SOS", cfg.UNK_token:"UNK"}
        self.num_words = 4

    def addSentence(self,sentence): #Add Sentence to vocabulary
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):  # Add words to vocabulary
        if word not in self.word2index:
            self.word2index[word] = self.num_words
            if self.trimmed == False:
                self.word2count[word] = 1
            self.index2word[self.num_words] = word
            self.num_words += 1
        else:
            if self.trimmed == False:
                self.word2count[word] += 1

    def save(self,word2index_dic = 'word2index_dic.p', index2word_dic = 'index2word_dic.p',
         word2count_dic = 'word2count_dic.p'):

        # Create the "Saved" directory if it doesn't exist
        saved_dir = saved_path
        os.makedirs(saved_dir, exist_ok=True)

        w2i = os.path.join(saved_dir, self.name + '_' + word2index_dic)
        i2w = os.path.join(saved_dir, self.name + '_' + index2word_dic)
        w2c = os.path.join(saved_dir, self.name + '_' + word2count_dic)
        try:
            with open(w2i, 'wb') as fp:
                pickle.dump(self.word2index, fp, protocol=pickle.HIGHEST_PROTOCOL)

            with open(i2w, 'wb') as fp:
                pickle.dump(self.index2word, fp, protocol=pickle.HIGHEST_PROTOCOL)

            with open(w2c, 'wb') as fp:
                pickle.dump(self.word2count, fp, protocol=pickle.HIGHEST_PROTOCOL)
        except:
            print('Path Error, Verify the path of the filename is correct')



    def load(self, word2index_dic = 'word2index_dic.p', index2word_dic = 'index2word_dic.p',
             word2count_dic = 'word2count_dic.p'):

        w2i = os.path.join('Saved',self.name+'_'+word2index_dic)
        i2w = os.path.join('Saved',self.name+'_'+index2word_dic)
        w2c = os.path.join('Saved',self.name+'_'+word2count_dic)

        try:
            with open(w2i, 'rb') as fp:
                self.word2index = pickle.load(fp)

            with open(i2w, 'rb') as fp:
                self.index2word = pickle.load(fp)

            with open(w2c, 'rb') as fp:
                self.word2count = pickle.load(fp)

            self.num_words = len(self.word2index)

        except:
            print('File loading error.. check the path or filename is correct')


    def trim(self, min_count):  # Trim Rare words with frequency less than min_count
        if self.trimmed:
            print('Already trimmed before')
            return 0
        self.trimmed = True

        keep_words = []

        for k, v in self.word2count.items():
            if v >= min_count:
                keep_words.append(k)

        print('keep_words {} / {} = {:.4f}'.format(
            len(keep_words), len(self.word2index), len(keep_words) / len(self.word2index)
        ))

        # Reinitialize dictionaries
        self.word2index = {"PAD":self.cfg.PAD_token,"EOS":self.cfg.EOS_token,"SOS":self.cfg.SOS_token, "UNK":self.cfg.UNK_token}
        #self.word2count = {}
        self.index2word = {self.cfg.PAD_token:"PAD",self.cfg.EOS_token:"EOS",self.cfg.SOS_token:"SOS", self.cfg.UNK_token:"UNK"}
        self.num_words = 4

        for word in keep_words:
            self.addWord(word)
            if word not in self.word2count:
                del self.word2count[word]


In [56]:
class Encoder(nn.Module):

    def __init__(self,cfg):
        super(Encoder,self).__init__()
        '''
        Encoder module. Project the video feature into a different space which will be
        send to decoder.
        Argumets:
          input_size : CNN extracted feature size. For Densenet 1920, For inceptionv4 1536
          output_size : Dimention of projected space.
        '''

        self.appearance_projection_layer = nn.Linear(cfg.appearance_input_size,cfg.appearance_projected_size)


    def forward(self,x):
        appearance_out = self.appearance_projection_layer(x)
        return appearance_out


class TemporalAttention(nn.Module):
    def __init__(self,cfg):
        super(TemporalAttention,self).__init__()
        '''
        Spatial Attention module. It depends on previous hidden memory in the decoder(of shape hidden_size),
        feature at the source side ( of shape(196,feat_size) ).
        at(s) = align(ht,hs)
              = exp(score(ht,hs)) / Sum(exp(score(ht,hs')))
        where
        score(ht,hs) = ht.t * hs                         (dot)
                     = ht.t * Wa * hs                  (general)
                     = va.t * tanh(Wa[ht;hs])           (concat)
        Here we have used concat formulae.
        Argumets:
          hidden_size : hidden memory size of decoder. (batch,hidden_size)
          feat_size : feature size of each grid (annotation vector) at encoder side.
          bottleneck_size : intermediate size.
        '''


        self.hidden_size = cfg.decoder_hidden_size
        self.feat_size = cfg.feat_size
        self.bottleneck_size = cfg.attn_size

        self.decoder_projection = nn.Linear(self.hidden_size,self.bottleneck_size)
        self.encoder_projection = nn.Linear(self.feat_size, self.bottleneck_size)
        self.final_projection = nn.Linear(self.bottleneck_size,1)

    def forward(self,hidden,feats):
        '''
        shape of hidden (hidden_size) (batch,hidden_size) #(100,512)
        shape of feats (batch size,
        ,feat_size)  #(100,40,1536)
        '''
        Wh = self.decoder_projection(hidden)
        Uv = self.encoder_projection(feats)
        Wh = Wh.unsqueeze(1).expand_as(Uv)
        energies = self.final_projection(torch.tanh(Wh+Uv))
        weights = F.softmax(energies, dim=1)
        weighted_feats = feats *weights.expand_as(feats)
        attn_feats = weighted_feats.sum(dim=1)
        return attn_feats,weights


class DecoderRNN(nn.Module):

    def __init__(self,cfg,voc):
        super(DecoderRNN, self).__init__()
        '''
        Decoder, Basically a language model.
        Args:
        hidden_size : hidden memory size of LSTM/GRU
        output_size : output size. Its same as the vocabulary size.
        n_layers :

        '''

        # Keep for reference


        # Keep for reference
        self.dropout = cfg.dropout
        self.feat_len = cfg.frame_len
        self.attn_size = cfg.attn_size
        self.output_size = voc.num_words
        self.rnn_dropout = cfg.rnn_dropout
        self.n_layers = cfg.n_layers
        self.decoder_type = cfg.decoder_type

        # Define layers
        self.embedding = nn.Embedding(voc.num_words, cfg.embedding_size)
        self.attention = TemporalAttention(cfg)
        self.embedding_dropout = nn.Dropout(cfg.dropout)
        if self.decoder_type == 'gru':
            self.rnn = nn.GRU(input_size=cfg.decoder_input_size, hidden_size=cfg.decoder_hidden_size,
                              num_layers=self.n_layers, dropout=self.rnn_dropout)
        else:
            self.rnn = nn.LSTM(input_size=cfg.decoder_input_size, hidden_size=cfg.decoder_hidden_size,
                           num_layers=self.n_layers, dropout=self.rnn_dropout)
        self.out = nn.Linear(cfg.decoder_hidden_size, self.output_size)


    def forward(self, inputs, hidden, feats):
        '''
        we run this one step (word) at a time

        inputs -  (1, batch)
        hidden - h_n/c_n :(num_layers * num_directions, batch, hidden_size)    # GRU:h_n   LSTM:(h_n,c_n)
        feats - (batch,attention_length,annotation_vector_size)

        '''
        embedded = self.embedding(inputs) # [i/p:(1,batch)  o/p:(1,batch,embedding_size)]
        last_hidden = hidden[0] if self.decoder_type=='lstm' else hidden
        last_hidden = last_hidden.view(self.n_layers,last_hidden.size(1),last_hidden.size(2))
        last_hidden = last_hidden[-1]
        feats, attn_weights = self.attention(last_hidden,feats) #(100,1536) #(100,28,1)
        input_combined = torch.cat((embedded,feats.unsqueeze(0)),dim=2)
        output, hidden = self.rnn(input_combined, hidden) # (1,100,512)
        output = output.squeeze(0) # (100,512)
        output = self.out(output) # (100,num_words)
        output = F.softmax(output, dim = 1) #(100,num_words)
        return output, hidden, attn_weights

class SALSTM(nn.Module):

    def __init__(self,voc,cfg,path):
        super(SALSTM,self).__init__()

        self.voc = voc
        self.path = path
        self.cfg = cfg

        if cfg.opt_encoder:
            self.encoder = Encoder(cfg).to(cfg.device)
            self.enc_optimizer = optim.Adam(self.encoder.parameters(),lr=cfg.encoder_lr)

        self.decoder = DecoderRNN(cfg,voc).to(cfg.device)
        self.dec_optimizer = optim.Adam(self.decoder.parameters(),lr=cfg.decoder_lr,weight_decay=cfg.weight_decay,amsgrad=True)

        self.teacher_forcing_ratio = cfg.teacher_forcing_ratio
        self.print_every = cfg.print_every
        self.clip = cfg.clip
        self.device = cfg.device
        if cfg.opt_param_init:
            self.init_params()


    def init_params(self):
        for name, param in self.decoder.named_parameters():
            if 'weight' in name:
                nn.init.orthogonal_(param)
                #nn.init.kaiming_normal_(param, mode='fan_out', nonlinearity='relu')



    def update_hyperparameters(self,cfg):

        if self.cfg.opt_encoder:
            self.enc_optimizer = optim.Adam(self.encoder.parameters(),lr=cfg.encoder_lr)

        self.dec_optimizer = optim.Adam(self.decoder.parameters(),lr=cfg.decoder_lr,amsgrad=True)
        self.teacher_forcing_ratio = cfg.teacher_forcing_ratio


    def load(self,encoder_path = 'Save/Meanpool_10.pt',decoder_path='Saved/SALSTM_10.pt'):
        if os.path.exists(encoder_path) and os.path.exists(decoder_path):
            self.encoder.load_state_dict(torch.load(encoder_path))
            self.decoder.load_state_dict(torch.load(decoder_path))
        else:
            print('File not found Error..')

    def save(self,encoder_path,decoder_path):
        if os.path.exists(encoder_path) and os.path.exists(decoder_path):

            torch.save(model.encoder.state_dict(),encoder_path)
            torch.save(model.decoder.state_dict(),decoder_path)
        else:
            print('Invalid path address given.')

    def train_epoch(self,dataloader,utils):
        '''
        Function to train the model for a single epoch.
        Args:
         Input:
            dataloader : the dataloader object.basically train dataloader object.
         Return:
             epoch_loss : Average single time step loss for an epoch
        '''
        total_loss = 0
        start_iteration = 1
        print_loss = 0
        iteration = 1
        if self.cfg.opt_encoder:
            self.encoder.train()
        self.decoder.train()
        for data in dataloader:
            features, targets, mask, max_length, _,_,_ = data
            use_teacher_forcing = True if random.random() < self.teacher_forcing_ratio else False
            loss = self.train_iter(utils,features,targets,mask,max_length,use_teacher_forcing)
            print_loss += loss
            total_loss += loss
        # Print progress
            if iteration % self.print_every == 0:
                print_loss_avg = print_loss / self.print_every
                print("Iteration: {}; Percent complete: {:.1f}%; Average loss: {:.4f}".
                format(iteration, iteration / len(dataloader) * 100, print_loss_avg))
                print_loss = 0


            iteration += 1
        return total_loss/len(dataloader)


    def train_iter(self,utils,input_variable, target_variable, mask,max_target_len,use_teacher_forcing):
        '''
        Forward propagate input signal and update model for a single iteration.

        Args:
        Inputs:
            input_variable : video mini-batch tensor; size = (B,T,F)
            target_variable : Ground Truth Captions;  size = (T,B); T will be different for different mini-batches
            mask : Masked tensor for Ground Truth;    size = (T,C)
            max_target_len : maximum lengh of the mini-batch; size = T
            use_teacher_forcing : binary variable. If True training uses teacher forcing else sampling.
            clip : clip the gradients to counter exploding gradient problem.
        Returns:
            iteration_loss : average loss per time step.
        '''
        if self.cfg.opt_encoder:
            self.enc_optimizer.zero_grad()
        self.dec_optimizer.zero_grad()

        loss = 0
        print_losses = []
        n_totals = 0

        input_variable = input_variable.to(self.device)

        if self.cfg.opt_encoder:
            input_variable = self.encoder(input_variable)
        target_variable = target_variable.to(self.device)
        mask = mask.byte().to(self.device)

        # Forward pass through encoder
        decoder_input = torch.LongTensor([[self.cfg.SOS_token for _ in range(self.cfg.batch_size)]])
        decoder_input = decoder_input.to(self.device)
        decoder_hidden = torch.zeros(self.cfg.n_layers, self.cfg.batch_size,
                                      self.cfg.decoder_hidden_size).to(self.device)
        if self.cfg.decoder_type == 'lstm':
            decoder_hidden = (decoder_hidden,decoder_hidden)


        # Forward batch of sequences through decoder one time step at a time
        if use_teacher_forcing:
            for t in range(max_target_len):
                decoder_output, decoder_hidden,_ = self.decoder(decoder_input, decoder_hidden,input_variable.float())
                # Teacher forcing: next input comes from ground truth(data distribution)
                decoder_input = target_variable[t].view(1, -1)
                mask_loss, nTotal = utils.maskNLLLoss(decoder_output.unsqueeze(0), target_variable[t], mask[t],self.device)
                loss += mask_loss
                print_losses.append(mask_loss.item() * nTotal)
                n_totals += nTotal
        else:
            for t in range(max_target_len):
                decoder_output, decoder_hidden,_ = self.decoder(decoder_input, decoder_hidden,input_variable.float())
                # No teacher forcing: next input is decoder's own current output(model distribution)
                _, topi = decoder_output.squeeze(0).topk(1)
                decoder_input = torch.LongTensor([[topi[i][0] for i in range(self.cfg.batch_size)]])
                decoder_input = decoder_input.to(self.device)
                # Calculate and accumulate loss
                mask_loss, nTotal = utils.maskNLLLoss(decoder_output, target_variable[t], mask[t],self.device)
                loss += mask_loss
                print_losses.append(mask_loss.item() * nTotal)
                n_totals += nTotal

        # Perform backpropatation
        loss.backward()

        if self.cfg.opt_encoder:
            _ = nn.utils.clip_grad_norm_(self.encoder.parameters(), self.clip)
            self.enc_optimizer.step()

        _ = nn.utils.clip_grad_norm_(self.decoder.parameters(), self.clip)
        self.dec_optimizer.step()


        return sum(print_losses) / n_totals


    @torch.no_grad()
    def GreedyDecoding(self,features,max_length=15):
        batch_size = features.size()[0]
        features = features.to(self.device)

        if self.cfg.opt_encoder:
            features = self.encoder(features) #need to make optional
        decoder_input = torch.LongTensor([[self.cfg.SOS_token for _ in range(batch_size)]]).to(self.device)
        decoder_hidden = torch.zeros(self.cfg.n_layers, batch_size,
                                      self.cfg.decoder_hidden_size).to(self.device)
        if self.cfg.decoder_type == 'lstm':
            decoder_hidden = (decoder_hidden,decoder_hidden)
        caption = []
        attention_values = []
        for _ in range(max_length):
            decoder_output, decoder_hidden,attn_values = self.decoder(decoder_input,
                                                            decoder_hidden,features.float())
            _, topi = decoder_output.squeeze(0).topk(1)
            decoder_input = torch.LongTensor([[topi[i][0] for i in range(batch_size)]]).to(self.device)
            caption.append(topi.squeeze(1).cpu())
            attention_values.append(attn_values.squeeze(2))
        caption = torch.stack(caption,0).permute(1,0)
        caps_text = []
        for dta in caption:
            tmp = []
            for token in dta:
                if token.item() not in self.voc.index2word.keys() or token.item()==2: # Remove EOS and bypass OOV
                    pass
                else:
                    tmp.append(self.voc.index2word[token.item()])
            tmp = ' '.join(x for x in tmp)
            caps_text.append(tmp)
        return caption,caps_text, torch.stack(attention_values,0).cpu().numpy()

In [57]:
class Evaluator:

    def __init__(self,model,dataloader,path,cfg,reference_dict,decoding_type = 'greedy'):
        self.path = path
        self.cfg = cfg
        self.dataloader = dataloader
        self.reference_dict = reference_dict
        self.prediction_dict = {}
        self.scores = {}
        self.meteor = 0.32 # save best model based on METEOR score
        self.losses = {}
        self.best_model = model
        self.meteor_sota = 0.34
        self.decoding_type = decoding_type

    def prediction_list(self,model):
        self.prediction_dict = {}
        ide_list = []
        caption_list = []
        model.eval()
        with torch.no_grad():
            for data in self.dataloader:
                features, targets, mask, max_length,ides,motion_feat,object_feat= data
                if self.cfg.model_name == 'sa-lstm':
                    if self.decoding_type == 'greedy':
                        cap,cap_txt,_ = model.GreedyDecoding(features.to(self.cfg.device))
                    if self.decoding_type == 'beam':
                        cap_txt = model.BeamDecoding(features.to(self.cfg.device),self.cfg.beam_length)
                    ide_list += ides
                    caption_list += cap_txt
        for a in zip(ide_list,caption_list):
                self.prediction_dict[str(a[0])] = [a[1].strip()]

    def evaluate(self,scorer,model,epoch,loss=9999):
        self.prediction_list(model)
        scores = scorer.score(self.reference_dict,self.prediction_dict)
        self.scores[epoch] = scores
        self.losses[epoch] = loss
        return scores



    def save_model(self,model,epoch):
        print('Saving models....')
        filename = os.path.join(self.path.saved_models_path, self.cfg.model_name+str(epoch)+'.pt')
        torch.save(model,filename)

In [58]:
#set seed for reproducibility
utils = Utils()
utils.set_seed(1)

In [59]:
#create SALSTM object
cfg = ConfigSALSTM(opt_encoder=True)
# specifying the dataset in configuration object from {'msvd','msrvtt'}
cfg.dataset = 'msvd'

#Changing the hyperparameters in configuration object
cfg.batch_size = 100  #training batch size
cfg.n_layers = 1    # number of layers in decoder rnn
cfg.decoder_type = 'lstm'  # from {'lstm','gru'}
cfg.dropout = 0.5
cfg.opt_param_init = False

#creation of path object
path = Path(cfg,os.getcwd())

In [60]:
# Vocabulary object
voc = Vocabulary(cfg)

# If vocabulary is already saved or downloaded the saved file
# voc.load() # comment this if using vocabulary for the first time or with no saved file

text_dict = {}
voc = Vocabulary(cfg)
data_handler = DataHandler(cfg, path, voc)

import h5py

print(path.appearance_feature_file)

# Load the HDF5 file and process its contents
with h5py.File(path.appearance_feature_file, 'r') as f:
    # List all the keys (datasets) in the HDF5 file
    dataset_keys = list(f.keys())

# Update text_dict with train, val, and test dictionaries from data_handler
text_dict.update(data_handler.train_dict)
text_dict.update(data_handler.val_dict)
text_dict.update(data_handler.test_dict)

# Iterate through text_dict and add sentences to the vocabulary
for k, v in text_dict.items():
    for anno in v:
        voc.addSentence(anno)

# Save the vocabulary
voc.save()

# # Remove words below the minimum count
# min_count = 2
# voc.trim(min_count=min_count)

# Print the vocabulary size
print('Vocabulary Size:', voc.num_words)

/content/drive/MyDrive/MSVD/feature/MSVD_APPEARANCE_INCEPTIONRESNETV2_28.hdf5
Vocabulary Size: 12596


In [67]:
# Datasets and dataloaders
data_handler = DataHandler(cfg,path,voc)
train_dset,val_dset,test_dset = data_handler.getDatasets()
train_loader,val_loader,test_loader = data_handler.getDataloader(train_dset,val_dset,test_dset)

#Model object
# model = SALSTM(voc,cfg,path)
model = torch.load('/content/drive/MyDrive/MSVD/sa-lstm600/sa-lstm600.pt')
#Evaluator object on test data
test_evaluator_greedy = Evaluator(model,test_loader,path,cfg,data_handler.test_dict)


In [68]:
#Training Loop one only by pre trained
cfg.encoder_lr = 1e-4
cfg.decoder_lr = 1e-4
cfg.teacher_forcing_ratio = 0.2
model.update_hyperparameters(cfg)
e = 1
loss_train = model.train_epoch(train_loader,utils)
print('greedy :',test_evaluator_greedy.evaluate(utils,model,e,loss_train))

{'testlen': 4459, 'reflen': 4435, 'guess': [4459, 3789, 3119, 2449], 'correct': [3329, 1833, 1026, 450]}
ratio: 1.0054114994360754
greedy : {'Bleu_1': 0.7465799506614158, 'Bleu_2': 0.6009759480592286, 'Bleu_3': 0.49160397915129306, 'Bleu_4': 0.3843859227846618, 'METEOR': 0.3146426931250789, 'ROUGE_L': 0.6726702597824165, 'CIDEr': 0.6204282551856276}


In [None]:
# model_save_path = '/content/drive/MyDrive/MSVD/sa-lstm600/sa-lstm600_updated.pt'
# torch.save(model.state_dict(), model_save_path)

In [None]:
predictions = test_evaluator_greedy.prediction_dict
references = test_evaluator_greedy.reference_dict


In [None]:
!pip install ffmpeg-python




In [None]:
import ffmpeg
from IPython.display import HTML
from base64 import b64encode

In [None]:
def play(filename):
    html = ''
    video = open(filename, 'rb').read()
    src = 'data:video/mp4;base64,' + b64encode(video).decode()
    html += '<video width=750 height=375 controls autoplay loop><source src="%s" type="video/mp4"></video>' % src
    return HTML(html)

In [None]:
def convert_to_mp4(input_path, output_path):
    try:
        ffmpeg.input(input_path).output(output_path).run(quiet=True, overwrite_output=True)
        return True
    except ffmpeg.Error as e:
        print(f"Error converting file: {e}")
        return False

In [None]:
def get_video_info(video_name):
    # Read the youtube_mapping file
    mapping_file = '/content/drive/MyDrive/MSVD/other_ann/youtube_mapping.txt'
    video_id = None

    # Read the mapping file and find the video id
    with open(mapping_file, 'r') as f:
        for line in f:
            parts = line.strip().split()
            if parts[1] == video_name:
                video_id = parts[0]
                break

    if video_id is None:
        print(f"No video id found for {video_name}")
        return

        # Path to the .avi video file
    avi_video_path = f'/content/drive/MyDrive/MSVD/YouTubeClips/{video_id}.avi'
    mp4_video_path = f'/content/drive/MyDrive/MSVD/mp4/{video_id}.mp4'

    if os.path.exists(avi_video_path):
        # Convert .avi to .mp4
        if not os.path.exists(mp4_video_path):
            success = convert_to_mp4(avi_video_path, mp4_video_path)
            if not success:
                print(f"Failed to convert {avi_video_path} to .mp4.")
                return
        # Display the video using the play function
        display(play(mp4_video_path))
    else:
        print(f"Video file for {video_id} not found.")


    # Display the captions
    prediction_caption = predictions.get(video_name, ["No prediction available"])[0]
    reference_caption = references.get(video_name, ["No reference available"])[0]

    print(f"Prediction: {prediction_caption}")
    print(f"Reference: {reference_caption}")

In [None]:
get_video_info('vid1465')


Prediction: a man is playing a guitar on a stage
Reference: a man is performing on the electric guitar


In [None]:
get_video_info('vid1463')


Prediction: a man is riding a bike on a motorcycle
Reference: a man is doing motorcycle tricks


In [None]:
get_video_info('vid1534')


Prediction: a man is pouring a can of water into a pot
Reference: a man pours something from a can into a white bowl
