In [1]:
import os
import random
import json
import h5py
import itertools

from PIL import Image
import numpy as np
import torch
import torchvision.datasets as dset
import torchvision.transforms as transforms

from torch.utils.data import DataLoader,Dataset
from torch.nn import functional as F

from config import Path
from dictionary import Vocabulary
from utils import Utils
from evaluate import Evaluator
from data import DataHandler

utils = Utils()
utils.set_seed(1)

In [2]:
train_val_msrvtt_path = "MSRVTT\\captions\\train_val_videodatainfo.json"

# CAPTIONS

* train_dict = {}
* val_dict = {}
* test_dict = {}

In [3]:
train_val_file = json.load(open(train_val_msrvtt_path))

train_id_list = [i for i in range(0, 80)]
val_id_list = [i for i in range(80, 90)]
test_id_list = [i for i in range(90, 100)]

train_dict = {}
val_dict = {}
test_dict = {}

for datap in train_val_file['sentences']:
    if int(datap['video_id'][5:]) in train_id_list:
        if datap['video_id'] in list(train_dict.keys()):
            train_dict[datap['video_id']] += [datap['caption']]
        else:
            train_dict[datap['video_id']] = [datap['caption']]
    
    if int(datap['video_id'][5:]) in val_id_list:
        if datap['video_id'] in list(val_dict.keys()):
            val_dict[datap['video_id']] += [datap['caption']]
        else:
            val_dict[datap['video_id']] = [datap['caption']]
            
    if int(datap['video_id'][5:]) in test_id_list:
        if datap['video_id'] in list(test_dict.keys()):
            test_dict[datap['video_id']] += [datap['caption']]
        else:
            test_dict[datap['video_id']] = [datap['caption']]

In [4]:
#Import configuration and model 
from config import ConfigSALSTM
from models.SA_LSTM.model import SALSTM

#create Mean pooling object
cfg = ConfigSALSTM(opt_encoder=True)

# specifying the dataset in configuration object from {'msvd','msrvtt'}
cfg.dataset = 'msrvtt'

#Changing the hyperparameters in configuration object
cfg.batch_size = 32 #training batch size
cfg.n_layers = 1    # number of layers in decoder rnn
cfg.decoder_type = 'lstm'  # from {'lstm','gru'}
cfg.dropout = 0.5
cfg.opt_param_init = False

#creation of path object
path = Path(cfg, os.getcwd())

#Vocabulary object, 
voc = Vocabulary(cfg)

#If vocabulary is already saved or downloaded the saved file
voc.load() #comment this if using vocabulary for the first time or with no saved file

#If is not built
# text_dict = {}
# voc = Vocabulary(cfg)

# text_dict.update(train_dict)
# text_dict.update(val_dict)
# text_dict.update(test_dict)

# for k,v in text_dict.items():
#     for anno in v:
#         voc.addSentence(anno)
        
# voc.save()

min_count = 2 #remove all words below count min_count
voc.trim(min_count=min_count)

print('Vocabulary Size : ',voc.num_words)

keep_words 1108 / 2191 = 0.5057
Vocabulary Size :  1112


# FEATURES

* Appearance Features
* Motion Features

In [5]:
af_path = "MSRVTT\\features\\image_inceptionresnetv2_imagenet_fps_max60_100.hdf5"

In [6]:
appearance_feature_dict = {}

f1 = h5py.File(af_path, 'r+')

for key in f1.keys():
    arr = f1[key]
    
    if arr.shape[0] < 28:
        pad = self.cfg.frame_len - arr.shape[0]
        arr = np.concatenate((arr,np.zeros((pad,arr.shape[1]))),axis = 0)
    
    appearance_feature_dict[key] = arr

In [7]:
train_name_list = list(train_dict.keys())
val_name_list = list(val_dict.keys())
test_name_list = list(test_dict.keys())

# TRAINING

In [8]:
# Datasets and dataloaders
data_handler = DataHandler(cfg, path, voc)
train_dset, val_dset, test_dset = data_handler.getDatasets()
train_loader, val_loader, test_loader = data_handler.getDataloader(train_dset, val_dset, test_dset)

In [9]:
# f1 = h5py.File(af_path,'r+')

# for key in f1.keys():
#     arr = f1[key][:]
#     print(arr)
#     break

In [10]:
#Model object
model = SALSTM(voc, cfg, path)

#Evaluator object on test data
test_evaluator_greedy = Evaluator(model,test_loader,path,cfg,data_handler.test_dict)
test_evaluator_beam = Evaluator(model,test_loader,path,cfg,data_handler.test_dict,decoding_type='beam')



In [11]:
#Training Loop
cfg.encoder_lr = 1e-4
cfg.decoder_lr = 1e-3
cfg.teacher_forcing_ratio = 1.0
model.update_hyperparameters(cfg)
val_loss = []

for e in range(1,3001):
    loss = model.train_epoch(train_loader, utils)
    
    if e%50 == 0 :
        print('Epoch -- >',e,'Loss -->',loss)
        print('greedy :',test_evaluator_greedy.evaluate(utils,model,e,loss))
        val_loss.append(model.loss_calculate(val_loader,utils))
        print('beam :',test_evaluator_beam.evaluate(utils,model,e,loss))
        print('semibeam :',test_evaluator_semibeam.evaluate(utils,model,e,loss))

  loss = crossEntropy.masked_select(mask).mean()
  Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


KeyboardInterrupt: 