<a href="https://colab.research.google.com/github/ledduy610/b2dl-vsum/blob/main/VSUM_VASNet_Revision.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#VASNet Run in Notebook

* Chỉnh lại VASNet để chạy cho các feature mới như InceptionV3 (2048-d) (GoogleNet là 1024-d)

##Helper py **file**

In [None]:
import torch
from torchvision import transforms
import numpy as np
import time
import glob
import random
import argparse
import h5py
import json
import torch.nn.init as init

In [None]:
!pip install ortools
!pip install knapsack


Collecting ortools
[?25l  Downloading https://files.pythonhosted.org/packages/6a/bd/75277072925d687aa35a6ea9e23e81a7f6b7c980b2a80949c5b9a3f98c79/ortools-9.0.9048-cp37-cp37m-manylinux1_x86_64.whl (14.4MB)
[K     |████████████████████████████████| 14.4MB 213kB/s 
Collecting protobuf>=3.15.8
[?25l  Downloading https://files.pythonhosted.org/packages/48/72/05ec80a16a85d9c0e69020ab731b1dafdf2fee591b30811c6b63ec447afe/protobuf-3.17.2-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.0MB)
[K     |████████████████████████████████| 1.0MB 42.3MB/s 
Installing collected packages: protobuf, ortools
  Found existing installation: protobuf 3.12.4
    Uninstalling protobuf-3.12.4:
      Successfully uninstalled protobuf-3.12.4
Successfully installed ortools-9.0.9048 protobuf-3.17.2


Collecting knapsack
  Downloading https://files.pythonhosted.org/packages/f7/12/0ab665722f73befb7a6daca98d79026480bbfa11c49b326cfa0d8f4f5951/knapsack-0.0.7-py3-none-any.whl
Installing collected packages: knapsack
Successfully installed knapsack-0.0.7


###vsum_tools.py
Verbatim pasted, along with knapsack.py

In [None]:
''''
Courtesy of KaiyangZhou
https://github.com/KaiyangZhou/pytorch-vsumm-reinforce

@article{zhou2017reinforcevsumm,
   title={Deep Reinforcement Learning for Unsupervised Video Summarization with Diversity-Representativeness Reward},
   author={Zhou, Kaiyang and Qiao, Yu and Xiang, Tao},
   journal={arXiv:1801.00054},
   year={2017}
}

Modifications by Jiri Fajtl
- knapsack replaced with knapsack_ortools
- added evaluate_user_summaries() for user summaries ground truth evaluation
'''

import numpy as np

# A Dynamic Programming based Python Program for 0-1 Knapsack problem
# Returns the maximum value that can be put in a knapsack of capacity W
import numpy as np
from ortools.algorithms import pywrapknapsack_solver


def knapsack(W, wt, val, n):
    K = [[0 for x in range(W+1)] for x in range(n+1)]

    # Build table K[][] in bottom up manner
    for i in range(n+1):
        for w in range(W+1):
            if i==0 or w==0:
                K[i][w] = 0
            elif wt[i-1] <= w:
                K[i][w] = max(val[i-1] + K[i-1][w-wt[i-1]],  K[i-1][w])
            else:
                K[i][w] = K[i-1][w]


    best = K[n][W]

    amount = np.zeros(n)
    a = best
    j = n
    Y = W

    # j = j + 1;
    #
    # amount(j) = 1;
    # Y = Y - weights(j);
    # j = j - 1;
    # a = A(j + 1, Y + 1);

    while a > 0:
       while K[j][Y] == a:
           j = j - 1

       j = j + 1
       amount[j-1] = 1
       Y = Y - wt[j-1]
       j = j - 1
       a = K[j][Y]

    return amount


def test_knapsack():
    weights = [1 ,1 ,1, 1 ,2 ,2 ,3]
    values  = [1 ,1 ,2 ,3, 1, 3 ,5]
    best = 13
    print(knapsack(7, weights, values, 7))

#===========================================
'''
------------------------------------------------
Use dynamic programming (DP) to solve 0/1 knapsack problem
Time complexity: O(nW), where n is number of items and W is capacity

Author: Kaiyang Zhou
Website: https://kaiyangzhou.github.io/
------------------------------------------------
knapsack_dp(values,weights,n_items,capacity,return_all=False)

Input arguments:
  1. values: a list of numbers in either int or float, specifying the values of items
  2. weights: a list of int numbers specifying weights of items
  3. n_items: an int number indicating number of items
  4. capacity: an int number indicating the knapsack capacity
  5. return_all: whether return all info, defaulty is False (optional)

Return:
  1. picks: a list of numbers storing the positions of selected items
  2. max_val: maximum value (optional)
------------------------------------------------
'''
def knapsack_dp(values,weights,n_items,capacity,return_all=False):
    check_inputs(values,weights,n_items,capacity)

    table = np.zeros((n_items+1,capacity+1),dtype=np.float32)
    keep = np.zeros((n_items+1,capacity+1),dtype=np.float32)

    for i in range(1,n_items+1):
        for w in range(0,capacity+1):
            wi = weights[i-1] # weight of current item
            vi = values[i-1] # value of current item
            if (wi <= w) and (vi + table[i-1,w-wi] > table[i-1,w]):
                table[i,w] = vi + table[i-1,w-wi]
                keep[i,w] = 1
            else:
                table[i,w] = table[i-1,w]

    picks = []
    K = capacity

    for i in range(n_items,0,-1):
        if keep[i,K] == 1:
            picks.append(i)
            K -= weights[i-1]

    picks.sort()
    picks = [x-1 for x in picks] # change to 0-index

    if return_all:
        max_val = table[n_items,capacity]
        return picks,max_val
    return picks

def check_inputs(values,weights,n_items,capacity):
    # check variable type
    assert(isinstance(values,list))
    assert(isinstance(weights,list))
    assert(isinstance(n_items,int))
    assert(isinstance(capacity,int))
    # check value type
    assert(all(isinstance(val,int) or isinstance(val,float) for val in values))
    assert(all(isinstance(val,int) for val in weights))
    # check validity of value
    assert(all(val >= 0 for val in weights))
    assert(n_items > 0)
    assert(capacity > 0)

def test_knapsack_dp():
    values = [2,3,4]
    weights = [1,2,3]
    n_items = 3
    capacity = 3
    picks = knapsack_dp(values,weights,n_items,capacity)
    print (picks)



osolver = pywrapknapsack_solver.KnapsackSolver(
    # pywrapknapsack_solver.KnapsackSolver.KNAPSACK_MULTIDIMENSION_BRANCH_AND_BOUND_SOLVER,
    pywrapknapsack_solver.KnapsackSolver.KNAPSACK_DYNAMIC_PROGRAMMING_SOLVER,
    'test')

def knapsack_ortools(values, weights, items, capacity ):
    scale = 1000
    values = np.array(values)
    weights = np.array(weights)
    values = (values * scale).astype(np.int)
    weights = (weights).astype(np.int)
    capacity = capacity

    osolver.Init(values.tolist(), [weights.tolist()], [capacity])
    computed_value = osolver.Solve()
    packed_items = [x for x in range(0, len(weights))
                    if osolver.BestSolutionContains(x)]

    return packed_items


if __name__ == "__main__":
    test_knapsack_dp()
    test_knapsack()


import math


def generate_summary(ypred, cps, n_frames, nfps, positions, proportion=0.15, method='knapsack'):
    """Generate keyshot-based video summary i.e. a binary vector.
    Args:
    ---------------------------------------------
    - ypred: predicted importance scores.
    - cps: change points, 2D matrix, each row contains a segment.
    - n_frames: original number of frames.
    - nfps: number of frames per segment.
    - positions: positions of subsampled frames in the original video.
    - proportion: length of video summary (compared to original video length).
    - method: defines how shots are selected, ['knapsack', 'rank'].
    """
    n_segs = cps.shape[0]
    frame_scores = np.zeros((n_frames), dtype=np.float32)
    if positions.dtype != int:
        positions = positions.astype(np.int32)
    if positions[-1] != n_frames:
        positions = np.concatenate([positions, [n_frames]])
    for i in range(len(positions) - 1):
        pos_left, pos_right = positions[i], positions[i+1]
        if i == len(ypred):
            frame_scores[pos_left:pos_right] = 0
        else:
            frame_scores[pos_left:pos_right] = ypred[i]

    seg_score = []
    for seg_idx in range(n_segs):
        start, end = int(cps[seg_idx,0]), int(cps[seg_idx,1]+1)
        scores = frame_scores[start:end]
        seg_score.append(float(scores.mean()))

    limits = int(math.floor(n_frames * proportion))

    if method == 'knapsack':
        #picks = knapsack_dp(seg_score, nfps, n_segs, limits)
        picks = knapsack_ortools(seg_score, nfps, n_segs, limits)
    elif method == 'rank':
        order = np.argsort(seg_score)[::-1].tolist()
        picks = []
        total_len = 0
        for i in order:
            if total_len + nfps[i] < limits:
                picks.append(i)
                total_len += nfps[i]
    else:
        raise KeyError("Unknown method {}".format(method))

    summary = np.zeros((1), dtype=np.float32) # this element should be deleted
    for seg_idx in range(n_segs):
        nf = nfps[seg_idx]
        if seg_idx in picks:
            tmp = np.ones((nf), dtype=np.float32)
        else:
            tmp = np.zeros((nf), dtype=np.float32)
        summary = np.concatenate((summary, tmp))

    summary = np.delete(summary, 0) # delete the first element
    return summary


def evaluate_summary(machine_summary, user_summary, eval_metric='avg'):
    """Compare machine summary with user summary (keyshot-based).
    Args:
    --------------------------------
    machine_summary and user_summary should be binary vectors of ndarray type.
    eval_metric = {'avg', 'max'}
    'avg' averages results of comparing multiple human summaries.
    'max' takes the maximum (best) out of multiple comparisons.
    """
    machine_summary = machine_summary.astype(np.float32)
    user_summary = user_summary.astype(np.float32)
    n_users,n_frames = user_summary.shape

    # binarization
    machine_summary[machine_summary > 0] = 1
    user_summary[user_summary > 0] = 1

    if len(machine_summary) > n_frames:
        machine_summary = machine_summary[:n_frames]
    elif len(machine_summary) < n_frames:
        zero_padding = np.zeros((n_frames - len(machine_summary)))
        machine_summary = np.concatenate([machine_summary, zero_padding])

    f_scores = []
    prec_arr = []
    rec_arr = []

    for user_idx in range(n_users):
        gt_summary = user_summary[user_idx,:]
        overlap_duration = (machine_summary * gt_summary).sum()
        precision = overlap_duration / (machine_summary.sum() + 1e-8)
        recall = overlap_duration / (gt_summary.sum() + 1e-8)
        if precision == 0 and recall == 0:
            f_score = 0.
        else:
            f_score = (2 * precision * recall) / (precision + recall)
        f_scores.append(f_score)
        prec_arr.append(precision)
        rec_arr.append(recall)

    if eval_metric == 'avg':
        final_f_score = np.mean(f_scores)
        final_prec = np.mean(prec_arr)
        final_rec = np.mean(rec_arr)
    elif eval_metric == 'max':
        final_f_score = np.max(f_scores)
        max_idx = np.argmax(f_scores)
        final_prec = prec_arr[max_idx]
        final_rec = rec_arr[max_idx]
    
    return final_f_score, final_prec, final_rec


def evaluate_user_summaries(user_summary, eval_metric='avg'):
    """Compare machine summary with user summary (keyshot-based).
    Args:
    --------------------------------
    machine_summary and user_summary should be binary vectors of ndarray type.
    eval_metric = {'avg', 'max'}
    'avg' averages results of comparing multiple human summaries.
    'max' takes the maximum (best) out of multiple comparisons.
    """
    user_summary = user_summary.astype(np.float32)
    n_users, n_frames = user_summary.shape

    # binarization
    user_summary[user_summary > 0] = 1

    f_scores = []
    prec_arr = []
    rec_arr = []

    for user_idx in range(n_users):
        gt_summary = user_summary[user_idx, :]
        for other_user_idx in range(user_idx+1, n_users):
            other_gt_summary = user_summary[other_user_idx, :]
            overlap_duration = (other_gt_summary * gt_summary).sum()
            precision = overlap_duration / (other_gt_summary.sum() + 1e-8)
            recall = overlap_duration / (gt_summary.sum() + 1e-8)
            if precision == 0 and recall == 0:
                f_score = 0.
            else:
                f_score = (2 * precision * recall) / (precision + recall)
            f_scores.append(f_score)
            prec_arr.append(precision)
            rec_arr.append(recall)


    if eval_metric == 'avg':
        final_f_score = np.mean(f_scores)
        final_prec = np.mean(prec_arr)
        final_rec = np.mean(rec_arr)
    elif eval_metric == 'max':
        final_f_score = np.max(f_scores)
        max_idx = np.argmax(f_scores)
        final_prec = prec_arr[max_idx]
        final_rec = rec_arr[max_idx]

    return final_f_score, final_prec, final_rec


[0, 1]
[0. 0. 1. 1. 0. 1. 1.]


###Config.py

Verbatim pasted




In [None]:
from torch.autograd import Variable


class HParameters:

    def __init__(self):
        self.verbose = False
        self.use_cuda = True
        self.cuda_device = 0
        self.max_summary_length = 0.15

        self.l2_req = 0.00001
        self.lr_epochs = [0]
        self.lr = [0.00005]

        self.epochs_max = 300
        self.train_batch_size = 1

        self.output_dir = 'ex-10'

        self.root = ''
        self.datasets=['datasets/eccv16_dataset_summe_google_pool5.h5',
                       'datasets/eccv16_dataset_tvsum_google_pool5.h5',
                       'datasets/eccv16_dataset_ovp_google_pool5.h5',
                       'datasets/eccv16_dataset_youtube_google_pool5.h5']

        self.splits = ['splits/tvsum_splits.json',
                        'splits/summe_splits.json']

        self.splits += ['splits/tvsum_aug_splits.json',
                        'splits/summe_aug_splits.json']

        return


    def get_dataset_by_name(self, dataset_name):
        for d in self.datasets:
            if dataset_name in d:
                return [d]
        return None

    def load_from_args(self, args):
        for key in args:
            val = args[key]
            if val is not None:
                if hasattr(self, key) and isinstance(getattr(self, key), list):
                    val = val.split()

                setattr(self, key, val)

    def __str__(self):
        vars = [attr for attr in dir(self) if not callable(getattr(self,attr)) and not (attr.startswith("__") or attr.startswith("_"))]

        info_str = ''
        for i, var in enumerate(vars):
            val = getattr(self, var)
            if isinstance(val, Variable):
                val = val.data.cpu().numpy().tolist()[0]
            info_str += '['+str(i)+'] '+var+': '+str(val)+'\n'

        return info_str


if __name__ == "__main__":

    # Tests
    hps = HParameters()
    print(hps)

    args = {'root': 'root_dir',
            'datasets': 'set1,set2,set3',
            'splits': 'split1, split2',
            'new_param_float': 1.23456
            }

    hps.load_from_args(args)
    print(hps)


##Vasnet_model.py
 along with layer_norm.py

 editited to fit the new features size

In [None]:
__author__ = 'Jiri Fajtl'
__email__ = 'ok1zjf@gmail.com'
__version__= '3.6'
__status__ = "Research"
__date__ = "1/12/2018"
__license__= "MIT License"


import torch
import torch.nn as nn
import torch.nn.functional as F

class LayerNorm(nn.Module):
    def __init__(self, features, eps=1e-6):
        super(LayerNorm, self).__init__()
        self.gamma = nn.Parameter(torch.ones(features))
        self.beta = nn.Parameter(torch.zeros(features))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        return self.gamma * (x - mean) / (std + self.eps) + self.beta

class SelfAttention(nn.Module):

    def __init__(self, apperture=-1, ignore_itself=False, input_size=1024, output_size=1024):
        super(SelfAttention, self).__init__()

        self.apperture = apperture
        self.ignore_itself = ignore_itself

        self.m = input_size
        self.output_size = output_size

        self.K = nn.Linear(in_features=self.m, out_features=self.output_size, bias=False)
        self.Q = nn.Linear(in_features=self.m, out_features=self.output_size, bias=False)
        self.V = nn.Linear(in_features=self.m, out_features=self.output_size, bias=False)
        self.output_linear = nn.Linear(in_features=self.output_size, out_features=self.m, bias=False)

        self.drop50 = nn.Dropout(0.5)



    def forward(self, x):
        n = x.shape[0]  # sequence length

        K = self.K(x)  # ENC (n x m) => (n x H) H= hidden size
        Q = self.Q(x)  # ENC (n x m) => (n x H) H= hidden size
        V = self.V(x)

        Q *= 0.06
        logits = torch.matmul(Q, K.transpose(1,0))

        if self.ignore_itself:
            # Zero the diagonal activations (a distance of each frame with itself)
            logits[torch.eye(n).byte()] = -float("Inf")

        if self.apperture > 0:
            # Set attention to zero to frames further than +/- apperture from the current one
            onesmask = torch.ones(n, n)
            trimask = torch.tril(onesmask, -self.apperture) + torch.triu(onesmask, self.apperture)
            logits[trimask == 1] = -float("Inf")

        att_weights_ = nn.functional.softmax(logits, dim=-1)
        weights = self.drop50(att_weights_)
        y = torch.matmul(V.transpose(1,0), weights).transpose(1,0)
        y = self.output_linear(y)

        return y, att_weights_



class VASNet(nn.Module):

    def __init__(self, m = 2048):
        super(VASNet, self).__init__()

        # self.m = 1024 # cnn features size
        self.m = m ## AN EDIT 2021.05.27 change features size to inceptionv3 2048
        # self.hidden_size = 1024  ## AN EDIT 2021.05.27 I don't know wtf does this do 

        self.att = SelfAttention(input_size=self.m, output_size=self.m)
        
        self.ka = nn.Linear(in_features=self.m, out_features=1024)
        self.kb = nn.Linear(in_features=self.ka.out_features, out_features=1024)
        self.kc = nn.Linear(in_features=self.kb.out_features, out_features=1024)
        self.kd = nn.Linear(in_features=self.ka.out_features, out_features=1)

        self.sig = nn.Sigmoid()
        self.relu = nn.ReLU()
        self.drop50 = nn.Dropout(0.5)
        self.softmax = nn.Softmax(dim=0)
        self.layer_norm_y = LayerNorm(self.m)
        self.layer_norm_ka = LayerNorm(self.ka.out_features)


    def forward(self, x, seq_len):

        m = x.shape[2] # Feature size

        # Place the video frames to the batch dimension to allow for batch arithm. operations.
        # Assumes input batch size = 1.
        x = x.view(-1, m)
        y, att_weights_ = self.att(x)

        y = y + x
        y = self.drop50(y)
        y = self.layer_norm_y(y)

        # Frame level importance score regression
        # Two layer NN
        y = self.ka(y)
        y = self.relu(y)
        y = self.drop50(y)
        y = self.layer_norm_ka(y)

        y = self.kd(y)
        y = self.sig(y)
        y = y.view(1, -1)

        return y, att_weights_



if __name__ == "__main__":
    pass

##Main.py

##AONet class

Minor edited to debug

In [None]:
class AONet:

    def __init__(self, hps: HParameters):
        self.hps = hps
        self.model = None
        self.log_file = None
        self.verbose = hps.verbose


    def fix_keys(self, keys, dataset_name = None):
        """
        :param keys:
        :return:
        """
        # dataset_name = None
        if len(self.datasets) == 1:
            dataset_name = next(iter(self.datasets))

        keys_out = []
        for key in keys:
            t = key.split('/')
            if len(t) != 2:
                assert dataset_name is not None, "ERROR dataset name in some keys is missing but there are multiple dataset {} to choose from".format(len(self.datasets))

                key_name = dataset_name+'/'+key
                keys_out.append(key_name)
            else:
                keys_out.append(key)

        return keys_out


    def load_datasets(self, datasets = None):
        """
        Loads all h5 datasets from the datasets list into a dictionary self.dataset
        referenced by their base filename
        :param datasets:  List of dataset filenames
        :return:
        """
        if datasets is None:
            datasets = self.hps.datasets

        datasets_dict = {}
        for dataset in datasets:
            _, base_filename = os.path.split(dataset)
            base_filename, _ = os.path.splitext(base_filename)
            print("Loading:", dataset)
            # dataset_name = base_filename.split('_')[2]
            # print("\tDataset name:", dataset_name)
            datasets_dict[base_filename] = h5py.File(dataset, 'r')

        self.datasets = datasets_dict
        return datasets_dict


    def load_split_file(self, splits_file):

        self.dataset_name, self.dataset_type, self.splits = parse_splits_filename(splits_file)
        n_folds = len(self.splits)
        self.split_file = splits_file
        print("Loading splits from: ",splits_file)

        return n_folds


    def select_split(self, split_id):
        print("Selecting split: ",split_id)

        self.split_id = split_id
        n_folds = len(self.splits)
        assert self.split_id < n_folds, "split_id (got {}) exceeds {}".format(self.split_id, n_folds)

        split = self.splits[self.split_id]
        self.train_keys = split['train_keys']
        self.test_keys = split['test_keys']

        dataset_filename = self.hps.get_dataset_by_name(self.dataset_name)[0]
        _,dataset_filename = os.path.split(dataset_filename)
        dataset_filename,_ = os.path.splitext(dataset_filename)
        self.train_keys = self.fix_keys(self.train_keys, dataset_filename)
        self.test_keys = self.fix_keys(self.test_keys, dataset_filename)
        return



    def load_model(self, model_filename):
        self.model.load_state_dict(torch.load(model_filename, map_location=lambda storage, loc: storage))
        return


    def initialize(self, cuda_device=None, f_len = 2048):
        rnd_seed = 12345
        random.seed(rnd_seed)
        np.random.seed(rnd_seed)
        torch.manual_seed(rnd_seed)

        self.model = VASNet(m = f_len)
        self.model.eval()
        self.model.apply(weights_init)
        #print(self.model)

        cuda_device = cuda_device or self.hps.cuda_device

        if self.hps.use_cuda:
            print("Setting CUDA device: ",cuda_device)
            torch.cuda.set_device(cuda_device)
            torch.cuda.manual_seed(rnd_seed)

        if self.hps.use_cuda:
            self.model.cuda()

        return


    def get_data(self, key):
        key_parts = key.split('/')
        assert len(key_parts) == 2, "ERROR. Wrong key name: "+key
        dataset, key = key_parts
        return self.datasets[dataset][key]

    def lookup_weights_file(self, data_path):
        dataset_type_str = '' if self.dataset_type == '' else self.dataset_type + '_'
        weights_filename = data_path + '/models/{}_{}splits_{}_*.tar.pth'.format(self.dataset_name, dataset_type_str, self.split_id)
        weights_filename = glob.glob(weights_filename)
        if len(weights_filename) == 0:
            print("Couldn't find model weights: ", weights_filename)
            return ''

        # Get the first weights filename in the dir
        weights_filename = weights_filename[0]
        splits_file = data_path + '/splits/{}_{}splits.json'.format(self.dataset_name, dataset_type_str)

        return weights_filename, splits_file


    def train(self, output_dir='EX-0'):

        print("Initializing VASNet model and optimizer...")
        self.model.train()

        criterion = nn.MSELoss()

        if self.hps.use_cuda:
            criterion = criterion.cuda()

        parameters = filter(lambda p: p.requires_grad, self.model.parameters())
        self.optimizer = torch.optim.Adam(parameters, lr=self.hps.lr[0], weight_decay=self.hps.l2_req)

        print("Starting training...")

        max_val_fscore = 0
        max_val_fscore_epoch = 0
        train_keys = self.train_keys[:]

        lr = self.hps.lr[0]
        for epoch in range(self.hps.epochs_max):

            if epoch % 50 == 0 : print("Epoch: {0:6}".format(str(epoch)+"/"+str(self.hps.epochs_max)), end='') #AN EDIT: only report for every 50 epoch
            self.model.train()
            avg_loss = []

            random.shuffle(train_keys)

            for i, key in enumerate(train_keys):
                dataset = self.get_data(key)
                seq = dataset['features'][...]
                
                seq = torch.from_numpy(seq).unsqueeze(0)

                target = dataset['gtscore'][...]
                target = torch.from_numpy(target).unsqueeze(0)

                # Normalize frame scores
                target -= target.min()
                target /= target.max()

                if self.hps.use_cuda:
                    seq, target = seq.float().cuda(), target.float().cuda()

                # print(seq.shape)
                seq_len = seq.shape[1]
                y, _ = self.model(seq,seq_len)
                loss_att = 0
                # print(i, key, target)

                # print(y.shape, target.shape)
                loss = criterion(y, target)

                # print(loss)
                # loss2 = y.sum()/seq_len
                loss = loss + loss_att
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()
                avg_loss.append([float(loss), float(loss_att)])

           
            # Evaluate test dataset
            val_fscore, video_scores = self.eval(self.test_keys)
            if max_val_fscore < val_fscore:
                max_val_fscore = val_fscore
                max_val_fscore_epoch = epoch

            avg_loss = np.array(avg_loss)
            if epoch % 50 == 0 :  print("   Train loss: {0:.05f}".format(np.mean(avg_loss[:, 0])), end='')  #AN EDIT: only report for every 50 epoch
            if epoch % 50 == 0 :  print('   Test F-score avg/max: {0:0.5}/{1:0.5}'.format(val_fscore, max_val_fscore))  #AN EDIT: only report for every 50 epoch

            if self.verbose:
                video_scores = [["No", "Video", "F-score"]] + video_scores
                print_table(video_scores, cell_width=[3,40,8])

            # Save model weights
            path, filename = os.path.split(self.split_file)
            base_filename, _ = os.path.splitext(filename)
            path = os.path.join(output_dir, 'models_temp', base_filename+'_'+str(self.split_id))
            os.makedirs(path, exist_ok=True)
            filename = str(epoch)+'_'+str(round(val_fscore*100,3))+'.pth.tar'
            torch.save(self.model.state_dict(), os.path.join(path, filename))

        return max_val_fscore, max_val_fscore_epoch


    def eval(self, keys, results_filename=None):

        self.model.eval()
        summary = {}
        att_vecs = {}
        with torch.no_grad():
            for i, key in enumerate(keys):
                data = self.get_data(key)
                # seq = self.dataset[key]['features'][...]
                seq = data['features'][...]
                seq = torch.from_numpy(seq).unsqueeze(0)

                if self.hps.use_cuda:
                    seq = seq.float().cuda()

                y, att_vec = self.model(seq, seq.shape[1])
                summary[key] = y[0].detach().cpu().numpy()
                att_vecs[key] = att_vec.detach().cpu().numpy()

        f_score, video_scores = self.eval_summary(summary, keys, metric=self.dataset_name,
                    results_filename=results_filename, att_vecs=att_vecs)

        return f_score, video_scores


    def eval_summary(self, machine_summary_activations, test_keys, results_filename=None, metric='tvsum', att_vecs=None):

        eval_metric = 'avg' if metric == 'tvsum' else 'max'

        if results_filename is not None:
            h5_res = h5py.File(results_filename, 'w')

        fms = []
        video_scores = []
        for key_idx, key in enumerate(test_keys):
            d = self.get_data(key)
            probs = machine_summary_activations[key]

            if 'change_points' not in d:
                print("ERROR: No change points in dataset/video ",key)

            cps = d['change_points'][...]
            num_frames = d['n_frames'][()]
            nfps = d['n_frame_per_seg'][...].tolist()
            positions = d['picks'][...]
            user_summary = d['user_summary'][...]

            machine_summary = generate_summary(probs, cps, num_frames, nfps, positions)
            fm, _, _ = evaluate_summary(machine_summary, user_summary, eval_metric)
            fms.append(fm)

            # Reporting & logging
            video_scores.append([key_idx + 1, key, "{:.1%}".format(fm)])

            if results_filename:
                gt = d['gtscore'][...]
                h5_res.create_dataset(key + '/score', data=probs)
                h5_res.create_dataset(key + '/machine_summary', data=machine_summary)
                h5_res.create_dataset(key + '/gtscore', data=gt)
                h5_res.create_dataset(key + '/fm', data=fm)
                h5_res.create_dataset(key + '/picks', data=positions)

                video_name = key.split('/')[1]
                if 'video_name' in d:
                    video_name = d['video_name'][...]
                h5_res.create_dataset(key + '/video_name', data=video_name)

                if att_vecs is not None:
                    h5_res.create_dataset(key + '/att', data=att_vecs[key])

        mean_fm = np.mean(fms)

        # Reporting & logging
        if results_filename is not None:
            h5_res.close()

        return mean_fm, video_scores


### Auxilary function

In [None]:
def parse_splits_filename(splits_filename):
    # Parse split file and count number of k_folds
    spath, sfname = os.path.split(splits_filename)
    sfname, _ = os.path.splitext(sfname)
    dataset_name = sfname.split('_')[0]  # Get dataset name e.g. tvsum
    dataset_type = sfname.split('_')[1]  # augmentation type e.g. aug

    # The keyword 'splits' is used as the filename fields terminator from historical reasons.
    if dataset_type == 'splits':
        # Split type is not present
        dataset_type = ''

    # Get number of discrete splits within each split json file
    with open(splits_filename, 'r') as sf:
        splits = json.load(sf)

    return dataset_name, dataset_type, splits

def weights_init(m):
    classname = m.__class__.__name__
    if classname == 'Linear':
        init.xavier_uniform_(m.weight, gain=np.sqrt(2.0))
        if m.bias is not None:
            init.constant_(m.bias, 0.1)

##Train function


### Definition

In [None]:
def train(hps, f_len = 2048):
    os.makedirs(hps.output_dir, exist_ok=True)
    os.makedirs(os.path.join(hps.output_dir, 'splits'), exist_ok=True)
    os.makedirs(os.path.join(hps.output_dir, 'code'), exist_ok=True)
    os.makedirs(os.path.join(hps.output_dir, 'models'), exist_ok=True)
    os.system('cp -f splits/*.json  ' + hps.output_dir + '/splits/')
    os.system('cp *.py ' + hps.output_dir + '/code/')

    # Create a file to collect results from all splits
    f = open(hps.output_dir + '/results.txt', 'wt')

    for split_filename in hps.splits:
        dataset_name, dataset_type, splits = parse_splits_filename(split_filename)

        # For no augmentation use only a dataset corresponding to the split file
        datasets = None
        if dataset_type == '':
            datasets = hps.get_dataset_by_name(dataset_name)

        if datasets is None:
            datasets = hps.datasets

        f_avg = 0
        n_folds = len(splits)
        for split_id in range(n_folds):
            ao = AONet(hps)
            ao.load_datasets(datasets=datasets)

            ao.initialize(f_len = f_len) # AN EDIT : pass f_len through
            
            ao.load_split_file(splits_file=split_filename)
            ao.select_split(split_id=split_id)

            fscore, fscore_epoch = ao.train(output_dir=hps.output_dir)
            f_avg += fscore

            # Log F-score for this split_id
            f.write(split_filename + ', ' + str(split_id) + ', ' + str(fscore) + ', ' + str(fscore_epoch) + '\n')
            f.flush()

            # Save model with the highest F score
            _, log_file = os.path.split(split_filename)
            log_dir, _ = os.path.splitext(log_file)
            log_dir += '_' + str(split_id)
            log_file = os.path.join(hps.output_dir, 'models', log_dir) + '_' + str(fscore) + '.tar.pth'

            os.makedirs(os.path.join(hps.output_dir, 'models', ), exist_ok=True)
            os.system('mv ' + hps.output_dir + '/models_temp/' + log_dir + '/' + str(fscore_epoch) + '_*.pth.tar ' + log_file)
            os.system('rm -rf ' + hps.output_dir + '/models_temp/' + log_dir)

            print("Split: {0:}   Best F-score: {1:0.5f}   Model: {2:}".format(split_filename, fscore, log_file))

        # Write average F-score for all splits to the results.txt file
        f_avg /= n_folds
        f.write(split_filename + ', ' + str('avg') + ', ' + str(f_avg) + '\n')
        f.flush()

    f.close()


### Calling train function

Edit path to h5 and split file before training


In [None]:
import sys

import os

import argparse

parser = argparse.ArgumentParser("PyTorch implementation of paper \"Summarizing Videos with Attention\"")
parser.add_argument('-r', '--root', type=str, default='', help="Project root directory")
parser.add_argument('-d', '--datasets', type=str, help="Path to a comma separated list of h5 datasets")
parser.add_argument('-s', '--splits', type=str, help="Comma separated list of split files.")
parser.add_argument('-t', '--train', action='store_true', help="Train")
parser.add_argument('-v', '--verbose', action='store_true', help="Prints out more messages")
parser.add_argument('-o', '--output-dir', type=str, default='data', help="Experiment name")



_StoreAction(option_strings=['-o', '--output-dir'], dest='output_dir', nargs=None, const=None, default='data', type=<class 'str'>, choices=None, help='Experiment name', metavar=None)

In [None]:
szRootDir = '/content/drive/MyDrive/0.Desktop/VSUM-Colab' #Duy
szVASNetRootDir = szRootDir + '/VASNet' 
szUITTVSUMRootDir = szRootDir + '/UIT-VSUM' 
%cd $szVASNetRootDir 
!pwd

h5_orig_file = szVASNetRootDir + '/datasets/eccv16_dataset_tvsum_google_pool5.h5' 
h5_new_data_file = szUITTVSUMRootDir + '/eccv16_dataset_tvsum_google_pool5-xreplace-by-inceptionv3_avg.h5  '
output_dir = szUITTVSUMRootDir + '/vasnet_inceptionv3_avg/ '
split_dir = szVASNetRootDir + '/splits/tvsum_splits.json'

output_dir_orig = szUITTVSUMRootDir + '/vasnet_googlenet/ '

/content/drive/.shortcut-targets-by-id/1UnHkRTf9TYt790JF-rSwz0LnpK9HeZ6E/Colab/VASNet
/content/drive/.shortcut-targets-by-id/1UnHkRTf9TYt790JF-rSwz0LnpK9HeZ6E/Colab/VASNet


In [None]:
# googlenet
import timeit
import datetime 
import sys
import h5py
import json
import random

def train_wrapper(model):
  sys.argv = "main.py "
  sys.argv += '-d ' + h5_orig_file
  sys.argv += '  -o ' + output_dir_orig
  sys.argv += '  -s ' + split_dir
  sys.argv = sys.argv.split()

  args = parser.parse_args()

  #Get feature length
  #h5 = h5py.File(colab_path + 'eccv16_dataset_tvsum_google_pool5-replace-by-' + model + '.h5', 'r');
  #f_len =(h5['video_1']['features'].shape[1])
  f_len = 1024

  # MAIN
  #======================
  hps = HParameters()
  hps.load_from_args(args.__dict__)

  train(hps, f_len)

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from config import  *
from layer_norm import  *
import torch
from torchvision import transforms
import numpy as np
import time
import glob
import random
import argparse
import h5py
import json
import torch.nn.init as init

train_wrapper('vgg16')

Loading: /content/drive/MyDrive/0.Desktop/VSUM-Colab/VASNet/datasets/eccv16_dataset_tvsum_google_pool5.h5
Setting CUDA device:  0
Loading splits from:  /content/drive/MyDrive/0.Desktop/VSUM-Colab/VASNet/splits/tvsum_splits.json
Selecting split:  0
Initializing VASNet model and optimizer...
Starting training...
Epoch: 0/300    Train loss: 0.12937   Test F-score avg/max: 0.50908/0.50908
Epoch: 50/300   Train loss: 0.03394   Test F-score avg/max: 0.59682/0.59682
Epoch: 100/300   Train loss: 0.02727   Test F-score avg/max: 0.59466/0.60108
Epoch: 150/300   Train loss: 0.02291   Test F-score avg/max: 0.59975/0.60508
Epoch: 200/300   Train loss: 0.02051   Test F-score avg/max: 0.58917/0.60775
Epoch: 250/300   Train loss: 0.01877   Test F-score avg/max: 0.57212/0.61289
Split: /content/drive/MyDrive/0.Desktop/VSUM-Colab/VASNet/splits/tvsum_splits.json   Best F-score: 0.61289   Model: /content/drive/MyDrive/0.Desktop/VSUM-Colab/UIT-VSUM/vasnet_googlenet/models/tvsum_splits_0_0.612890321913846.ta

In [None]:
# inceptionv3
import timeit
import datetime 
import sys
import h5py
import json
import random

def train_wrapper(model):
  sys.argv = "main.py "
  sys.argv += '-d ' + h5_new_data_file
  sys.argv += '  -o ' + output_dir
  sys.argv += '  -s ' + split_dir
  sys.argv = sys.argv.split()

  args = parser.parse_args()

  #Get feature length
  #h5 = h5py.File(colab_path + 'eccv16_dataset_tvsum_google_pool5-replace-by-' + model + '.h5', 'r');
  #f_len =(h5['video_1']['features'].shape[1])
  f_len = 2048

  # MAIN
  #======================
  hps = HParameters()
  hps.load_from_args(args.__dict__)

  train(hps, f_len)


In [None]:
!ls '/content/drive/MyDrive/0.Desktop/VSUM-Colab/UIT-VSUM/'


In [None]:
# for i in [  'resnet50', 'inceptionv3' ,'vgg16', 'efficientnet']:
for i in ['vgg16' ]:

  f = open(colab_path + "retrain_vasnet_with_" + i + ".log", "a")
  
  start = time.perf_counter()
  
  train_wrapper(i)
  end = time.perf_counter() 

  report_time = "\n" + str(datetime.datetime.now()) + " " + str(i) + " " + str(end - start) 
  print(report_time)
  f.write( report_time )
  f.flush()
  f.close()


## Show results

In [None]:
 !awk '{ total += $4; count++ } END { print total/count/60 }'   /content/drive/MyDrive/VSum/Colab/retrain_vasnet_with_efficientnet.log

In [None]:
colab_path = '/content/drive/MyDrive/VSum/Colab/' # AN

for model in [  'resnet50', 'inception_v3' ,'vgg16', 'efficientnet']:

  output_folder = colab_path + 'vasnet_'+ model + '_retrain/   '
  training_time_log = colab_path + "retrain_vasnet_with_" + model + ".log"

  print("\n---------------", model, "-----s--------\n")
  !cat $training_time_log
  !awk '{ total += $4; count++ } END { print total/count }'  "$training_time_log"

In [None]:
colab_path = '/content/drive/MyDrive/VSum/Colab/' # AN

for model in [  'resnet50', 'inception_v3' ,'vgg16', 'efficientnet']:

  output_folder = colab_path + 'vasnet_'+ model + '_retrain'
  training_time_log = colab_path + "retrain_vasnet_with_" + model + ".log"

  print("\n---------------", model, "-----s--------\n")
  !cat "$output_folder"*/results.txt

| model | h5filesize | features length  | training time (minute) | avg f1 all split | best split 
--- | --- | --- | --- | --- | --- 
google net (original) |119 MiB | 1024 | 18.72 | 61.43% | 64.18% (split 3) 
resnet 50 | 211 MiB | 2048 | 53.569 | 57.05% | 60.72% (split 3) 
inception_v3 | 211 MiB | 2048 | 55  |  58.83% | 62.66% (split3) 
vgg16 | 73 MiB | 512 | 11 | 55.10% | 56.81% (split 3)
efficientnet | 257 MiB | 2506 | 79 | 59.88%  | 62.88% (split 3)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

```