In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import os

# ORG-Module

Object Relational Graph is a module that learns to describe an object based on its relationship with others in a video. The algorithm consists many steps and stated in the following order:

1. Apply pretrained object detector to capture severall class-agnostic proposal.
2. The object features is captured on each keyframes.
3. The object features then stored in R, where i is the i-th keyframes, and k is the k-th object.
4. The number of objects extracted from each frames are five objects.
5. The R variable consist of 5 independent object features.
6. Define Object Set R K x d, where K is the number of object nodes, and d is the dimension features.
7. Define A, where A is a relation coefficient matrix between K nodes.
8. Before feeding to A, the R variable is feed to **Fully connected layer** with bias resulting in R'.
9. Then A is the product of fully connected layer between R' and R'T
10. After that, the product is activated using softmax function and named A^
11. Apply the GCN function, R^ = A^ . R . Wr, Where Wr is learnable parameter
12. R^ is the enhanced object features with interaction message between objects

# Develop Side

In [None]:
# the object feats has the dimension of Frames x Objs x features
# with batch dimension it becomes 4-D tensor

feat_dims = 512
k_objects = 5

# this means the object is the second object
# of the first frame

r_obj_feats = torch.rand(k_objects, feat_dims)

In [None]:
# based on ORG paper A is equal to:
# φ(R) . transpose(ψ(R))
# where : ...
# φ(R) = R . Wi + bi
# ψ(R) = R . wj + bj

in_features = feat_dims
out_features = feat_dims

sigma_r = nn.Linear(in_features, out_features)
psi_r = nn.Linear(in_features, out_features)
a_softmax = nn.Softmax(dim=1)

w_r = nn.Linear(in_features, out_features, bias=False)

In [None]:
sigma_r_out = sigma_r(r_obj_feats)
psi_r_out = psi_r(r_obj_feats)

In [None]:
a_coeff_mat = torch.matmul(sigma_r_out, torch.t(psi_r_out))

In [None]:
a_hat = a_softmax(a_coeff_mat)

In [None]:
a_hat_mul_r = torch.matmul(a_hat, r_obj_feats)

In [None]:
output = w_r(a_hat_mul_r)

In [None]:
output

# BEFORE

In [None]:
class ORG_OLD(nn.Module):
    def __init__(self, cfg):
        super(ORG_OLD, self).__init__()
        '''
        Object Relational Graph (ORG) is a module that learns 
        to describe an object based on its relationship 
        with others in a video.
        
        Arguments:
            feat_size : The object feature size that obtained from
                        the last fully-connected layer of the backbone
                        of Faster R-CNN
        '''


        self.sigma_r = nn.Linear(cfg.object_projected_size, 
                                 cfg.object_projected_size)
        
        self.psi_r = nn.Linear(cfg.object_projected_size, 
                               cfg.object_projected_size)
        
        self.a_softmax = nn.Softmax(dim=1)
        
        self.w_r = nn.Linear(cfg.object_projected_size, 
                             cfg.object_projected_size, 
                             bias=False)
    
    def forward(self, r_obj_feats):
        r_hat_ith_frame = []
        # for loop on every frame
        for i in range(r_obj_feats.size(1)):
            sigma_r_out = self.sigma_r(r_obj_feats[:, i])
            psi_r_out = self.psi_r(r_obj_feats[:, i])

            # batch multiplications
            a_coeff_mat = torch.bmm(sigma_r_out, psi_r_out.transpose(1, 2))
            a_hat = self.a_softmax(a_coeff_mat)
            
            # batch multiplication
            a_hat_mul_r = torch.bmm(a_hat, r_obj_feats[:, i])
            output = self.w_r(a_hat_mul_r)
            
            r_hat_ith_frame.append(output.unsqueeze(1))
        
        r_hat = torch.cat(r_hat_ith_frame, dim=1)
        
        return r_hat

In [None]:
org_old = ORG_OLD(cfg)

In [None]:
r_feats = torch.randn(128 * 7, 512, 28, 5)

In [None]:
import time

start = time.time()

org_old(r_feats.permute(0, 2, 3, 1))

end = time.time()
total_time = end - start
print(f"Execution time: {total_time:.4f} seconds")

# Class Side (Alpha)

In [None]:
class ConfigORGTRL:
    def __init__(self):
        self.object_input_size = 1024
        self.object_projected_size = 512
        self.object_kernel_size = (1, 1)


class ORG(nn.Module):
    def __init__(self, cfg):
        super(ORG, self).__init__()
        '''
        Object Relational Graph (ORG) is a module that learns 
        to describe an object based on its relationship 
        with others in a video.
        
        Arguments:
            feat_size : The object feature size that obtained from
                        the last fully-connected layer of the backbone
                        of Faster R-CNN
        '''
        
        self.sigma_r = nn.Conv2d(in_channels=cfg.object_projected_size, 
                                 out_channels=cfg.object_projected_size,
                                 kernel_size=cfg.object_kernel_size)
        
        self.psi_r = nn.Conv2d(in_channels=cfg.object_projected_size, 
                                 out_channels=cfg.object_projected_size,
                                 kernel_size=cfg.object_kernel_size)
        
        self.w_r = nn.Conv2d(in_channels=cfg.object_projected_size, 
                             out_channels=cfg.object_projected_size,
                             kernel_size=cfg.object_kernel_size,
                             bias=False)
        
    def forward(self, r_feats):
        a_coeff = torch.matmul(self.sigma_r(r_feats).permute(0, 2, 3, 1), 
                               self.psi_r(r_feats).permute(0, 2, 1, 3))
        
        a_hat = F.softmax(a_coeff, dim=-1)
        
        r_hat = torch.matmul(a_hat, self.w_r(r_feats).permute(0, 2, 3, 1))
        
        return r_hat

In [None]:
cfg = ConfigORGTRL()
org = ORG(cfg)

In [None]:
import time

start = time.time()

org(r_feats)

end = time.time()
total_time = end - start
print(f"Execution time: {total_time:.4f} seconds")

In [None]:
sigma_r = nn.Conv2d(in_channels=cfg.object_projected_size, 
                     out_channels=cfg.object_projected_size,
                     kernel_size=cfg.object_kernel_size)
        
psi_r = nn.Conv2d(in_channels=cfg.object_projected_size, 
                 out_channels=cfg.object_projected_size,
                 kernel_size=cfg.object_kernel_size)

w_r = nn.Conv2d(in_channels=cfg.object_projected_size, 
                 out_channels=cfg.object_projected_size,
                 kernel_size=cfg.object_kernel_size,
                 bias=False)

In [None]:
sigma_r(r_feats).permute(0, 2, 3, 1)

In [None]:
psi_r(r_feats).permute(0, 2, 3, 1).transpose(2, -1)[0][1]

In [None]:
psi_r(r_feats).permute(0, 2, 1, 3)[0][1]

In [None]:
a_hat = F.softmax(torch.matmul(sigma_r(r_feats).permute(0, 2, 3, 1), 
                               psi_r(r_feats).permute(0, 2, 3, 1).transpose(2, -1)), dim=-1)

In [None]:
r_hat = torch.bmm(a_hat, w_r(r_feats))

# In Practice Using Faster R-CNN Object Features (Beta)

# Attention LSTM Class (alpha)

In [None]:
class TemporalAttention(nn.Module):
    def __init__(self, 
                 hidden_size,
                 features_size,
                 attn_size):
        super(TemporalAttention, self).__init__()
        '''
        Temporal Attention Module of ORG.
        It depends on previous hidden state of LSTM attention.
        Arguments:
          lstm_attn_hidden: The hidden state from LSTM attention
                            tensors of shape (batch_size, hidden_size).
          video_feats_size: The concatenation of frame features
                            and motion features.
                            tensors of shape (batch_size, n_frames, feats_size)
          attn_size       : The attention size of attention module.
        '''
        
        self.hidden_size = hidden_size
        self.features_size = features_size
        self.attn_size = attn_size
        
        # This layer is for the operation between W_a and V_i
        # : W_a is the learnable params associated with Video Features
        # : V_i is the concatenation between appearance features
        #   and motion features
        encoder_projection = nn.Linear(features_size, 
                                       attn_size, 
                                       bias=False)

        # This layer is for the operation between U_a and h_t_attn
        # : U_a is the learnable params associated with LSTM attn hidden states
        # : h_t_attn is the concatenation between appearance features
        #   and motion features
        decoder_projection = nn.Linear(hidden_size, 
                                       attn_size, 
                                       bias=False)

        # This layer is for the operation between w_T and result tanh(W_v + U_h)
        # : W_v is the result of matrix multiplication between 
        #   video features and weight W
        # : U_h is the result of matrix multiplication between
        #   LSTM attention hidden states and weight U
        # : tanh(.) is the tanh activation function
        # : w_T is a vector of learnable params for the result
        #   of the tanh activation
        energy_projection = nn.Linear(attn_size, 
                                      1, 
                                      bias=False)
        
    def forward(self,
                h_attn_lstm,
                v_features):
        '''
        shape of hidden attention lstm (batch_size, hidden_size)
        shape of video features input (batch_size, n_frames, features_size)
        '''
        Wv = encoder_projection(v_features)
        Uh = decoder_projection(h_attn_lstm)
        
        Ew = energy_projection(torch.tanh(Wv + Uh))
        alpha = softmax_activation(Ew)
        
        weighted_feats = alpha * v_features
        context_global = weighted_feats.sum(dim=1)
        
        return context_global

In [None]:
input_size = 1324
hidden_size = 512
features_size = 512
attn_size = 512
num_layers= 1 
dropout= 0.5

lstm_attn = nn.LSTM(input_size, 
                    hidden_size, 
                    num_layers, 
                    batch_first=True, 
                    dropout=dropout)

temporal_attn = TemporalAttention(hidden_size,
                                  features_size,
                                  attn_size)

In [None]:
feature_vector = torch.randn(1, 28, 512)
motion_vector = torch.randn(1, 28, 512)

video_features = torch.cat((feature_vector, motion_vector), dim=1)
video_features.shape

In [None]:
v_bar = torch.mean(video_features, dim=1, keepdim=True)

v_bar.shape

In [None]:
prev_word_emb = torch.randn(1, 1, 300)


prev_cell_lang_lstm = torch.rand(1, 1, 512)
prev_hidden_lang_lstm = torch.rand(1, 1, 512)

In [None]:
input_combined = torch.cat((v_bar, prev_word, prev_lang_hidden), dim=-1)
prev_h_attn = (prev_hidden_lang_lstm, prev_cell_lang_lstm)

input_combined.shape

In [None]:
with torch.no_grad():
    outputs, hidden_attn_lstm = lstm_attn(input_combined)
    context_global = temporal_attn(hidden_attn_lstm[0],
                                   video_features)

In [None]:
last_hidden_lang = hidden_attn_lstm[0]

In [None]:
n_layers = 1

last_hidden_lang = last_hidden_lang.view(n_layers, last_hidden_lang.size(1), last_hidden_lang.size(2))
# last_hidden_lang = last_hidden_lang[-1]

In [None]:
context_global.shape

In [None]:
input_features = 512
output_features = 512

# This layer is for the operation between W_a and V_i
# : W_a is the learnable params associated with Video Features
# : V_i is the concatenation between appearance features
#   and motion features
encoder_projection = nn.Linear(input_features, 
                               output_features, 
                               bias=False)

# This layer is for the operation between U_a and h_t_attn
# : U_a is the learnable params associated with LSTM attn hidden states
# : h_t_attn is the concatenation between appearance features
#   and motion features
decoder_projection = nn.Linear(input_features, 
                               output_features, 
                               bias=False)

# This layer is for the operation between w_T and result tanh(W_v + U_h)
# : W_v is the result of matrix multiplication between 
#   video features and weight W
# : U_h is the result of matrix multiplication between
#   LSTM attention hidden states and weight U
# : tanh(.) is the tanh activation function
# : w_T is a vector of learnable params for the result
#   of the tanh activation
energy_projection = nn.Linear(input_features, 
                              1, 
                              bias=False)

# This layer is for the normalization of all the weights
# corresponding to its frame
softmax_activation = nn.Softmax(dim=1)

In [None]:
Wv = encoder_projection(video_features)
Uh = decoder_projection(h_attn)
Ew = energy_projection(torch.tanh(Wv + Uh))
alpha = softmax_activation(Ew)
weighted_feats = alpha * video_features
context_global = weighted_feats.sum(dim=1)

In [None]:
context_global.shape

In [None]:
input_size = 512 
hidden_size = 512
num_layers= 1 
dropout= 0.5

attlstm = AttentionLSTM(512,
                        512,
                        1,
                        0.2)

In [None]:
output, hidden = attlstm(prev_word, prev_hidden)

# Temporal Attention (alpha)

In [None]:
class TemporalAttention(nn.Module):
    def __init__(self, 
                 decoder_hidden_size, 
                 feat_size,
                 attn_size,):
        super(TemporalAttention, self).__init__()
        '''
        Temporal Attention module. 
        It depends on previous hidden memory in the decoder(of shape hidden_size),
        feature at the source side ( of shape(196, feat_size) ).  
        at(s) = align(ht,hs)
              = exp(score(ht,hs)) / Sum(exp(score(ht,hs')))  
        where
        score(ht,hs) = ht.t * hs                         (dot)
                     = ht.t * Wa * hs                  (general)
                     = va.t * tanh(Wa[ht;hs])           (concat)  
        Here we have used concat formulae.
        Argumets:
          hidden_size : hidden memory size of decoder. (batch,hidden_size)
          feat_size : feature size of each grid (annotation vector) at encoder side.
          bottleneck_size : intermediate size.
        '''


        self.hidden_size = decoder_hidden_size
        self.feat_size = feat_size
        self.bottleneck_size = attn_size
        
        self.decoder_projection = nn.Linear(self.hidden_size,
                                            self.bottleneck_size,
                                            bias=False)
        self.encoder_projection = nn.Linear(self.feat_size, 
                                            self.bottleneck_size, 
                                            bias=False)
        self.final_projection = nn.Linear(self.bottleneck_size, 
                                          1,
                                          bias=False)
     
    def forward(self, hidden, feats):
        '''
        shape of hidden (hidden_size) (batch,hidden_size) #(100, 512)
        shape of feats (batch size, ,feat_size)  #(100, 40, 1536)
        '''

        Wh = self.decoder_projection(hidden)  
        Uv = self.encoder_projection(feats)   
        Wh = Wh.unsqueeze(1).expand_as(Uv)

        energies = self.final_projection(torch.tanh(Wh + Uv))

        weights = F.softmax(energies, dim=1)
        weighted_feats = feats * weights.expand_as(feats)
        attn_feats = weighted_feats.sum(dim=1)

        return attn_feats, weights

In [None]:
hidden_size=512
bottleneck_size=512
feat_size=512

decoder_projection = nn.Linear(hidden_size,
                               bottleneck_size,
                               bias=False)

encoder_projection = nn.Linear(feat_size, 
                               bottleneck_size, 
                               bias=False)

final_projection = nn.Linear(bottleneck_size, 
                              1,
                              bias=False)

In [None]:
hidden = torch.randn((2, 512))
feats = torch.randn((2, 10, 512))

In [None]:
Wh = decoder_projection(hidden)
Uv = encoder_projection(feats)

In [None]:
result = Wh.unsqueeze(1).expand_as(Uv)

In [None]:
alpha = final_projection(result)

In [None]:
# weights = torch.nn.softmax(alpha, dim=1)
weighted_feats = feats * alpha.expand_as(feats)
attn_feats = weighted_feats.sum(dim=1)

In [None]:
attn_feats.shape

# Object Alignment Unit

In [None]:
import time

In [None]:
# Assume that the object features for all videos have been loaded as a PyTorch tensor,
# where the tensor has shape (batch_size, max_num_frames, max_num_objects, object_feature_dim).
videos = torch.randn(2, 3, 5, 512)
batch_size, max_num_frames, max_num_objects, object_feature_dim = videos.size()
# videos = torch.empty((2, 4, 5, 512)).uniform_(0, 1)

In [None]:
# videos = videos.to(device) # move tensor to GPU device
# %%time
start_time = time.time()

# Compute the similarity scores between each pair of frames
# # similarity_scores = []
aligned_objects = []

anchor_frame = videos[:, 0].clone().detach()
next_frame = videos[:, 1:videos.size(3)].clone().detach()

for i in range(1, max_num_frames):
    # ini menggunakan R_Enhanced_Features
    i_th_frame = videos[:, i].clone().detach()

    # Compute the cosine similarity between each object in the anchor frame and the i-th frame.
    similarity_scores_i = torch.bmm(anchor_frame, i_th_frame.transpose(1, 2)) / \
                           (torch.norm(anchor_frame, dim=2)[:, :, None] * torch.norm(i_th_frame, dim=2)[:, None, :])
    
    max_similarities, max_similarity_indices = torch.max(similarity_scores_i, dim=2)
    
    aligned_objects_i = torch.gather(i_th_frame, 
                                     dim=1, 
                                     index=max_similarity_indices[:, :, None].expand(-1, -1, object_feature_dim))
    
#     similarity_scores.append(similarity_scores_i)
    aligned_objects.append(aligned_objects_i.unsqueeze(1))

aligned_frames = torch.cat(aligned_objects, dim=1)
all_aligned_frames = torch.cat([anchor_frame.unsqueeze(1), aligned_frames], dim=1)

# weighted_frames = torch.mul(all_aligned_frames, alpha.unsqueeze(-1))
# sum_weighted_frames = torch.sum(weighted_frames, dim=1)

end_time = time.time()
total_time = end_time - start_time
print(f"Execution time: {total_time:.4f} seconds")

## Optimized Function

In [None]:
# videos = videos.to(device) # move tensor to GPU device
# %%time
start_time = time.time()

anchor_frame = videos[:, 0].clone().detach()
next_frame = videos[:, 1:videos.size(3)].clone().detach()

similarity_score = (torch.matmul(anchor_frame.unsqueeze(1), next_frame.transpose(2, -1)) / \
                    (torch.norm(anchor_frame.unsqueeze(1), dim=-1)[:, :, :, None] * \
                     torch.norm(next_frame, dim=-1)[:, :, None, :]))

aligned_frames = torch.gather(next_frame, 
                              dim=2, 
                              index=similarity_score.topk(1, -1)[1].\
                              expand(-1, -1, -1, object_feature_dim))

all_aligned_frames = torch.cat([anchor_frame.unsqueeze(1), aligned_frames], dim=1)


end_time = time.time()
total_time = end_time - start_time
print(f"Execution time: {total_time:.4f} seconds")

In [5]:
def align_object_variable(object_variable, r_hat):
    '''
    align object modul according to ORG-TRL Paper
    refers = https://openaccess.thecvf.com/content_CVPR_2020/papers/Zhang_Object_Relational_Graph_With_Teacher-Recommended_Learning_for_Video_Captioning_CVPR_2020_paper.pdf
    args:
      object_variable : This is object features exctracted from Faster RCNN
    output:
      aligned_object_variable
    '''
    ## Mengambil anchor frame sebagai acuan untuk setiap objek
    ## Memisahkan anchor frame dari keseluruhan fitur objek
    anchor_frame = object_variable[:, 0].detach()
    next_frame = object_variable[:, 1:object_variable.size(1)].detach()
    
    ## menghitung cosine similarity scores
    ## matmul( achor_frame, next_frame ) / | anchor_frame | * | next_frame |
    similarity_score = (torch.matmul(anchor_frame.unsqueeze(1), next_frame.transpose(2, -1)) / \
                        (torch.norm(anchor_frame.unsqueeze(1), dim=-1)[:, :, :, None] * \
                         torch.norm(next_frame, dim=-1)[:, :, None, :]))

    aligned_frames = torch.gather(r_hat[:, 1:r_hat.size(3)].detach(), 
                                  dim=2, 
                                  index=similarity_score.topk(1, -1)[1].\
                                  expand(-1, -1, -1, r_hat.size(-1)))

    return torch.cat([r_hat[:, 0].unsqueeze(1), aligned_frames], dim=1)

In [6]:
videos = torch.randn(128, 28, 5, 1024)
r_hat_input = torch.randn(128, 28, 5, 512)

In [7]:
r_hat = align_object_variable(videos.detach(), r_hat_input.detach())

In [9]:
anchor_frame = videos[:, 0].detach()
next_frame = videos[:, 1:videos.size(1)].detach()

## menghitung cosine similarity scores
## matmul( achor_frame, next_frame ) / | anchor_frame | * | next_frame |
similarity_score = (torch.matmul(anchor_frame.unsqueeze(1), next_frame.transpose(2, -1)) / \
                    (torch.norm(anchor_frame.unsqueeze(1), dim=-1)[:, :, :, None] * \
                     torch.norm(next_frame, dim=-1)[:, :, None, :]))

## Method Fix Aligned Object

In [None]:
start_time = time.time()

anchor_frame = videos[:, 0].clone().detach()
i_th_frame = videos[:, 1:videos.size(3)].clone().detach()

similarity_scores_ts = F.cosine_similarity(anchor_frame.unsqueeze(2), 
                                           i_th_frame.view(batch_size, 
                                                           -1, 
                                                           object_feature_dim).unsqueeze(1), 
                                           dim=-1).view(anchor_frame.size(0), 
                                                        anchor_frame.size(1), 
                                                        i_th_frame.size(1), 
                                                        i_th_frame.size(2))

aligned_indices_ts = similarity_scores_ts.topk(1, -1)[1].squeeze(-1).transpose(1, -1).unsqueeze(-1)

aligned_frames_ts = torch.gather(i_th_frame, 
                                 dim=2, 
                                 index=aligned_indices_ts.expand(-1, -1, -1, anchor_frame.size(-1)))

all_aligned_frames = torch.cat([anchor_frame.unsqueeze(1), aligned_frames_ts], dim=1)

end_time = time.time()
total_time = end_time - start_time
print(f"Execution time: {total_time:.4f} seconds")

In [None]:
aligned_indices_ts.shape

## Method 3

In [None]:
a = torch.tensor([[[[0.1, 0.2, 0.3], [0.15, 0.25, 0.35]]],
                  
                  [[[0.71, 0.72, 0.73], [0.85, 0.88, 0.89]]]])

b = torch.tensor([[[[0.15, 0.25, 0.35], 
                    [0.5, 0.55, 0.58], 
                    [0.8, 0.85, 0.9],
                    [0.87, 0.85, 0.97]],
                   
                   [[0.15, 0.25, 0.35], 
                    [0.5, 0.55, 0.58], 
                    [0.8, 0.85, 0.9],
                    [0.87, 0.85, 0.97]],
                  
                  [[0.15, 0.25, 0.35], 
                    [0.5, 0.55, 0.58], 
                    [0.8, 0.85, 0.9],
                    [0.87, 0.85, 0.97]]],
                  
                  [[[0.3, 0.35, 0.3], 
                    [0.5, 0.55, 0.52], 
                    [0.95, 0.95, 0.91],
                    [0.87, 0.85, 0.97]],
                   
                   [[0.3, 0.35, 0.3], 
                    [0.5, 0.55, 0.52], 
                    [0.95, 0.95, 0.91],
                    [0.87, 0.85, 0.97]],
                   
                   [[0.15, 0.25, 0.35], 
                    [0.5, 0.55, 0.58], 
                    [0.8, 0.85, 0.9],
                    [0.87, 0.85, 0.97]]],
                                   
                 ])

# F.cosine_similarity(a, b.view(2, 2, 3), dim=3)

In [None]:
F.cosine_similarity(a.squeeze(1).unsqueeze(2), b.view(2,-1, 3).unsqueeze(1), dim=-1)

In [None]:
res.shape

In [None]:
a.squeeze(1).unsqueeze(2).shape

In [None]:
b.view(2,-1, 3).unsqueeze(1).shape

In [None]:
# hasil fungsi ini memiliki arti
# batch_size, num_objects, num_frames, num_objects_i_th_frame
res.view(2, 2, 3, 4)

In [None]:
videos = torch.empty((2, 4, 5, 512)).uniform_(0, 1)
anchor_frame = videos[:, 0].clone().detach()
i_th_frame = videos[:, 1:videos.size(3)].clone().detach()

similarity_scores = F.cosine_similarity(anchor_frame.unsqueeze(2), i_th_frame.view(2, -1, 512).unsqueeze(1), dim=-1)

In [None]:
similarity_scores.view(2, 5, 3, 5)[

In [None]:
anchor_frame.unsqueeze(2).shape

In [None]:
i_th_frame.view(2, -1, 512).shape

In [None]:
i_th_frame.shape

In [None]:
b.shape

In [None]:
a.squeeze(1).unsqueeze(2).shape

In [None]:
b.shape

In [None]:
b.view(2, -1, 3).shape

In [None]:
i_th_frame.shape

## Method 2

In [None]:
import torch

videos = torch.randn(2, 10, 5, 512)
batch = 2
frames = 10
objects = 5
objects_feat = 512

# Define anchor frame and i-th frame
anchor_frame = videos[:, 0]
i_th_frame = videos[:, 1:videos.size(3)]

# Reshape tensors to compute cosine similarity for each object
anchor_frame_reshape = anchor_frame.view(batch, 1, objects, objects_feat)
i_th_frame_reshape = i_th_frame.view(batch, frames-1, objects, objects_feat)

# Compute cosine similarity for each object
cos_similarity = torch.nn.functional.cosine_similarity(anchor_frame_reshape, i_th_frame_reshape, dim=-1)

# Print cosine similarity tensor shape
print(cos_similarity.shape) # Output: torch.Size([2, 9, 5])

## Method 1

In [None]:
def cosine_similarity(anchor_frame, i_th_frame):
    return torch.bmm(anchor_frame, i_th_frame.transpose(1, 2)) / \
           (torch.norm(anchor_frame, dim=2)[:, :, None] * torch.norm(i_th_frame, dim=2)[:, None, :])

In [None]:
anchor_frame = videos[:, 0]
i_th_frame = videos[:, 1:max_num_frames] 

similarity_indices_comp = [torch.max(cosine_similarity(anchor_frame, i_th_frame[:, idx]), dim=2)[1] for idx in range(1, i_th_frame.size(1))]

In [None]:
similarity_scores_comp[0]

In [None]:
i_th_frame[:, 0].shape

In [None]:
torch.gather(i_th_frame[:, 0], 
             1, 
             similarity_scores_comp[0])

In [None]:
decoder_hidden_size = 512
bottleneck_size = 512
feat_size = 1836
n_frames = 10
batch_size = 2

hidden_attn = torch.randn(batch_size, decoder_hidden_size)
v_feats = torch.randn(batch_size, n_frames, feat_size)

temporal_attn = TemporalAttention(decoder_hidden_size, 
                                  feat_size, 
                                  bottleneck_size)

att_feats, alpha = temporal_attn(hidden_attn, v_feats)

In [None]:
encoder_projection = nn.Linear(512, 
                               512, 
                               bias=False)

decoder_projection = nn.Linear(512, 
                               512, 
                               bias=False)

energy_projection = nn.Linear(512, 
                              1, 
                              bias=False)

h_attn_lstm = torch.randn(2, 512)

In [None]:
Wv = encoder_projection(sum_weighted_frames)
Uh = decoder_projection(h_attn_lstm)
Uh = Uh.unsqueeze(1).expand_as(Wv)

Ew = energy_projection(torch.tanh(Wv + Uh))
beta = F.softmax(Ew, dim=1)

weighted_objs = torch.mul(sum_weighted_frames, beta)
local_context_feature = torch.sum(weighted_objs, dim=1)

In [None]:
local_context_feature.shape

In [None]:
res.shape

In [None]:
encoder_projection.weight.shape

# Spatial Attention (alpha)

In [None]:
class SpatialAttention(nn.Module):
    def __init__(self, 
                 decoder_hidden_size, 
                 feat_size,
                 attn_size,):
        super(SpatialAttention, self).__init__()
        '''
        Spatial Attention module. 
        It depends on previous hidden attention memory in the decoder attention,
        and the size of object features.  
        Argumets:
          decoder_hidden_size : hidden memory size of decoder. (batch, hidden_size)
          feat_size : feature size of object features.
          bottleneck_size : intermediate size.
        '''

        self.hidden_size = decoder_hidden_size
        self.feat_size = feat_size
        self.bottleneck_size = attn_size
        
        self.decoder_projection = nn.Linear(self.hidden_size,
                                            self.bottleneck_size,
                                            bias=False)
        self.encoder_projection = nn.Linear(self.feat_size, 
                                            self.bottleneck_size, 
                                            bias=False)
        self.energy_projection = nn.Linear(self.bottleneck_size, 
                                          1,
                                          bias=False)
     
    def forward(self, h_attn_lstm, obj_feats):
        '''
        shape of hidden (hidden_size) (batch,hidden_size) #(100, 512)
        shape of feats (batch size, ,feat_size)  #(100, 40, 1536)
        '''

        Wv = self.encoder_projection(obj_feats)
        Uh = self.decoder_projection(h_attn_lstm)
        Uh = Uh.unsqueeze(1).expand_as(Wv)

        Ew = self.energy_projection(torch.tanh(Wv + Uh))
        alpha = F.softmax(Ew, dim=1)
        
        weighted_objs = torch.mul(obj_feats, beta)
        global_context_feature = torch.sum(weighted_objs, dim=1)

        return global_context_feature, beta

In [None]:
obj_feat_size=512

spatial_attention = SpatialAttention(decoder_hidden_size,
                                     obj_feat_size,
                                     bottleneck_size)

global_context_feature, beta = spatial_attention(h_attn_lstm, sum_weighted_frames)

In [None]:
global_context_feature.shape

In [None]:
beta.shape

# Demo

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import os

from models.ORG_TRL.model import ORG_TRL
from models.ORG_TRL.model import Encoder
from models.ORG_TRL.model import DecoderRNN
from models.ORG_TRL.model import TemporalAttention
from models.ORG_TRL.model import SpatialAttention
from config import ConfigORGTRL
from config import Path
from dictionary import Vocabulary
from config import Path
from data import DataHandler

In [2]:
cfg = ConfigORGTRL(opt_encoder=True)
# specifying the dataset in configuration object from {'msvd','msrvtt'}
cfg.dataset = 'msrvtt'

voc = Vocabulary(cfg, gloVe=True)
path = Path(cfg, os.getcwd())
voc.load()

model = ORG_TRL(voc, cfg, path)

# data_handler = DataHandler(cfg, path, voc)
# train_dset, val_dset, test_dset = data_handler.getDatasets()
# train_loader, val_loader, test_loader = data_handler.getDataloader(train_dset, val_dset, test_dset)

# for data in train_loader:
#     appearance_features, targets, mask, max_length, _, motion_features, _ = data
#     print(appearance_features.shape)
#     print(targets.shape)
#     print(mask.shape)
#     print(max_length)
#     print(motion_features.shape)
#     break



In [3]:
appearance_feat = torch.randn(128, 28, 1536)
motion_feat = torch.randn(128, 28, 2048)
obj_feat = torch.randn(128, 28, 5, 1024)

In [4]:
import time

start_time = time.time()

v_feats, r_feats, r_hat = model.encoder(appearance_feat, motion_feat, obj_feat)
aligned_objects = model.align_object_variable(r_feats, r_hat)

end_time = time.time()
total_time = end_time - start_time
print(f"Execution time: {total_time:.4f} seconds")

Execution time: 1.0831 seconds


In [5]:
r_hat.shape

torch.Size([128, 28, 5, 512])

In [6]:
r_feats.shape

torch.Size([128, 28, 5, 512])

In [43]:
Uh = torch.randn(128, 512)

In [33]:
lstm = nn.LSTM()

In [None]:
n_layers = 1
batch_size = 32
decoder_hidden_size = 512

decoder_hidden = torch.zeros(n_layers, 
                             batch_size,
                             decoder_hidden_size)

decoder_hidden_attn = (decoder_hidden, decoder_hidden)
decoder_hidden_lang = (decoder_hidden, decoder_hidden)

decoder_input = torch.LongTensor([[cfg.SOS_token for _ in range(cfg.batch_size)]])
appearance_features = torch.randn((32, 28, 512))
motion_features = torch.randn((32, 28, 512))

v_features = torch.cat((appearance_features, motion_features), dim=-1)

v_bar_features = torch.mean(v_features, dim=1, keepdim=True).squeeze(1).unsqueeze(0)

embedded = torch.randn((1, 32, 300))

input_attn_lstm = torch.cat((v_bar_features, embedded, decoder_hidden_lang[0]), dim=-1)

In [None]:
with torch.no_grad():
    output, h_lang_lstm, h_attn_lstm = decoder(decoder_input,
                                               decoder_hidden_attn,
                                               decoder_hidden_lang,
                                               v_feats, 
                                               aligned_objects)

In [None]:
result = model.BeamDecoding(appearance_feat, 
                            motion_feat, 
                            obj_feat, 
                            5)

In [None]:
decoder_hidden_size = 512
bottleneck_size = 512
feat_size = 1836

temporal_attention = TemporalAttention(cfg)

last_hidden_attn = torch.randn(32, 512)

In [None]:
context_global_vector, alpha = temporal_attention(last_hidden_attn, v_feats)

In [None]:
context_global_vector.shape

In [None]:
with torch.no_grad():
    output, h_lang_lstm, h_attn_lstm = decoder(decoder_input,
                                               decoder_hidden_attn,
                                               decoder_hidden_lang,
                                               v_features
                                               )

In [None]:
output.shape

In [None]:
h_attn_lstm[0].shape

In [None]:
output = [1749, 1649, 1549, 1449]
target = [628, 234, 76, 18]

In [None]:
caption = []
gt = []

for word in output:
    caption.append(voc.index2word.get(word))

for word in target:
    gt.append(voc.index2word.get(word))
    
print(caption)
print(gt)

In [None]:
v_features = torch.randn((32, 28, 512))
Uh = torch.rand((1, 32, 512))

In [None]:
res = Uh[0].unsqueeze(1).expand_as(v_features)
res.shape

In [None]:
v_bar_features.shape

In [None]:
attention_lstm = nn.LSTM(input_size=1836, 
                         hidden_size=512,
                         num_layers=1, 
                         dropout=0.2,
                         batch_first=False)

In [None]:
output, h = attention_lstm(input_attn_lstm,
                           decoder_hidden_attn)

h[0].shape

In [None]:
def forward(self,
            inputs, 
            attn_hidden,
            lang_hidden, 
            v_features):

In [None]:
inputs = torch.cat((v_bar_features, embedded), dim=-1)

In [None]:
t_targets = targets.T

In [None]:
t_targets.shape

In [None]:
for e, (key, value) in enumerate(voc.index2word.items()):
    if e < 11:
        print(e, key, value)
    else:
        break

In [None]:
targets[0]

# Encoder Object Projection

time execution: 2.13 s for ~1000 data

In [None]:
batch_size = 128
seq_len = 5
in_channels = 1024
out_channels = 512
num_frames = 28
num_boxes = 5
num_feats = 512
kernel_size = 1
feature_dim = 1024

obj_feats = torch.randn(batch_size, num_frames, num_boxes, feature_dim)

In [None]:
class ConfigORGTRL:
    def __init__(self):
        self.object_input_size = 1024
        self.object_projected_size = 512
        self.object_kernel_size = (1, 1)

In [None]:
class Encoder(nn.Module):
    def __init__(self, cfg):
        super(Encoder,self).__init__()
        '''
        Encoder module. Project the video feature into a different space which will be 
        send to decoder.
        Argumets:
          input_size : Faster RCNN extracted features that has shape 1024-D
          output_size : Dimention of projected space.
        '''
        self.object_projection = nn.Conv2d(cfg.object_input_size, 
                                           cfg.object_projected_size, 
                                           cfg.object_kernel_size)
           
    def forward(self, appearance_feat, motion_feat, object_feat):        
        ## Intinya memproyeksikan dengan input
        ## yang direshape langsung menjadi (batch_size * num_objects, dim_feature, frame_len)
        object_feat = F.relu(self.object_projection(object_feat.permute(0, 3, 1, 2)))
                
        return appearance_feat, motion_feat, object_feat

In [None]:
## Memeriksa bentuk tensor
batch_size, frame_len, num_objects, dim_feature = obj_feats[:2, :3, :2, :5].size()
obj_feats[:2, :3, :2, :5].reshape(batch_size * num_objects, dim_feature, frame_len)

In [None]:
## inisialisasi nn.Conv1d
object_projection = nn.Conv2d(in_channels=in_channels, 
                              out_channels=out_channels,
                              kernel_size=(1, 1))

In [None]:
batch_size, frame_len, num_objects, dim_feature = obj_feats.size()
result = F.relu(object_projection(obj_feats.view(batch_size, dim_feature, frame_len, num_objects)))

In [None]:
cfg = ConfigORGTRL()
encoder = Encoder(cfg)

In [None]:
_, _, r_feats = encoder(1, 2, obj_feats)

In [None]:
r_feats.shape

In [None]:
import time

In [None]:
start_time = time.time()

encoder(1, 2, obj_feats)

end_time = time.time()
total_time = end_time - start_time
print(f"Execution time: {total_time:.4f} seconds")

In [None]:
obj_feats[0][:3]

In [None]:
obj_feats.permute(0, 3, 1, 2)[0]