In [1]:
from models import Decoder
from models import Encoder
from models import Joint_Representaion_Learner
from models import Seq2Seq

import torch
import torch.nn as nn
import torch.nn.functional as F

import pickle
import numpy as np

# Setara Opt Dictionary

In [2]:
config = {
    'pos_attention' : False,
    'enhance_input' : 2,
    'watch' : 0,
    'num_hidden_layers_decoder' : 1,
    'decoding_type' : 'ARFormer',
    'decoder' : 'BertDecoder',
    'vocab_size' : 100,
    'dim_hidden' : 512,
    'max_len' : 30,
    'with_category' : True,
    'num_category' : 20,
    'layer_norm_eps' : 0.00001,
    'hidden_dropout_prob' : 0.5,
    'num_attention_heads' : 8,
    'attention_probs_dropout_prob' : 0.0,
    'with_layernorm' : False,
    'intermediate_size' : 2048,
    'hidden_act' : 'gelu_new'
}

opt = {
    'encoder' : 'Encoder_HighWay',
    'modality' : 'mio',
    'dim_m' : 2048,
    'dim_i' : 1536,
    'dim_o' : 1024,
    'dim_hidden' : 512,
    'no_encoder_bn' : False,
    'vocab_size' : 100,
    'pos_attention' : False,
    'enhance_input' : 2,
    'watch' : 0,
    'num_hidden_layers_decoder' : 1,
    'decoding_type' : 'ARFormer',
    'decoder' : 'BertDecoder',
    'vocab_size' : 100,
    'dim_hidden' : 512,
    'max_len' : 30,
    'with_category' : True,
    'num_category' : 20,
    'layer_norm_eps' : 0.00001,
    'hidden_dropout_prob' : 0.5,
    'num_attention_heads' : 8,
    'attention_probs_dropout_prob' : 0.0,
    'with_layernorm' : False,
    'intermediate_size' : 2048,
    'hidden_act' : 'gelu_new'
}

batch_size = 8
frame_len = 6
num_objs = 5
seq_len = 16
feat_dim = 512

# Encoder Input

In [3]:
torch.manual_seed(1)

m = torch.randn(batch_size, frame_len, 2048)
i = torch.randn(batch_size, frame_len, 1536)
o = torch.randn(batch_size, frame_len, num_objs, 1024)

feats = [m, i, o]

# Deklarasi Encoder

In [4]:
encoder = getattr(Encoder, opt['encoder'], None)(opt)
encoder.eval()

Encoder_HighWay(
  (Encoder_M): Sequential(
    (0): Linear(in_features=2048, out_features=512, bias=True)
    (1): HighWay(
      (w1): Linear(in_features=512, out_features=512, bias=True)
      (w2): Linear(in_features=512, out_features=512, bias=True)
      (tanh): Tanh()
    )
    (2): Dropout(p=0.5, inplace=False)
  )
  (Encoder_I): Sequential(
    (0): Linear(in_features=1536, out_features=512, bias=True)
    (1): HighWay(
      (w1): Linear(in_features=512, out_features=512, bias=True)
      (w2): Linear(in_features=512, out_features=512, bias=True)
      (tanh): Tanh()
    )
    (2): Dropout(p=0.5, inplace=False)
  )
  (Encoder_O): ORG(
    (dropout): Dropout(p=0.3, inplace=False)
    (adjacency_dropout): Dropout(p=0.5, inplace=False)
    (object_projection): Linear(in_features=1024, out_features=512, bias=True)
    (sigma_r): Linear(in_features=1024, out_features=512, bias=True)
    (psi_r): Linear(in_features=1024, out_features=512, bias=True)
    (w_r): Linear(in_features=10

# Forward Pass Encoder

In [5]:
enc_output, enc_hidden = encoder(feats) ## AMAN BERJALAN

In [6]:
print("Shape untuk output enhanced object features {} \n\
Shape untuk output proyeksi object features {}".format(enc_output[-1][0].shape, enc_output[-1][1].shape))

Shape untuk output enhanced object features torch.Size([8, 6, 5, 512]) 
Shape untuk output proyeksi object features torch.Size([8, 6, 5, 512])


# Joint Representation

In [7]:
feats_size = [opt['dim_hidden']] * (len(opt['modality']))
join_representation_learner = Joint_Representaion_Learner(feats_size, opt)

In [8]:
enc_output, enc_hidden, enc_obj_output = join_representation_learner(enc_output, enc_hidden)

In [9]:
print(enc_output.shape)
print(enc_hidden.shape)
print(enc_obj_output[0].shape, enc_obj_output[1].shape)

torch.Size([8, 12, 512])
torch.Size([8, 512])
torch.Size([8, 6, 5, 512]) torch.Size([8, 6, 5, 512])


## Ditampung di Dictionary

In [10]:
results = {}
results['enc_output'] = enc_output # ini tuh udah tensor yang disimpan di dict
results['enc_hidden'] = enc_hidden # ini juga udah tensor
results['enc_obj_output'] = enc_obj_output

# Deklarasi Decoder

In [11]:
decoder = getattr(Decoder, config['decoder'], None)(config)
decoder.eval()

BertDecoder(
  (embedding): BertEmbeddings(
    (word_embeddings): Embedding(100, 512, padding_idx=0)
    (position_embeddings): Embedding(30, 512)
    (category_embeddings): Embedding(20, 512)
    (LayerNorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (layer): ModuleList(
    (0): BertLayer(
      (attention): BertAttention(
        (self): BertSelfAttention(
          (query): Linear(in_features=512, out_features=512, bias=True)
          (key): Linear(in_features=512, out_features=512, bias=True)
          (value): Linear(in_features=512, out_features=512, bias=True)
          (dropout): Dropout(p=0.0, inplace=False)
        )
        (output): BertSelfOutput(
          (dense): Linear(in_features=512, out_features=512, bias=True)
          (dropout): Dropout(p=0.5, inplace=False)
        )
      )
      (attend_to_enc_output): BertAttention(
        (self): BertSelfAttention(
          (query): Linear(in_features=512, o

# Deklarasi Data yang Dibutuhkan

In [12]:
torch.manual_seed(1)

tgt_tokens = torch.randint(0, config['vocab_size']-1, (batch_size, seq_len))
category = torch.LongTensor([2])
decoding_type = config['decoding_type']

# Info Corpus

In [13]:
with open('MSRVTT/info_corpus.pkl', 'rb') as f:
    info_corpus = pickle.load(f)

# ambil index to word dictionary
i2w = info_corpus['info']['itow']

Preparation before feedigng

### Modified Preparation

In [14]:
def align_object_variable(r_feats, r_hat):
        '''
        align object modul according to ORG-TRL Paper
        refers = https://openaccess.thecvf.com/content_CVPR_2020/papers/Zhang_Object_Relational_Graph_With_Teacher-Recommended_Learning_for_Video_Captioning_CVPR_2020_paper.pdf
        args:
        object_variable : This is object features exctracted from Faster RCNN
        output:
        aligned_object_variable
        '''
        ## Mengambil anchor frame sebagai acuan untuk setiap objek
        ## Memisahkan anchor frame dari keseluruhan fitur objek
        anchor_frame = r_feats[:, 0]
        next_frame = r_feats[:, 1:r_feats.size(1)]
        
        ## menghitung cosine similarity scores
        ## matmul( achor_frame, next_frame ) / | anchor_frame | * | next_frame |
        similarity_score = (torch.matmul(anchor_frame.unsqueeze(1), next_frame.transpose(2, -1)) / \
                            (torch.norm(anchor_frame.unsqueeze(1), dim=-1)[:, :, :, None] * \
                            torch.norm(next_frame, dim=-1)[:, :, None, :]))

        aligned_frames = torch.gather(r_hat[:, 1:r_hat.size(1)], 
                                      dim=2, 
                                      index=similarity_score.topk(1, -1)[1].\
                                      expand(-1, -1, -1, r_hat.size(-1)))

        return torch.cat([r_hat[:, 0].unsqueeze(1), aligned_frames], dim=1)


def prepare_inputs_for_decoder(encoder_outputs, category):
    input_keys_for_decoder = ['enc_output', 'enc_obj_output']

    inputs_for_decoder = {'category': category}
    for key in input_keys_for_decoder:
        inputs_for_decoder[key] = encoder_outputs[key] # di sini udah ambil tensor

    if isinstance(inputs_for_decoder['enc_output'], list):
        assert len(inputs_for_decoder['enc_output']) == 1
        inputs_for_decoder['enc_output'] = inputs_for_decoder['enc_output'][0]
    
    if isinstance(inputs_for_decoder['enc_obj_output'], tuple):
        assert len(inputs_for_decoder['enc_obj_output']) == 2
        b_size, f_len, n_obj, _ = inputs_for_decoder['enc_obj_output'][1].size()
        inputs_for_decoder['enc_obj_output'] = align_object_variable(inputs_for_decoder['enc_obj_output'][0],
                                                                     inputs_for_decoder['enc_obj_output'][1]).view(b_size, f_len * n_obj, -1)

    return inputs_for_decoder

# Rerun this cell if Data is Updated

In [15]:
inputs_for_decoder = prepare_inputs_for_decoder(results, category)
tgt_tokens = [item[:, :-1] for item in tgt_tokens] if isinstance(tgt_tokens, list) else tgt_tokens[:, :-1]
inputs_for_decoder.keys()

dict_keys(['category', 'enc_output', 'enc_obj_output'])

In [16]:
inputs_for_decoder['enc_obj_output'].shape

torch.Size([8, 30, 512])

In [17]:
hidden_states, embs, *_ = decoder( 
    tgt_seq=tgt_tokens, 
    decoding_type=decoding_type,
    output_attentions=False,
    **inputs_for_decoder # difeed setiap key value pairs ke forward method decoder
)

# Connecting All the Modules
1. Connecting all the modules
2. Checking the forward pass

In [1]:
from models import Decoder
from models import Encoder
from models import Joint_Representaion_Learner
from models import Seq2Seq

import torch
import torch.nn as nn
import torch.nn.functional as F

import pickle
import numpy as np

In [2]:
config = {
    'pos_attention' : False,
    'enhance_input' : 2,
    'watch' : 0,
    'num_hidden_layers_decoder' : 1,
    'decoding_type' : 'ARFormer',
    'decoder' : 'BertDecoder',
    'vocab_size' : 100,
    'dim_hidden' : 512,
    'max_len' : 30,
    'with_category' : True,
    'num_category' : 20,
    'layer_norm_eps' : 0.00001,
    'hidden_dropout_prob' : 0.5,
    'num_attention_heads' : 8,
    'attention_probs_dropout_prob' : 0.0,
    'with_layernorm' : False,
    'intermediate_size' : 2048,
    'hidden_act' : 'gelu_new'
}

opt = {
    'encoder' : 'Encoder_HighWay',
    'modality' : 'mio',
    'dim_m' : 2048,
    'dim_i' : 1536,
    'dim_o' : 1024,
    'dim_hidden' : 512,
    'no_encoder_bn' : False,
    'vocab_size' : 100,
    'pos_attention' : False,
    'enhance_input' : 2,
    'watch' : 0,
    'num_hidden_layers_decoder' : 1,
    'decoding_type' : 'ARFormer',
    'decoder' : 'BertDecoder',
    'vocab_size' : 100,
    'dim_hidden' : 512,
    'max_len' : 30,
    'with_category' : True,
    'num_category' : 20,
    'layer_norm_eps' : 0.00001,
    'hidden_dropout_prob' : 0.5,
    'num_attention_heads' : 8,
    'attention_probs_dropout_prob' : 0.0,
    'with_layernorm' : False,
    'intermediate_size' : 2048,
    'hidden_act' : 'gelu_new'
}

batch_size = 8
frame_len = 6
num_objs = 5
seq_len = 16
feat_dim = 512

In [3]:
encoder = getattr(Encoder, opt['encoder'], None)(opt)
feats_size = [opt['dim_hidden']] * (len(opt['modality']))
joint_representation_learner = Joint_Representaion_Learner(feats_size, opt)
decoder = getattr(Decoder, config['decoder'], None)(config)
tgt_word_prj = nn.Linear(opt["dim_hidden"], opt["vocab_size"], bias=False)

## Instantiate the Model

In [4]:
model = Seq2Seq(
        opt=opt,
        preEncoder=None, #None
        encoder=encoder,
        joint_representation_learner=joint_representation_learner,
        auxiliary_task_predictor=None, #None
        decoder=decoder,
        tgt_word_prj=tgt_word_prj,
        )

## Preparing Input

In [5]:
torch.manual_seed(1)

m = torch.randn(batch_size, frame_len, 2048)
i = torch.randn(batch_size, frame_len, 1536)
o = torch.randn(batch_size, frame_len, num_objs, 1024)

feats = [m, i, o]

tokens = torch.randint(0, config['vocab_size']-1, (batch_size, seq_len))
category = torch.randint(0, 19, (1,)).long()
decoding_type = config['decoding_type']

In [6]:
with open('MSRVTT/info_corpus.pkl', 'rb') as f:
    info_corpus = pickle.load(f)

# ambil index to word dictionary
vocab = info_corpus['info']['itow']

## Forward Pass

In [7]:
results = model(feats=feats,
                tgt_tokens=tokens, 
                category=category,
                opt=opt,
                vocab=vocab)

### Forward Pass TEST is Passed!

# Original Code

In [14]:
def prepare_inputs_for_decoder(encoder_outputs, category):
    input_keys_for_decoder = ['enc_output']

    inputs_for_decoder = {'category': category}
    for key in input_keys_for_decoder:
        inputs_for_decoder[key] = encoder_outputs[key] # di sini udah ambil tensor

    if isinstance(inputs_for_decoder['enc_output'], list):
        assert len(inputs_for_decoder['enc_output']) == 1
        inputs_for_decoder['enc_output'] = inputs_for_decoder['enc_output'][0]

    return inputs_for_decoder