In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# Input Embedding Layer for Features

Input Embedding Layer (IEL) is an encoder method used in Non-Autoregressive Coarse-to-Fine Video Captioning (https://arxiv.org/abs/1911.12018) paper written by Bang Yang. 

In [None]:
class ConfigORGTRL:
    '''
    Hyperparameter settings for Soft Attention based LSTM (SA-LSTM) model.
    '''
    def __init__(self, model_name='sa-lstm', opt_encoder=True):
        self.appearance_input_size = 2048
        self.motion_input_size = 2048
        self.projected_size = 512
        self.encoder_dropout_size = 0.5

In [None]:
class InputEmbeddingLayer(nn.Module):
    def __init__(self, cfg):
        super(InputEmbeddingLayer, self).__init__()
        
        self.w1 = nn.Linear(cfg.appearance_input_size, cfg.projected_size)
        self.w2 = nn.Linear(cfg.projected_size, cfg.projected_size)
        self.w3 = nn.Linear(cfg.projected_size, cfg.projected_size)
        self.dropout = nn.Dropout(cfg.encoder_dropout_size)
        
    def forward(self, feats):
        v_bar = self.w1(feats)
        v_hat = torch.tanh(self.w2(v_bar))
        gate = torch.sigmoid(self.w3(v_hat))
        result = gate * v_bar + (1 - gate) * v_hat
        
        return self.dropout(result)

In [None]:
cfg = ConfigORGTRL()
iel_encoder_appearance = InputEmbeddingLayer(cfg)
iel_encoder_motion = InputEmbeddingLayer(cfg)

In [None]:
appearance_feats = torch.randn(32, 8, 2048)
motion_feats = torch.randn(32, 8, 2048)

In [None]:
iel_encoder_appearance(appearance_feats).shape

In [None]:
iel_encoder_motion(motion_feats).shape