In [1]:
import torch
import torch.nn as nn
import numpy as np
import os

# ORG-Module

Object Relational Graph is a module that learns to describe an object based on its relationship with others in a video. The algorithm consists many steps and stated in the following order:

1. Apply pretrained object detector to capture severall class-agnostic proposal.
2. The object features is captured on each keyframes.
3. The object features then stored in R, where i is the i-th keyframes, and k is the k-th object.
4. The number of objects extracted from each frames are five objects.
5. The R variable consist of 5 independent object features.
6. Define Object Set R K x d, where K is the number of object nodes, and d is the dimension features.
7. Define A, where A is a relation coefficient matrix between K nodes.
8. Before feeding to A, the R variable is feed to **Fully connected layer** with bias resulting in R'.
9. Then A is the product of fully connected layer between R' and R'T
10. After that, the product is activated using softmax function and named A^
11. Apply the GCN function, R^ = A^ . R . Wr, Where Wr is learnable parameter
12. R^ is the enhanced object features with interaction message between objects

# Develop Side

In [2]:
# the object feats has the dimension of Frames x Objs x features
# with batch dimension it becomes 4-D tensor

feat_dims = 512
k_objects = 5

# this means the object is the second object
# of the first frame

r_obj_feats = torch.rand(k_objects, feat_dims)

In [3]:
# based on ORG paper A is equal to:
# φ(R) . transpose(ψ(R))
# where : ...
# φ(R) = R . Wi + bi
# ψ(R) = R . wj + bj

in_features = feat_dims
out_features = feat_dims

sigma_r = nn.Linear(in_features, out_features)
psi_r = nn.Linear(in_features, out_features)
a_softmax = nn.Softmax(dim=1)

w_r = nn.Linear(in_features, out_features, bias=False)

In [4]:
sigma_r_out = sigma_r(r_obj_feats)
psi_r_out = psi_r(r_obj_feats)

In [5]:
a_coeff_mat = torch.matmul(sigma_r_out, torch.t(psi_r_out))

In [6]:
a_hat = a_softmax(a_coeff_mat)

In [7]:
a_hat_mul_r = torch.matmul(a_hat, r_obj_feats)

In [8]:
output = w_r(a_hat_mul_r)

In [9]:
output

tensor([[-7.9213e-02, -7.8978e-02,  7.4341e-02,  ...,  3.1808e-01,
          1.4916e-01,  2.8370e-01],
        [-1.1797e-01, -1.4182e-01,  4.4578e-02,  ...,  3.4157e-01,
          8.7550e-02,  2.5541e-01],
        [-1.4583e-01, -8.9589e-02,  2.8276e-04,  ...,  3.0307e-01,
          1.1692e-02,  1.8906e-01],
        [-1.2058e-01, -6.3393e-02,  8.3677e-02,  ...,  3.1532e-01,
          1.4132e-01,  2.3636e-01],
        [-6.8676e-02, -1.1910e-01,  7.7551e-02,  ...,  3.4590e-01,
          1.5732e-01,  3.2165e-01]], grad_fn=<MmBackward0>)

# Class Side (Alpha)

In [10]:
class ORG(nn.Module):
    
    def __init__(self, feat_dims):
        super(ORG, self).__init__()
        '''
        Object Relational Graph (ORG) is a module that learns 
        to describe an object based on its relationship 
        with others in a video.
        
        Arguments:
            feat_size : The object feature size that obtained from
                        the last fully-connected layer of the backbone
                        of Faster R-CNN
        '''
        
        sigma_r = nn.Linear(feat_dims, feat_dims)
        psi_r = nn.Linear(feat_dims, feat_dims)
        
        a_softmax = nn.Softmax(dim=1)
        
        w_r = nn.Linear(feat_dims, feat_dims, bias=False)
        
    def forward(self, r_obj_feat):
        sigma_r_out = sigma_r(r_obj_feats)
        psi_r_out = psi_r(r_obj_feats)
        
        a_coeff_mat = torch.matmul(sigma_r_out, torch.t(psi_r_out))
        a_hat = a_softmax(a_coeff_mat)
        
        a_hat_mul_r = torch.matmul(a_hat, r_obj_feats)
        output = w_r(a_hat_mul_r)
        
        return output

In [11]:
org_module = ORG(feat_dims)

In [12]:
r_hat = org_module(r_obj_feats)
r_hat

tensor([[-7.9213e-02, -7.8978e-02,  7.4341e-02,  ...,  3.1808e-01,
          1.4916e-01,  2.8370e-01],
        [-1.1797e-01, -1.4182e-01,  4.4578e-02,  ...,  3.4157e-01,
          8.7550e-02,  2.5541e-01],
        [-1.4583e-01, -8.9589e-02,  2.8276e-04,  ...,  3.0307e-01,
          1.1692e-02,  1.8906e-01],
        [-1.2058e-01, -6.3393e-02,  8.3677e-02,  ...,  3.1532e-01,
          1.4132e-01,  2.3636e-01],
        [-6.8676e-02, -1.1910e-01,  7.7551e-02,  ...,  3.4590e-01,
          1.5732e-01,  3.2165e-01]], grad_fn=<MmBackward0>)

In [13]:
r_hat.shape

torch.Size([5, 512])

# In Practice Using Faster R-CNN Object Features (Beta)

# Attention LSTM Class (alpha)

In [122]:
class TemporalAttention(nn.Module):
    def __init__(self, 
                 hidden_size,
                 features_size,
                 attn_size):
        super(TemporalAttention, self).__init__()
        '''
        Temporal Attention Module of ORG.
        It depends on previous hidden state of LSTM attention.
        Arguments:
          lstm_attn_hidden: The hidden state from LSTM attention
                            tensors of shape (batch_size, hidden_size).
          video_feats_size: The concatenation of frame features
                            and motion features.
                            tensors of shape (batch_size, n_frames, feats_size)
          attn_size       : The attention size of attention module.
        '''
        
        self.hidden_size = hidden_size
        self.features_size = features_size
        self.attn_size = attn_size
        
        # This layer is for the operation between W_a and V_i
        # : W_a is the learnable params associated with Video Features
        # : V_i is the concatenation between appearance features
        #   and motion features
        encoder_projection = nn.Linear(features_size, 
                                       attn_size, 
                                       bias=False)

        # This layer is for the operation between U_a and h_t_attn
        # : U_a is the learnable params associated with LSTM attn hidden states
        # : h_t_attn is the concatenation between appearance features
        #   and motion features
        decoder_projection = nn.Linear(hidden_size, 
                                       attn_size, 
                                       bias=False)

        # This layer is for the operation between w_T and result tanh(W_v + U_h)
        # : W_v is the result of matrix multiplication between 
        #   video features and weight W
        # : U_h is the result of matrix multiplication between
        #   LSTM attention hidden states and weight U
        # : tanh(.) is the tanh activation function
        # : w_T is a vector of learnable params for the result
        #   of the tanh activation
        energy_projection = nn.Linear(attn_size, 
                                      1, 
                                      bias=False)
        
    def forward(self,
                h_attn_lstm,
                v_features):
        '''
        shape of hidden attention lstm (batch_size, hidden_size)
        shape of video features input (batch_size, n_frames, features_size)
        '''
        Wv = encoder_projection(v_features)
        Uh = decoder_projection(h_attn_lstm)
        
        Ew = energy_projection(torch.tanh(Wv + Uh))
        alpha = softmax_activation(Ew)
        
        weighted_feats = alpha * v_features
        context_global = weighted_feats.sum(dim=1)
        
        return context_global

In [129]:
input_size = 1324
hidden_size = 512
features_size = 512
attn_size = 512
num_layers= 1 
dropout= 0.5

lstm_attn = nn.LSTM(input_size, 
                    hidden_size, 
                    num_layers, 
                    batch_first=True, 
                    dropout=dropout)

temporal_attn = TemporalAttention(hidden_size,
                                  features_size,
                                  attn_size)

In [124]:
feature_vector = torch.randn(1, 28, 512)
motion_vector = torch.randn(1, 28, 512)

video_features = torch.cat((feature_vector, motion_vector), dim=1)
video_features.shape

torch.Size([1, 56, 512])

In [126]:
v_bar = torch.mean(video_features, dim=1, keepdim=True)

v_bar.shape

torch.Size([1, 1, 512])

In [131]:
prev_word_emb = torch.randn(1, 1, 300)


prev_cell_lang_lstm = torch.rand(1, 1, 512)
prev_hidden_lang_lstm = torch.rand(1, 1, 512)

In [132]:
input_combined = torch.cat((v_bar, prev_word, prev_lang_hidden), dim=-1)
prev_h_attn = (prev_hidden_lang_lstm, prev_cell_lang_lstm)

input_combined.shape

torch.Size([1, 1, 1324])

In [160]:
with torch.no_grad():
    outputs, hidden_attn_lstm = lstm_attn(input_combined)
    context_global = temporal_attn(hidden_attn_lstm[0],
                                   video_features)

In [169]:
last_hidden_lang = hidden_attn_lstm[0]

In [170]:
n_layers = 1

last_hidden_lang = last_hidden_lang.view(n_layers, last_hidden_lang.size(1), last_hidden_lang.size(2))
# last_hidden_lang = last_hidden_lang[-1]

In [151]:
context_global.shape

torch.Size([1, 512])

In [88]:
input_features = 512
output_features = 512

# This layer is for the operation between W_a and V_i
# : W_a is the learnable params associated with Video Features
# : V_i is the concatenation between appearance features
#   and motion features
encoder_projection = nn.Linear(input_features, 
                               output_features, 
                               bias=False)

# This layer is for the operation between U_a and h_t_attn
# : U_a is the learnable params associated with LSTM attn hidden states
# : h_t_attn is the concatenation between appearance features
#   and motion features
decoder_projection = nn.Linear(input_features, 
                               output_features, 
                               bias=False)

# This layer is for the operation between w_T and result tanh(W_v + U_h)
# : W_v is the result of matrix multiplication between 
#   video features and weight W
# : U_h is the result of matrix multiplication between
#   LSTM attention hidden states and weight U
# : tanh(.) is the tanh activation function
# : w_T is a vector of learnable params for the result
#   of the tanh activation
energy_projection = nn.Linear(input_features, 
                              1, 
                              bias=False)

# This layer is for the normalization of all the weights
# corresponding to its frame
softmax_activation = nn.Softmax(dim=1)

In [112]:
Wv = encoder_projection(video_features)
Uh = decoder_projection(h_attn)
Ew = energy_projection(torch.tanh(Wv + Uh))
alpha = softmax_activation(Ew)
weighted_feats = alpha * video_features
context_global = weighted_feats.sum(dim=1)

In [115]:
context_global.shape

torch.Size([1, 512])

In [None]:
input_size = 512 
hidden_size = 512
num_layers= 1 
dropout= 0.5

attlstm = AttentionLSTM(512,
                        512,
                        1,
                        0.2)

In [29]:
output, hidden = attlstm(prev_word, prev_hidden)

TypeError: forward() missing 2 required positional arguments: 'memory' and 'video_features'

# Temporal Attention (alpha)

In [262]:
class TemporalAttention(nn.Module):
    def __init__(self, 
                 decoder_hidden_size, 
                 feat_size,
                 attn_size,):
        super(TemporalAttention, self).__init__()
        '''
        Temporal Attention module. 
        It depends on previous hidden memory in the decoder(of shape hidden_size),
        feature at the source side ( of shape(196, feat_size) ).  
        at(s) = align(ht,hs)
              = exp(score(ht,hs)) / Sum(exp(score(ht,hs')))  
        where
        score(ht,hs) = ht.t * hs                         (dot)
                     = ht.t * Wa * hs                  (general)
                     = va.t * tanh(Wa[ht;hs])           (concat)  
        Here we have used concat formulae.
        Argumets:
          hidden_size : hidden memory size of decoder. (batch,hidden_size)
          feat_size : feature size of each grid (annotation vector) at encoder side.
          bottleneck_size : intermediate size.
        '''


        self.hidden_size = decoder_hidden_size
        self.feat_size = feat_size
        self.bottleneck_size = attn_size
        
        self.decoder_projection = nn.Linear(self.hidden_size,
                                            self.bottleneck_size,
                                            bias=False)
        self.encoder_projection = nn.Linear(self.feat_size, 
                                            self.bottleneck_size, 
                                            bias=False)
        self.final_projection = nn.Linear(self.bottleneck_size, 
                                          1,
                                          bias=False)
     
    def forward(self, hidden, feats):
        '''
        shape of hidden (hidden_size) (batch,hidden_size) #(100, 512)
        shape of feats (batch size, ,feat_size)  #(100, 40, 1536)
        '''

        Wh = self.decoder_projection(hidden)  
        Uv = self.encoder_projection(feats)   
        Wh = Wh.unsqueeze(1).expand_as(Uv)

        energies = self.final_projection(torch.tanh(Wh + Uv))

        weights = F.softmax(energies, dim=1)
        weighted_feats = feats * weights.expand_as(feats)
        attn_feats = weighted_feats.sum(dim=1)

        return attn_feats, weights

In [240]:
hidden_size=512
bottleneck_size=512
feat_size=512

decoder_projection = nn.Linear(hidden_size,
                               bottleneck_size,
                               bias=False)

encoder_projection = nn.Linear(feat_size, 
                               bottleneck_size, 
                               bias=False)

final_projection = nn.Linear(bottleneck_size, 
                              1,
                              bias=False)

In [249]:
hidden = torch.randn((2, 512))
feats = torch.randn((2, 10, 512))

In [250]:
Wh = decoder_projection(hidden)
Uv = encoder_projection(feats)

In [251]:
result = Wh.unsqueeze(1).expand_as(Uv)

In [252]:
alpha = final_projection(result)

In [254]:
# weights = torch.nn.softmax(alpha, dim=1)
weighted_feats = feats * alpha.expand_as(feats)
attn_feats = weighted_feats.sum(dim=1)

In [255]:
attn_feats.shape

torch.Size([2, 512])

# Object Alignment Unit

In [237]:
# Assume that the object features for all videos have been loaded as a PyTorch tensor,
# where the tensor has shape (batch_size, max_num_frames, max_num_objects, object_feature_dim).
videos = torch.randn(2, 10, 5, 512)
# videos = videos.to(device) # move tensor to GPU device

batch_size, max_num_frames, max_num_objects, object_feature_dim = videos.size()

# Compute the similarity scores between each pair of frames
similarity_scores = []
aligned_objects = []

for i in range(1, max_num_frames):
    anchor_frame = videos[:, 0]
    i_th_frame = videos[:, i] # ini menggunakan R_Enhanced_Features

    # Compute the cosine similarity between each object in the anchor frame and the i-th frame.
    # Using R_features
    similarity_scores_i = torch.bmm(anchor_frame, i_th_frame.transpose(1, 2)) / \
                           (torch.norm(anchor_frame, dim=2)[:, :, None] * torch.norm(i_th_frame, dim=2)[:, None, :])
    
    max_similarities, max_similarity_indices = torch.max(similarity_scores_i, dim=2)
    
    aligned_objects_i = torch.gather(i_th_frame, 
                                     dim=1, 
                                     index=max_similarity_indices[:, :, None].expand(-1, -1, object_feature_dim))
    
    similarity_scores.append(similarity_scores_i)
    aligned_objects.append(aligned_objects_i.unsqueeze(1))

aligned_frames = torch.cat(aligned_objects, dim=1)
all_aligned_frames = torch.cat([anchor_frame.unsqueeze(1), aligned_frames], dim=1)

weighted_frames = torch.mul(all_aligned_frames, alpha.unsqueeze(-1))
sum_weighted_frames = torch.sum(weighted_frames, dim=1)

In [264]:
decoder_hidden_size = 512
bottleneck_size = 512
feat_size = 1836
n_frames = 10
batch_size = 2

hidden_attn = torch.randn(batch_size, decoder_hidden_size)
v_feats = torch.randn(batch_size, n_frames, feat_size)

temporal_attn = TemporalAttention(decoder_hidden_size, 
                                  feat_size, 
                                  bottleneck_size)

att_feats, alpha = temporal_attn(hidden_attn, v_feats)

In [298]:
encoder_projection = nn.Linear(512, 
                               512, 
                               bias=False)

decoder_projection = nn.Linear(512, 
                               512, 
                               bias=False)

energy_projection = nn.Linear(512, 
                              1, 
                              bias=False)

h_attn_lstm = torch.randn(2, 512)

In [314]:
Wv = encoder_projection(sum_weighted_frames)
Uh = decoder_projection(h_attn_lstm)
Uh = Uh.unsqueeze(1).expand_as(Wv)

Ew = energy_projection(torch.tanh(Wv + Uh))
beta = F.softmax(Ew, dim=1)

weighted_objs = torch.mul(sum_weighted_frames, beta)
local_context_feature = torch.sum(weighted_objs, dim=1)

In [317]:
local_context_feature.shape

torch.Size([2, 512])

In [293]:
res.shape

torch.Size([2, 5, 512])

In [295]:
encoder_projection.weight.shape

torch.Size([512, 512])

# Spatial Attention (alpha)

In [324]:
class SpatialAttention(nn.Module):
    def __init__(self, 
                 decoder_hidden_size, 
                 feat_size,
                 attn_size,):
        super(SpatialAttention, self).__init__()
        '''
        Spatial Attention module. 
        It depends on previous hidden attention memory in the decoder attention,
        and the size of object features.  
        Argumets:
          decoder_hidden_size : hidden memory size of decoder. (batch, hidden_size)
          feat_size : feature size of object features.
          bottleneck_size : intermediate size.
        '''

        self.hidden_size = decoder_hidden_size
        self.feat_size = feat_size
        self.bottleneck_size = attn_size
        
        self.decoder_projection = nn.Linear(self.hidden_size,
                                            self.bottleneck_size,
                                            bias=False)
        self.encoder_projection = nn.Linear(self.feat_size, 
                                            self.bottleneck_size, 
                                            bias=False)
        self.energy_projection = nn.Linear(self.bottleneck_size, 
                                          1,
                                          bias=False)
     
    def forward(self, h_attn_lstm, obj_feats):
        '''
        shape of hidden (hidden_size) (batch,hidden_size) #(100, 512)
        shape of feats (batch size, ,feat_size)  #(100, 40, 1536)
        '''

        Wv = self.encoder_projection(obj_feats)
        Uh = self.decoder_projection(h_attn_lstm)
        Uh = Uh.unsqueeze(1).expand_as(Wv)

        Ew = self.energy_projection(torch.tanh(Wv + Uh))
        alpha = F.softmax(Ew, dim=1)
        
        weighted_objs = torch.mul(obj_feats, beta)
        global_context_feature = torch.sum(weighted_objs, dim=1)

        return global_context_feature, beta

In [325]:
obj_feat_size=512

spatial_attention = SpatialAttention(decoder_hidden_size,
                                     obj_feat_size,
                                     bottleneck_size)

global_context_feature, beta = spatial_attention(h_attn_lstm, sum_weighted_frames)

In [327]:
global_context_feature.shape

torch.Size([2, 512])

In [328]:
beta.shape

torch.Size([2, 5, 1])

# Demo

In [11]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import os

from models.ORG_TRL.model import Encoder
from models.ORG_TRL.model import DecoderRNN
from models.ORG_TRL.model import TemporalAttention
from config import ConfigORGTRL
from config import Path
from dictionary import Vocabulary
from config import Path
from data import DataHandler

In [12]:
cfg = ConfigORGTRL(opt_encoder=True)
# specifying the dataset in configuration object from {'msvd','msrvtt'}
cfg.dataset = 'msrvtt'

voc = Vocabulary(cfg, gloVe=True)
path = Path(cfg, os.getcwd())
voc.load()

encoder = Encoder(cfg)
decoder = DecoderRNN(cfg, voc)

data_handler = DataHandler(cfg, path, voc)
train_dset, val_dset, test_dset = data_handler.getDatasets()
train_loader, val_loader, test_loader = data_handler.getDataloader(train_dset, val_dset, test_dset)

for data in train_loader:
    appearance_features, targets, mask, max_length, _, motion_features, _ = data
    print(appearance_features.shape)
    print(targets.shape)
    print(mask.shape)
    print(max_length)
    print(motion_features.shape)
    break

torch.Size([32, 28, 1536])
torch.Size([14, 32])
torch.Size([14, 32])
14
torch.Size([32, 28, 2048])


In [None]:
target = targets[0].view(-1, 1)

In [13]:
targets

tensor([[   6,  255,   77,    6,  140,    6,    6,    6,    6,    6,    6,    6,
            6,   77,   92,    6,    6,    6,    6,  135,    6,    6,  451,   11,
            6,    9, 2708,    6,    6,  353,    6, 3782],
        [  14,   25,   25,   92,   25,   92,  263,   85,  202,   92,  525,   79,
         1649,   25,  462,  469,   61, 1340,   92,  187,  165,  263,    6,   21,
         2303,   28, 2707,  219,   87,  167,  630, 3780],
        [  15,   34,  255,   25,    6,   25,  629,   86,  249,  283,  330,  283,
           25,    6,   70,   92,   25,   79,  283,  195,  299, 1117,  187,   22,
           25,  996,    5,   34,  103,  748,    6,   25],
        [  53,   99, 3622,   97,   99, 1428,    6,   87, 2016,   12,  235,  748,
          607,   92,  243,   34,  624,   25,  198,   17,  247, 1115,   99,  499,
          352,   28,   28,  117, 1850,   28, 2462,  579],
        [  21,  206,   39,    2,   12,   39,  519,    2,    6, 2383,   28,    6,
            2,  631,    6,    6,    2, 

In [3]:
appearance_feat = torch.randn(32, 4, 1536)
motion_feat = torch.rand(32, 4, 2048)

v_feats = encoder(appearance_feat, motion_feat)

In [4]:
v_feats.shape

torch.Size([32, 4, 512])

In [7]:
with torch.no_grad():
    output, h_lang_lstm, h_attn_lstm = decoder(decoder_input,
                                               decoder_hidden_attn,
                                               decoder_hidden_lang,
                                               v_feats)

In [8]:
output.shape

torch.Size([32, 5044])

In [13]:
decoder_hidden_size = 512
bottleneck_size = 512
feat_size = 1836

temporal_attention = TemporalAttention(cfg)

last_hidden_attn = torch.randn(32, 512)

In [14]:
context_global_vector, alpha = temporal_attention(last_hidden_attn, v_feats)

In [15]:
context_global_vector.shape

torch.Size([32, 512])

In [6]:
n_layers = 1
batch_size = 32
decoder_hidden_size = 512

decoder_hidden = torch.zeros(n_layers, 
                             batch_size,
                             decoder_hidden_size)

decoder_hidden_attn = (decoder_hidden, decoder_hidden)
decoder_hidden_lang = (decoder_hidden, decoder_hidden)

decoder_input = torch.LongTensor([[cfg.SOS_token for _ in range(cfg.batch_size)]])
appearance_features = torch.randn((32, 28, 512))
motion_features = torch.randn((32, 28, 512))

v_features = torch.cat((appearance_features, motion_features), dim=-1)

v_bar_features = torch.mean(v_features, dim=1, keepdim=True).squeeze(1).unsqueeze(0)

embedded = torch.randn((1, 32, 300))

input_attn_lstm = torch.cat((v_bar_features, embedded, decoder_hidden_lang[0]), dim=-1)

In [4]:
with torch.no_grad():
    output, h_lang_lstm, h_attn_lstm = decoder(decoder_input,
                                               decoder_hidden_attn,
                                               decoder_hidden_lang,
                                               v_features
                                               )

torch.Size([1, 32, 512])


In [5]:
output.shape

torch.Size([32, 5044])

In [8]:
h_attn_lstm[0].shape

torch.Size([1, 32, 512])

In [10]:
output = [1749, 1649, 1549, 1449]
target = [628, 234, 76, 18]

In [11]:
caption = []
gt = []

for word in output:
    caption.append(voc.index2word.get(word))

for word in target:
    gt.append(voc.index2word.get(word))
    
print(caption)
print(gt)

['erase', 'puppy', 'denver', 'do']
['son', 'rolls', 'several', 'gathered']


In [10]:
v_features = torch.randn((32, 28, 512))
Uh = torch.rand((1, 32, 512))

In [11]:
res = Uh[0].unsqueeze(1).expand_as(v_features)
res.shape

torch.Size([32, 28, 512])

In [19]:
v_bar_features.shape

torch.Size([1, 32, 1024])

In [20]:
attention_lstm = nn.LSTM(input_size=1836, 
                         hidden_size=512,
                         num_layers=1, 
                         dropout=0.2,
                         batch_first=False)

In [21]:
output, h = attention_lstm(input_attn_lstm,
                           decoder_hidden_attn)

h[0].shape

In [None]:
def forward(self,
            inputs, 
            attn_hidden,
            lang_hidden, 
            v_features):

In [71]:
inputs = torch.cat((v_bar_features, embedded), dim=-1)

In [37]:
t_targets = targets.T

In [38]:
t_targets.shape

torch.Size([32, 23])

In [None]:
for e, (key, value) in enumerate(voc.index2word.items()):
    if e < 11:
        print(e, key, value)
    else:
        break

In [31]:
targets[0]

tensor([  77,    6,    6,  144,  280,   28,    6,    6,  140,    6,    4,    6,
         144,    6,    6,    6,    6,    6,  144,    6,  255, 2989,    6,    6,
         135,  191,    6,    6,  140,  267,    6,    6])