# Check Encoder and Decoder

In [53]:
import torch
import torch.nn as nn
import torchvision.models as models
from torch.nn.utils.rnn import pack_padded_sequence

device = torch.device('cuda: 0') if torch.cuda.is_available() else torch.device('cpu')

## Encoder

In [72]:
class EncoderCNN(nn.Module):
    def __init__(self, embed_size):
        """Load the pretrained ResNet-152 and replace top fc layer."""
        super(EncoderCNN, self).__init__()
        resnet = models.resnet152(pretrained=True)
        modules = list(resnet.children())[:-1]      # delete the last fc layer.
        self.resnet = nn.Sequential(*modules)
        self.linear = nn.Linear(resnet.fc.in_features, embed_size)
        self.bn = nn.BatchNorm1d(embed_size, momentum=0.01)
        
    def forward(self, images):
        """
        Extract feature vectors from input images.
        @param images: [batch, 3, 224, 224]
        """
        with torch.no_grad():
            features = self.resnet(images)
        print("resnet_out: ", features.shape)
        features = features.reshape(features.size(0), -1)
        print("reshape: ", features.shape)
        features = self.linear(features)
        print("linear: ", features.shape)
        features = self.bn(features)
        return features

encoder = EncoderCNN(256).to(device)

In [59]:
Input = torch.rand((128, 3, 224, 224)).to(device)

In [73]:
encoder(Input).shape

resnet_out:  torch.Size([128, 2048, 1, 1])
reshape:  torch.Size([128, 2048])
linear:  torch.Size([128, 256])


torch.Size([128, 256])

### Check Module.childen() and Module.modules
https://discuss.pytorch.org/t/module-children-vs-module-modules/4551/4

In [27]:
# 此段代码只用于检查module.children() and module.modules()，没有任何实际作用
model = nn.Sequential(
    nn.Linear(10,20), 
    nn.ReLU(),
    nn.Sequential(
        nn.Sigmoid(), 
        nn.ReLU()
    )
)

In [37]:
modules = list(model.children())
nn.Sequential(*modules)

Sequential(
  (0): Linear(in_features=10, out_features=20, bias=True)
  (1): ReLU()
  (2): Sequential(
    (0): Sigmoid()
    (1): ReLU()
  )
)

In [38]:
modules = list(model.modules())
nn.Sequential(*modules)

Sequential(
  (0): Sequential(
    (0): Linear(in_features=10, out_features=20, bias=True)
    (1): ReLU()
    (2): Sequential(
      (0): Sigmoid()
      (1): ReLU()
    )
  )
  (1): Linear(in_features=10, out_features=20, bias=True)
  (2): ReLU()
  (3): Sequential(
    (0): Sigmoid()
    (1): ReLU()
  )
  (4): Sigmoid()
  (5): ReLU()
)

由此来看，以一个Sequence 为一个块，modules.children()得到的是最外层的结果，如果内层有Sequence，是当做一整块来处理的，不会进入到里面。
而modules.modules()是以迭代的形式遍历所有的块，上面的例子中则是遍历了三次的结果。

### BatchNorm1d
$y = \frac{x - \mathrm{E}[x]}{\sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta$  
为什么要BatchNorm: https://morvanzhou.github.io/tutorials/machine-learning/ML-intro/3-08-batch-normalization/  

In [162]:
bat = nn.BatchNorm1d(2, momentum=0.01, affine=False)
ins = nn.InstanceNorm1d(2, momentum=0.01, affine=False, track_running_stats=True)

In [174]:
cache = torch.rand(2, 2, 3)

In [176]:
bat(cache)

tensor([[[ 1.3568,  0.3336, -1.3398],
         [-0.2027, -1.1410, -1.3697]],

        [[ 1.0965, -0.7566, -0.6906],
         [ 0.5478,  1.3426,  0.8230]]])

In [175]:
ins(cache)

tensor([[[ 1.1154,  0.1950, -1.3104],
         [ 1.3896, -0.4684, -0.9212]],

        [[ 1.4133, -0.7451, -0.6682],
         [-1.0818,  1.3289, -0.2471]]])

Note: 在 batch 为 1 的情况下，BatchNorm1d 和 InstanceNorm1d 执行的是相同的操作，
将 affine 设为 False，便可得到相同的结果.  
affine 控制的是 affine parameters， 即公式上的 $\lambda$ 和 $\beta$  
仿射变换： https://www.zhihu.com/question/20666664   

需要注意的是：InstanceNorm1d 的输入必须是(N, C, L)， 而 BatchNorm1d 的输入可以是 (N, C, L) 或者 (N, C)  
其中：N 为 batch, C 必须与定义的 InstanceNorm1d 和 BatchNorm1d Size 一致

## Decoder

In [219]:
class DecoderRNN(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers, max_seq_length=20):
        """Set the hyper-parameters and build the layers."""
        super(DecoderRNN, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, vocab_size)
        self.max_seg_length = max_seq_length
        
    def forward(self, features, captions, lengths):
        """Decode image feature vectors and generates captions."""
        embeddings = self.embed(captions)
        embeddings = torch.cat((features.unsqueeze(1), embeddings), 1)
        packed = pack_padded_sequence(embeddings, lengths, batch_first=True)
        hiddens, _ = self.lstm(packed)
        outputs = self.linear(hiddens[0])
        return outputs
    
    def sample(self, features, states=None):
        """Generate captions for given image features using greedy search."""
        sampled_ids = []
        inputs = features.unsqueeze(1)
        out_list = []
        for i in range(self.max_seg_length):
            hiddens, states = self.lstm(inputs, states)          # hiddens: (batch_size, 1, hidden_size)
            outputs = self.linear(hiddens.squeeze(1))            # outputs:  (batch_size, vocab_size)
            out_list.append(outputs)
            
            _, predicted = outputs.max(1)                        # predicted: (batch_size)
            sampled_ids.append(predicted)
            inputs = self.embed(predicted)                       # inputs: (batch_size, embed_size)
            inputs = inputs.unsqueeze(1)                         # inputs: (batch_size, 1, embed_size)
        sampled_ids = torch.stack(sampled_ids, 1)                # sampled_ids: (batch_size, max_seq_length)
        return sampled_ids, out_list

vocab_size = 10000
decoder = DecoderRNN(256, 512, vocab_size, 1).to(device)

In [230]:
# 模拟数据
batch = 2

feature = torch.rand((batch, 256)).to(device)
caption = torch.randint(vocab_size, (batch, 15)).to(device)
lengths = [15, 13]

- feature: Encoder 的输出, [batch, 256]  
- caption: Caption对应的词典id, [batch, max_length]  
- lengths: 每一个Caption的有效长度，list   
注意，一个Batch中每一个Caption的有效长度不是固定的, caption是按照有效长度降序排序的。

In [232]:
emb = nn.Embedding(vocab_size, 256)

TODO: 无法模拟数据，如何解决这个问题？

In [233]:
emb(caption)

RuntimeError: Expected tensor for argument #1 'indices' to have scalar type Long; but got CUDAFloatTensor instead (while checking arguments for embedding)