In [1]:
import torch
import torch.nn as nn

seed = 42
torch.manual_seed(42)
torch.cuda.manual_seed(42)

### NLP Example

* Sample Embedding 생성

In [2]:
batch, sentence_len, embed_dim = 2, 4, 5
batch_embed = torch.randn(batch, embed_dim, sentence_len)
print(batch_embed)

tensor([[[ 1.9269,  1.4873,  0.9007, -2.1055],
         [ 0.6784, -1.2345, -0.0431, -1.6047],
         [-0.7521,  1.6487, -0.3925, -1.4036],
         [-0.7279, -0.5594, -0.7688,  0.7624],
         [ 1.6423, -0.1596, -0.4974,  0.4396]],

        [[-0.7581,  1.0783,  0.8008,  1.6806],
         [ 0.0349,  0.3211,  1.5736, -0.8455],
         [ 1.3123,  0.6872, -1.0892, -0.3553],
         [-1.4181,  0.8963,  0.0499,  2.2667],
         [ 1.1790, -0.4345, -1.3864, -1.2862]]])


* nn.BatchNorm1d 함수를 사용한 Nomalize 
    * batch을 기준으로 $\mu, \sigma^2$를 이용하여 Normalize
    * nn.BatchNorm1d은 입력을 (batch, embed_dim, sentence_len)으로 받으므로 transpose를 통해 (batch, sentence_len, embed_dim)으로 형태를 변경
    * batch 내의 동일한 위치의 embedding 값을 기준으로 $\mu, \sigma^2$를 계산
        * eg. $\mu_{1} = \frac{(embed_{1,1} + embed_{2,1} + embed_{3,1} + ... + embed_{batch size, 1})}{batch size}$


In [None]:
batch_1d_norm = nn.BatchNorm1d(embed_dim)
output = batch_1d_norm(batch_embed)
print(output)

tensor([[[ 1.0112,  0.6694,  0.2133, -2.1240],
         [ 0.8342, -1.1157,  0.0988, -1.4930],
         [-0.6723,  1.6039, -0.3313, -1.2899],
         [-0.7092, -0.5581, -0.7460,  0.6279],
         [ 1.6754, -0.0950, -0.4269,  0.4937]],

        [[-1.0764,  0.3514,  0.1356,  0.8197],
         [ 0.1783,  0.4700,  1.7466, -0.7191],
         [ 1.2850,  0.6923, -0.9918, -0.2960],
         [-1.3285,  0.7479, -0.0114,  1.9775],
         [ 1.2202, -0.3651, -1.3004, -1.2020]]],
       grad_fn=<NativeBatchNormBackward0>)


* Batch Normalization 구현 
* $y = \frac{x - E[x]}{\sqrt{Var[x] + \epsilon}}*\gamma + \beta$
* $\gamma, \beta$는 학습 가능한 paramters( weight, bias )
* var 계산 시, unbiased=False으로 설정하지 않으면 Bessel’s correction을 통해 표본 크기를 n이 아닌 n-1을 사용하게 된다)  

In [None]:
batch_embed = batch_embed.transpose(1, 2)
eg_mean = torch.mean(batch_embed, dim=(0, 1))
print('mean:\n ', eg_mean)
print(' ')
eg_var = torch.var(batch_embed, dim=(0, 1), unbiased = False)
print('var:\n', eg_var)

mean:
  tensor([ 0.6264, -0.1400, -0.0431,  0.0626, -0.0629])
 
var:
 tensor([1.6543, 0.9625, 1.1125, 1.2423, 1.0358])


In [None]:
# 위의 결과와 동일
eg_x_hat = (batch_embed - eg_mean) / torch.sqrt(eg_var + batch_1d_norm.eps)
print((batch_1d_norm.weight * eg_x_hat + batch_1d_norm.bias).transpose(1, 2))

tensor([[[ 1.0112,  0.6694,  0.2133, -2.1240],
         [ 0.8342, -1.1157,  0.0988, -1.4930],
         [-0.6723,  1.6039, -0.3313, -1.2899],
         [-0.7092, -0.5581, -0.7460,  0.6279],
         [ 1.6754, -0.0950, -0.4269,  0.4937]],

        [[-1.0764,  0.3514,  0.1356,  0.8197],
         [ 0.1783,  0.4700,  1.7466, -0.7191],
         [ 1.2850,  0.6923, -0.9918, -0.2960],
         [-1.3285,  0.7479, -0.0114,  1.9775],
         [ 1.2202, -0.3651, -1.3004, -1.2020]]], grad_fn=<TransposeBackward0>)


### Image Example

In [None]:
batch, channel, height, width = 2, 3, 5, 5
batch_img = torch.randn(batch, channel, height, width)
print(batch_img)

tensor([[[[-0.8371, -0.9224,  1.8113,  0.1606,  0.3672],
          [ 0.1754,  1.3852, -0.4459, -1.2024,  0.7078],
          [-1.0759,  0.5357,  1.1754,  0.5612, -0.4527],
          [-0.7718,  0.1453,  0.2311,  0.0087, -0.1423],
          [ 0.1971, -1.1441,  0.3383,  1.6992,  2.8140]],

         [[ 0.3598, -0.0898,  0.4584, -0.5644,  1.0563],
          [-1.4692,  1.4332,  0.7281, -0.7106, -0.6021],
          [ 0.9604,  0.4048, -1.3543, -0.4976,  0.4747],
          [-0.1976,  1.2683,  1.2243,  0.0981,  1.7423],
          [-1.3527,  0.2191,  0.5526, -0.6788,  0.5743]],

         [[ 0.1877, -0.3576, -0.3165,  0.5886, -0.8905],
          [ 0.4098, -0.9864,  0.1233,  0.3499,  0.6173],
          [-0.1693,  0.2332,  4.0356,  1.2795,  1.0311],
          [-0.7048,  1.0131, -0.3308,  0.5177,  0.3878],
          [-0.5797, -0.1691, -0.5733,  0.5069, -0.4752]]],


        [[[-0.4920,  0.2704, -0.5628,  0.6793,  0.4405],
          [-0.3609, -0.0606,  0.0733,  0.8187,  1.4805],
          [ 0.3449, -1.

* nn.BatchNorm1d 함수를 사용한 Nomalize 
    * batch을 기준으로 $\mu, \sigma^2$를 이용하여 Normalize
    * nn.BatchNorm1d은 입력을 (batch, channel, height, width)으로 받으므로 permute를 통해 (batch, height, width, channel)으로 형태를 변경
    * batch 내의 동일한 위치의 channel 값을 기준으로 $\mu, \sigma^2$를 계산
        * eg. $\mu_{1} = \frac{(channel{1,1,1} + channel{1,2,1} + channel{1,3,1} + ... + channel{1,batch size, 1})}{batch size}$
