In [1]:
import torch
import torch.nn as nn

There is two connection to the normalization layer in the transfoermer.
1. The output of multihead attention layer.
2. The output of embedding layer. (word embedding + positional embedding)

In [8]:
# batch_size, seq_len, emb_size [2, 1, 3]
emb_seq = torch.tensor([[[0.2, 0.1, 0.3]],
                        [[0.5, 0.1, 0.1]]])

mean = emb_seq.mean(dim=2, keepdim=True)
std = emb_seq.std(dim=2, keepdim=True)

In [7]:
mean

tensor([[[0.2000]],

        [[0.2333]]])

In [15]:
((emb_seq - mean) ** 2).mean(dim=2, keepdim=True)

tensor([[[0.0067]],

        [[0.0356]]])

In [14]:
emb_seq.var(dim=2, keepdim=True)

tensor([[[0.0100]],

        [[0.0533]]])

In [9]:
std

tensor([[[0.1000]],

        [[0.2309]]])

In [11]:
emb_seq_normalized = (emb_seq - mean) / (std+1e-5)
emb_seq_normalized

tensor([[[ 0.0000, -0.9999,  0.9999]],

        [[ 1.1547, -0.5773, -0.5773]]])

In [17]:
emb_seq.size()[1:]

torch.Size([1, 3])

In [29]:
p = nn.Parameter(torch.ones([5]))
input = torch.randn(2, 1, 5)
print(p.shape)
print(input.shape)
p_input = input+p
print(p_input.shape)

torch.Size([5])
torch.Size([2, 1, 5])
torch.Size([2, 1, 5])


In [38]:
class LayerNormalization(nn.Module):
    def __init__(self, params_shape:list, esp:int=1e-5):
        super().__init__()
        self.params_shape = params_shape
        self.gamma = nn.Parameter(torch.ones(params_shape))
        self.beta = nn.Parameter(torch.zeros(params_shape))
        self.esp = esp

    def forward(self, input):
        dims = [-(i+1) for i in range(len(self.params_shape))]
        mean = input.mean(dim=dims, keepdim=True)
        var = ((input - mean) ** 2).mean(dim=dims, keepdim=True)
        std = var.sqrt()
        y = (input - mean) / (std + self.esp)
        out = self.gamma * y + self.beta
        return out

In [39]:
layer_norm = LayerNormalization([3])

In [40]:
emb_seq_normalized = layer_norm.forward(emb_seq)
emb_seq_normalized.shape

torch.Size([2, 1, 3])