# Understanding Layer Normalization

In [127]:
import torch
from torch import nn

sequence_length = 10
batch_size = 1
input_dim = 512

# X' is generated by linear transformation of original input sequence X from the previous layer 
x_prime = torch.randn((sequence_length, batch_size, input_dim)) # Care the order

In [128]:
parameter_shape = x_prime.size()[-2:]
gamma = nn.Parameter(torch.ones(parameter_shape))
beta =  nn.Parameter(torch.zeros(parameter_shape))

gamma.shape, beta.shape

(torch.Size([1, 512]), torch.Size([1, 512]))

In [129]:
dims = [-(i + 1) for i in range(len(parameter_shape))]

dims

[-1, -2]

In [130]:
mean = x_prime.mean(dim=dims, keepdim=True)

mean

tensor([[[ 1.4665e-02]],

        [[ 2.8745e-02]],

        [[-2.7341e-02]],

        [[ 5.5685e-03]],

        [[-4.1234e-02]],

        [[ 5.0047e-02]],

        [[ 2.7336e-02]],

        [[-4.9045e-05]],

        [[-1.3190e-03]],

        [[ 1.1447e-02]]])

In [131]:
var = ((x_prime - mean) ** 2).mean(dim=dims, keepdim=True)
epsilon = 1e-5 # To prevent std being zero
std = (var + epsilon).sqrt()

std

tensor([[[1.0420]],

        [[0.9436]],

        [[0.9886]],

        [[0.9858]],

        [[0.9641]],

        [[1.0130]],

        [[1.0716]],

        [[1.0272]],

        [[1.0307]],

        [[0.9771]]])

In [132]:
y = gamma * (x_prime - mean) / std + beta

y

tensor([[[-0.6089,  0.4439, -0.8347,  ..., -2.5158,  0.6711, -0.7572]],

        [[-0.5859,  0.9957, -0.4961,  ...,  1.7858,  0.5414, -1.2964]],

        [[-1.1442, -0.3652,  1.1107,  ...,  0.0585,  0.0890,  0.1515]],

        ...,

        [[-0.7613,  0.6279, -1.3385,  ..., -1.0500,  0.9928, -1.4089]],

        [[-1.8199,  0.7994, -1.6335,  ...,  0.5430, -0.1617, -1.0083]],

        [[-1.1573, -0.0651,  1.2419,  ...,  0.6839, -0.2760,  0.0400]]],
       grad_fn=<AddBackward0>)

# Class Representation

In [133]:
import torch
from torch import nn

class LayerNormalization():
  def __init__(self, parameters_shape, eps=1e-5):
    '''
    :param parameters_shape: [batch_size, input_dim]
    '''
    self.parameters_shape=parameters_shape
    self.eps=eps
    self.gamma = nn.Parameter(torch.ones(parameters_shape))
    self.beta =  nn.Parameter(torch.zeros(parameters_shape))

  def forward(self, x):
    '''
    :param x: input sequence size of [sequence_length, batch_size, input_dim]
    '''
    dims = [-(i + 1) for i in range(len(self.parameters_shape))]
    mean = x.mean(dim=dims, keepdim=True)
    var = ((x - mean) ** 2).mean(dim=dims, keepdim=True)
    std = (var + self.eps).sqrt()
    y = self.gamma * (x - mean) / std  + self.beta
    return y

In [134]:
layer_norm = LayerNormalization(x_prime.size()[-2:])
y = layer_norm.forward(x_prime)

y

tensor([[[-0.6089,  0.4439, -0.8347,  ..., -2.5158,  0.6711, -0.7572]],

        [[-0.5859,  0.9957, -0.4961,  ...,  1.7858,  0.5414, -1.2964]],

        [[-1.1442, -0.3652,  1.1107,  ...,  0.0585,  0.0890,  0.1515]],

        ...,

        [[-0.7613,  0.6279, -1.3385,  ..., -1.0500,  0.9928, -1.4089]],

        [[-1.8199,  0.7994, -1.6335,  ...,  0.5430, -0.1617, -1.0083]],

        [[-1.1573, -0.0651,  1.2419,  ...,  0.6839, -0.2760,  0.0400]]],
       grad_fn=<AddBackward0>)