# Understanding Layer Normalization

In [9]:
import torch
from torch import nn

sequence_length = 10
batch_size = 1
input_dim = 512

# X' is generated by linear transformation of original input sequence X from the previous layer 
x_prime = torch.randn((sequence_length, batch_size, input_dim)) # Care the order

In [10]:
parameter_shape = x_prime.size()[-2:]
gamma = nn.Parameter(torch.ones(parameter_shape))
beta =  nn.Parameter(torch.zeros(parameter_shape))

gamma.shape, beta.shape

(torch.Size([1, 512]), torch.Size([1, 512]))

In [11]:
dims = [-(i + 1) for i in range(len(parameter_shape))]

dims

[-1, -2]

In [12]:
mean = x_prime.mean(dim=dims, keepdim=True)

mean

tensor([[[-0.0068]],

        [[ 0.0178]],

        [[-0.0527]],

        [[ 0.0270]],

        [[-0.0360]],

        [[ 0.0268]],

        [[-0.0213]],

        [[ 0.0770]],

        [[-0.0013]],

        [[ 0.0203]]])

In [13]:
var = ((x_prime - mean) ** 2).mean(dim=dims, keepdim=True)
epsilon = 1e-5 # To prevent std being zero
std = (var + epsilon).sqrt()

std

tensor([[[1.0428]],

        [[0.9649]],

        [[1.0358]],

        [[0.9930]],

        [[0.9665]],

        [[1.0093]],

        [[0.9559]],

        [[0.9482]],

        [[1.0401]],

        [[1.0553]]])

In [14]:
y = gamma * (x_prime - mean) / std + beta

y

tensor([[[ 0.8458,  0.3772, -0.0408,  ...,  1.4994, -0.2065,  0.6609]],

        [[ 1.2053,  1.0577,  0.7136,  ..., -0.0042, -0.6064, -2.9316]],

        [[-1.1550,  0.0506,  1.0039,  ..., -1.2100, -0.2253,  0.0459]],

        ...,

        [[ 0.2312,  0.9849,  1.0613,  ...,  0.0413, -0.4037,  0.1094]],

        [[ 1.8095, -0.1417, -0.7581,  ..., -0.5188,  1.7325, -0.9280]],

        [[ 0.5270,  0.7654,  1.1790,  ...,  0.4023,  0.1251,  0.2346]]],
       grad_fn=<AddBackward0>)

# Class Representation

In [15]:
import torch
from torch import nn

class LayerNormalization(nn.Module):
  def __init__(self, parameters_shape, eps=1e-5):
    '''
    :param parameters_shape: [batch_size, input_dim]
    '''
    super().__init__()
    self.parameters_shape=parameters_shape
    self.eps=eps
    self.gamma = nn.Parameter(torch.ones(parameters_shape))
    self.beta =  nn.Parameter(torch.zeros(parameters_shape))

  def forward(self, x):
    '''
    :param x: input sequence size of [sequence_length, batch_size, input_dim]
    '''
    dims = [-(i + 1) for i in range(len(self.parameters_shape))]
    mean = x.mean(dim=dims, keepdim=True)
    var = ((x - mean) ** 2).mean(dim=dims, keepdim=True)
    std = (var + self.eps).sqrt()
    y = self.gamma * (x - mean) / std  + self.beta
    return y

In [16]:
layer_norm = LayerNormalization(x_prime.size()[-2:])
y = layer_norm.forward(x_prime)

y

tensor([[[ 0.8458,  0.3772, -0.0408,  ...,  1.4994, -0.2065,  0.6609]],

        [[ 1.2053,  1.0577,  0.7136,  ..., -0.0042, -0.6064, -2.9316]],

        [[-1.1550,  0.0506,  1.0039,  ..., -1.2100, -0.2253,  0.0459]],

        ...,

        [[ 0.2312,  0.9849,  1.0613,  ...,  0.0413, -0.4037,  0.1094]],

        [[ 1.8095, -0.1417, -0.7581,  ..., -0.5188,  1.7325, -0.9280]],

        [[ 0.5270,  0.7654,  1.1790,  ...,  0.4023,  0.1251,  0.2346]]],
       grad_fn=<AddBackward0>)