In [2]:
import torch
from torch import nn

In [3]:
inputs = torch.Tensor([[[0.2, 0.1, 0.3], [0.5, 0.1, 0.1]]])
B, S, E = inputs.size()
inputs = inputs.reshape(S, B, E)
inputs.size()

torch.Size([2, 1, 3])

In [4]:
parameter_shape = inputs.size()[-2:]
gamma = nn.Parameter(torch.ones(parameter_shape))
beta =  nn.Parameter(torch.zeros(parameter_shape))

In [5]:
gamma.size(), beta.size()

(torch.Size([1, 3]), torch.Size([1, 3]))

In [6]:
dims = [-(i + 1) for i in range(len(parameter_shape))]

In [7]:
dims

[-1, -2]

In [8]:
mean = inputs.mean(dim=dims, keepdim=True)
mean.size()

torch.Size([2, 1, 1])

In [9]:
mean

tensor([[[0.2000]],

        [[0.2333]]])

In [10]:
var = ((inputs - mean) ** 2).mean(dim=dims, keepdim=True)
epsilon = 1e-5
std = (var + epsilon).sqrt()
std

tensor([[[0.0817]],

        [[0.1886]]])

In [11]:
y = (inputs - mean) / std
y

tensor([[[ 0.0000, -1.2238,  1.2238]],

        [[ 1.4140, -0.7070, -0.7070]]])

In [12]:
out = gamma * y + beta

In [13]:
out

tensor([[[ 0.0000, -1.2238,  1.2238]],

        [[ 1.4140, -0.7070, -0.7070]]], grad_fn=<AddBackward0>)

## Class

In [14]:
import torch
from torch import nn

class LayerNormalization():
    def __init__(self, parameters_shape, eps=1e-5):
        self.parameters_shape=parameters_shape
        self.eps=eps
        self.gamma = nn.Parameter(torch.ones(parameters_shape))
        self.beta =  nn.Parameter(torch.zeros(parameters_shape))

    def forward(self, input):
        dims = [-(i + 1) for i in range(len(self.parameters_shape))]
        mean = inputs.mean(dim=dims, keepdim=True)
        print(f"Mean \n ({mean.size()}): \n {mean}")
        var = ((inputs - mean) ** 2).mean(dim=dims, keepdim=True)
        std = (var + self.eps).sqrt()
        print(f"Standard Deviation \n ({std.size()}): \n {std}")
        y = (inputs - mean) / std
        print(f"y \n ({y.size()}) = \n {y}")
        out = self.gamma * y  + self.beta
        print(f"out \n ({out.size()}) = \n {out}")
        return out

In [15]:
batch_size = 2
sentence_length = 3
embedding_dim = 4
inputs = torch.randn(sentence_length, batch_size, embedding_dim)

print(f"input \n ({inputs.size()}) = \n {inputs}")

input 
 (torch.Size([3, 2, 4])) = 
 tensor([[[ 2.4057,  2.5422,  1.7291,  0.1332],
         [ 0.5426, -1.2210, -0.2213, -0.4375]],

        [[ 0.2198, -0.0851,  2.1993, -1.5436],
         [-0.9357, -1.6414, -0.9650, -0.3945]],

        [[-0.7204, -3.1008,  0.2288, -0.1439],
         [-0.9144,  0.0583,  0.7968,  0.2167]]])


In [16]:
layer_norm = LayerNormalization(inputs.size()[-1:])

In [17]:
out = layer_norm.forward(inputs)

Mean 
 (torch.Size([3, 2, 1])): 
 tensor([[[ 1.7026],
         [-0.3343]],

        [[ 0.1976],
         [-0.9842]],

        [[-0.9341],
         [ 0.0394]]])
Standard Deviation 
 (torch.Size([3, 2, 1])): 
 tensor([[[0.9569],
         [0.6282]],

        [[1.3341],
         [0.4422]],

        [[1.2959],
         [0.6155]]])
y 
 (torch.Size([3, 2, 4])) = 
 tensor([[[ 0.7348,  0.8774,  0.0277, -1.6399],
         [ 1.3958, -1.4115,  0.1798, -0.1642]],

        [[ 0.0166, -0.2119,  1.5004, -1.3052],
         [ 0.1095, -1.4861,  0.0434,  1.3332]],

        [[ 0.1649, -1.6720,  0.8974,  0.6098],
         [-1.5496,  0.0308,  1.2307,  0.2881]]])
out 
 (torch.Size([3, 2, 4])) = 
 tensor([[[ 0.7348,  0.8774,  0.0277, -1.6399],
         [ 1.3958, -1.4115,  0.1798, -0.1642]],

        [[ 0.0166, -0.2119,  1.5004, -1.3052],
         [ 0.1095, -1.4861,  0.0434,  1.3332]],

        [[ 0.1649, -1.6720,  0.8974,  0.6098],
         [-1.5496,  0.0308,  1.2307,  0.2881]]], grad_fn=<AddBackward0>)


In [18]:
out[0].mean(), out[0].std()

(tensor(8.9407e-08, grad_fn=<MeanBackward0>),
 tensor(1.0690, grad_fn=<StdBackward0>))