References:
- https://www.youtube.com/watch?v=G45TuC6zRf4

In [1]:
import torch

In [2]:
"""
I've added a Batch Dimension to the same exact input that we have 
this is because during training we typically would have a batch
Dimension so that it helps parallelize training and training just becomes
faster 

so we would reshape the input to be: 
the number of words which is two
in this case I've taken the batch size as one 
and we're going to see the embedding for each batch as just three
"""

inputs = torch.Tensor([[[0.2, 0.1, 0.3], [0.5, 0.1, 0.1]]])

# batch, number of words, embedding
B, S, E = inputs.size()

print(B, S, E)
print(inputs)

1 2 3
tensor([[[0.2000, 0.1000, 0.3000],
         [0.5000, 0.1000, 0.1000]]])


In [3]:
inputs = inputs.reshape(S, B, E)

print(inputs.size())
print(inputs)

torch.Size([2, 1, 3])
tensor([[[0.2000, 0.1000, 0.3000]],

        [[0.5000, 0.1000, 0.1000]]])


In [4]:
"""
because now we have like this batch Dimension 
we're also going to use layer normalization not only just for the last
layer but that last layer across some batches 

in this case it's going to be one (batch dimension) so it's not going 
to make too much of a difference 

but layer normalization is essentially going to be computed
across the layer and also the batch just for your reference
"""

parameter_shape = inputs.size()[-2:]

"""
that's kind of why we see one by three dimensional matrices otherwise we would
have just seen just three dimensional vectors for gamma and beta and 

we're going to initialize gamma to be the standard deviation which is just ones
whereas betas are just going to be a bunch of zeros
"""
gamma = torch.nn.Parameter(torch.ones(parameter_shape))
beta =  torch.nn.Parameter(torch.zeros(parameter_shape))

"""
in this case that's kind of why we see 1 x 3 dimensional matrices 
otherwise we would have just seen just three dimensional vectors 
for gamma and beta
"""
gamma.size(), beta.size()

(torch.Size([1, 3]), torch.Size([1, 3]))

In [5]:
"""
now I'm basically computing the dimensions for which we want 
to compute layer normalization that is:
the batch Dimension as well as the embedding Dimension 

and  it's the last two layers
"""
dims = [-(i + 1) for i in range(len(parameter_shape))]

parameter_shape, dims

(torch.Size([1, 3]), [-1, -2])

In [6]:
"""
now we'll just take the mean across the batch dimension in the
layer Dimension and we're going to end up with a 2 x 1 x 1 tensor
"""
mean = inputs.mean(dim=dims, keepdim=True)
mean.size(), mean

(torch.Size([2, 1, 1]), tensor([[[0.2000]],
 
         [[0.2333]]]))

In [7]:
"""
we do the same kind of computation like we did before for computing 
the standard deviation 

now notice that we're adding some small Epsilon value to this
variance and this is done to ensure because you know standard deviation is
going to be a denominator over here it doesn't become zero
"""

# variance
var = ((inputs - mean) ** 2).mean(dim=dims, keepdim=True)
epsilon = 1e-5
std = (var + epsilon).sqrt()
std

tensor([[[0.0817]],

        [[0.1886]]])

In [8]:
"""
so when we actually do the inputs minus the mean divided by standard deviation 
we will get the same exact Matrix that we kind of worked out 
by hand which is great
"""
y = (inputs - mean) / std
y

tensor([[[ 0.0000, -1.2238,  1.2238]],

        [[ 1.4140, -0.7070, -0.7070]]])

In [9]:
"""
now we're gonna multiply gamma that is a matrix of ones to all the values of Y
and add zeros you're kind of still going to get the same exact Matrix

but this time you'll notice this additional parameter (grad_fn=) 
over here  which means that it has learnable parameters in this case
gamma and beta which are going to be updating during the actual back
propagation phase
"""
out = gamma * y + beta
out

tensor([[[ 0.0000, -1.2238,  1.2238]],

        [[ 1.4140, -0.7070, -0.7070]]], grad_fn=<AddBackward0>)

## Class

In [10]:
import torch

class LayerNormalization():
    def __init__(self, parameters_shape, eps=1e-5):
        self.parameters_shape=parameters_shape
        self.eps=eps
        self.gamma = torch.nn.Parameter(torch.ones(parameters_shape))
        self.beta =  torch.nn.Parameter(torch.zeros(parameters_shape))

    def forward(self, input):
        dims = [-(i + 1) for i in range(len(self.parameters_shape))]
        mean = inputs.mean(dim=dims, keepdim=True)
        print(f"Mean \n ({mean.size()}): \n {mean}")
        var = ((inputs - mean) ** 2).mean(dim=dims, keepdim=True)
        std = (var + self.eps).sqrt()
        print(f"Standard Deviation \n ({std.size()}): \n {std}")
        y = (inputs - mean) / std
        print(f"y \n ({y.size()}) = \n {y}")
        out = self.gamma * y  + self.beta
        print(f"out \n ({out.size()}) = \n {out}")
        return out

In [11]:
batch_size = 3
sentence_length = 5
embedding_dim = 8 
inputs = torch.randn(sentence_length, batch_size, embedding_dim)

print(f"input \n ({inputs.size()}) = \n {inputs}")

input 
 (torch.Size([5, 3, 8])) = 
 tensor([[[-0.9336,  1.4350,  1.4550, -1.1124, -0.0604,  1.7027,  0.4845,
          -0.6321],
         [-2.0648,  0.9175,  0.9180,  0.7445,  0.7821, -0.8418, -0.0627,
          -0.2638],
         [-1.5282, -0.0943,  1.2761,  0.6899,  0.0288,  1.3201, -0.1306,
          -1.8454]],

        [[-1.3317,  2.0521,  0.7397,  0.2874, -1.7054, -0.2183,  0.1137,
          -0.5181],
         [ 0.7227, -0.0262, -1.6567, -2.0064, -0.0487, -1.1105,  0.2263,
          -0.2071],
         [ 0.4981, -1.0664,  0.1809,  0.0433,  0.0584, -0.9919, -1.3965,
           0.3221]],

        [[-0.8001,  1.4854,  0.1499,  0.1294,  1.0086,  0.2764,  0.3969,
           0.0314],
         [ 2.0431,  0.5838, -0.0537, -0.7300, -0.6818, -0.1660, -0.7880,
          -0.3153],
         [ 0.6759,  0.5725, -0.8026,  1.2495,  0.5475,  0.7430, -0.1975,
           0.0752]],

        [[-0.7739,  0.5131,  1.1141,  0.8539,  2.9295,  1.5332, -1.1278,
          -1.5058],
         [ 1.0423,  0.1607, 

In [12]:
layer_norm = LayerNormalization(inputs.size()[-1:])

In [13]:
out = layer_norm.forward(inputs)

Mean 
 (torch.Size([5, 3, 1])): 
 tensor([[[ 0.2923],
         [ 0.0161],
         [-0.0355]],

        [[-0.0726],
         [-0.5133],
         [-0.2940]],

        [[ 0.3347],
         [-0.0135],
         [ 0.3579]],

        [[ 0.4420],
         [-0.3397],
         [-0.1422]],

        [[ 0.2058],
         [-0.2580],
         [-0.0630]]])
Standard Deviation 
 (torch.Size([5, 3, 1])): 
 tensor([[[1.0688],
         [0.9934],
         [1.0957]],

        [[1.1073],
         [0.9024],
         [0.6865]],

        [[0.6366],
         [0.8848],
         [0.5981]],

        [[1.4027],
         [1.0988],
         [0.9590]],

        [[0.6784],
         [0.6441],
         [0.8274]]])
y 
 (torch.Size([5, 3, 8])) = 
 tensor([[[-1.1470,  1.0691,  1.0878, -1.3143, -0.3301,  1.3196,  0.1798,
          -0.8649],
         [-2.0948,  0.9074,  0.9079,  0.7332,  0.7711, -0.8637, -0.0794,
          -0.2818],
         [-1.3623, -0.0537,  1.1970,  0.6620,  0.0586,  1.2371, -0.0868,
          -1.6518]],



In [14]:
out[0].mean(), out[0].std()

(tensor(2.9802e-08, grad_fn=<MeanBackward0>),
 tensor(1.0215, grad_fn=<StdBackward0>))