In [1]:
import torch
import torch.nn as nn

In [2]:
GPT_CONFIG_124M = {
    "vocab_size":50257,
    "context_length":1024,
    "embedding_dim":768,
    "num_of_heads":12,
    "num_of_layers":12,
    "drop_rate":0.1,
    "qkv_bias":False
}

In [3]:
torch.manual_seed(123)
batch_example = torch.randn(2,5)
batch_example

tensor([[-0.1115,  0.1204, -0.3696, -0.2404, -1.1969],
        [ 0.2093, -0.9724, -0.7550,  0.3239, -0.1085]])

### Without Normalization

In [4]:
layer = nn.Sequential(nn.Linear(5,6), nn.ReLU())
layer

Sequential(
  (0): Linear(in_features=5, out_features=6, bias=True)
  (1): ReLU()
)

In [5]:
output = layer(batch_example)
output

tensor([[0.2260, 0.3470, 0.0000, 0.2216, 0.0000, 0.0000],
        [0.2133, 0.2394, 0.0000, 0.5198, 0.3297, 0.0000]],
       grad_fn=<ReluBackward0>)

In [6]:
mean = output.mean(dim=-1, keepdim=True)
variance = output.var(dim=-1, keepdim=True)
print(mean)
print("----------------")
print(variance)

tensor([[0.1324],
        [0.2170]], grad_fn=<MeanBackward1>)
----------------
tensor([[0.0231],
        [0.0398]], grad_fn=<VarBackward0>)


### Using Layer Normalization

In [7]:
mean = output.mean(dim=-1, keepdim=True)
variance = output.var(dim=-1, keepdim=True)

output_normalized =( output - mean) / torch.sqrt(variance)
output_normalized

tensor([[ 0.6159,  1.4126, -0.8719,  0.5872, -0.8719, -0.8719],
        [-0.0189,  0.1121, -1.0876,  1.5173,  0.5647, -1.0876]],
       grad_fn=<DivBackward0>)

In [8]:
mean = output_normalized.mean(dim=-1, keepdim=True)
variance = output_normalized.var(dim=-1, keepdim=True)

torch.set_printoptions(sci_mode=False)
print(mean)
print("----------------")
print(variance)

tensor([[    0.0000],
        [    0.0000]], grad_fn=<MeanBackward1>)
----------------
tensor([[1.0000],
        [1.0000]], grad_fn=<VarBackward0>)


# Build Class

In [9]:
class LayerNormalization(nn.Module):
    def __init__(self, embedding_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(embedding_dim))
        self.shift = nn.Parameter(torch.zeros(embedding_dim))
        
    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        variance = x.var(dim=-1, keepdim=True)
        
        x_normalized = (x - mean)/ torch.sqrt(variance + self.eps)
        return self.scale * x_normalized + self.shift

In [12]:
batch_size, embedding_size = output.shape
embedding_size

6

In [14]:
lm = LayerNormalization(embedding_size)

In [16]:
normalized_from_class =  lm(output)

In [17]:
mean = normalized_from_class.mean(dim=-1, keepdim=True)
variance = normalized_from_class.var(dim=-1, keepdim=True)

torch.set_printoptions(sci_mode=False)
print(mean)
print("----------------")
print(variance)

tensor([[    -0.0000],
        [     0.0000]], grad_fn=<MeanBackward1>)
----------------
tensor([[0.9996],
        [0.9997]], grad_fn=<VarBackward0>)
