In [1]:
import torch
import torch.nn as nn

In [22]:
input = torch.tensor([[0.2260, 0.3470, 0.000, 0.2216, 0.000, 0.000],
                      [0.2133, 0.2394, 0.0000, 0.5198, 0.3297, 0.000]])

mean = torch.mean(input,1, keepdim=True)
var = torch.var(input, 1, keepdim=True)

## layer Normalization
layer_norm_inp = (input -mean)/torch.sqrt(var)

In [26]:
layer_norm_inp

tensor([[ 0.6160,  1.4126, -0.8719,  0.5870, -0.8719, -0.8719],
        [-0.0187,  0.1121, -1.0877,  1.5173,  0.5646, -1.0877]])

In [24]:
layer_mean = layer_norm_inp.mean(dim =-1, keepdim=True)
layer_var = layer_norm_inp.var(dim =-1, keepdim=True)

print(f"The mean after layer normalization is {layer_mean}")
print(f"The variance after layer normalization is {layer_var}")

The mean after layer normalization is tensor([[-3.9736e-08],
        [-1.9868e-08]])
The variance after layer normalization is tensor([[1.0000],
        [1.0000]])


In [72]:
input

tensor([[0.2260, 0.3470, 0.0000, 0.2216, 0.0000, 0.0000],
        [0.2133, 0.2394, 0.0000, 0.5198, 0.3297, 0.0000]])

In [78]:
eps = 1e-5
import math
scale = nn.Parameter(torch.ones(6))
shift = nn.Parameter(torch.zeros(6))

mean = input.mean(dim =-1, keepdim=True)
var = input.var(dim =-1, keepdim= True)

linear_norm_input = (input-mean)/torch.sqrt(var+eps)
linear_norm_input = linear_norm_input*scale + shift

In [79]:
linear_norm_input

tensor([[ 0.6159,  1.4123, -0.8717,  0.5869, -0.8717, -0.8717],
        [-0.0187,  0.1121, -1.0875,  1.5171,  0.5646, -1.0875]],
       grad_fn=<AddBackward0>)

## Modelling Layer normalization

In [49]:
class Layer_normalization(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.shift = nn.Parameter(torch.zeros(emb_dim))
        self.scale = nn.Parameter(torch.ones(emb_dim))

    def forward(self, x):

        mean = x.mean(dim =-1, keepdim = True)
        var = x.var(dim =-1, keepdim = True, unbiased = False)
        layer_norm = (x-mean)/torch.sqrt(var+self.eps)
        print(layer_norm)
        print(self.scale*layer_norm+ self.shift)

        return self.scale*layer_norm + self.shift



In [50]:
ln = Layer_normalization(emb_dim =6)
out_ln = ln(input)

tensor([[ 0.6746,  1.5470, -0.9548,  0.6429, -0.9548, -0.9548],
        [-0.0205,  0.1228, -1.1913,  1.6619,  0.6184, -1.1913]])
tensor([[ 0.6746,  1.5470, -0.9548,  0.6429, -0.9548, -0.9548],
        [-0.0205,  0.1228, -1.1913,  1.6619,  0.6184, -1.1913]],
       grad_fn=<AddBackward0>)


In [51]:
ln_mean = out_ln.mean(dim =-1, keepdim=True)
ln_var = out_ln.var(dim =-1, keepdim=True)
print(ln_mean)
print(ln_var)

tensor([[-4.9671e-08],
        [-1.9868e-08]], grad_fn=<MeanBackward1>)
tensor([[1.1994],
        [1.1996]], grad_fn=<VarBackward0>)


In [52]:
out_ln

tensor([[ 0.6746,  1.5470, -0.9548,  0.6429, -0.9548, -0.9548],
        [-0.0205,  0.1228, -1.1913,  1.6619,  0.6184, -1.1913]],
       grad_fn=<AddBackward0>)

## Implementing Shirtcut connections

In [64]:
import math
class GELU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):

       return 0.5*x*(1+ torch.tanh(torch.sqrt(2/(torch.Tensor([math.pi])))*(x + 0.044715*(torch.pow(x, 3)))))

In [65]:
class Shortconnections(nn.Module):
    def __init__(self, layer_size, use_shortcut):
        super().__init__()

        self.use_shortcut = use_shortcut
        self.layers = nn.ModuleList([
            nn.Sequential(nn.Linear(layer_size[0], layer_size[1]), GELU()),
            nn.Sequential(nn.Linear(layer_size[1], layer_size[2]), GELU()),
            nn.Sequential(nn.Linear(layer_size[2], layer_size[3]), GELU()),
            nn.Sequential(nn.Linear(layer_size[3], layer_size[4]), GELU()),
            nn.Sequential(nn.Linear(layer_size[4], layer_size[5]), GELU())]
        )

    def forward(self, x):
        for layer in self.layers:
            layer_output = layer(x)
            if self.use_shortcut and x.shape == layer_output.shape:
                x = x+layer_output

            else :
                x = layer_output
        return x

In [67]:
layer_size = [3,3,3,3,3,1]

sample_input = torch.tensor([[1., 0., -1.]])

torch.manual_seed(123)

model_without_shortcut = Shortconnections(layer_size, use_shortcut= False)


In [69]:
def print_gradients(model, x):
    output = model(x)
    loss = nn.MSELoss()
    targets = torch.tensor([[0.]])
    loss = loss(output, targets)

    loss.backward()

    for name, param in model.named_parameters():
        if 'weight' in name:
            print(f"{name} has gradient mean of {param.grad.abs().mean().item()}")

In [70]:
print_gradients(model_without_shortcut, sample_input)

layers.0.0.weight has gradient mean of 0.00020173587836325169
layers.1.0.weight has gradient mean of 0.00012011159560643137
layers.2.0.weight has gradient mean of 0.0007152039906941354
layers.3.0.weight has gradient mean of 0.0013988736318424344
layers.4.0.weight has gradient mean of 0.005049645435065031


In [71]:
model_without_shortcut = Shortconnections(layer_size, use_shortcut= True)
print_gradients(model_without_shortcut, sample_input)

layers.0.0.weight has gradient mean of 0.0014432291500270367
layers.1.0.weight has gradient mean of 0.004846952389925718
layers.2.0.weight has gradient mean of 0.004138893447816372
layers.3.0.weight has gradient mean of 0.005915115587413311
layers.4.0.weight has gradient mean of 0.032659437507390976
