In [1]:
import torch
import torch.nn as nn

In [4]:
# Implementing the GELU activation function
class GELU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(
            torch.sqrt(torch.tensor(2.0 / torch.pi, dtype=x.dtype, device=x.device)) *
            (x + 0.044715 * torch.pow(x, 3))
        ))

In [5]:
class ExampleDeepNeuralMNetwork(nn.Module):
    def __init__(self, layer_sizes, use_shortcut): #layer size indicates how many neurons each layer should contain.
        super().__init__()
        self.use_shortcut = use_shortcut
        #
        self.layers = nn.ModuleList([ #constructing the neural network
            nn.Sequential(nn.Linear(layer_sizes[0], layer_sizes[1], GELU())),
            nn.Sequential(nn.Linear(layer_sizes[1], layer_sizes[2], GELU())),
            nn.Sequential(nn.Linear(layer_sizes[2], layer_sizes[3], GELU())),
            nn.Sequential(nn.Linear(layer_sizes[3], layer_sizes[4], GELU())),
            nn.Sequential(nn.Linear(layer_sizes[4], layer_sizes[5], GELU()))
        ])

    def forward(self, x):
        for layer in self.layers:
            #compute the output of the current layer
            layer_output = layer(x) #the output of the first layer serves as input to the second layer
            #check if shortcut can be applied
            if self.use_shortcut and x.shape == layer_output.shape:
                x = x + layer_output
            else:
                x = layer_output
        return x


In [6]:
layer_sizes = [3, 3, 3, 3, 3, 1] #five layers with 3 neuron and the last layer with one neuron
sample_input = torch.tensor([[1., 0., -1.]])
torch.manual_seed(123)
model_without_shortcut = ExampleDeepNeuralMNetwork(
    layer_sizes, use_shortcut=False
)

In [None]:
#print the gradients for every layer
def print_gradients(model, x):
    #forward pass
    output = model(x)
    target = torch.tensor([[0.]])

    #calculate loss based on how close the target and output are
    loss = nn.MSELoss()
    loss = loss(output, target)

    #backward pass to calculate the loss gradients for each layer
    loss.backward()

    for name, param in model.named_parameters():
        if 'weight' in name:
            #print the mean absolute gradient of the weughts
            print(f"{name} has gradient mean of {param.grad.abs().mean().item()}")

In [8]:
print_gradients(model_without_shortcut, sample_input)

layers.0.0.weight has gradient mean of 0.0015313407639041543
layers.1.0.weight has gradient mean of 0.0008734685834497213
layers.2.0.weight has gradient mean of 0.002111609559506178
layers.3.0.weight has gradient mean of 0.0030934568494558334
layers.4.0.weight has gradient mean of 0.007880656979978085


This is a clear represntation of the varnishing gradient problem

In [9]:
#setting the shortcut to be true
model_with_shortcut = ExampleDeepNeuralMNetwork(
    layer_sizes, use_shortcut=True
)
print_gradients(model_with_shortcut, sample_input)

layers.0.0.weight has gradient mean of 0.24866610765457153
layers.1.0.weight has gradient mean of 0.8006523251533508
layers.2.0.weight has gradient mean of 0.3836197853088379
layers.3.0.weight has gradient mean of 0.3954206109046936
layers.4.0.weight has gradient mean of 1.001085877418518


This illustration clearly shows how the shortcut connection solves the varbishing gradients problem