In [1]:
import torch
import torch.nn as nn

In [2]:
class GELU(nn.Module):
    def __init__(self):
        super().__init__()
        
    def forward(self,x):
        return 0.5 * x * (
            1 + torch.tanh(torch.sqrt(torch.tensor(2.0/torch.pi)) * (x + 0.44715 * torch.pow(x,3)))
        )

In [3]:
class DeepNeuralNetwork(nn.Module):
    def __init__(self, layer_sizes, use_shortcut):
        super().__init__()
        self.use_shortcut = use_shortcut
        self.layers = nn.ModuleList([
            nn.Sequential(nn.Linear(layer_sizes[0], layer_sizes[1]), GELU()),
            
            nn.Sequential(nn.Linear(layer_sizes[1], layer_sizes[2]), GELU()),
            
            nn.Sequential(nn.Linear(layer_sizes[2], layer_sizes[3]), GELU()),
            
            nn.Sequential(nn.Linear(layer_sizes[3], layer_sizes[4]), GELU()),
            
            nn.Sequential(nn.Linear(layer_sizes[4], layer_sizes[5]), GELU())
        ])
        
    def forward(self, x):
        for layer in self.layers:
            layer_output = layer(x)
            if self.use_shortcut and x.shape == layer_output.shape:
                x = x + layer_output
            else:
                x = layer_output
        return x           


In [4]:
layer_sizes = [3,3,3,3,3,1]

In [5]:
inputs = torch.tensor([[1.0, 0.0, 1.0]])

In [6]:
def print_gradient(model, x):
    output = model(x)
    target = torch.tensor([[0.]])
    
    loss = nn.MSELoss()
    loss = loss(output, target)
    
    loss.backward()
    
    for name, param in model_without_shortcut.named_parameters():
        if 'weight' in name:
            print(name, param.grad.abs().mean().item())
        

In [7]:
torch.manual_seed(123)
model_without_shortcut = DeepNeuralNetwork(layer_sizes, use_shortcut=False)
model_without_shortcut

DeepNeuralNetwork(
  (layers): ModuleList(
    (0-3): 4 x Sequential(
      (0): Linear(in_features=3, out_features=3, bias=True)
      (1): GELU()
    )
    (4): Sequential(
      (0): Linear(in_features=3, out_features=1, bias=True)
      (1): GELU()
    )
  )
)

In [8]:
print_gradient(model_without_shortcut,inputs)

layers.0.0.weight 0.00014525471488013864
layers.1.0.weight 0.00014241492317523807
layers.2.0.weight 0.0006289510056376457
layers.3.0.weight 0.0010982871754094958
layers.4.0.weight 0.004437715280801058


In [9]:
model_using_shortcut = DeepNeuralNetwork(layer_sizes, use_shortcut=True)
model_using_shortcut

DeepNeuralNetwork(
  (layers): ModuleList(
    (0-3): 4 x Sequential(
      (0): Linear(in_features=3, out_features=3, bias=True)
      (1): GELU()
    )
    (4): Sequential(
      (0): Linear(in_features=3, out_features=1, bias=True)
      (1): GELU()
    )
  )
)

In [10]:
print_gradient(model_using_shortcut, inputs)

layers.0.0.weight 0.00014525471488013864
layers.1.0.weight 0.00014241492317523807
layers.2.0.weight 0.0006289510056376457
layers.3.0.weight 0.0010982871754094958
layers.4.0.weight 0.004437715280801058
