# The Vanishing Gradients Problem

# Code Example

In [9]:
import torch
import torch.nn as nn

# Create a deep network with sigmoid activations
class DeepNet(nn.Module):
    def __init__(self, depth=20):
        super().__init__()
        layers = []
        for _ in range(depth):
            layers.append(nn.Linear(100, 100))
            layers.append(nn.Sigmoid())  # <- prone to vanishing gradients
        self.net = nn.Sequential(*layers)
        self.output = nn.Linear(100, 1)
    
    def forward(self, x):
        return self.output(self.net(x))

# Create random input
x = torch.randn(1, 100)
model = DeepNet(depth=20)
loss_fn = nn.MSELoss()

# Forward + backward pass
target = torch.tensor([[1.0]])
output = model(x)
loss = loss_fn(output, target)
loss.backward()

# Inspect average gradient magnitude per layer
for i, layer in enumerate(model.net):
    if isinstance(layer, nn.Linear):
        #print(layer.weight.grad)
        grad_mean = layer.weight.grad.abs().mean().item()
        print(f"Layer {i//2 + 1}: mean |grad| = {grad_mean:.10f}")


Layer 1: mean |grad| = 0.0000000000
Layer 2: mean |grad| = 0.0000000000
Layer 3: mean |grad| = 0.0000000000
Layer 4: mean |grad| = 0.0000000000
Layer 5: mean |grad| = 0.0000000000
Layer 6: mean |grad| = 0.0000000000
Layer 7: mean |grad| = 0.0000000000
Layer 8: mean |grad| = 0.0000000000
Layer 9: mean |grad| = 0.0000000000
Layer 10: mean |grad| = 0.0000000000
Layer 11: mean |grad| = 0.0000000002
Layer 12: mean |grad| = 0.0000000012
Layer 13: mean |grad| = 0.0000000074
Layer 14: mean |grad| = 0.0000000617
Layer 15: mean |grad| = 0.0000004269
Layer 16: mean |grad| = 0.0000029049
Layer 17: mean |grad| = 0.0000205589
Layer 18: mean |grad| = 0.0001422858
Layer 19: mean |grad| = 0.0011019657
Layer 20: mean |grad| = 0.0079036895


In [2]:
model

DeepNet(
  (net): Sequential(
    (0): Linear(in_features=100, out_features=100, bias=True)
    (1): Sigmoid()
    (2): Linear(in_features=100, out_features=100, bias=True)
    (3): Sigmoid()
    (4): Linear(in_features=100, out_features=100, bias=True)
    (5): Sigmoid()
    (6): Linear(in_features=100, out_features=100, bias=True)
    (7): Sigmoid()
    (8): Linear(in_features=100, out_features=100, bias=True)
    (9): Sigmoid()
    (10): Linear(in_features=100, out_features=100, bias=True)
    (11): Sigmoid()
    (12): Linear(in_features=100, out_features=100, bias=True)
    (13): Sigmoid()
    (14): Linear(in_features=100, out_features=100, bias=True)
    (15): Sigmoid()
    (16): Linear(in_features=100, out_features=100, bias=True)
    (17): Sigmoid()
    (18): Linear(in_features=100, out_features=100, bias=True)
    (19): Sigmoid()
    (20): Linear(in_features=100, out_features=100, bias=True)
    (21): Sigmoid()
    (22): Linear(in_features=100, out_features=100, bias=True)
    