# Initialization of Parameters

- `init_normal_`: weights~Gaussian(0, 0.01), bias~Zero
- `init_constant_`: weights~constants, bias~Zero
- `init_xavier_`: weights~Xavier


In [3]:
import torch
from torch import nn

In [4]:
net = nn.Sequential(nn.LazyLinear(8), nn.ReLU(), nn.LazyLinear(1))
X = torch.rand(size=(2, 4))
net(X).shape



torch.Size([2, 1])

In [5]:
def init_normal(module):
    if type(module) == nn.Linear:
        nn.init.normal_(module.weight, mean=0, std=0.01)
        nn.init.zeros_(module.bias)

net.apply(init_normal)
net[0].weight.data, net[0].bias.data


(tensor([[-0.0085, -0.0126, -0.0120,  0.0078],
         [-0.0261,  0.0178,  0.0016,  0.0127],
         [-0.0085, -0.0095, -0.0125,  0.0281],
         [-0.0114,  0.0078, -0.0137,  0.0058],
         [ 0.0001,  0.0048, -0.0048, -0.0080],
         [ 0.0158, -0.0047,  0.0032,  0.0069],
         [ 0.0045,  0.0091,  0.0108,  0.0189],
         [ 0.0005, -0.0257, -0.0002, -0.0016]]),
 tensor([0., 0., 0., 0., 0., 0., 0., 0.]))

In [6]:
def init_constant(module):
    if type(module) == nn.Linear:
        nn.init.constant_(module.weight, 2) # 2 is the constant
        nn.init.zeros_(module.bias)

net.apply(init_constant)
net[0].weight.data, net[0].bias.data


(tensor([[2., 2., 2., 2.],
         [2., 2., 2., 2.],
         [2., 2., 2., 2.],
         [2., 2., 2., 2.],
         [2., 2., 2., 2.],
         [2., 2., 2., 2.],
         [2., 2., 2., 2.],
         [2., 2., 2., 2.]]),
 tensor([0., 0., 0., 0., 0., 0., 0., 0.]))

In [7]:
# We can initialize different layers by different initializers
def init_xavier(module):
    if type(module) == nn.Linear:
        nn.init.xavier_uniform_(module.weight)


def init_42(module):
    if type(module) == nn.Linear:
        nn.init.constant_(module.weight, 42)


net[0].apply(init_xavier)
net[2].apply(init_42)
print(net[0].weight.data)
print(net[2].weight.data)

tensor([[ 0.5305,  0.6949,  0.7057,  0.5652],
        [ 0.6541, -0.4367, -0.5844, -0.2961],
        [-0.5202, -0.3214, -0.2099,  0.2223],
        [ 0.3149,  0.3967, -0.2504, -0.2714],
        [ 0.2533, -0.3579,  0.0315,  0.0318],
        [ 0.5440,  0.5466,  0.5503, -0.2221],
        [ 0.0311,  0.2209,  0.0752, -0.3390],
        [ 0.5204, -0.6107, -0.2125, -0.0064]])
tensor([[42., 42., 42., 42., 42., 42., 42., 42.]])


In [11]:
# Custom initialisation over a Uniform distribution
def custom_init(module):
    if type(module) == nn.Linear:
        print(
            "Init",
            *[(name, param.shape) for name, param in module.named_parameters()][0]
        )

        nn.init.uniform_(module.weight, -10, 10)
        module.weight.data *= module.weight.data.abs() >= 5


net.apply(custom_init)
net[0].weight

Init weight torch.Size([8, 4])
Init weight torch.Size([1, 8])


Parameter containing:
tensor([[-5.2647,  8.9670, -6.9094, -0.0000],
        [-9.6208,  0.0000, -6.8990, -8.5384],
        [ 5.4891,  0.0000, -0.0000, -7.5526],
        [ 9.0367, -7.7021, -0.0000, -5.8225],
        [ 5.9627, -0.0000,  0.0000,  5.9515],
        [ 0.0000, -0.0000, -0.0000, -6.3688],
        [ 5.0961, -9.5561,  9.6942,  5.8251],
        [ 0.0000,  0.0000, -0.0000, -0.0000]], requires_grad=True)