In [1]:
import torch
from torch import nn

In [3]:
net = nn.Sequential(nn.Linear(4, 8), nn.ReLU(), nn.Linear(8, 1))
X = torch.rand(size=(2, 4))

In [4]:
X

tensor([[0.5362, 0.6675, 0.6359, 0.4147],
        [0.8710, 0.0728, 0.2609, 0.7490]])

In [5]:
net(X)

tensor([[-0.1695],
        [-0.2550]], grad_fn=<AddmmBackward>)

In [6]:
print(net[2].state_dict())

OrderedDict([('weight', tensor([[ 0.0280,  0.1599,  0.3015, -0.2890,  0.0346, -0.2274, -0.1465,  0.1489]])), ('bias', tensor([-0.3451]))])


In [8]:
print(net[0].state_dict())

OrderedDict([('weight', tensor([[-0.0229,  0.3560, -0.0398, -0.2329],
        [ 0.4544, -0.0760, -0.3926,  0.1075],
        [ 0.3718,  0.4046, -0.1221, -0.4719],
        [ 0.3979, -0.3642,  0.0400, -0.2827],
        [ 0.0929,  0.2155, -0.1533,  0.0279],
        [-0.1972, -0.3355, -0.0970, -0.2790],
        [ 0.4034, -0.3808, -0.1592, -0.2471],
        [ 0.0919, -0.3958,  0.4377, -0.0621]])), ('bias', tensor([ 0.0146, -0.2239,  0.3753, -0.2280, -0.3303, -0.2759,  0.1523, -0.2200]))])


In [11]:
print(type(net[2].bias))
print(net[2].bias)
print(net[2].bias.data)

<class 'torch.nn.parameter.Parameter'>
Parameter containing:
tensor([-0.3451], requires_grad=True)
tensor([-0.3451])


In [12]:
net[2].weight.grad == None

True

In [13]:
print(*[(name, param.shape) for name, param in net[0].named_parameters()])

('weight', torch.Size([8, 4])) ('bias', torch.Size([8]))


In [14]:
print(*[(name, param.shape) for name, param in net.named_parameters()])

('0.weight', torch.Size([8, 4])) ('0.bias', torch.Size([8])) ('2.weight', torch.Size([1, 8])) ('2.bias', torch.Size([1]))


In [15]:
net.state_dict()['2.bias'].data

tensor([-0.3451])

In [16]:
def block1():
    return nn.Sequential(nn.Linear(4, 8), nn.ReLU(), nn.Linear(8,4), nn.ReLU())

def block2():
    net = nn.Sequential()
    for i in range(4):
        net.add_module(f'block {i}', block1())
    return net

In [17]:
rgnet = nn.Sequential(block2(), nn.Linear(4, 1))

In [18]:
rgnet.state_dict()

OrderedDict([('0.block 0.0.weight',
              tensor([[ 0.1695, -0.1930,  0.3745, -0.0270],
                      [-0.4655,  0.2678, -0.0999,  0.3553],
                      [ 0.1100, -0.0027,  0.3710, -0.0774],
                      [-0.2249,  0.1939, -0.2567, -0.4822],
                      [-0.1197, -0.4308,  0.4424,  0.1172],
                      [ 0.3740, -0.0036, -0.4100, -0.2992],
                      [-0.3497, -0.0665, -0.1358,  0.2861],
                      [-0.2603,  0.2789,  0.1813, -0.4825]])),
             ('0.block 0.0.bias',
              tensor([ 0.2771,  0.4359,  0.3635,  0.3965, -0.3362, -0.0587,  0.2717,  0.2688])),
             ('0.block 0.2.weight',
              tensor([[-0.1798,  0.0150,  0.1819,  0.0305, -0.0918, -0.1879,  0.1521, -0.1230],
                      [-0.1897, -0.1221, -0.0811, -0.0496, -0.1819, -0.3209, -0.0616, -0.1625],
                      [ 0.1498, -0.0541, -0.0096,  0.2542, -0.2662,  0.3211,  0.0981,  0.1223],
                      [ 0.

In [19]:
rgnet(X)

tensor([[0.5338],
        [0.5339]], grad_fn=<AddmmBackward>)

In [20]:
print(rgnet)

Sequential(
  (0): Sequential(
    (block 0): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 1): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 2): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 3): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
  )
  (1): Linear(in_features=4, out_features=1, bias=True)
)


In [21]:
rgnet[0][1][0].bias.data

tensor([-0.4760, -0.3020,  0.2361, -0.2255, -0.4179,  0.3804,  0.1959,  0.3445])

In [23]:
def init_normal(m):
    if type(m) == nn.Linear:
        nn.init.normal_(m.weight, mean=0, std=0.01)
        nn.init.zeros_(m.bias)

net.apply(init_normal)
net[0].weight.data[0], net[0].bias.data[0]

(tensor([-0.0141,  0.0214, -0.0059, -0.0226]), tensor(0.))

In [25]:
def init_constant(m):
    if type(m) == nn.Linear:
        nn.init.constant_(m.weight, 1)
        nn.init.zeros_(m.bias)
    
net.apply(init_constant)
net[0].weight.data[0], net[0].bias.data[0]

(tensor([1., 1., 1., 1.]), tensor(0.))

In [27]:
def xavier(m):
    if type(m) == nn.Linear:
        torch.nn.init.xavier_uniform_(m.weight)

def init_42(m):
    if type(m) == nn.Linear:
        torch.nn.init.constant_(m.weight, 42)

net[0].apply(xavier)
net[2].apply(init_42)
print(net[0].weight.data)
print(net[2].weight.data)

tensor([[ 0.2224,  0.3784,  0.3046, -0.6209],
        [ 0.2200, -0.2506,  0.0429, -0.4970],
        [ 0.6614, -0.4702, -0.5930, -0.2566],
        [ 0.6299,  0.2765, -0.6093,  0.4132],
        [ 0.2841,  0.5701,  0.6561, -0.1526],
        [ 0.2083, -0.6029,  0.3729, -0.2087],
        [ 0.4830, -0.6506, -0.2148, -0.1209],
        [-0.6796, -0.1479,  0.1410, -0.6508]])
tensor([[42., 42., 42., 42., 42., 42., 42., 42.]])


In [30]:
def my_init(m):
    if type(m) == nn.Linear:
        print("init", *[(name, param.shape) for name, param in m.named_parameters()][0])
        nn.init.uniform_(m.weight, -10, 10)
        m.weight.data *= m.weight.data.abs() >= 5

net.apply(my_init)
net[0].weight

init weight torch.Size([8, 4])
init weight torch.Size([1, 8])


Parameter containing:
tensor([[ 0.0000,  0.0000,  8.6284, -0.0000],
        [-7.3751, -0.0000, -0.0000, -0.0000],
        [ 5.0226,  0.0000,  0.0000,  0.0000],
        [ 7.7219,  6.2083,  0.0000,  8.5271],
        [-9.7977,  0.0000,  6.9495, -8.1519],
        [ 0.0000,  5.1169,  0.0000,  0.0000],
        [ 0.0000,  0.0000, -0.0000, -0.0000],
        [-0.0000, -0.0000,  0.0000, -8.2361]], requires_grad=True)

In [33]:
net[0].weight.data[:] += 1
net[0].weight.data[0, 0] = 42
net[0].weight.data

tensor([[42.0000,  3.0000, 11.6284,  3.0000],
        [-4.3751,  3.0000,  3.0000,  3.0000],
        [ 8.0226,  3.0000,  3.0000,  3.0000],
        [10.7219,  9.2083,  3.0000, 11.5271],
        [-6.7977,  3.0000,  9.9495, -5.1519],
        [ 3.0000,  8.1169,  3.0000,  3.0000],
        [ 3.0000,  3.0000,  3.0000,  3.0000],
        [ 3.0000,  3.0000,  3.0000, -5.2361]])

In [34]:
shared = nn.Linear(8, 8)
net = nn.Sequential(nn.Linear(4, 8), nn.ReLU(), shared, nn.ReLU(), shared, nn.ReLU(), nn.Linear(8, 1))
net(X)

tensor([[0.0768],
        [0.0363]], grad_fn=<AddmmBackward>)

In [35]:
print(net[2].weight.data[0] == net[4].weight.data[0])

tensor([True, True, True, True, True, True, True, True])


In [37]:
net[2].weight.data[0, 0] = 100
print(net[2].weight.data[0] == net[4].weight.data[0])

tensor([True, True, True, True, True, True, True, True])
