### 引言

In [1]:
import torch
from torch import nn

In [2]:
net = nn.Sequential(nn.Linear(4, 8), nn.ReLU(), nn.Linear(8, 1))
X = torch.rand(size=[2, 4])

In [3]:
net(X)

tensor([[-0.0555],
        [-0.1251]], grad_fn=<AddmmBackward0>)

### 5.2.1 参数访问

In [7]:
net[2].state_dict()

OrderedDict([('weight',
              tensor([[-0.0924,  0.1858,  0.0620,  0.1767,  0.2851,  0.2124,  0.2702, -0.2105]])),
             ('bias', tensor([-0.2046]))])

In [8]:
net[1].state_dict()

OrderedDict()

In [18]:
net.state_dict()

OrderedDict([('0.weight',
              tensor([[ 0.4319, -0.2772, -0.3791,  0.3995],
                      [ 0.3984, -0.4986,  0.1091,  0.4254],
                      [-0.0505, -0.3083,  0.0405, -0.1974],
                      [-0.3434,  0.4668, -0.0685,  0.2471],
                      [ 0.1730,  0.4733, -0.0888, -0.3485],
                      [-0.1124,  0.3707,  0.1142, -0.2572],
                      [ 0.3586,  0.4052, -0.4735, -0.0536],
                      [ 0.2905,  0.2718, -0.1157,  0.4460]])),
             ('0.bias',
              tensor([ 0.0674, -0.0938, -0.2005, -0.0995, -0.0564,  0.4143, -0.4239, -0.2433])),
             ('2.weight',
              tensor([[-0.0924,  0.1858,  0.0620,  0.1767,  0.2851,  0.2124,  0.2702, -0.2105]])),
             ('2.bias', tensor([-0.2046]))])

目标参数

In [9]:
type(net[2].bias)

torch.nn.parameter.Parameter

In [10]:
net[2].bias

Parameter containing:
tensor([-0.2046], requires_grad=True)

In [11]:
net[2].bias.data

tensor([-0.2046])

In [13]:
net[2].weight.grad is None

True

一次性访问所有参数

In [17]:
print(*[(name, param.shape) for name, param in net[0].named_parameters()])
print('=' * 40)
print(*[(name, param.shape) for name, param in net.named_parameters()])

('weight', torch.Size([8, 4])) ('bias', torch.Size([8]))
('0.weight', torch.Size([8, 4])) ('0.bias', torch.Size([8])) ('2.weight', torch.Size([1, 8])) ('2.bias', torch.Size([1]))


In [19]:
net.state_dict()['2.bias'].data

tensor([-0.2046])

从嵌套块收集参数

In [20]:
def block1():
    return nn.Sequential(nn.Linear(4, 8), nn.ReLU(),
                         nn.Linear(8, 4), nn.ReLU())


def block2():
    net = nn.Sequential()
    for i in range(4):
        # 在这里嵌套
        net.add_module(f'block {i}', block1())
    return net


rgnet = nn.Sequential(block2(), nn.Linear(4, 1))
rgnet(X)

tensor([[0.2773],
        [0.2773]], grad_fn=<AddmmBackward0>)

In [21]:
rgnet

Sequential(
  (0): Sequential(
    (block 0): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 1): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 2): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 3): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
  )
  (1): Linear(in_features=4, out_features=1, bias=True)
)

In [22]:
rgnet[0][1][0].bias.data

tensor([-0.4890, -0.3511,  0.1636,  0.1827,  0.0656,  0.3588, -0.3530,  0.1390])

### 5.2.2 参数初始化

In [30]:
net

Sequential(
  (0): Linear(in_features=4, out_features=8, bias=True)
  (1): ReLU()
  (2): Linear(in_features=8, out_features=1, bias=True)
)

内置初始化

In [26]:
def init_normal(m):
    if type(m) == nn.Linear:
        nn.init.normal_(m.weight, mean=0, std=0.01)
        nn.init.zeros_(m.bias)


net.apply(init_normal)
net[0].weight.data[0], net[0].bias.data[0]

(tensor([ 0.0031,  0.0065,  0.0030, -0.0020]), tensor(0.))

In [27]:
def init_constant(m):
    if type(m) == nn.Linear:
        nn.init.constant_(m.weight, 1)
        nn.init.zeros_(m.bias)


net.apply(init_constant)
net[0].weight.data[0], net[0].bias.data[0]

(tensor([1., 1., 1., 1.]), tensor(0.))

In [29]:
def init_xavier(m):
    if type(m) == nn.Linear:
        nn.init.xavier_uniform_(m.weight)


def init_42(m):
    if type(m) == nn.Linear:
        nn.init.constant_(m.weight, 42)


net[0].apply(init_xavier)
net[2].apply(init_42)
print(net[0].weight.data[0])
print(net[2].weight.data)

tensor([ 0.5760,  0.3098, -0.3278, -0.6494])
tensor([[42., 42., 42., 42., 42., 42., 42., 42.]])


自定义初始化

In [31]:
def my_init(m):
    if type(m) == nn.Linear:
        print("Init", *[[name, param.shape] for name, param in m.named_parameters()])

        nn.init.uniform_(m.weight, -10, 10)
        m.weight.data *= m.weight.data.abs() >= 5


net.apply(my_init)
net[0].weight[:2]

Init ['weight', torch.Size([8, 4])] ['bias', torch.Size([8])]
Init ['weight', torch.Size([1, 8])] ['bias', torch.Size([1])]


tensor([[-0.0000, -9.9706, -5.8567, -0.0000],
        [-0.0000, -0.0000, -0.0000,  5.5585]], grad_fn=<SliceBackward0>)

注意: 我们始终可以直接设置参数

In [32]:
net[0].weight.data += 1
net[0].weight.data[0, 0] = 42
net[0].weight.data[0]

tensor([42.0000, -8.9706, -4.8567,  1.0000])

### 5.2.3 参数绑定

In [33]:
shared = nn.Linear(8, 8)
net = nn.Sequential(
    nn.Linear(4, 8),
    nn.ReLU(),
    shared,
    nn.ReLU(),
    shared,
    nn.ReLU(),
    nn.Linear(8, 1)
)

net(X)

tensor([[-0.1761],
        [-0.1658]], grad_fn=<AddmmBackward0>)

In [34]:
# 检查参数是否相同
print(net[2].weight.data[0] == net[4].weight.data[0])

tensor([True, True, True, True, True, True, True, True])


In [35]:
# 确保它们实际上是同一个对象, 而不只是有相同的值
net[2].weight.data[0, 0] = 100
print(net[2].weight.data[0] == net[4].weight.data[0])

tensor([True, True, True, True, True, True, True, True])


共享参数通常可以节省内存，并在以下方面具有特定的好处：

- 对于图像识别中的CNN，共享参数使网络能够在图像中的任何地方而不是仅在某个区域中查找给定的功能。
- 对于RNN，它在序列的各个时间步之间共享参数，因此可以很好地推广到不同序列长度的示例。
- 对于自动编码器，编码器和解码器共享参数。 在具有线性激活的单层自动编码器中，共享权重会在权重矩阵的不同隐藏层之间强制正交。