1. 参数访问
    * 目标参数：每个参数都表示为参数（parameter）类的一个实例。要对参数执行任何操作，首先我们需要访问底层的数值。
    * 一次性访问所有参数
    * 从嵌套块收集参数
2. 参数初始化
    * 内置初始化
    * 自定义初始化
3. 参数绑定
4. 小结

In [1]:
# 单隐藏层的多层感知机。
import torch
from torch import nn

net = nn.Sequential(nn.Linear(4, 8), nn.ReLU(), nn.Linear(8, 1))
X = torch.rand(size=(2, 4))
net(X)

tensor([[0.4296],
        [0.4028]], grad_fn=<AddmmBackward>)

In [2]:
# 参数访问
print(net[2].state_dict())

OrderedDict([('weight', tensor([[ 0.1570,  0.1388,  0.0965, -0.0047, -0.3511,  0.3259,  0.1751,  0.2166]])), ('bias', tensor([0.3295]))])


In [3]:
# 下面的代码从第二个神经网络层提取偏置，提取后返回的是一个参数类实例，并进一步访问该参数的值。
# 参数是复合的对象，包含值、梯度和额外信息
print(type(net[2].bias))
print(net[2].bias)
print(net[2].bias.data)

<class 'torch.nn.parameter.Parameter'>
Parameter containing:
tensor([0.3295], requires_grad=True)
tensor([0.3295])


In [4]:
# 除了值之外，我们还可以访问每个参数的梯度，由于没有调用反向传播，所以参数的梯度处于初始状态。
net[2].weight.grad == None

True

In [9]:
# 一次性访问所有参数
print(*[(name, param.shape) for name, param in net[0].named_parameters()])
print(*[(name, param.shape) for name, param in net.named_parameters()])
net.state_dict()['2.bias'].data

('weight', torch.Size([8, 4])) ('bias', torch.Size([8]))
('0.weight', torch.Size([8, 4])) ('0.bias', torch.Size([8])) ('2.weight', torch.Size([1, 8])) ('2.bias', torch.Size([1]))


tensor([0.3295])

In [10]:
# 从嵌套块收集参数

def block1():
    return nn.Sequential(nn.Linear(4, 8), nn.ReLU(),
                         nn.Linear(8, 4), nn.ReLU())

def block2():
    net = nn.Sequential()
    for i in range(4):
        # 在这里嵌套
        net.add_module(f'block {i}', block1())
    return net

rgnet = nn.Sequential(block2(), nn.Linear(4, 1))
rgnet(X)

tensor([[0.3266],
        [0.3266]], grad_fn=<AddmmBackward>)

In [11]:
print(rgnet)

Sequential(
  (0): Sequential(
    (block 0): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 1): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 2): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 3): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
  )
  (1): Linear(in_features=4, out_features=1, bias=True)
)


In [12]:
rgnet[0][1][0].bias.data

tensor([-0.1088,  0.0520, -0.2078, -0.0831, -0.1747, -0.2374, -0.2178, -0.4653])

In [13]:
# 参数初始化
# 高斯分布
def init_normal(m):
    if type(m) == nn.Linear:
        nn.init.normal_(m.weight, mean=0, std=0.01)
        nn.init.zeros_(m.bias)
net.apply(init_normal)
net[0].weight.data[0], net[0].bias.data[0]

(tensor([ 0.0147,  0.0044, -0.0047, -0.0027]), tensor(0.))

In [14]:
# 常数
def init_constant(m):
    if type(m) == nn.Linear:
        nn.init.constant_(m.weight, 1)
        nn.init.zeros_(m.bias)
net.apply(init_constant)
net[0].weight.data[0], net[0].bias.data[0]

(tensor([1., 1., 1., 1.]), tensor(0.))

In [15]:
# 不同块采用不同的初始化
def xavier(m):
    if type(m) == nn.Linear:
        nn.init.xavier_uniform_(m.weight)
def init_42(m):
    if type(m) == nn.Linear:
        nn.init.constant_(m.weight, 42)

net[0].apply(xavier)
net[2].apply(init_42)
print(net[0].weight.data[0])
print(net[2].weight.data)

tensor([-0.5934,  0.0626, -0.3637,  0.2423])
tensor([[42., 42., 42., 42., 42., 42., 42., 42.]])


In [16]:
# 自定义参数初始化
def my_init(m):
    if type(m) == nn.Linear:
        print("Init", *[(name, param.shape)
                        for name, param in m.named_parameters()][0])
        nn.init.uniform_(m.weight, -10, 10)
        m.weight.data *= m.weight.data.abs() >= 5

net.apply(my_init)
net[0].weight

Init weight torch.Size([8, 4])
Init weight torch.Size([1, 8])


Parameter containing:
tensor([[-5.4169,  6.9345, -5.3274,  6.7716],
        [ 8.4636,  0.2145,  6.9587,  1.5237],
        [-5.5604,  8.0154, -4.5076, -9.5305],
        [ 9.0585, -6.1241,  8.2844, -8.7465],
        [ 6.6681, -3.4718, -7.3977,  9.0384],
        [ 1.4992,  2.2234,  6.6491,  4.0744],
        [-3.2752,  8.6998, -5.5834,  3.9857],
        [ 4.8742,  8.1861, -3.1011,  4.0511]], requires_grad=True)

In [17]:
# 任意赋值
net[0].weight.data[:] += 1
net[0].weight.data[0, 0] = 42
net[0].weight.data[0]

tensor([42.0000,  7.9345, -4.3274,  7.7716])

In [18]:
# 我们需要给共享层一个名称，以便可以引用它的参数。
shared = nn.Linear(8, 8)
net = nn.Sequential(nn.Linear(4, 8), nn.ReLU(),
                    shared, nn.ReLU(),
                    shared, nn.ReLU(),
                    nn.Linear(8, 1))
net(X)
# 检查参数是否相同
print(net[2].weight.data[0] == net[4].weight.data[0])
net[2].weight.data[0, 0] = 100
# 确保它们实际上是同一个对象，而不只是有相同的值。
print(net[2].weight.data[0] == net[4].weight.data[0])

tensor([True, True, True, True, True, True, True, True])
tensor([True, True, True, True, True, True, True, True])


In [19]:
net[2].weight.data[0]

tensor([ 1.0000e+02,  8.9864e-03,  2.7858e-01,  1.6640e-01, -7.8594e-02,
         4.4907e-02, -4.4838e-02,  2.4383e-01])