## 1 模型参数的访问、初始化与共享

定义含单个隐藏层的感知机：

In [2]:
import torch
import torch.nn as nn
import torch.nn.init as init

# 使用Sequential构造模型会默认执行初始化
net = nn.Sequential(nn.Linear(4, 3), nn.ReLU(), nn.Linear(3, 1))
print(net)

X = torch.rand(2, 4)
Y = net(X).sum()

Sequential(
  (0): Linear(in_features=4, out_features=3, bias=True)
  (1): ReLU()
  (2): Linear(in_features=3, out_features=1, bias=True)
)


### 1.1 通过net.parameters()或nn.named_parameters()访问参数
参数的类型为torch.nn.parameter.Parameter，这是tensor的子类，该类型的实例会被自动加入到模型的参数列表中。

In [6]:
print(type(net.named_parameters()))  # type is generator
for name, param in net.named_parameters():
    print(name, param.size(), type(param), '\n', param, '\n')

<class 'generator'>
0.weight torch.Size([3, 4]) <class 'torch.nn.parameter.Parameter'> 
 Parameter containing:
tensor([[ 0.0346,  0.2906,  0.1959, -0.1518],
        [-0.0540,  0.4834,  0.2244,  0.2504],
        [ 0.1455, -0.1077, -0.0040, -0.0552]], requires_grad=True) 

0.bias torch.Size([3]) <class 'torch.nn.parameter.Parameter'> 
 Parameter containing:
tensor([ 0.0441, -0.4389,  0.1332], requires_grad=True) 

2.weight torch.Size([1, 3]) <class 'torch.nn.parameter.Parameter'> 
 Parameter containing:
tensor([[0.0758, 0.3708, 0.3032]], requires_grad=True) 

2.bias torch.Size([1]) <class 'torch.nn.parameter.Parameter'> 
 Parameter containing:
tensor([0.5126], requires_grad=True) 



In [10]:
class MyNet(nn.Module):
    def __init__(self, **kwargs):
        super(MyNet, self).__init__(**kwargs)
        self.w1 = torch.nn.parameter.Parameter(torch.rand(4, 4))
        self.w2 = torch.rand(4, 4)

n = MyNet()
for name, param in n.named_parameters():
    # 输出不包含w2
    print(name, '\n', param)

w1 
 Parameter containing:
tensor([[0.8450, 0.5001, 0.7558, 0.7901],
        [0.2242, 0.0298, 0.2684, 0.5284],
        [0.2449, 0.8873, 0.8072, 0.2524],
        [0.3379, 0.0736, 0.4378, 0.8302]], requires_grad=True)


通过.data和.grad访问数值和梯度值：

In [11]:
weight_0 = list(net[0].parameters())[0]
print(weight_0.data)
print(weight_0.grad) # 反向传播前梯度为None
Y.backward()
print(weight_0.grad)

tensor([[ 0.0346,  0.2906,  0.1959, -0.1518],
        [-0.0540,  0.4834,  0.2244,  0.2504],
        [ 0.1455, -0.1077, -0.0040, -0.0552]])
None
tensor([[0.0559, 0.0652, 0.0695, 0.0154],
        [0.0000, 0.0000, 0.0000, 0.0000],
        [0.2237, 0.2610, 0.2780, 0.0618]])


### 1.2 模型参数的初始化
通过torch.nn.init()来实现。

In [12]:
for name, param in net.named_parameters():
    if 'weight' in name:
        init.normal_(param, mean=0, std=0.01)
        print(name, param.data)
    if 'bias' in name:
        init.uniform_(param, a=-1., b=1.)
        print(name, param.data)

0.weight tensor([[-0.0043,  0.0049,  0.0025, -0.0073],
        [-0.0070,  0.0055, -0.0011,  0.0107],
        [ 0.0039, -0.0043, -0.0109, -0.0051]])
0.bias tensor([0.6006, 0.2985, 0.9304])
2.weight tensor([[ 0.0048, -0.0116,  0.0120]])
2.bias tensor([0.9254])


实现一个自定义的初始化方法。在下面的例子里，我们令权重有一半概率初始化为0，有另一半概率初始化为[-10,-5][−10,−5]和[5,10][5,10]两个区间里均匀分布的随机数。

In [17]:
def init_weight_(tensor):
    with torch.no_grad():
        tensor.uniform_(-10, 10)
        # 每个元素乘0或者乘1
        tensor *= (tensor.abs() >= 5).float()

for name, param in net.named_parameters():
    if 'weight' in name:
        init_weight_(param)
        print(name, param.data)

0.weight tensor([[-0.0000, -7.9060,  0.0000,  8.1568],
        [-6.9980,  0.0000,  5.2687, -0.0000],
        [ 0.0000,  0.0000, -0.0000, -5.3343]])
2.weight tensor([[-0., -0., -0.]])


### 1.3 模型参数的共享

传入Sequential的模块是同一个Module的实例：

In [25]:
linear = nn.Linear(1, 1, bias=False)
net = nn.Sequential(linear, linear)
print(net)
for name, param in net.named_parameters():
    if 'weight' in name:
        init.constant_(param, val=3)
        print(name, param.data)

Sequential(
  (0): Linear(in_features=1, out_features=1, bias=False)
  (1): Linear(in_features=1, out_features=1, bias=False)
)
0.weight tensor([[3.]])


In [26]:
# 指向的是同一个实例，因此对应的参数是两个module共享的
print(id(net[0]) == id(net[1]))
print(id(net[0].weight) == id(net[1].weight))

True
True


因为模型参数里包含了梯度，所以在反向传播计算时，这些共享的参数的梯度是累加的：

In [27]:
x = torch.ones(1, 1)
y = net(x).sum()    # y = 3 * (3 * x)
print('y = %f' % y)
y.backward()
# grad <- grad + 3重复了两次（每个layer重复了一次，这个结果被累加）
print(net[0].weight.grad)

y = 9.000000
tensor([[6.]])
