In [1]:
import torch.nn as nn
from d2l import torch as d2l
import torch
import numpy as np

## 参数管理

In [2]:
net=nn.Sequential(
    nn.Linear(4,10),
    nn.ReLU(),
    nn.Linear(10,1)
)
x=torch.randn(size=(2,4))

#### 访问参数

二、model.state_dict()方法
pytorch 中的 state_dict 是一个简单的python的字典对象,将每一层与它的对应参数建立映射关系.(如model的每一层的weights及偏置等等)

注意：

（1）只有那些参数可以训练的layer才会被保存到模型的state_dict中,如卷积层,线性层等等，像什么池化层、BN层这些本身没有参数的层是没有在这个字典中的；

（2）这个方法的作用一方面是方便查看某一个层的权值和偏置数据，另一方面更多的是在模型保存的时候使用。

In [3]:
for params in net.state_dict():
    print(params,'\t',net.state_dict()[params].shape)

0.weight 	 torch.Size([10, 4])
0.bias 	 torch.Size([10])
2.weight 	 torch.Size([1, 10])
2.bias 	 torch.Size([1])


In [4]:
print(type(net[2].bias))
print(net[2].bias)
print(net[2].bias.data)
print(net)

<class 'torch.nn.parameter.Parameter'>
Parameter containing:
tensor([-0.0646], requires_grad=True)
tensor([-0.0646])
Sequential(
  (0): Linear(in_features=4, out_features=10, bias=True)
  (1): ReLU()
  (2): Linear(in_features=10, out_features=1, bias=True)
)


In [5]:
print([(name,params.shape) for name,params in net.named_parameters()])
print(*[(name,params.shape) for name,params in net[0].named_parameters()])

[('0.weight', torch.Size([10, 4])), ('0.bias', torch.Size([10])), ('2.weight', torch.Size([1, 10])), ('2.bias', torch.Size([1]))]
('weight', torch.Size([10, 4])) ('bias', torch.Size([10]))


In [6]:
net.state_dict()["2.bias"].data#state_dict()可以看作是一个字典

tensor([-0.0646])

In [7]:
def block1():
    return nn.Sequential(nn.Linear(4,8),nn.ReLU(),nn.Linear(8,4),nn.ReLU())
def block2():
    net=nn.Sequential()
    for i in range(4):
        net.add_module(f'block{i}',block1())#add_module 很有用
    return net
nett=nn.Sequential(block2(),nn.Linear(4,1))
nett(x)

tensor([[0.0173],
        [0.0173]], grad_fn=<AddmmBackward0>)

In [8]:
print(nett)

Sequential(
  (0): Sequential(
    (block0): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block1): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block2): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block3): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
  )
  (1): Linear(in_features=4, out_features=1, bias=True)
)


In [9]:
print(nett[0][1][2].bias.data)

tensor([-0.2152, -0.2384,  0.0516,  0.3022])


#### 自定义参数：正态分布

In [15]:
def init_normal(m):#自定义参数：正态分布
    if type(m)==nn.Linear:
        nn.init.normal_(m.weight,mean=0,std=0.01)
        nn.init.zeros_(m.bias)
net.apply(init_normal)#pytorch中的model.apply(fn)会递归地将函数fn应用到父模块的每个子模块submodule，也包括model这个父模块自身。
print(net[0].bias.data)
print([(name,params) for name,params in net.named_parameters()])

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
[('0.weight', Parameter containing:
tensor([[-0.0089,  0.0011, -0.0025,  0.0136],
        [-0.0081, -0.0195, -0.0001,  0.0036],
        [-0.0111,  0.0025,  0.0087,  0.0053],
        [ 0.0102, -0.0029, -0.0047,  0.0051],
        [-0.0072, -0.0208,  0.0160, -0.0070],
        [-0.0074, -0.0034, -0.0064,  0.0139],
        [ 0.0037,  0.0013,  0.0082, -0.0129],
        [ 0.0076, -0.0053, -0.0078, -0.0069],
        [ 0.0017,  0.0077,  0.0272, -0.0026],
        [ 0.0067,  0.0062,  0.0166, -0.0063]], requires_grad=True)), ('0.bias', Parameter containing:
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], requires_grad=True)), ('2.weight', Parameter containing:
tensor([[-0.0029, -0.0006, -0.0046, -0.0095, -0.0005, -0.0069,  0.0024,  0.0008,
          0.0014,  0.0038]], requires_grad=True)), ('2.bias', Parameter containing:
tensor([0.], requires_grad=True))]


#### 自定义参数：常数

In [16]:
def init_constant(m):#自定义参数：常量
    if type(m)==nn.Linear:
        nn.init.constant_(m.weight,1)#注意函数的拼写
        nn.init.zeros_(m.bias)
net.apply(init_constant)
print([(name,params) for name,params in net.named_parameters()])

[('0.weight', Parameter containing:
tensor([[1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.]], requires_grad=True)), ('0.bias', Parameter containing:
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], requires_grad=True)), ('2.weight', Parameter containing:
tensor([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]], requires_grad=True)), ('2.bias', Parameter containing:
tensor([0.], requires_grad=True))]


#### 注意，Xavier_uniform_是一种很好的初始化神经网络权重的方法, 对tanh很有效果，但是relu函数表现很差
#### 作为补充，使用kaiming函数进行初始化，可以在relu函数上很好的表现

In [19]:
def Xavier(m):
    if type(m)==nn.Linear:
        nn.init.xavier_uniform_(m.weight)
        nn.init.zeros_(m.bias)
def init_constant(m):
    if type(m)==nn.Linear:
        nn.init.constant_(m.weight,42)
        nn.init.zeros_(m.bias)
net[0].apply(Xavier)
net[2].apply(init_constant)
print([(name,params) for name,params in net.named_parameters()])            

[('0.weight', Parameter containing:
tensor([[ 0.2235, -0.2157, -0.4504, -0.6087],
        [ 0.0236,  0.3859, -0.6317, -0.1297],
        [ 0.1158,  0.0858, -0.5168, -0.4140],
        [-0.1544, -0.4032, -0.3478, -0.3715],
        [ 0.0038,  0.6455,  0.1789, -0.1064],
        [-0.1521,  0.3185,  0.2916,  0.2536],
        [-0.4378,  0.1692,  0.3530,  0.2215],
        [-0.5978,  0.6303, -0.0479,  0.3094],
        [-0.0064, -0.3371,  0.2196, -0.0501],
        [-0.5319, -0.5221, -0.6529,  0.2456]], requires_grad=True)), ('0.bias', Parameter containing:
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], requires_grad=True)), ('2.weight', Parameter containing:
tensor([[42., 42., 42., 42., 42., 42., 42., 42., 42., 42.]],
       requires_grad=True)), ('2.bias', Parameter containing:
tensor([0.], requires_grad=True))]


#### TMD 太抽象了这段
#### 实现：参数自定义
$$ f(x)=\left\{
\begin{aligned}
U(5,10) 可能性\frac{1}{4} \\
0 可能性\frac{1}{2} \\
U(-10,-5) 可能性\frac{1}{4}\\
\end{aligned}
\right.
$$

In [32]:
def init_self(m):
    if type(m)==nn.Linear:
        nn.init.uniform_(m.weight,-10,10)
        m.weight.data=m.weight.data*(m.weight.data.abs()>=5)
net.apply(init_self)
print([(name,params) for name,params in net.named_parameters()])

[('0.weight', Parameter containing:
tensor([[-5.9383, -8.9986, -0.0000,  6.7914],
        [ 9.8173,  5.6307, -0.0000,  8.0374],
        [ 0.0000,  7.8136,  0.0000,  6.4271],
        [ 0.0000,  0.0000,  7.4778,  9.2649],
        [-0.0000,  8.5363,  0.0000, -0.0000],
        [-0.0000, -9.3629,  7.5027,  0.0000],
        [-0.0000, -9.7523, -8.8350,  9.6947],
        [-0.0000, -8.3388,  5.1251, -6.9979],
        [ 0.0000, -0.0000, -5.0203, -7.7817],
        [-0.0000, -0.0000, -9.7697,  0.0000]], requires_grad=True)), ('0.bias', Parameter containing:
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], requires_grad=True)), ('2.weight', Parameter containing:
tensor([[-6.0311, -9.9391, -9.8441,  8.2084, -5.8184,  5.2010,  6.8500,  0.0000,
         -5.5040,  0.0000]], requires_grad=True)), ('2.bias', Parameter containing:
tensor([0.], requires_grad=True))]


# 参数共享：重点

In [38]:
share=nn.Linear(10,10)#相同层实现参数共享
net=nn.Sequential(
    nn.Linear(4,10),
    nn.ReLU(),
    share,nn.ReLU(),
    share,nn.ReLU(),
    nn.Linear(10,1)
)
print(net[2].weight.data==net[4].weight.data)
net[2].weight.data[0][0]=100
print(net[2].weight.data)
print(net[2].weight.data==net[4].weight.data)

tensor([[True, True, True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True, True, True]])
tensor([[ 1.0000e+02, -1.8356e-01,  1.2982e-04,  2.4792e-01, -2.4462e-02,
         -2.8970e-01, -2.2396e-01,  1.2569e-02,  2.7265e-01, -1.8294e-01],
        [-1.7607e-01, -3.0515e-01, -2.7216e-01, -3.0695e-01,  2.4779e-01,
          1.7051e-01, -1.2374e-01,  2.1619e-01, -2.0694e-01,  1.4515e-01],
 