In [49]:
import torch
from torch import nn

net = nn.Sequential(nn.Linear(4,8), nn.ReLU(), nn.Linear(8,1))

X = torch.randn(2, 4)
print(net(X))

tensor([[-0.0639],
        [-0.3197]], grad_fn=<AddmmBackward0>)


1. 参数访问

In [3]:
print(net[2].state_dict())

OrderedDict({'weight': tensor([[ 0.1494, -0.1076,  0.0313, -0.2263,  0.1407,  0.2457,  0.1315, -0.1452]]), 'bias': tensor([-0.0832])})


- 目标参数

In [8]:
# 目标参数
print(type(net[2].bias))
print(net[2].bias)
print(net[2].bias.requires_grad)
print(net[2].bias.data)

<class 'torch.nn.parameter.Parameter'>
Parameter containing:
tensor([-0.0832], requires_grad=True)
True
tensor([-0.0832])


- 一次性访问所有参数

In [13]:
# 一次性访问所有参数
# print([(name, param.shape) for name, param in net[0].named_parameters()])
print(*[(name, param.shape) for name, param in net[0].named_parameters()])

# print([(name, param.shape) for name, param in net.named_parameters()])
print(*[(name, param.shape) for name, param in net.named_parameters()])

('weight', torch.Size([8, 4])) ('bias', torch.Size([8]))
('0.weight', torch.Size([8, 4])) ('0.bias', torch.Size([8])) ('2.weight', torch.Size([1, 8])) ('2.bias', torch.Size([1]))


In [18]:
# print(net.state_dict()) # 按字典方式 key-value 方式存储
print(net.state_dict()['2.bias'])
print(net.state_dict()['2.bias'].data)

tensor([-0.0832])
tensor([-0.0832])


- 从嵌套块收集参数

In [21]:
import torch
from torch import nn

def block1():
    return nn.Sequential(
        nn.Linear(4, 8), nn.ReLU(),
        nn.Linear(8, 4), nn.ReLU(),
    )

def block2():
    net = nn.Sequential()
    for i in range(4):
        # 在这里嵌套
        net.add_module(f'block {i}', block1())
    return net

rgnet = nn.Sequential(block2(), nn.Linear(4, 1))

X = torch.randn(2, 4)
print(rgnet(X))

tensor([[-0.6040],
        [-0.6038]], grad_fn=<AddmmBackward0>)


In [22]:
print(rgnet)

Sequential(
  (0): Sequential(
    (block 0): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 1): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 2): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 3): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
  )
  (1): Linear(in_features=4, out_features=1, bias=True)
)


In [25]:
print(rgnet[0][1][0])
print(rgnet[0][1][0].bias.data)

Linear(in_features=4, out_features=8, bias=True)
tensor([-0.4836,  0.3687,  0.1627,  0.3175,  0.1345, -0.2167,  0.3229,  0.2263])


- 2. 参数初始化

- 内置初始化

In [33]:
def init_normal(m):
    if type(m) == nn.Linear:
        nn.init.normal_(m.weight, mean=0, std=0.01)
        nn.init.zeros_(m.bias)

net.apply(init_normal)

print(net[0].weight.data) # weight 是符合均值为0, 标准差为 0.01 的正太分布
print(net[0].weight.data[0])
print(net[0].weight.data[0][0])

print(net[0].bias) # bias 设置的初始值 0
print(net[0].bias.data)

tensor([[ 4.1900e-03, -6.0065e-03,  1.8607e-03,  9.5762e-03],
        [-1.7931e-03,  4.2967e-03,  3.3903e-03, -1.2399e-02],
        [-3.4824e-03, -8.5898e-03,  7.2548e-03, -7.6122e-03],
        [ 7.4625e-03, -4.3153e-03,  3.9301e-03,  9.4779e-04],
        [ 8.0677e-03,  1.0073e-02,  2.5424e-03, -7.7675e-03],
        [ 3.8879e-03,  5.6094e-03, -1.0333e-03,  2.3389e-05],
        [ 5.5408e-03,  1.7183e-02, -2.2808e-03,  1.4229e-02],
        [ 1.8873e-02,  1.7025e-03,  9.4418e-03,  3.8361e-02]])
tensor([ 0.0042, -0.0060,  0.0019,  0.0096])
tensor(0.0042)
Parameter containing:
tensor([0., 0., 0., 0., 0., 0., 0., 0.], requires_grad=True)
tensor([0., 0., 0., 0., 0., 0., 0., 0.])


In [36]:
def init_constant(m):
    if type(m) == nn.Linear:
        nn.init.constant_(m.weight, 1)
        nn.init.zeros_(m.bias)

net.apply(init_constant)

print(net[0].weight.data[0])
print(net[0].bias.data)
print(net[0].bias.data[0])

tensor([1., 1., 1., 1.])
tensor([0., 0., 0., 0., 0., 0., 0., 0.])
tensor(0.)


In [41]:
def init_xavier(m):
    if type(m) == nn.Linear:
        nn.init.xavier_uniform_(m.weight)

def init42(m):
    if type(m) == nn.Linear:
        nn.init.constant_(m.weight, 42)

# print(net)
net[0].apply(init_xavier)
net[2].apply(init42)

print(net[0].weight.data[0])
print(net[2].weight.data)

tensor([ 0.6070, -0.2725, -0.5733, -0.6994])
tensor([[42., 42., 42., 42., 42., 42., 42., 42.]])


- 2. 自定义初始化

In [54]:
def my_init(m):
    if type(m) == nn.Linear:
        print('Init ', [(name, param.shape) for name, param in m.named_parameters()][0])
        nn.init.uniform_(m.weight, -10, 10)
        # print("my_init ", net[0].weight.data)
        m.weight.data *= m.weight.data.abs() >= 5


net.apply(my_init)

print(net[0].weight.data)
print(net[0].weight.data[:2])


Init  ('weight', torch.Size([8, 4]))
my_init  tensor([[-3.3797,  2.2061,  4.1332, -8.9523],
        [-0.8433, -7.8924, -9.9782,  9.7691],
        [ 8.0386,  6.5668, -7.0756,  6.0646],
        [ 9.2750, -0.0383, -5.3722,  2.5742],
        [-4.3707,  8.4483,  3.8494, -8.5503],
        [-5.9884,  0.0209,  0.9016, -1.2862],
        [ 2.6331, -8.6587, -3.5106,  0.0434],
        [ 1.2449, -1.5677, -3.9434, -8.1932]])
Init  ('weight', torch.Size([1, 8]))
my_init  tensor([[-0.0000,  0.0000,  0.0000, -8.9523],
        [-0.0000, -7.8924, -9.9782,  9.7691],
        [ 8.0386,  6.5668, -7.0756,  6.0646],
        [ 9.2750, -0.0000, -5.3722,  0.0000],
        [-0.0000,  8.4483,  0.0000, -8.5503],
        [-5.9884,  0.0000,  0.0000, -0.0000],
        [ 0.0000, -8.6587, -0.0000,  0.0000],
        [ 0.0000, -0.0000, -0.0000, -8.1932]])
tensor([[-0.0000,  0.0000,  0.0000, -8.9523],
        [-0.0000, -7.8924, -9.9782,  9.7691],
        [ 8.0386,  6.5668, -7.0756,  6.0646],
        [ 9.2750, -0.0000, -5.37

- 直接设置参数

In [60]:
# net = nn.Sequential(nn.Linear(4,8), nn.ReLU(), nn.Linear(8,1))
#
# X = torch.randn(2, 4)
# print(net(X))

print(net[0].weight.data.shape)
print(net[2].weight.data.shape)

net[0].weight.data[:] += 1
net[0].weight.data[0,0] = 42
print(net[0].weight.data)
print(net[0].weight.data[0])

torch.Size([8, 4])
torch.Size([1, 8])
tensor([[42.0000,  2.0000,  2.0000, -6.9523],
        [ 2.0000, -5.8924, -7.9782, 11.7691],
        [10.0386,  8.5668, -5.0756,  8.0646],
        [11.2750,  2.0000, -3.3722,  2.0000],
        [ 2.0000, 10.4483,  2.0000, -6.5503],
        [-3.9884,  2.0000,  2.0000,  2.0000],
        [ 2.0000, -6.6587,  2.0000,  2.0000],
        [ 2.0000,  2.0000,  2.0000, -6.1932]])
tensor([42.0000,  2.0000,  2.0000, -6.9523])


- 3. 参数绑定

In [63]:
shared = nn.Linear(8, 8)
net = nn.Sequential(
    nn.Linear(4, 8), nn.ReLU(),
    shared, nn.ReLU(),
    shared, nn.ReLU(),
    nn.Linear(8, 1)
                    )
X = torch.randn(2, 4)
net(X)

# 检测参数是否相同
print(net[2].weight.data[0] == net[4].weight.data[0])
net[2].weight.data[0,0] = 100 # 修改内存, 确保是同一个对象, 而不只是相同的数值
print(net[2].weight.data[0] == net[4].weight.data[0])

tensor([True, True, True, True, True, True, True, True])
tensor([True, True, True, True, True, True, True, True])
