In [13]:
# 1.层和块
import torch
from torch import nn
from torch.nn import functional as F

In [16]:
net = nn.Sequential(nn.Linear(20, 256), nn.ReLU(), nn.Linear(256, 10))

In [17]:
x = torch.randn(2, 20)
x

tensor([[-0.6083,  0.7508, -1.3111, -0.7779,  0.1394,  0.5039, -0.3343,  0.3619,
          1.1843, -0.5676, -0.3472, -0.7191, -0.3773, -0.6165, -0.0377,  1.4400,
          1.1858, -0.2270,  0.3796,  1.5167],
        [ 2.1568, -0.4487, -1.0302, -0.4681, -0.7847, -0.5237, -0.8615, -0.4804,
         -0.3147, -1.5417,  0.1084,  2.2572, -0.0193,  0.0664, -1.1158, -0.7542,
         -1.9071,  0.0811,  0.2072, -0.7747]])

In [18]:
net(x)

tensor([[ 0.1268,  0.0915, -0.0997, -0.0676,  0.1270, -0.1315, -0.0066,  0.0375,
          0.0128, -0.3469],
        [-0.1970,  0.0740, -0.0157,  0.1457, -0.2055, -0.3382,  0.2366,  0.0570,
          0.1274, -0.4828]], grad_fn=<AddmmBackward>)

In [19]:
# 自定义块
class MLP(nn.Module):
    # 用模型参数声明层，这里，我们声明两个全连接层
    def __init__(self):
        super().__init__()
        self.hidden = nn.Linear(20, 256)
        self.out = nn.Linear(256, 10)
    def forward(self, x):
        return self.out(F.relu(self.hidden(x)))

In [20]:
net = MLP()
net(x)

tensor([[ 0.0148,  0.1265,  0.0196, -0.0249,  0.0019, -0.1003, -0.1698,  0.1692,
          0.0874,  0.2383],
        [-0.3927, -0.0334, -0.0707, -0.0559, -0.1147,  0.2072, -0.0680,  0.1037,
          0.3289,  0.0964]], grad_fn=<AddmmBackward>)

In [21]:
# 顺序块
class MySequential(nn.Module):
    def __init__(self, *args):
        super().__init__()
        for block in args:
            self._modules[block] = block
    def forward(self, x):
        for block in self._modules.values():
            x = block(x)
        return x

In [22]:
# MySequential 的⽤法与之前为 nn.Sequential 类编写的代码相同
net = MySequential(nn.Linear(20, 256), nn.ReLU(), 
                   nn.Linear(256, 10))

In [23]:
net(x)

tensor([[-0.0952, -0.4357,  0.4080,  0.2082,  0.1714,  0.0109, -0.1182, -0.0049,
          0.3946, -0.0089],
        [ 0.0136,  0.0487,  0.0975, -0.0450,  0.1551, -0.3226,  0.4254, -0.3063,
          0.0153,  0.3306]], grad_fn=<AddmmBackward>)

In [24]:
# 在正向传播中执行代码
class FixedHiddenMLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.rand_weight = torch.rand((20, 20), requires_grad = False)
        self.linear = nn.Linear(20, 20)
    def forward(self, x):
        x = self.linear(x)
        x = F.relu(torch.mm(x, self.rand_weight) + 1)
        x = self.linear(x)
        while x.abs().sum() > 1:
            x /= 2
        return x.sum()

In [25]:
net = FixedHiddenMLP()
net(x)

tensor(0.1334, grad_fn=<SumBackward0>)

In [26]:
class NestMLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(nn.Linear(20, 64), nn.ReLU(), 
                                 nn.Linear(64, 32), nn.ReLU())
        self.linear = nn.Linear(32, 16)
    def forward(self, x):
        return self.linear(self.net(x))

In [29]:
chimera = nn.Sequential(NestMLP(), nn.Linear(16, 20), FixedHiddenMLP())
chimera(x)

tensor(-0.1163, grad_fn=<SumBackward0>)

In [30]:
# 2.参数管理
import torch
from torch import nn

In [31]:
net = nn.Sequential(nn.Linear(4, 8), nn.ReLU(), nn.Linear(8, 1))
x = torch.rand(size = (2, 4))

In [32]:
net(x)

tensor([[0.3492],
        [0.2608]], grad_fn=<AddmmBackward>)

In [34]:
# 参数访问
net[2].state_dict()

OrderedDict([('weight',
              tensor([[ 0.3038, -0.2647, -0.0713, -0.1663, -0.1995, -0.3382,  0.2605,  0.1802]])),
             ('bias', tensor([0.2519]))])

In [35]:
# 目标参数
type(net[2].bias)

torch.nn.parameter.Parameter

In [36]:
net[2].bias

Parameter containing:
tensor([0.2519], requires_grad=True)

In [37]:
net[2].bias.data

tensor([0.2519])

In [39]:
net[2].weight

Parameter containing:
tensor([[ 0.3038, -0.2647, -0.0713, -0.1663, -0.1995, -0.3382,  0.2605,  0.1802]],
       requires_grad=True)

In [41]:
net[2].weight.grad == None

True

In [44]:
# 一次性访问所有参数
print(*[(name, param.shape) for name, param 
        in net[0].named_parameters()])

('weight', torch.Size([8, 4])) ('bias', torch.Size([8]))


In [46]:
print(*[(name, param.shape) for name, param in net.named_parameters()])

('0.weight', torch.Size([8, 4])) ('0.bias', torch.Size([8])) ('2.weight', torch.Size([1, 8])) ('2.bias', torch.Size([1]))


In [48]:
net.state_dict()['2.bias'].data, net[2].bias.data

(tensor([0.2519]), tensor([0.2519]))

In [49]:
# 从嵌套块收集参数
def block1():
    return nn.Sequential(nn.Linear(4,8), nn.ReLU(), 
                         nn.Linear(8, 4), nn.ReLU())
def block2():
    net = nn.Sequential()
    for i in range(4):
        net.add_module(f'block{i}', block1())
    return net

In [55]:
rgnet = nn.Sequential(block2(), nn.Linear(4, 1))

In [56]:
rgnet(x)

tensor([[0.3161],
        [0.3161]], grad_fn=<AddmmBackward>)

In [57]:
print(rgnet)

Sequential(
  (0): Sequential(
    (block0): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block1): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block2): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block3): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
  )
  (1): Linear(in_features=4, out_features=1, bias=True)
)


In [61]:
rgnet[0][1][0].bias.data

tensor([ 0.2812, -0.1229, -0.1619,  0.2384,  0.4275,  0.3126,  0.2231, -0.1510])

In [63]:
# 参数初始化
# 内置初始化
def init_normal(m):
    if type(m) == nn.Linear:
        nn.init.normal_(m.weight, mean = 0, std = 0.01)
        nn.init.zeros_(m.bias)
net.apply(init_normal)

Sequential(
  (0): Linear(in_features=4, out_features=8, bias=True)
  (1): ReLU()
  (2): Linear(in_features=8, out_features=1, bias=True)
)

In [65]:
net[0].weight.data, net[0].weight.data[0]

(tensor([[-9.1205e-03,  5.6484e-05,  8.6692e-03, -9.9888e-03],
         [ 3.7502e-03,  1.0786e-02, -1.1967e-02,  1.3620e-03],
         [ 8.3149e-03, -1.9608e-03,  1.0101e-02,  6.8813e-03],
         [-1.0054e-03,  9.9220e-03, -5.3052e-03,  1.0659e-02],
         [ 6.1779e-03, -1.5938e-02,  1.8262e-02, -1.5318e-03],
         [ 1.1609e-02, -6.7187e-03,  4.3718e-03, -2.3187e-03],
         [ 7.3073e-03,  1.0687e-02,  1.1814e-03, -3.0647e-03],
         [ 1.6954e-02,  4.5879e-03,  1.7537e-02,  3.7927e-03]]),
 tensor([-9.1205e-03,  5.6484e-05,  8.6692e-03, -9.9888e-03]))

In [69]:
net[0].bias.data, net[0].bias.data[0]

(tensor([0., 0., 0., 0., 0., 0., 0., 0.]), tensor(0.))

In [70]:
net[2].weight.data, net[2].weight.data[0]

(tensor([[-0.0083, -0.0064, -0.0156, -0.0099, -0.0057,  0.0002,  0.0089, -0.0078]]),
 tensor([-0.0083, -0.0064, -0.0156, -0.0099, -0.0057,  0.0002,  0.0089, -0.0078]))

In [71]:
# Xavier 初始化
def xavier(m):
    if type(m) == nn.Linear:
        nn.init.xavier_uniform_(m.weight)
# 设置常量
def init_42(m):
    if type(m) == nn.Linear:
        nn.init.constant_(m.weight, 42)

In [72]:
net[0].apply(xavier)

Linear(in_features=4, out_features=8, bias=True)

In [73]:
net[2].apply(init_42)

Linear(in_features=8, out_features=1, bias=True)

In [74]:
net[0].weight.data[0]

tensor([-0.6775, -0.5376, -0.0301, -0.6153])

In [75]:
net[2].weight.data

tensor([[42., 42., 42., 42., 42., 42., 42., 42.]])

In [79]:
# 自定义初始化
def my_init(m):
    if type(m) == nn.Linear:
        print('Init', *[(name, param.shape) for name, param 
                        in m.named_parameters()][0])
        nn.init.uniform_(m.weight, -10, 10)
        m.weight.data *= m.weight.data.abs() >= 5

In [80]:
net.apply(my_init)

Init weight torch.Size([8, 4])
Init weight torch.Size([1, 8])


Sequential(
  (0): Linear(in_features=4, out_features=8, bias=True)
  (1): ReLU()
  (2): Linear(in_features=8, out_features=1, bias=True)
)

In [81]:
net[0].weight[:2]

tensor([[ 6.4071,  7.2905,  0.0000, -7.6448],
        [-0.0000, -0.0000,  0.0000,  9.6140]], grad_fn=<SliceBackward>)

In [83]:
# 直接设置参数
net[0].weight.data[:] += 1
net[0].weight.data[:2]

tensor([[ 8.4071,  9.2905,  2.0000, -5.6448],
        [ 2.0000,  2.0000,  2.0000, 11.6140]])

In [84]:
net[0].weight.data[0, 0] = 42
net[0].weight.data[:2]

tensor([[42.0000,  9.2905,  2.0000, -5.6448],
        [ 2.0000,  2.0000,  2.0000, 11.6140]])

In [85]:
net[0].weight.data[0]

tensor([42.0000,  9.2905,  2.0000, -5.6448])

In [86]:
# 参数绑定
# 我们需要给共享一个名称，以便可以引用它的参数
shared = nn.Linear(8, 8)
net = nn.Sequential(nn.Linear(4, 8), nn.ReLU(), shared, 
                    nn.ReLU(), shared, nn.ReLU(), nn.Linear(8, 1))

In [87]:
net(x)

tensor([[0.3508],
        [0.3482]], grad_fn=<AddmmBackward>)

In [89]:
# 检查参数是否相同
net[2].weight.data[0] == net[4].weight.data[0]

tensor([True, True, True, True, True, True, True, True])

In [94]:
print(net[2].weight.data[0])
net[2].weight.data[0][0] = 0
print(net[2].weight.data[0])

tensor([ 0.1266,  0.2781, -0.0684,  0.2484, -0.2573,  0.1240,  0.0226,  0.3347])
tensor([ 0.0000,  0.2781, -0.0684,  0.2484, -0.2573,  0.1240,  0.0226,  0.3347])


In [95]:
net[4].weight.data[0]

tensor([ 0.0000,  0.2781, -0.0684,  0.2484, -0.2573,  0.1240,  0.0226,  0.3347])

In [96]:
# 3.自定义层
# 不带参数的层
import torch
from torch import nn
import torch.nn.functional as F

In [104]:
class CenteredLayer(nn.Module):
    def __init__(self):
        super().__init__()
    def forward(self, x):
        return x - x.mean()

In [105]:
x = torch.FloatTensor([1, 2, 3, 4, 5])

In [106]:
x - x.mean()

tensor([-2., -1.,  0.,  1.,  2.])

In [107]:
layer = CenterdLayer()
layer(x)

tensor([-2., -1.,  0.,  1.,  2.])

In [111]:
net = nn.Sequential(nn.Linear(8, 2), CenteredLayer())

In [114]:
y = net(torch.rand(4, 8))
y

tensor([[ 0.3212, -0.2637],
        [ 0.3552, -0.1811],
        [ 0.3088, -0.3900],
        [ 0.2973, -0.4476]], grad_fn=<SubBackward0>)

In [115]:
y.mean()

tensor(-7.4506e-09, grad_fn=<MeanBackward0>)

In [116]:
# 带参数的层
class MyLinear(nn.Module):
    def __init__(self, in_units, units):
        super().__init__()
        self.weight = nn.Parameter(torch.randn(in_units, units))
        self.bias = nn.Parameter(torch.randn(units,))
    def forward(self, x):
        linear = torch.matmul(x, self.weight.data) + self.bias.data
        return F.relu(linear)

In [118]:
linear = MyLinear(5, 3)
linear.weight.data

tensor([[-0.7016,  0.0124, -0.8048],
        [ 0.0429, -0.7089,  0.4540],
        [-0.0190,  0.8339, -0.7874],
        [-0.3630, -0.7206,  0.3361],
        [-0.6368,  0.4950, -1.0861]])

In [119]:
linear(torch.rand(2, 5))

tensor([[0.4285, 0.0000, 0.9317],
        [0.0106, 0.2474, 0.0000]])

In [120]:
net = nn.Sequential(MyLinear(64, 8), MyLinear(8, 1))

In [121]:
z = torch.rand(2, 64)

In [122]:
net(z)

tensor([[20.3018],
        [12.1266]])

In [123]:
# 4.读写文件
# 加载和保存变量
import torch 
import torch.nn
from torch.nn import functional as F

In [127]:
x1 = torch.arange(4)

In [128]:
torch.save(x1, 'x_file')

In [130]:
x2 = torch.load('x_file')
x2

tensor([0, 1, 2, 3])

In [131]:
y = torch.zeros(4)
torch.save([x, y], 'x_y_files')

In [132]:
x2, y2 = torch.load('x_y_files')

In [133]:
x2, y2

(tensor([0, 1, 2, 3]), tensor([0., 0., 0., 0.]))

In [134]:
my_dict = {'x':x, 'y':y}
torch.save(my_dict, 'my_dict')

In [135]:
my_dict2 = torch.load('my_dict')
my_dict2

{'x': tensor([0, 1, 2, 3]), 'y': tensor([0., 0., 0., 0.])}

In [136]:
# 加载和保存模型参数
class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.hidden = nn.Linear(20, 256)
        self.out = nn.Linear(256, 10)
    def forward(self, x):
        return self.out(F.relu(self.hidden(x)))

In [137]:
net = MLP()
x = torch.randn(size = (2, 20))

In [138]:
y = net(x)
y

tensor([[ 0.1712,  0.3211, -0.3849,  0.4120,  0.1188,  0.0589, -0.0013, -0.0207,
          0.0521,  0.1220],
        [ 0.4110,  0.1116, -0.4006,  0.1067, -0.2067,  0.1287, -0.1596, -0.1717,
          0.1752, -0.2565]], grad_fn=<AddmmBackward>)

In [139]:
torch.save(net.state_dict(), 'mlp.params')

In [140]:
clone = MLP()
clone.load_state_dict(torch.load('mlp.params'))
clone.eval()

MLP(
  (hidden): Linear(in_features=20, out_features=256, bias=True)
  (out): Linear(in_features=256, out_features=10, bias=True)
)

In [141]:
y_clone = clone(x)
y_clone == y

tensor([[True, True, True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True, True, True]])

In [12]:
# 5.GPU
!nvidia-smi

'nvidia-smi' 不是内部或外部命令，也不是可运行的程序
或批处理文件。


In [6]:
# 计算设备
import torch
from torch import nn

In [7]:
torch.device('cpu')

device(type='cpu')

In [8]:
torch.cuda.device('cuda')

<torch.cuda.device at 0x29b40963d00>

In [9]:
torch.cuda.device('cuda:1')

<torch.cuda.device at 0x29b40963c40>

In [10]:
torch.cuda.device_count()

1

In [16]:
def try_gpu(i = 0):  #@save
    """如果存在，则返回 gpu(i)，否则返回 cpu()"""
    if torch.cuda.device_count() >= i + 1:
        return torch.device(f"cuda:{i}")
    return torch.device('cpu')

In [18]:
def try_all_gpu(): #@save
    """返回所有的 GPU ，如果没有 GPU ，则返回 CPU """
    devices = [torch.device(f"cuda:{i}") for i in range(
        torch.cuda.device_count())]
    return devices if devices else [torch.device('cpu')]

In [19]:
try_gpu(), try_gpu(), try_all_gpu()

(device(type='cuda', index=0),
 device(type='cuda', index=0),
 [device(type='cuda', index=0)])

In [20]:
# 张量与 gpu 
x = torch.tensor([1, 2, 3])
x.device

device(type='cpu')

In [21]:
# 储存在 gpu 上
x = torch.ones(2, 3, device = try_gpu())
x

tensor([[1., 1., 1.],
        [1., 1., 1.]], device='cuda:0')

In [22]:
Y = torch.rand(2, 3, device=try_gpu(1))
Y

tensor([[0.6404, 0.7799, 0.3205],
        [0.7856, 0.9673, 0.6261]])