# 4.1 模型构造

In [1]:
import torch
from torch import nn

class MLP(nn.Module):
    def __init__(self, **kwargs):
        super(MLP, self).__init__(**kwargs)
        self.hidden = nn.Linear(784, 256)
        self.act = nn.ReLU()
        self.output = nn.Linear(256, 10)

    def forward(self, x):
        a = self.act(self.hidden(x))
        return self.output(a)

In [2]:
X = torch.rand(2, 784)
net = MLP()
print(net)
net(X)

MLP(
  (hidden): Linear(in_features=784, out_features=256, bias=True)
  (act): ReLU()
  (output): Linear(in_features=256, out_features=10, bias=True)
)


tensor([[-0.1946,  0.0439, -0.0275,  0.1823, -0.0086, -0.1431, -0.2265, -0.0637,
         -0.1311,  0.1625],
        [-0.0968,  0.0871, -0.0212,  0.1506, -0.1333, -0.1903, -0.2147, -0.0074,
         -0.1683,  0.1196]], grad_fn=<AddmmBackward>)

In [3]:
class MySequential(nn.Module):
    from collections import OrderedDict
    def __init__(self, *args):
        super(MySequential, self).__init__()
        if len(args) == 1 and isinstance(args[0], OrderedDict):
            for key, module in args[0].items():
                self.add_module(key, module)
        else:
            for idx, module in enumerate(args):
                self.add_module(str(idx), module)
    
    def forward(self, input):
        for module in self._modules.values():
            input = module(input)
        return input

In [4]:
net = MySequential(
        nn.Linear(784, 256),
        nn.ReLU(),
        nn.Linear(256, 10))

print(net)
net(X)

MySequential(
  (0): Linear(in_features=784, out_features=256, bias=True)
  (1): ReLU()
  (2): Linear(in_features=256, out_features=10, bias=True)
)


tensor([[ 0.0468, -0.2429, -0.1627, -0.1966, -0.0832, -0.0109, -0.0360, -0.1659,
          0.1672, -0.1601],
        [ 0.0248, -0.2007, -0.0566, -0.0167, -0.0041, -0.0735, -0.1430, -0.0556,
          0.1975, -0.1876]], grad_fn=<AddmmBackward>)

In [5]:
for module in MLP()._modules.values():
    print(module)

Linear(in_features=784, out_features=256, bias=True)
ReLU()
Linear(in_features=256, out_features=10, bias=True)


In [6]:
for module in net._modules.values():
    print(module)

Linear(in_features=784, out_features=256, bias=True)
ReLU()
Linear(in_features=256, out_features=10, bias=True)


In [7]:
#ModuleList并不是一个网络，只是把不同的Module存储在一起
net = nn.ModuleList([nn.Linear(784, 256), nn.ReLU()])
net.append(nn.Linear(256, 10))
print(net[-1])
print(net)

Linear(in_features=256, out_features=10, bias=True)
ModuleList(
  (0): Linear(in_features=784, out_features=256, bias=True)
  (1): ReLU()
  (2): Linear(in_features=256, out_features=10, bias=True)
)


In [24]:
net = nn.ModuleDict({
    'linear': nn.Linear(784, 256),
    'act': nn.ReLU(),
})
net['output'] = nn.Linear(256, 10)
print(net['linear'])
print(net.output)
print(net)

Linear(in_features=784, out_features=256, bias=True)
Linear(in_features=256, out_features=10, bias=True)
ModuleDict(
  (linear): Linear(in_features=784, out_features=256, bias=True)
  (act): ReLU()
  (output): Linear(in_features=256, out_features=10, bias=True)
)


In [48]:
class FancyMLP(nn.Module):
    def __init__(self, **kwargs):
        super(FancyMLP, self).__init__(**kwargs)
        
        self.rand_weight = torch.rand((20, 20), requires_grad=False)
        self.linear = nn.Linear(20, 20)
    
    def forward(self, x):
        x = self.linear(x)
        x = nn.functional.relu(torch.mm(x, self.rand_weight.data)) + 1
        
        x = self.linear(x)
        while x.norm().item() > 1:
            x /= 2
        if x.norm().item() < 0.8:
            x *= 10
        return x.sum()

In [49]:
X = torch. rand(2, 20)
net = FancyMLP()
print(net)
net(X)

FancyMLP(
  (linear): Linear(in_features=20, out_features=20, bias=True)
)


tensor(-0.1949, grad_fn=<SumBackward0>)

In [50]:
class NestMLP(nn.Module):
    def __init__(self, **kwargs):
        super(NestMLP, self).__init__(**kwargs)
        self.net = nn.Sequential(nn.Linear(40, 30), nn.ReLU())
        
    def forward(self, x):
        return self.net(x)

net = nn.Sequential(NestMLP(), nn.Linear(30, 20), FancyMLP())

X = torch.rand(2, 40)
print(net)
net(X)

Sequential(
  (0): NestMLP(
    (net): Sequential(
      (0): Linear(in_features=40, out_features=30, bias=True)
      (1): ReLU()
    )
  )
  (1): Linear(in_features=30, out_features=20, bias=True)
  (2): FancyMLP(
    (linear): Linear(in_features=20, out_features=20, bias=True)
  )
)


tensor(8.4418, grad_fn=<SumBackward0>)

# 4.2模型参数的访问、初始化和共享

In [4]:
import torch
from torch import nn
from torch.nn import init

net = nn.Sequential(nn.Linear(4, 3), nn.ReLU(), nn.Linear(3, 1))

print(net)
X = torch.rand(2, 4)
Y = net(X).sum()
print(net(X))
print(Y)

Sequential(
  (0): Linear(in_features=4, out_features=3, bias=True)
  (1): ReLU()
  (2): Linear(in_features=3, out_features=1, bias=True)
)
tensor([[-0.4607],
        [-0.2303]], grad_fn=<AddmmBackward>)
tensor(-0.6909, grad_fn=<SumBackward0>)


In [5]:
print(type(net.named_parameters()))
for name, param in net.named_parameters():
    print(name, param.size())

<class 'generator'>
0.weight torch.Size([3, 4])
0.bias torch.Size([3])
2.weight torch.Size([1, 3])
2.bias torch.Size([1])


In [15]:
for name, param in net[0].named_parameters():
    print(name, param.size(), type(param))

weight torch.Size([3, 4]) <class 'torch.nn.parameter.Parameter'>
bias torch.Size([3]) <class 'torch.nn.parameter.Parameter'>


In [27]:
class MyModel(nn.Module):
    def __init__(self, **kwargs):
        super(MyModel, self).__init__(**kwargs)
        self.weight1 = nn.parameter.Parameter(torch.rand(20, 20))
        self.weight2 = torch.rand(20, 20)
    def forward(self, x):
        pass
    
n = MyModel()
for name, param in n.named_parameters():
    print(name)

weight1


In [32]:
weight_0 = list(net[0].parameters())[0]
print(weight_0.data)
print(weight_0.grad)
Y.backward()
print(weight_0.grad)

tensor([[ 0.0583,  0.2956, -0.1465,  0.3597],
        [ 0.4133, -0.4733,  0.3021,  0.0765],
        [-0.3240, -0.3876,  0.3738,  0.2905]])
None
tensor([[-0.5680, -0.5933, -0.6820, -0.4550],
        [-0.5606, -0.5856, -0.6731, -0.4491],
        [ 0.0000,  0.0000,  0.0000,  0.0000]])


In [33]:
for name, param in net.named_parameters():
    if 'weight' in name:
        init.normal_(param, mean=0, std=0.01)
        print(name, param.data)

0.weight tensor([[-0.0075,  0.0020,  0.0037, -0.0139],
        [ 0.0119, -0.0266, -0.0107,  0.0001],
        [-0.0019,  0.0198, -0.0133, -0.0085]])
2.weight tensor([[-0.0011,  0.0032,  0.0040]])


In [37]:
for name, param in net.named_parameters():
    if 'bias' in name:
        init.constant_(param, val=0)
        print(name, param)

0.bias Parameter containing:
tensor([0., 0., 0.], requires_grad=True)
2.bias Parameter containing:
tensor([0.], requires_grad=True)


In [36]:
def normal_(tensor, mean=0, std=1):
    with torch.no_grad():
        return tensor.normal_(mean, std)

In [46]:
def init_weight_(tensor):
    with torch.no_grad():
        tensor.uniform_(-10, 10)
        tensor *= (tensor.abs() >= 5).float()

for name, param in net.named_parameters():
    if 'weight' in name:
        init_weight_(param)
        print(name, param)

0.weight Parameter containing:
tensor([[ 9.0238, -7.7786, -0.0000,  9.9776],
        [ 0.0000,  6.7165, -6.5606, -9.5663],
        [-7.4009,  5.9940,  0.0000,  6.6667]], requires_grad=True)
2.weight Parameter containing:
tensor([[-9.1437, -8.1545, -0.0000]], requires_grad=True)


In [51]:
for name, param in net.named_parameters():
    if 'bias' in name:
        param.data += 1
        print(name, param.data)

0.bias tensor([1., 1., 1.])
2.bias tensor([1.])


In [52]:
linear = nn.Linear(1, 1, bias=False)
net = nn.Sequential(linear, linear)
print(net)
for name, param in net.named_parameters():
    print(name, param)

Sequential(
  (0): Linear(in_features=1, out_features=1, bias=False)
  (1): Linear(in_features=1, out_features=1, bias=False)
)
0.weight Parameter containing:
tensor([[-0.9785]], requires_grad=True)


In [53]:
print(id(net[0]) == id(net[1]))
print(id(net[0].weight) == id(net[1].weight))

True
True


# 4.4 自定义层

In [4]:
import torch
from torch import nn

class CenteredLayer(nn.Module):
    def __init__(self, **kwargs):
        super(CenteredLayer, self).__init__(**kwargs)
    def forward(self, x):
        return x - x.mean()

In [5]:
layer = CenteredLayer()
layer(torch.tensor([[1, 2, 3, 4, 5]], dtype=torch.float))

tensor([[-2., -1.,  0.,  1.,  2.]])

In [14]:
net = nn.Sequential(nn.Linear(8, 128), CenteredLayer())

In [18]:
y = net(torch.rand(4, 8))
y.mean().item()

-3.91155481338501e-08

In [20]:
class MyDense(nn.Module):
    def __init__(self):
        super(MyDense, self).__init__()
        self.params = nn.ParameterList([nn.Parameter(torch.randn(4, 4)) for i in range(3)])
        self.params.append(nn.Parameter(torch.randn(4, 1)))
    
    def forward(self, x):
        for i in range(len(self.params)):
            x = torch.mm(x, self.params[i])
        return x

net = MyDense()
print(net)

MyDense(
  (params): ParameterList(
      (0): Parameter containing: [torch.FloatTensor of size 4x4]
      (1): Parameter containing: [torch.FloatTensor of size 4x4]
      (2): Parameter containing: [torch.FloatTensor of size 4x4]
      (3): Parameter containing: [torch.FloatTensor of size 4x1]
  )
)


In [24]:
class MyDictDense(nn.Module):
    def __init__(self):
        super(MyDictDense, self).__init__()
        self.params = nn.ParameterDict({
            'linear1': nn.Parameter(torch.randn(4, 4)),
            'linear2': nn.Parameter(torch.randn(4, 1))
        })
        self.params.update({'linear3': nn.Parameter(torch.randn(4, 2))})
    
    def forward(self, x, choice='linear1'):
        return torch.mm(x, self.params[choice])

net = MyDictDense()
print(net)    

MyDictDense(
  (params): ParameterDict(
      (linear1): Parameter containing: [torch.FloatTensor of size 4x4]
      (linear2): Parameter containing: [torch.FloatTensor of size 4x1]
      (linear3): Parameter containing: [torch.FloatTensor of size 4x2]
  )
)


In [31]:
x = torch.ones(1, 4)
print(net(x, 'linear1'))
print(net(x, 'linear2'))
print(net(x, 'linear3'))

tensor([[ 0.6885, -1.2643, -2.0893,  1.3118]], grad_fn=<MmBackward>)
tensor([[-0.4006]], grad_fn=<MmBackward>)
tensor([[-1.1349,  2.8811]], grad_fn=<MmBackward>)


In [33]:
net = nn.Sequential(
    MyDictDense(),
    MyDense(),
)
print(net)
print(net(x))

Sequential(
  (0): MyDictDense(
    (params): ParameterDict(
        (linear1): Parameter containing: [torch.FloatTensor of size 4x4]
        (linear2): Parameter containing: [torch.FloatTensor of size 4x1]
        (linear3): Parameter containing: [torch.FloatTensor of size 4x2]
    )
  )
  (1): MyDense(
    (params): ParameterList(
        (0): Parameter containing: [torch.FloatTensor of size 4x4]
        (1): Parameter containing: [torch.FloatTensor of size 4x4]
        (2): Parameter containing: [torch.FloatTensor of size 4x4]
        (3): Parameter containing: [torch.FloatTensor of size 4x1]
    )
  )
)
tensor([[18.9635]], grad_fn=<MmBackward>)


# 4.5 读取和存储

In [41]:
import torch
from torch import nn

x = torch.ones(3)
torch.save(x, 'x.pt')

In [42]:
x2 = torch.load('x.pt')
x2

tensor([1., 1., 1.])

In [43]:
y = torch.zeros(4)
torch.save([x, y], 'xy.pt')
xy_list = torch.load('xy.pt')
xy_list

[tensor([1., 1., 1.]), tensor([0., 0., 0., 0.])]

In [44]:
torch.save({'x': x, 'y': y}, 'xy_dict.pt')
xy = torch.load('xy_dict.pt')
xy

{'x': tensor([1., 1., 1.]), 'y': tensor([0., 0., 0., 0.])}

In [68]:
class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.hidden = nn.Linear(3, 2)
        self.act = nn.ReLU()
        self.output = nn.Linear(2, 1)
    
    def forward(self, x):
        a = self.act(self.hidden(x))
        return self.output(a)

net = MLP()
net.state_dict()

OrderedDict([('hidden.weight',
              tensor([[ 0.0227,  0.2822, -0.2726],
                      [ 0.1490, -0.3357,  0.0452]])),
             ('hidden.bias', tensor([-0.4504,  0.2108])),
             ('output.weight', tensor([[0.4097, 0.5618]])),
             ('output.bias', tensor([-0.0552]))])

In [61]:
optimizer = torch.optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
optimizer.state_dict()

{'state': {},
 'param_groups': [{'lr': 0.001,
   'momentum': 0.9,
   'dampening': 0,
   'weight_decay': 0,
   'nesterov': False,
   'params': [0, 1, 2, 3]}]}

In [62]:
X = torch.randn(2, 3)
Y = net(X)

PATH = './net.pt'
torch.save(net.state_dict(), PATH)

net2 = MLP()
net2.load_state_dict(torch.load(PATH))
net2.load_state_dict(torch.load(PATH))
Y2 = net(X)
Y2 == Y

tensor([[True],
        [True]])

In [63]:
torch.save(net, PATH)

In [64]:
net3 = torch.load(PATH)

In [65]:
net3(X) == net(X)

tensor([[True],
        [True]])

# 4.6 GPU计算

In [1]:
import torch
from torch import nn


torch.cuda.is_available()

True

In [3]:
torch.cuda.device_count()

1

In [5]:
torch.cuda.current_device()

0

In [8]:
torch.cuda.get_device_name(0)

'GeForce RTX 2080 with Max-Q Design'

In [10]:
x = torch.tensor([1, 2, 3])
x

tensor([1, 2, 3])

In [12]:
torch.tensor([1, 2, 3], device='cuda:0')

tensor([1, 2, 3], device='cuda:0')

In [14]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

x = torch.tensor([1, 2, 3], device=device)
print(x)

tensor([1, 2, 3], device='cuda:0')


In [15]:
y = x**2
print(y)

tensor([1, 4, 9], device='cuda:0')


In [16]:
z = y + x.cpu()

RuntimeError: expected device cuda:0 but got device cpu

In [17]:
net = nn.Linear(3, 1)
list(net.parameters())[0].device

device(type='cpu')

In [20]:
net.cuda()
list(net.parameters())[0].device

device(type='cuda', index=0)

In [21]:
x = torch.rand(2, 3, device=device)

In [22]:
net(x)

tensor([[-0.1526],
        [ 0.3222]], device='cuda:0', grad_fn=<AddmmBackward>)