## 深度学习的模型构造方法

### 1.1 继承Module类

In [1]:
import torch
import torch.nn as nn

使用该方式构造一个含有两个隐藏层的MLP：

In [2]:
class MLP(nn.Module):
    def __init__(self, **kwargs):
        super(MLP, self).__init__(**kwargs)
        self.hidden1 = nn.Linear(784, 256)
        self.act1 = nn.ReLU()
        self.hidden2 = nn.Linear(256, 64)
        self.act2 = nn.ReLU()
        self.output = nn.Linear(64, 10)
    
    def forward(self, x):
        """
        仅需重载前向计算的过程，反向传播函数backward由torch自动生成。
        """
        h1 = self.act1(self.hidden1(x))
        h2 = self.act2(self.hidden2(h1))
        return self.output(h2)

测试：

In [3]:
X = torch.rand(2, 784)
net = MLP()
print(net)
print(net(X))

MLP(
  (hidden1): Linear(in_features=784, out_features=256, bias=True)
  (act1): ReLU()
  (hidden2): Linear(in_features=256, out_features=64, bias=True)
  (act2): ReLU()
  (output): Linear(in_features=64, out_features=10, bias=True)
)
tensor([[-0.0484, -0.1706,  0.0212, -0.0876, -0.2173,  0.1117,  0.0234,  0.0559,
          0.1111,  0.0213],
        [-0.0710, -0.1611,  0.0223, -0.0313, -0.1808,  0.1131,  0.1164,  0.1169,
          0.1109,  0.0529]], grad_fn=<AddmmBackward>)


### 1.2 使用Module类的子类
Module的子类包括Sequential、ModuleList和ModuleDict等，可以直接用他们构建模型。

#### 1.2.1 Sequential类
当模型的前向计算为**简单串联各个层**的计算时，Sequential类可以通过更加简单的方式定义模型。这正是Sequential类的目的：它可以接收一个子模块的有序字典（OrderedDict）或者一系列子模块作为参数来逐一添加Module的实例，而模型的前向计算就是将这些实例**按添加的顺序**逐一计算。

实现一个与Sequential类有相同功能的MySequential类：

In [9]:
from collections import OrderedDict
class MySequential(nn.Module):
    def __init__(self, *args):
        super(MySequential, self).__init__()
        if len(args) == 1 and isinstance(args[0], OrderedDict):
            # 传入的唯一参数为一个OrderedDict
            for key, module in args[0].items():
                # 这里的key是自己添加的module的命令，如linear等等，是一个字符串。
                # 这个字段是一定存在的，因为这是一个字典
                self.add_module(key, module)
        else:
            for idx, module in enumerate(args):
                self.add_module(str(idx), module)
    
    def forward(self, input):
        # self._modules返回一个 OrderedDict，保证会按照成员添加时的顺序遍历成员
        for module in self._modules.values():
            input = module(input)
        return input

测试MySequential：

In [10]:
net = MySequential(
    nn.Linear(784, 256),
    nn.ReLU(),
    nn.Linear(256, 64),
    nn.ReLU(),
    nn.Linear(64, 10)
)
print(net)
net(X)  # 注意每次初始化一个net实例时会自动随机初始化参数，因此本次输出和上面的输出不一样

MySequential(
  (0): Linear(in_features=784, out_features=256, bias=True)
  (1): ReLU()
  (2): Linear(in_features=256, out_features=64, bias=True)
  (3): ReLU()
  (4): Linear(in_features=64, out_features=10, bias=True)
)


tensor([[-0.0240, -0.1164,  0.0289,  0.1467,  0.0451, -0.1431, -0.0349, -0.0894,
         -0.1172, -0.1007],
        [ 0.0132, -0.0532,  0.0970,  0.1642,  0.0874, -0.1490, -0.0247, -0.0631,
         -0.1271, -0.0863]], grad_fn=<AddmmBackward>)

In [17]:
modules = OrderedDict()
modules['linear1'] = nn.Linear(784, 256)
modules['relu1'] = nn.ReLU()
modules['linear2'] = nn.Linear(256, 64)
modules['relu2'] = nn.ReLU()
modules['softmax'] = nn.Linear(64, 10)
net = MySequential(modules)
print(net)
net(X)

MySequential(
  (linear1): Linear(in_features=784, out_features=256, bias=True)
  (relu1): ReLU()
  (linear2): Linear(in_features=256, out_features=64, bias=True)
  (relu2): ReLU()
  (softmax): Linear(in_features=64, out_features=10, bias=True)
)


tensor([[-0.0818, -0.0254,  0.0361,  0.1228,  0.0149,  0.0416,  0.0267, -0.0448,
          0.1175,  0.0269],
        [-0.0938, -0.0360, -0.0097,  0.0760,  0.0036,  0.0617,  0.0236,  0.0049,
          0.1063,  0.0309]], grad_fn=<AddmmBackward>)

#### 1.2.2 ModuleList类
ModuleList仅仅是一个储存各种模块的列表，这些模块之间没有联系也没有顺序（所以不用保证相邻层的输入输出维度匹配），而且没有实现forward功能。

In [18]:
net = nn.ModuleList([nn.Linear(784, 256), nn.ReLU()])
net.append(nn.Linear(256, 64))
net.append(nn.ReLU())
net.append(nn.Linear(64, 10))
print(net, '\n', net[-1])

ModuleList(
  (0): Linear(in_features=784, out_features=256, bias=True)
  (1): ReLU()
  (2): Linear(in_features=256, out_features=64, bias=True)
  (3): ReLU()
  (4): Linear(in_features=64, out_features=10, bias=True)
) 
 Linear(in_features=64, out_features=10, bias=True)


实际上，ModuleList的出现的最初目的是让网络定义前向传播时更加灵活，常和Module结合起来使用。下面有一个示例：

In [21]:
class MyNet(nn.Module):
    def __init__(self):
        super(MyNet, self).__init__()
        self.linear = nn.Linear(784, 10)
        self.linears = nn.ModuleList([nn.Linear(10, 10) for i in range(10)])
    
    def forward(self, x):
        output = self.linear(x)
        for i, l in enumerate(self.linears):
            output = self.linears[i // 2](output) + l(output)
        return output

In [23]:
net = MyNet()
print(net)
print(net(X))

MyNet(
  (linear): Linear(in_features=784, out_features=10, bias=True)
  (linears): ModuleList(
    (0): Linear(in_features=10, out_features=10, bias=True)
    (1): Linear(in_features=10, out_features=10, bias=True)
    (2): Linear(in_features=10, out_features=10, bias=True)
    (3): Linear(in_features=10, out_features=10, bias=True)
    (4): Linear(in_features=10, out_features=10, bias=True)
    (5): Linear(in_features=10, out_features=10, bias=True)
    (6): Linear(in_features=10, out_features=10, bias=True)
    (7): Linear(in_features=10, out_features=10, bias=True)
    (8): Linear(in_features=10, out_features=10, bias=True)
    (9): Linear(in_features=10, out_features=10, bias=True)
  )
)
tensor([[ 0.1733, -0.6189,  0.1815, -0.0954,  0.0341,  1.2951,  0.2058,  0.4890,
         -0.0624, -1.0287],
        [ 0.1723, -0.6207,  0.2078, -0.0972,  0.0355,  1.2948,  0.2018,  0.4912,
         -0.0593, -1.0282]], grad_fn=<AddBackward0>)


ModuleList不同于一般的Python的list，加入到ModuleList里面的所有模块的参数会被自动添加到整个网络中，后者则不会。

In [24]:
class Module_ModuleList(nn.Module):
    def __init__(self):
        super(Module_ModuleList, self).__init__()
        self.linears = nn.ModuleList([nn.Linear(10, 10)])

class Module_List(nn.Module):
    def __init__(self):
        super(Module_List, self).__init__()
        self.linears = [nn.Linear(10, 10)]

net1 = Module_ModuleList()
net2 = Module_List()

print("net1:")
for p in net1.parameters():
    print(p.size())

print("net2:")
for p in net2.parameters():
    print(p)

net1:
torch.Size([10, 10])
torch.Size([10])
net2:


#### 1.2.3 ModuleDict类
和ModuleList一样，ModuleDict实例仅仅是存放了一些模块的字典，并没有定义forward函数需要自己定义。同样，ModuleDict也与Python的Dict有所不同，ModuleDict里的所有模块的参数会被自动添加到整个网络中。

In [25]:
net = nn.ModuleDict({
    'linear': nn.Linear(784, 256),
    'act': nn.ReLU(),
})
net['output'] = nn.Linear(256, 10) # 添加
print(net['linear']) # 访问
print(net.output)
print(net)

Linear(in_features=784, out_features=256, bias=True)
Linear(in_features=256, out_features=10, bias=True)
ModuleDict(
  (act): ReLU()
  (linear): Linear(in_features=784, out_features=256, bias=True)
  (output): Linear(in_features=256, out_features=10, bias=True)
)


### 1.3 使用Module构造复杂的模型
下面我们构造一个稍微复杂点的网络FancyMLP。在这个网络中，我们通过get_constant函数创建训练中不被迭代的参数，即常数参数。在前向计算中，除了使用创建的常数参数外，我们还使用Tensor的函数和Python的控制流，并多次调用相同的层。

In [30]:
class FancyMLP(nn.Module):
    def __init__(self, **kwargs):
        super(FancyMLP, self).__init__(**kwargs)
        self.constants = torch.rand((20, 20), requires_grad=False)
        self.linear = nn.Linear(20, 20)
    
    def forward(self, x):
        x = self.linear(x)
        x = nn.functional.relu(torch.mm(x, self.constants) + 1)
        x = self.linear(x)
        while x.norm().item() > 1:
            x /= 2
        if x.norm().item() < 0.8:
            x *= 10
        return x.sum()

In [36]:
X = torch.rand(2, 20)
net = FancyMLP()
print(net)
net(X)

FancyMLP(
  (linear): Linear(in_features=20, out_features=20, bias=True)
)


tensor(5.5928, grad_fn=<SumBackward0>)

可以将Module定义的模型和Sequential结合起来定义新的复杂模型：

In [37]:
class NestMLP(nn.Module):
    def __init__(self, **kwargs):
        super(NestMLP, self).__init__(**kwargs)
        self.net = nn.Sequential(nn.Linear(40, 30), nn.ReLU()) 

    def forward(self, x):
        return self.net(x)

# 定义新的复杂模型
net = nn.Sequential(NestMLP(), nn.Linear(30, 20), FancyMLP())

X = torch.rand(2, 40)
print(net)
net(X)

Sequential(
  (0): NestMLP(
    (net): Sequential(
      (0): Linear(in_features=40, out_features=30, bias=True)
      (1): ReLU()
    )
  )
  (1): Linear(in_features=30, out_features=20, bias=True)
  (2): FancyMLP(
    (linear): Linear(in_features=20, out_features=20, bias=True)
  )
)


tensor(1.8082, grad_fn=<SumBackward0>)