# 4.1 模型构造

In [None]:
import torch
from torch import nn

print((torch.__version__))

## 4.1.1 继承`Module`类来构造模型

In [None]:
class MLP(nn.Module):
    # 声明带有模型参数的层，这里声明了两个全连接层
    def __init__(self, **kwargs):
        # 调用MLP父类Block的构造函数来进行必要的初始化。这样在构造实例时还可以指定其他函数
        # 参数，如“模型参数的访问、初始化和共享”一节将介绍的模型参数params
        super(MLP, self).__init__(**kwargs)
        self.hidden = nn.Linear(784, 256) # 隐藏层
        self.act = nn.ReLU()
        self.output = nn.Linear(256, 10)  # 输出层
         

    # 定义模型的前向计算，即如何根据输入x计算返回所需要的模型输出
    def forward(self, x):
        a = self.act(self.hidden(x))
        return self.output(a)

In [None]:
X = torch.rand(2, 784)
net = MLP()
print(net)
net(X)

## 4.1.2 `Module`的子类
### 4.1.2.1 `Sequential`类

In [None]:
class MySequential(nn.Module):
    from collections import OrderedDict
    def __init__(self, *args):
        super(MySequential, self).__init__()
        if len(args) == 1 and isinstance(args[0], OrderedDict): # 如果传入的是一个OrderedDict
            for key, module in list(args[0].items()):
                self.add_module(key, module)  # add_module方法会将module添加进self._modules(一个OrderedDict)
        else:  # 传入的是一些Module
            for idx, module in enumerate(args):
                self.add_module(str(idx), module)
                
    def forward(self, input):
        # self._modules返回一个 OrderedDict，保证会按照成员添加时的顺序遍历成
        for module in list(self._modules.values()):
            input = module(input)
        return input

In [None]:
net = MySequential(
        nn.Linear(784, 256),
        nn.ReLU(),
        nn.Linear(256, 10), 
        )
print(net)
net(X)

### 4.1.2.2 `ModuleList`类

In [None]:
net = nn.ModuleList([nn.Linear(784, 256), nn.ReLU()])
net.append(nn.Linear(256, 10)) # # 类似List的append操作
print((net[-1]))  # 类似List的索引访问
print(net)

### 4.1.2.3 `ModuleDict`类

In [None]:
net = nn.ModuleDict({
    'linear': nn.Linear(784, 256),
    'act': nn.ReLU(),
})
net['output'] = nn.Linear(256, 10) # 添加
print((net['linear'])) # 访问
print((net.output))
print(net)

## 4.1.3 构造复杂的模型

In [None]:
class FancyMLP(nn.Module):
    def __init__(self, **kwargs):
        super(FancyMLP, self).__init__(**kwargs)
        
        self.rand_weight = torch.rand((20, 20), requires_grad=False) # 不可训练参数（常数参数）
        self.linear = nn.Linear(20, 20)

    def forward(self, x):
        x = self.linear(x)
        # 使用创建的常数参数，以及nn.functional中的relu函数和mm函数
        x = nn.functional.relu(torch.mm(x, self.rand_weight.data) + 1)
        
        # 复用全连接层。等价于两个全连接层共享参数
        x = self.linear(x)
        # 控制流，这里我们需要调用item函数来返回标量进行比较
        while x.norm().item() > 1:
            x /= 2
        if x.norm().item() < 0.8:
            x *= 10
        return x.sum()

In [None]:
X = torch.rand(2, 20)
net = FancyMLP()
print(net)
net(X)

In [None]:
class NestMLP(nn.Module):
    def __init__(self, **kwargs):
        super(NestMLP, self).__init__(**kwargs)
        self.net = nn.Sequential(nn.Linear(40, 30), nn.ReLU()) 

    def forward(self, x):
        return self.net(x)

net = nn.Sequential(NestMLP(), nn.Linear(30, 20), FancyMLP())

X = torch.rand(2, 40)
print(net)
net(X)

# 4.2 模型参数的访问、初始化和共享

In [None]:
import torch
from torch import nn
from torch.nn import init

print((torch.__version__))

In [None]:
net = nn.Sequential(nn.Linear(4, 3), nn.ReLU(), nn.Linear(3, 1))  # pytorch已进行默认初始化

print(net)
X = torch.rand(2, 4)
Y = net(X).sum()

## 4.2.1 访问模型参数

In [None]:
print((type(net.named_parameters())))
for name, param in net.named_parameters():
    print((name, param.size()))

In [None]:
for name, param in net[0].named_parameters():
    print((name, param.size(), type(param)))

In [None]:
class MyModel(nn.Module):
    def __init__(self, **kwargs):
        super(MyModel, self).__init__(**kwargs)
        self.weight1 = nn.Parameter(torch.rand(20, 20))
        self.weight2 = torch.rand(20, 20)
    def forward(self, x):
        pass
    
n = MyModel()
for name, param in n.named_parameters():
    print(name)

In [None]:
weight_0 = list(net[0].parameters())[0]
print((weight_0.data))
print((weight_0.grad))
Y.backward()
print((weight_0.grad))

## 4.2.2 初始化模型参数

In [None]:
for name, param in net.named_parameters():
    if 'weight' in name:
        init.normal_(param, mean=0, std=0.01)
        print((name, param.data))

In [None]:
for name, param in net.named_parameters():
    if 'bias' in name:
        init.constant_(param, val=0)
        print((name, param.data))

## 4.2.3 自定义初始化方法

In [None]:
def init_weight_(tensor):
    with torch.no_grad():
        tensor.uniform_(-10, 10)
        tensor *= (tensor.abs() >= 5).float()

In [None]:
for name, param in net.named_parameters():
    if 'weight' in name:
        init_weight_(param)
        print((name, param.data))

In [None]:
for name, param in net.named_parameters():
    if 'bias' in name:
        param.data += 1
        print((name, param.data))

## 4.2.4 共享模型参数

In [None]:
linear = nn.Linear(1, 1, bias=False)
net = nn.Sequential(linear, linear) 
print(net)
for name, param in net.named_parameters():
    init.constant_(param, val=3)
    print((name, param.data))

In [None]:
print((id(net[0]) == id(net[1])))
print((id(net[0].weight) == id(net[1].weight)))

In [None]:
x = torch.ones(1, 1)
y = net(x).sum()
print(y)
y.backward()
print((net[0].weight.grad))

# 4.4 自定义层
## 4.4.1 不含模型参数的自定义层

In [None]:
import torch
from torch import nn

print((torch.__version__))

In [None]:
class CenteredLayer(nn.Module):
    def __init__(self, **kwargs):
        super(CenteredLayer, self).__init__(**kwargs)
    def forward(self, x):
        return x - x.mean()

In [None]:
layer = CenteredLayer()
layer(torch.tensor([1, 2, 3, 4, 5], dtype=torch.float))

In [None]:
net = nn.Sequential(nn.Linear(8, 128), CenteredLayer())

In [None]:
y = net(torch.rand(4, 8))
y.mean().item()

## 4.4.2 含模型参数的自定义层

In [None]:
class MyListDense(nn.Module):
    def __init__(self):
        super(MyListDense, self).__init__()
        self.params = nn.ParameterList([nn.Parameter(torch.randn(4, 4)) for i in range(3)])
        self.params.append(nn.Parameter(torch.randn(4, 1)))

    def forward(self, x):
        for i in range(len(self.params)):
            x = torch.mm(x, self.params[i])
        return x
net = MyListDense()
print(net)

In [None]:
class MyDictDense(nn.Module):
    def __init__(self):
        super(MyDictDense, self).__init__()
        self.params = nn.ParameterDict({
                'linear1': nn.Parameter(torch.randn(4, 4)),
                'linear2': nn.Parameter(torch.randn(4, 1))
        })
        self.params.update({'linear3': nn.Parameter(torch.randn(4, 2))}) # 新增

    def forward(self, x, choice='linear1'):
        return torch.mm(x, self.params[choice])

net = MyDictDense()
print(net)

In [None]:
x = torch.ones(1, 4)
print((net(x, 'linear1')))
print((net(x, 'linear2')))
print((net(x, 'linear3')))

In [None]:
net = nn.Sequential(
    MyDictDense(),
    MyListDense(),
)
print(net)
print((net(x)))

# 4.5 读取和存储

In [None]:
import torch
from torch import nn

print(torch.__version__)

## 4.5.1 读写`Tensor`

In [None]:
x = torch.ones(3)
torch.save(x, 'x.pt')

In [None]:
x2 = torch.load('x.pt')
x2

In [None]:
y = torch.zeros(4)
torch.save([x, y], 'xy.pt')
xy_list = torch.load('xy.pt')
xy_list

In [None]:
torch.save({'x': x, 'y': y}, 'xy_dict.pt')
xy = torch.load('xy_dict.pt')
xy

## 4.5.2 读写模型
### 4.5.2.1 `state_dict`

In [None]:
class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.hidden = nn.Linear(3, 2)
        self.act = nn.ReLU()
        self.output = nn.Linear(2, 1)

    def forward(self, x):
        a = self.act(self.hidden(x))
        return self.output(a)

net = MLP()
net.state_dict()

In [None]:
optimizer = torch.optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
optimizer.state_dict()

### 4.5.2.2 保存和加载模型

In [None]:
X = torch.randn(2, 3)
Y = net(X)

PATH = "./net.pt"
torch.save(net.state_dict(), PATH)

net2 = MLP()
net2.load_state_dict(torch.load(PATH))
Y2 = net2(X)
Y2 == Y

# 4.6 GPU计算

In [None]:
!nvidia-smi # 对Linux/macOS用户有效

In [None]:
import torch
from torch import nn

print(torch.__version__)

## 4.6.1 计算设备

In [None]:
torch.cuda.is_available() # cuda是否可用

In [None]:
torch.cuda.device_count() # gpu数量

In [None]:
torch.cuda.current_device() # 当前设备索引, 从0开始

In [None]:
torch.cuda.get_device_name(0) # 返回gpu名字

## 4.6.2 `Tensor`的GPU计算

In [None]:
x = torch.tensor([1, 2, 3])
x

In [None]:
x = x.cuda(0)
x

In [None]:
x.device

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

x = torch.tensor([1, 2, 3], device=device)
# or
x = torch.tensor([1, 2, 3]).to(device)
x

In [None]:
y = x**2
y

In [None]:
# z = y + x.cpu()

## 4.6.3 模型的GPU计算

In [None]:
net = nn.Linear(3, 1)
list(net.parameters())[0].device

In [None]:
net.cuda()
list(net.parameters())[0].device

In [None]:
x = torch.rand(2,3).cuda()
net(x)