In [1]:
# 丢弃法　dropout　也是应对过拟合的方法之一
# (另外两个：权重衰减, 增加训练数据的数量)

In [2]:
# 方法:
# 以多层感知机为例，可以丢弃隐藏层中的某个神经元．

# 正向传播时，是　卷积＋激活后的神经元被丢弃
# 反向传播时，是该神经元所涉及的所有权重其梯度为零．
# 测试时，不使用丢弃法（为了使结果更具确定性）

In [3]:
# 背后包含的思想：
# 假设某个隐藏层包含５个神经元，这五个神经元被丢弃的概率为drop_prob.
# 由于被丢弃的概率是随机的，因此网络训练时不会特定地依赖于某个神经元
#（否则它一旦被丢弃那么训练所得的模型其精度必定要大打折扣）
# 正因为如此，丢弃的过程就起到对模型训练过程正则化的作用，也就是说，可以防止过拟合．

In [4]:
### 此处可以将＂weight decay＂和＂dropout＂关联起来：
# 他们的实质都是，让权重参数不要过大．

### １．从零开始实现(以softmax回归为例)

In [23]:
import torch
import torch.nn as nn
import numpy as np
import sys
sys.path.append('./d2lzh/')
import d2lzh_pytorch as d2l

In [24]:
def dropout(X, drop_prob):
    X = X.float()
    assert 0 <= drop_prob <= 1
    keep_drop = 1 - drop_prob
    if keep_drop == 0:
        return torch.zeros_like(X)
    mask = (torch.rand(X.shape) < keep_drop).float() # torch.rand() [0,1]之间的数
    
    return mask * X / keep_drop   ### 输入信息被拉伸了．

In [25]:
X = torch.arange(16).view(2, 8)
dropout(X, 0)

tensor([[ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.],
        [ 8.,  9., 10., 11., 12., 13., 14., 15.]])

In [26]:
dropout(X, 0.5)

tensor([[ 0.,  2.,  0.,  0.,  0., 10., 12.,  0.],
        [16.,  0.,  0., 22.,  0., 26.,  0., 30.]])

In [27]:
dropout(X, 0.5)   # 与上不一致，结果被拉伸

tensor([[ 0.,  2.,  4.,  6.,  8.,  0., 12.,  0.],
        [ 0., 18., 20.,  0.,  0.,  0.,  0., 30.]])

In [28]:
dropout(X, 1.0)

tensor([[0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.]])

In [29]:
### 1.1 定义模型参数／定义模型

In [30]:
num_inputs, num_outputs, num_hiddens1, num_hiddens2 = 784, 10, 256, 256

w1 = torch.tensor(np.random.normal(0, 0.01, size=(num_inputs, num_hiddens1)), dtype=torch.float, requires_grad=True)
b1 = torch.zeros(num_hiddens1, requires_grad=True)

w2 = torch.tensor(np.random.normal(0, 0.01, size=(num_hiddens1, num_hiddens2)), dtype=torch.float, requires_grad=True)
b2 = torch.zeros(num_hiddens2, requires_grad=True)

w3 = torch.tensor(np.random.normal(0, 0.01, size=(num_hiddens2, num_outputs)), dtype=torch.float, requires_grad=True)
b3 = torch.zeros(num_outputs, requires_grad=True)

params = [w1, b1, w2, b2, w3, b3]

In [31]:
drop_prob1, drop_prob2 = 0.2, 0.5

def net(X, is_training=True):
    X = X.view(-1, num_inputs)
    H1 = (torch.matmul(X, params[0]) + params[1]).relu()
    if is_training:
        H1 = dropout(H1, drop_prob1)
    H2 = (torch.matmul(H1, params[2]) + params[3]).relu()
    if is_training:
        H2 = dropout(H2, drop_prob2)
        
    return torch.matmul(H2, params[4]) + params[5]

In [32]:
# 评估模型准确度时，使用标准库定义的模型　和　自定义模型有所区别

def evaluate_accuracy(data_iter, net):
    acc_sum, n = 0.0, 0
    for X, y in data_iter:
        if isinstance(net, torch.nn.Module):
            net.eval()   # 评估模式，关闭dropout
            acc_sum += (net(X).argmax(dim=1) == y).float().sum().item()
            net.train()  # 返回到训练模式
        
        else:  #　自定义模型
            if('is_training' in net.__code__.co_varnames): # 查看这个变量是否在net模块中
                acc_sum += (net(X, is_training=False).argmax(dim=1) == y).float().sum().item()
            else:
                acc_sum += (net(X).argmax(dim=1) == y).float().sum().item()
        
        n += y.shape[0]   # 统计样本数
        

In [33]:
### 训练 ＋测试
num_epochs, lr, batch_size = 5, 100.0, 256
loss = torch.nn.CrossEntropyLoss()
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size)
d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, params, lr)

RuntimeError: leaf variable has been moved into the graph interior

### 2．pytorch实现(以softmax回归为例)

In [34]:
# 不同点：
# 1. 模型和初始化参数都不用手动设置
# 2. 优化算法

In [36]:
net = torch.nn.Sequential(
    d2l.FlattenLayer(),
    nn.Linear(num_inputs, num_hiddens1),
    nn.ReLU(),
    nn.Dropout(drop_prob1),
    nn.Linear(num_hiddens1, num_hiddens2),
    nn.ReLU(),
    nn.Dropout(drop_prob2),
    nn.Linear(num_hiddens2, 10)
)

for param in net.parameters():
    nn.init.normal_(param, mean=0, std=0.01)


In [38]:
optimizer = torch.optim.SGD(net.parameters(), lr=0.5)

d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, None, None, optimizer)

epoch 1, loss 0.0049, train acc 0.522, test acc 0.746
epoch 2, loss 0.0023, train acc 0.779, test acc 0.760
epoch 3, loss 0.0020, train acc 0.817, test acc 0.786
epoch 4, loss 0.0018, train acc 0.835, test acc 0.822
epoch 5, loss 0.0017, train acc 0.843, test acc 0.788
