In [2]:
%matplotlib inline
import torch
import torch.nn as nn
import numpy as np
import sys
sys.path.append('..')
import d2lzh_pytorch as d2l

In [3]:
# 该函数以 drop_prob 的概率丢弃 X 中的元素

def dropout(X, drop_prob):
    X = X.float()
    
    # assert断言方法，如果条件返回错误，则终止程序执行
    assert 0 <= drop_prob <= 1
    
    keep_prob = 1 - drop_prob
    
    # 此情况下把全部元素丢弃
    if keep_prob == 0:
        return torch.zeros_like(X)
    
    mask = (torch.rand(X.shape) < keep_prob).float()
    
    return mask * X / keep_prob

In [5]:
X = torch.arange(16).view(2, 8)

dropout(X, 0.5)

tensor([[ 0.,  0.,  4.,  6.,  0., 10., 12.,  0.],
        [ 0., 18.,  0.,  0., 24., 26., 28., 30.]])

In [10]:
# 使用数据集Fashion-MNIST，定义多层感知机包含两个隐藏层

num_inputs, num_outputs, num_hiddens1, num_hiddens2 = 784, 10, 256, 256

w1 = torch.tensor(np.random.normal(0, 0.01, size=(num_inputs, num_hiddens1)), dtype=torch.float, requires_grad=True)
b1 = torch.zeros(num_hiddens1, requires_grad=True)
w2 = torch.tensor(np.random.normal(0, 0.01, size=(num_hiddens1, num_hiddens2)), dtype=torch.float, requires_grad=True)
b2 = torch.zeros(num_hiddens2, requires_grad=True)
w3 = torch.tensor(np.random.normal(0, 0.01, size=(num_hiddens2, num_outputs)), dtype=torch.float, requires_grad=True)
b3 = torch.zeros(num_outputs, requires_grad=True)

params = [w1, b1, w2, b2, w3, b3]

In [14]:
# 使用激活函数，并对每个激活函数的输出使用丢弃法

drop_prob1, drop_prob2 = 0.2, 0.5

def net(X, is_training=True):
    X = X.view(-1, num_inputs)
    H1 = (torch.matmul(X, w1) + b1).relu()
    
    #只在训练模型时使用丢弃法
    if is_training:
        #在第一层全连接后添加丢弃层
        H1 = dropout(H1, drop_prob1)
        
    H2 = (torch.matmul(H1, w2) + b2).relu()
    
    if is_training:
        H2 = dropout(H2, drop_prob2)
        
    return torch.matmul(H2, w3) + b3

In [15]:

def evaluate_accuracy(data_iter, net):
    acc_sum, n = 0.0, 0
    
    for X, y in data_iter:
        
        if isinstance(net, torch.nn.Module):
            # 评估模式, 这会关闭dropout
            net.eval() 
            
            acc_sum += (net(X).argmax(dim=1) == y).float().sum().item()
            
            # 改回训练模式
            net.train() 
            
        else: 
            # 自定义的模型
            
            if('is_training' in net.__code__.co_varnames): 
                
                # 如果有is_training这个参数
                # 将is_training设置成False
                
                acc_sum += (net(X, is_training=False).argmax(dim=1) == y).float().sum().item() 
                
            else:
                acc_sum += (net(X).argmax(dim=1) == y).float().sum().item() 
                
        n += y.shape[0]
        
    return acc_sum / n


In [16]:
num_epochs, lr, batch_size = 5, 100.0, 256

loss = torch.nn.CrossEntropyLoss()

train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size)

d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, params, lr)

epoch 1, loss 0.0045, train acc 0.546, test acc 0.685
epoch 2, loss 0.0023, train acc 0.785, test acc 0.803
epoch 3, loss 0.0019, train acc 0.825, test acc 0.765
epoch 4, loss 0.0017, train acc 0.840, test acc 0.825
epoch 5, loss 0.0016, train acc 0.848, test acc 0.783


In [17]:
# 简洁实现

net = nn.Sequential(
        d2l.FlattenLayer(),
        nn.Linear(num_inputs, num_hiddens1),
        nn.ReLU(),
        nn.Dropout(drop_prob1),
        nn.Linear(num_hiddens1, num_hiddens2), 
        nn.ReLU(),
        nn.Dropout(drop_prob2),
        nn.Linear(num_hiddens2, 10)
        )

for param in net.parameters():
    nn.init.normal_(param, mean=0, std=0.01)

In [18]:
optimizer = torch.optim.SGD(net.parameters(), lr=0.5)

d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, None, None, optimizer)

epoch 1, loss 0.0047, train acc 0.540, test acc 0.710
epoch 2, loss 0.0023, train acc 0.778, test acc 0.737
epoch 3, loss 0.0020, train acc 0.817, test acc 0.827
epoch 4, loss 0.0018, train acc 0.836, test acc 0.841
epoch 5, loss 0.0017, train acc 0.846, test acc 0.831
