In [1]:
import d2lzh as d2l
import torch
from torch import nn
from torch.nn import init
from torch import optim
from torch.nn import functional as F

def dropout(X, drop_prob):
    assert 0 <= drop_prob <= 1
    keep_prob = 1 - drop_prob
    if keep_prob == 0:
        return torch.zeros_like(X)
    mask = (torch.rand(X.shape) < keep_prob).float()
    return mask * X / keep_prob

In [2]:
X = torch.arange(16).reshape((2, 8)).float()
dropout(X, 0)

tensor([[ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.],
        [ 8.,  9., 10., 11., 12., 13., 14., 15.]])

In [3]:
dropout(X, 0.5)

tensor([[ 0.,  0.,  0.,  0.,  8., 10., 12., 14.],
        [16.,  0.,  0., 22., 24.,  0., 28.,  0.]])

In [4]:
dropout(X, 1)

tensor([[0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.]])

In [5]:
num_inputs, num_outputs, num_hiddens1, num_hiddens2 = 784, 10, 256, 256

W1 = torch.randn((num_inputs, num_hiddens1)) * 0.01
b1 = torch.zeros(num_hiddens1)
W2 = torch.randn((num_hiddens1, num_hiddens2)) * 0.01
b2 = torch.zeros(num_hiddens2)
W3 = torch.randn((num_hiddens2, num_outputs)) * 0.01
b3 = torch.zeros(num_outputs)

params = [W1, b1, W2, b2, W3, b3]
for param in params:
    param.requires_grad = True

In [6]:
drop_prob1, drop_prob2 = 0.2, 0.5

def net(X):
    X = X.reshape((-1, num_inputs))
    H1 = F.relu(X @ W1 + b1)
    H1 = dropout(H1, drop_prob1)  # 在第一层全连接后添加丢弃层
    H2 = F.relu(H1 @ W2 + b2)
    H2 = dropout(H2, drop_prob2)  # 在第二层全连接后添加丢弃层
    return H2 @ W3 + b3

In [7]:
num_epochs, lr, batch_size = 5, 0.5, 256
loss = nn.CrossEntropyLoss(reduction='sum')
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size)
d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, params, lr)

epoch 1, loss 1.1598, train acc 0.549, test acc 0.768
epoch 2, loss 0.5748, train acc 0.787, test acc 0.802
epoch 3, loss 0.4901, train acc 0.820, test acc 0.829
epoch 4, loss 0.4416, train acc 0.839, test acc 0.843
epoch 5, loss 0.4242, train acc 0.845, test acc 0.845


In [8]:
net = nn.Sequential(
    d2l.FlattenLayer(),
    nn.Linear(num_inputs, num_hiddens1),
    nn.ReLU(),
    nn.Dropout(drop_prob1),
    nn.Linear(num_hiddens1, num_hiddens2),
    nn.ReLU(),
    nn.Dropout(drop_prob2),
    nn.Linear(num_hiddens2, num_outputs)
)
loss = nn.CrossEntropyLoss()
for layer in net:
    if type(layer) == nn.Linear:
        init.normal_(layer.weight, std=0.01)

In [9]:
optimizer = optim.SGD(net.parameters(), lr=lr)
d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, None, None, optimizer)

epoch 1, loss 0.0049, train acc 0.528, test acc 0.719
epoch 2, loss 0.0024, train acc 0.768, test acc 0.746
epoch 3, loss 0.0020, train acc 0.815, test acc 0.815
epoch 4, loss 0.0018, train acc 0.832, test acc 0.824
epoch 5, loss 0.0017, train acc 0.842, test acc 0.803
