In [1]:
# 丢弃法在训练模型时起到正则化的作用，并可以用来应对过拟合。
# 在测试模型时，我们为了拿到更加确定性的结果，一般不使用丢弃法。

import d2lzh as d2l
from mxnet import autograd, gluon, init, nd
from mxnet.gluon import loss as gloss, nn

In [2]:
# 倒置丢弃法（inverted dropout）

def dropout(X, drop_prob):
    assert 0 <= drop_prob <= 1
    keep_prob = 1 - drop_prob
    # 这种情况下把全部元素都丢弃
    if keep_prob == 0:
        return X.zeros_like()
    # 下面的比较会采用广播机制，返回一个真值向量
    # 这是获取特定概率分布数据的技巧
    mask = nd.random.uniform(0, 1, X.shape) < keep_prob
    return mask * X / keep_prob

In [3]:
# 测试

X = nd.arange(16).reshape((2, 8))
#dropout(X, 0)
dropout(X, 0.5)
#dropout(X, 1)


[[  0.   0.   0.   0.   0.   0.   0.   0.]
 [ 16.   0.   0.  22.  24.  26.   0.  30.]]
<NDArray 2x8 @cpu(0)>

In [4]:
# 初始化模型参数

num_inputs, num_outputs, num_hiddens1, num_hiddens2 = 784, 10, 256, 256

W1 = nd.random.normal(scale=0.01, shape=(num_inputs, num_hiddens1))
b1 = nd.zeros(num_hiddens1)
W2 = nd.random.normal(scale=0.01, shape=(num_hiddens1, num_hiddens2))
b2 = nd.zeros(num_hiddens2)
W3 = nd.random.normal(scale=0.01, shape=(num_hiddens2, num_outputs))
b3 = nd.zeros(num_outputs)

params = [W1, b1, W2, b2, W3, b3]
for param in params:
    param.attach_grad()

In [5]:
# 定义模型

drop_prob1, drop_prob2 = 0.2, 0.5

def net(X):
    # 通过reshape函数将每张原始图像改成长度为num_inputs的向量
    X = X.reshape((-1, num_inputs))
    H1 = (nd.dot(X, W1) + b1).relu()
    # 只在训练的时候使用丢弃法
    if autograd.is_training():
        H1 = dropout(H1, drop_prob1)
    H2 = (nd.dot(H1, W2) + b2).relu()
    if autograd.is_training():
        H2 = dropout(H2, drop_prob2)
    return nd.dot(H2, W3) + b3

In [6]:
# 训练及测试

num_epochs, lr, batch_size = 5, 0.5, 256
loss = gloss.SoftmaxCrossEntropyLoss()
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size)
d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, params, lr)

epoch 1, loss 1.1926, train acc 0.535, test acc 0.771
epoch 2, loss 0.6001, train acc 0.775, test acc 0.833
epoch 3, loss 0.5047, train acc 0.816, test acc 0.844
epoch 4, loss 0.4544, train acc 0.835, test acc 0.857
epoch 5, loss 0.4242, train acc 0.845, test acc 0.863


In [7]:
# 简洁实现

net = nn.Sequential()
net.add(nn.Dense(256, activation="relu"),
        nn.Dropout(drop_prob1),
        nn.Dense(256),
        nn.Dropout(drop_prob2),
        nn.Dense(10))
net.initialize(init.Normal(sigma=0.01))

In [8]:
# 训练及测试

trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr})
d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, None, None, trainer)

epoch 1, loss 1.4464, train acc 0.469, test acc 0.761
epoch 2, loss 0.6815, train acc 0.747, test acc 0.821
epoch 3, loss 0.5838, train acc 0.785, test acc 0.798
epoch 4, loss 0.5169, train acc 0.812, test acc 0.844
epoch 5, loss 0.5777, train acc 0.794, test acc 0.837


# 练习:

如果把本节中的两个丢弃概率超参数对调，会有什么结果？

增大迭代周期数，比较使用丢弃法与不使用丢弃法的结果。

如果将模型改得更加复杂，如增加隐藏层单元，使用丢弃法应对过拟合的效果是否更加明显？

以本节中的模型为例，比较使用丢弃法与权重衰减的效果。如果同时使用丢弃法和权重衰减，效果会如何？