In [1]:
%matplotlib inline
import torch
import torch.nn as nn
import numpy as np
import sys
sys.path.append('..')
import d2lzh_pytorch as d2l

def dropout(X, drop_prob):
    X = X.float()
    assert 0 <= drop_prob <= 1
    keep_prob = 1 - drop_prob
    if keep_prob == 0:
        return torch.zeros_like(X)
    mask = (torch.rand(X.shape) < keep_prob).float()
    
    return mask * X / keep_prob

In [2]:
X = torch.arange(16).view(2, 8)
dropout(X, 0)

tensor([[ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.],
        [ 8.,  9., 10., 11., 12., 13., 14., 15.]])

In [3]:
dropout(X, 0.5)

tensor([[ 0.,  2.,  0.,  6.,  8.,  0.,  0., 14.],
        [ 0., 18.,  0., 22.,  0.,  0.,  0., 30.]])

In [4]:
dropout(X, 1.0)

tensor([[0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.]])

In [5]:
num_inputs, num_outputs, num_hidden1, num_hiddens2 = 784, 10, 256, 266

W1 = torch.tensor(np.random.normal(0, 0.01, size=(num_inputs, num_hidden1)),
                  dtype=torch.float, requires_grad=True)
b1 = torch.zeros(num_hidden1, requires_grad=True)
W2 = torch.tensor(np.random.normal(0, 0.01, size=(num_hidden1, num_hiddens2)),
                 dtype=torch.float, requires_grad=True)
b2 = torch.zeros(num_hiddens2, requires_grad=True)
W3 = torch.tensor(np.random.normal(0, 0.01, size=(num_hiddens2, num_outputs)),
                 dtype=torch.float, requires_grad=True)
b3 = torch.zeros(num_outputs, requires_grad=True)

params = [W1, b1, W2, b2, W3, b3]

In [6]:
drop_prob1, drop_prob2 = 0.2, 0.5

def net(X, is_training=True):
    X = X.view(-1, num_inputs)
    H1 = (torch.matmul(X, W1) + b1).relu()
    if is_training:
        H1 = dropout(H1, drop_prob1)
    H2 = (torch.matmul(H1, W2) + b2).relu()
    if is_training:
        H2 = dropout(H2, drop_prob2)
    return torch.matmul(H2, W3) + b3

In [7]:
def evaluate_accuracy(data_iter, net):
    acc_num, n = 0.0, 0
    for X, y in data_iter:
        if isinstance(net, torch.nn.Module):
            net.eval()
            acc_sum += (net(X).argmax(dim=1) == y).float().sum().item()
            net.train()
        else:
            if('is_training' in net.__code__.co_varnames):
                acc_sum += (net(X, is_training=False).argmax(dim=1) == y ).float().sum().item()
            else:
                acc_sum += (net(X).argmax(dim=1) == y).float().sum().item()
        n += y.shape[0]
    return acc_sum / n

In [8]:
num_epochs, lr, batch_size = 5, 100.0, 256
loss = torch.nn.CrossEntropyLoss()
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size, root='../data')
d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, params, lr)

epoch 1, loss 0.0045, train acc 0.554, test acc 0.740
epoch 2, loss 0.0023, train acc 0.787, test acc 0.803
epoch 3, loss 0.0019, train acc 0.824, test acc 0.801
epoch 4, loss 0.0017, train acc 0.839, test acc 0.828
epoch 5, loss 0.0016, train acc 0.849, test acc 0.807


In [11]:
net = nn.Sequential(d2l.FlattenLayer(),
                    nn.Linear(num_inputs, num_hidden1),
                    nn.ReLU(),
                    nn.Dropout(drop_prob1),
                    nn.Linear(num_hidden1, num_hiddens2),
                    nn.ReLU(),
                    nn.Dropout(drop_prob2),
                    nn.Linear(num_hiddens2, num_outputs)
                   )
for param in net.parameters():
    nn.init.normal_(param, mean=0, std=0.01)

In [12]:
optimizer = torch.optim.SGD(net.parameters(), lr=0.5)
d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, None, None, optimizer)

epoch 1, loss 0.0048, train acc 0.541, test acc 0.682
epoch 2, loss 0.0023, train acc 0.780, test acc 0.748
epoch 3, loss 0.0019, train acc 0.819, test acc 0.803
epoch 4, loss 0.0018, train acc 0.833, test acc 0.828
epoch 5, loss 0.0016, train acc 0.849, test acc 0.849
