5.10 批量归一化
    -由于深度网络底部层的梯度下降较慢，导致收敛变慢，于是将数值归一化到[0,1]内，许多激活函数在这个区间的收敛速度很快。
    -作用在全连接层和卷积层的输出上，激活函数前。
    -作用在全连接层和卷积层的输入上。
    -对于全连接层，作用在特征维上。
    -对于卷积层，作用在通道维上。
    -批量归一化中有两个超参数。
    -尽量使批量大小设置的大一点，从而使批量内样本的均值和方差的计算都较为准确。

5.10.2 从零开始实现

In [2]:
import torch
import time
from torch import nn, optim
import torch.nn.functional as F

import sys
sys.path.append('..')
import d2lzh_pytorch as d2l
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def batch_norm(is_training, X, gamma, beta, moving_mean, moving_var, eps, momentum):
    if not is_training:
        X_hat = (X - moving_mean) / torch.sqrt(moving_var + eps)
    else:
        assert len(X.shape) in (2, 4)
        if len(X.shape) == 2:
            #使用全连接层的情况
            mean = X.mean(dim=0)
            var = ((X - mean) ** 2).mean(dim=0)
        else:
            # 使用二维卷积层的情况，计算通道维上（axis=1）的均值和方差
            mean = X.mean(dim=0, keep_dim=True).mean(dim=1, keep_dim=True).mean(dim=2, keep_dim=True)
            var = ((X - mean) ** 2).mean(dim=0, keep_dim=True).mean(dim=1, keep_dim=True).mean(dim=2, keep_dim=True)
        X_hat = (X - mean) / torch.sqrt(var + eps)
        moving_mean = momentum * moving_mean + (1.0 - momentum) * mean
        moving_var = momentum * moving_var + (1.0 - momentum) * var
    Y = gamma * X_hat + beta

    return Y, moving_mean, moving_var

class BatchNorm(nn.Module):
    def __init__(self, num_features, num_dims) -> None:
        super(BatchNorm, self).__init__()

        #对于全连接层和卷积层来说分别为2和4。
        if num_dims == 2:
            shape = (1, num_features)
        else:
            shape = (1, num_features, 1, 1)
        self.gamma = nn.Parameter(torch.ones(shape))
        self.beta = nn.Parameter(torch.zeros(shape))

        # 不参与求梯度和迭代的变量，全在<内存>上初始化成0
        self.moving_mean = torch.zeros(shape)
        self.moving_var = torch.zeros(shape)
    
    def forward(self, X):
        if self.moving_mean.device != X.device:
            self.moving_mean = self.moving_mean.to(X.device)
            self.moving_var = self.moving_var.to(X.device)
        # Module实例的traning属性默认为true, 调用.eval()后设成false
        Y, self.moving_mean, self.moving_var = batch_norm(self.training, X, self.gamma, self.beta,
                                                            self.moving_mean, self.moving_var, eps=1e-5, momentum=0.9)
        return Y

5.10.2.1 使用批量归一化层的LeNet

In [6]:
net = nn.Sequential(
    nn.Conv2d(1, 6, 5), # in_channels, out_channels, kernel_size
    BatchNorm(6, 4),
    nn.Sigmoid(),
    nn.MaxPool2d(2, 2),
    nn.Conv2d(6, 16, 5),
    BatchNorm(16, num_dims=4),
    nn.Sigmoid(),
    nn.MaxPool2d(2, 2),
    d2l.FlattenLayer(),
    nn.Linear(16*4*4, 120),
    BatchNorm(120, num_dims=2),
    nn.Sigmoid(),
    nn.Linear(120, 84),
    BatchNorm(84, num_dims=2),
    nn.Sigmoid(),
    nn.Linear(84, 10)
)
net

Sequential(
  (0): Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1))
  (1): BatchNorm()
  (2): Sigmoid()
  (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (4): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
  (5): BatchNorm()
  (6): Sigmoid()
  (7): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (8): FlattenLayer()
  (9): Linear(in_features=256, out_features=120, bias=True)
  (10): BatchNorm()
  (11): Sigmoid()
  (12): Linear(in_features=120, out_features=84, bias=True)
  (13): BatchNorm()
  (14): Sigmoid()
  (15): Linear(in_features=84, out_features=10, bias=True)
)

In [12]:
import torchvision
import torchvision.transforms as transforms
batch_size = 256
train_data = torchvision.datasets.FashionMNIST(root='../Datasets/', train=False, transform=transforms.ToTensor())
test_data = torchvision.datasets.FashionMNIST(root='../Datasets/', train=False, transform=transforms.ToTensor())
train_iter = torch.utils.data.DataLoader(train_data, batch_size, shuffle=True)
test_iter = torch.utils.data.DataLoader(test_data, batch_size, shuffle=True)


lr, num_epochs = 0.001, 5
optimizer = optim.Adam(net.parameters(), lr=lr)
d2l.train_ch5(net, train_iter, test_iter, batch_size, optimizer, device, num_epochs)


training on  cuda


TypeError: mean() received an invalid combination of arguments - got (keep_dim=bool, dim=int, ), but expected one of:
 * (*, torch.dtype dtype)
      didn't match because some of the keywords were incorrect: keep_dim, dim
 * (tuple of ints dim, bool keepdim, *, torch.dtype dtype)
 * (tuple of names dim, bool keepdim, *, torch.dtype dtype)


5.10.3 简洁实现

In [13]:
net = nn.Sequential(
    nn.Conv2d(1, 6, 5),
    nn.BatchNorm2d(6),
    nn.Sigmoid(),
    nn.MaxPool2d(2, 2),
    nn.Conv2d(6, 16, 5),
    nn.BatchNorm2d(16),
    nn.Sigmoid(),
    nn.MaxPool2d(2, 2),
    d2l.FlattenLayer(),
    nn.Linear(16*4*4, 120),
    nn.BatchNorm1d(120),
    nn.Sigmoid(),
    nn.Linear(120, 84),
    nn.BatchNorm1d(84),
    nn.Sigmoid(),
    nn.Linear(84, 10)
)

In [14]:
import torchvision
import torchvision.transforms as transforms
batch_size = 256
train_data = torchvision.datasets.FashionMNIST(root='../Datasets/', train=False, transform=transforms.ToTensor())
test_data = torchvision.datasets.FashionMNIST(root='../Datasets/', train=False, transform=transforms.ToTensor())
train_iter = torch.utils.data.DataLoader(train_data, batch_size, shuffle=True)
test_iter = torch.utils.data.DataLoader(test_data, batch_size, shuffle=True)


lr, num_epochs = 0.001, 5
optimizer = optim.Adam(net.parameters(), lr=lr)
d2l.train_ch5(net, train_iter, test_iter, batch_size, optimizer, device, num_epochs)

training on  cuda
epoch 1, loss 1.6834, train acc 0.622, test acc 0.674, time 13.4 sec
epoch 2, loss 1.2318, train acc 0.776, test acc 0.771, time 2.3 sec
epoch 3, loss 0.9691, train acc 0.803, test acc 0.812, time 2.3 sec
epoch 4, loss 0.7875, train acc 0.830, test acc 0.831, time 2.3 sec
epoch 5, loss 0.6634, train acc 0.838, test acc 0.846, time 2.2 sec
