# 批量归一化层

## 全连接层做批量归一化

通常，将批量归一化层置于全连接层中的仿射变换和激活函数之间。

## 卷积层做批量归一化

发生在卷积计算之后、应用激活函数之前。

## 预测时的批量归一化


In [1]:
import time
import torch
from torch import nn, optim
import torch.nn.functional as F
import d2lzh_pytorch as d2l

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def batch_norm(is_training, X, gamma, beta, moving_mean, moving_var, eps, momentum):
    if not is_training:
        X_hat = (X - moving_mean) / torch.sqrt(moving_var + eps)
    
    else:
        assert len(X.shape) in (2, 4)
        if len(X.shape) == 2:
            # 全连接层，计算特征维上的均值和方差
            mean = X.mean(dim=0)
            var = ((X - mean) ** 2).mean(dim=0)
        
        else:
            mean = X.mean(dim=0, keepdim=True).mean(dim=2, keepdim=True).mean(dim=3, keepdim=True)
            var = ((X - mean) ** 2).mean(dim=0, keepdim=True).mean(dim=2, keepdim=True).mean(dim=3, keepdim=True)
        
        X_hat = (X - mean) / torch.sqrt(var + eps)
        moving_mean = momentum * moving_mean + (1.0 - momentum) * mean
        moving_var = momentum * moving_var + (1.0 - momentum) * var
    
    Y = gamma * X_hat + beta # 拉伸和平移
    return Y, moving_mean, moving_var


In [2]:
class BatchNorm(nn.Module):
    def __init__(self, num_features, num_dims):
        super(BatchNorm, self).__init__()
        if num_dims == 2:
            shape = (1, num_features)
        else:
            shape = (1, num_features, 1, 1)

        # 参与求梯度和迭代的拉伸和偏移参数，分别初始化成0和1
        self.gamma = nn.Parameter(torch.ones(shape))
        self.beta = nn.Parameter(torch.zeros(shape))
        # 不参与求梯度和迭代的变量，全在内存上初始化成0
        self.moving_mean = torch.zeros(shape)
        self.moving_var = torch.zeros(shape)

    def forward(self, X):
        if self.moving_mean.device != X.device:
            self.moving_mean = self.moving_mean.to(X.device)
            self.moving_var = self.moving_var.to(X.device)

        Y, self.moving_mean, self.moving_var = batch_norm(self.training, X, self.gamma, self.beta, self.moving_mean, self.moving_var, eps=1e-5, momentum=0.9)

        return Y

In [3]:
net = nn.Sequential(
    nn.Conv2d(1, 6, 5),
    BatchNorm(6, num_dims=4),
    nn.Sigmoid(),
    nn.MaxPool2d(2, 2),
    nn.Conv2d(6, 16, 5),
    BatchNorm(16, num_dims=4),
    nn.Sigmoid(),
    nn.MaxPool2d(2, 2),
    d2l.FlattenLayer(),
    nn.Linear(16*4*4, 120),
    BatchNorm(120, num_dims=2),
    nn.Sigmoid(),
    nn.Linear(120, 84),
    BatchNorm(84, num_dims=2),
    nn.Sigmoid(),
    nn.Linear(84, 10)
)

batch_size = 256
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size=batch_size)

lr, num_epochs = 0.001, 5
optimizer = torch.optim.Adam(net.parameters(), lr=lr)
d2l.train_ch5(net, train_iter, test_iter, batch_size, optimizer, device, num_epochs)

training on  cuda
epoch 1, loss 1.0037, train acc 0.783, test acc 0.836, time 15.5 sec
epoch 2, loss 0.4576, train acc 0.864, test acc 0.849, time 13.3 sec
epoch 3, loss 0.3671, train acc 0.879, test acc 0.854, time 13.5 sec
epoch 4, loss 0.3282, train acc 0.887, test acc 0.837, time 13.4 sec
epoch 5, loss 0.3047, train acc 0.894, test acc 0.846, time 13.2 sec


In [4]:
net[1].gamma.view((-1,)), net[1].beta.view((-1,))

(tensor([1.1662, 1.1112, 0.9872, 1.0552, 1.0212, 1.1489], device='cuda:0',
        grad_fn=<ViewBackward>),
 tensor([ 0.2454,  0.0990,  0.0874, -0.5748,  0.4119,  0.4154], device='cuda:0',
        grad_fn=<ViewBackward>))

# Pytorch自带BatchNorm2d

In [5]:
net = nn.Sequential(
    nn.Conv2d(1, 6, 5),
    nn.BatchNorm2d(6),
    nn.Sigmoid(),
    nn.MaxPool2d(2, 2),
    nn.Conv2d(6, 16, 5),
    nn.BatchNorm2d(16),
    nn.Sigmoid(),
    nn.MaxPool2d(2, 2),
    d2l.FlattenLayer(),
    nn.Linear(16*4*4, 120),
    nn.BatchNorm1d(120),
    nn.Sigmoid(),
    nn.Linear(120, 84),
    nn.BatchNorm1d(84),
    nn.Sigmoid(),
    nn.Linear(84, 10)
)

batch_size = 256
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size=batch_size)

lr, num_epochs = 0.001, 5
optimizer = torch.optim.Adam(net.parameters(), lr=lr)
d2l.train_ch5(net, train_iter, test_iter, batch_size, optimizer, device, num_epochs)

training on  cuda
epoch 1, loss 0.9908, train acc 0.790, test acc 0.843, time 11.7 sec
epoch 2, loss 0.4552, train acc 0.864, test acc 0.833, time 10.8 sec
epoch 3, loss 0.3676, train acc 0.879, test acc 0.854, time 11.4 sec
epoch 4, loss 0.3325, train acc 0.886, test acc 0.867, time 11.0 sec
epoch 5, loss 0.3096, train acc 0.892, test acc 0.873, time 11.0 sec
