In [30]:
import torch
from torch import nn
from torch.utils import data
import torchvision
from torchvision import transforms
import numpy as np
import matplotlib.pyplot as plt
import time
from IPython import display
import torch.nn.functional as F

## 训练深层网络

## 批量规范化层

## 从零实现

In [31]:
def batch_norm(X, gamma, beta, moving_mean, moving_var, eps, momentum):
    if not torch.is_grad_enabled():
        X_hat = (X-moving_mean) / torch.sqrt(moving_var+eps)
    else:
        match len(X.shape):
            case 2:
                mean = X.mean(dim=0)
                var = ((X-mean)**2).mean(dim=0)
            case 4:
                mean = X.mean(dim=(0, 2, 3), keepdim=True)
                var = ((X-mean)**2).mean(dim=(0,2,3), keepdim=True)
            case _:
                raise ValueError(f"Invalid X shape dims: {len(X.shape)}")
        X_hat = (X-mean) / torch.sqrt(var+eps)
        moving_mean = momentum*moving_mean + (1.0-momentum)*mean
        moving_var = momentum*moving_var + (1.0-momentum)*var
    Y = gamma*X_hat + beta
    return Y, moving_mean.data, moving_var.data

In [32]:
"""example
X = torch.arange(24, dtype=torch.float32).reshape(2, 2, 2, 3)
X, X.mean(dim=(0, 2, 3), keepdim=True), X.mean(dim=(0, 2, 3), keepdim=True).shape
"""
"""output
(tensor([[[[ 0.,  1.,  2.],
           [ 3.,  4.,  5.]],
 
          [[ 6.,  7.,  8.],
           [ 9., 10., 11.]]],
 
 
         [[[12., 13., 14.],
           [15., 16., 17.]],
 
          [[18., 19., 20.],
           [21., 22., 23.]]]]),
 tensor([[[[ 8.5000]],
 
          [[14.5000]]]]),
 torch.Size([1, 2, 1, 1]))
"""

'output\n(tensor([[[[ 0.,  1.,  2.],\n           [ 3.,  4.,  5.]],\n \n          [[ 6.,  7.,  8.],\n           [ 9., 10., 11.]]],\n \n \n         [[[12., 13., 14.],\n           [15., 16., 17.]],\n \n          [[18., 19., 20.],\n           [21., 22., 23.]]]]),\n tensor([[[[ 8.5000]],\n \n          [[14.5000]]]]),\n torch.Size([1, 2, 1, 1]))\n'

In [33]:
class BatchNorm(nn.Module):
    def __init__(self, num_features, num_dims):
        super(BatchNorm, self).__init__()
        shape = (1, num_features) if num_dims == 2 else (1, num_features, 1, 1)
        self.gamma = nn.Parameter(torch.ones(shape))
        self.beta = nn.Parameter(torch.zeros(shape))
        self.moving_mean = torch.zeros(shape)
        self.moving_var = torch.ones(shape)
    def forward(self, X):
        if self.moving_mean.device != X.device:
            self.moving_mean = self.moving_mean.to(X.device)
            self.moving_var = self.moving_var.to(X.device)
        Y, self.moving_mean, self.moving_var = batch_norm(X, self.gamma, self.beta, self.moving_mean,
                                                          self.moving_var, eps=1e-5, momentum=0.9)
        return Y

## 使用批量规范化的LeNet

In [34]:
net = nn.Sequential(
    nn.Conv2d(1, 6, kernel_size=5), BatchNorm(6, num_dims=4), nn.Sigmoid(),
    nn.AvgPool2d(kernel_size=2, stride=2),
    nn.Conv2d(6, 16, kernel_size=5), BatchNorm(16, num_dims=4), nn.Sigmoid(),
    nn.AvgPool2d(kernel_size=2, stride=2), 
    nn.Flatten(),
    nn.Linear(16*4*4, 120), BatchNorm(120, num_dims=2), nn.Sigmoid(),
    nn.Linear(120, 84), BatchNorm(84, num_dims=2), nn.Sigmoid(),
    nn.Linear(84, 10)
)

In [35]:
"""example
X = torch.randn(size=(1,1,28,28), dtype=torch.float32)
for layer in net:
    X = layer(X)
    print(layer.__class__.__name__, f'output shape: \t{X.shape}')
"""
"""output
Conv2d output shape: 	torch.Size([1, 6, 24, 24])
BatchNorm output shape: 	torch.Size([1, 6, 24, 24])
Sigmoid output shape: 	torch.Size([1, 6, 24, 24])
AvgPool2d output shape: 	torch.Size([1, 6, 12, 12])
Conv2d output shape: 	torch.Size([1, 16, 8, 8])
BatchNorm output shape: 	torch.Size([1, 16, 8, 8])
Sigmoid output shape: 	torch.Size([1, 16, 8, 8])
AvgPool2d output shape: 	torch.Size([1, 16, 4, 4])
Flatten output shape: 	torch.Size([1, 256])
Linear output shape: 	torch.Size([1, 120])
BatchNorm output shape: 	torch.Size([1, 120])
Sigmoid output shape: 	torch.Size([1, 120])
Linear output shape: 	torch.Size([1, 84])
BatchNorm output shape: 	torch.Size([1, 84])
Sigmoid output shape: 	torch.Size([1, 84])
Linear output shape: 	torch.Size([1, 10])
"""

'output\nConv2d output shape: \ttorch.Size([1, 6, 24, 24])\nBatchNorm output shape: \ttorch.Size([1, 6, 24, 24])\nSigmoid output shape: \ttorch.Size([1, 6, 24, 24])\nAvgPool2d output shape: \ttorch.Size([1, 6, 12, 12])\nConv2d output shape: \ttorch.Size([1, 16, 8, 8])\nBatchNorm output shape: \ttorch.Size([1, 16, 8, 8])\nSigmoid output shape: \ttorch.Size([1, 16, 8, 8])\nAvgPool2d output shape: \ttorch.Size([1, 16, 4, 4])\nFlatten output shape: \ttorch.Size([1, 256])\nLinear output shape: \ttorch.Size([1, 120])\nBatchNorm output shape: \ttorch.Size([1, 120])\nSigmoid output shape: \ttorch.Size([1, 120])\nLinear output shape: \ttorch.Size([1, 84])\nBatchNorm output shape: \ttorch.Size([1, 84])\nSigmoid output shape: \ttorch.Size([1, 84])\nLinear output shape: \ttorch.Size([1, 10])\n'

In [36]:
def load_data_fashion_mnist(batch_size, resize=None):
    """下载Fashion-MNIST数据集, 然后将其加载到内存中"""
    trans = [transforms.ToTensor()]
    if resize:
        trans.insert(0, transforms.Resize(resize))
    trans = transforms.Compose(trans)
    mnist_train = torchvision.datasets.FashionMNIST(
        root="../data", train=True, transform=trans, download=True
    )
    mnist_test = torchvision.datasets.FashionMNIST(
        root="../data", train=False, transform=trans, download=True
    )
    return (
        data.DataLoader(mnist_train, batch_size, shuffle=True, num_workers=4),
        data.DataLoader(mnist_test, batch_size, shuffle=False, num_workers=4)
    )

In [37]:
batch_size = 128
train_iter, test_iter = load_data_fashion_mnist(batch_size)

In [38]:
def accuracy(y_hat, y):
    if len(y_hat.shape) > 1 and y_hat.shape[1] > 1:
        y_hat = y_hat.argmax(axis=1)
    cmp = y_hat.type(y.dtype) == y
    return float(cmp.type(y.dtype).sum())

In [39]:
class Accumulator:
    def __init__(self, n):
        self.data = [.0] * n
    def add(self, *args):
        self.data = [a+float(b) for a, b in zip(self.data, args)]
    def reset(self):
        self.data = [.0] * len(self.data)
    def __getitem__(self, i):
        return self.data[i]

In [40]:
def evaluate_accuracy_gpu(net, data_iter, device=None):
    """使用GPU计算模型在数据集上的精度"""
    if isinstance(net, nn.Module):
        net.eval()
        if not device:
            device = next(net.parameters()).device
    metric = Accumulator(2)
    with torch.no_grad():
        for X, y in data_iter:
            if isinstance(X, list):
                X = [x.to(device) for x in X]
            else:
                X = X.to(device)
            y = y.to(device)
            metric.add(accuracy(net(X), y), y.numel())
    return metric[0] / metric[1]
            

In [41]:
class Timer:
    """记录多次运行时间"""
    def __init__(self):
        self.times = []
        self.start()
    def start(self):
        """启动计时器"""
        self.tik = time.time()
    def stop(self):
        """停止计时器并将时间记录在列表中"""
        self.times.append(time.time() - self.tik)
        return self.times[-1]
    def avg(self):
        """返回平均时间"""
        return sum(self.times) / len(self.times)
    def sum(self):
        """返回时间总和"""
        return sum(self.times)
    def cumsum(self):
        """返回累计时间"""
        return np.cumsum(self.times).tolist()

In [42]:
class Animator:
    """在动画中绘制数据"""
    def __init__(self, xlabel=None, ylabel=None, xlim=None, ylim=None,
                xscale='linear', yscale='linear', legend=None,
                fmts=('-', 'm--', 'g-.', 'r:')):
        ## 增量地绘制多条线
        legend = [] if legend is None else legend
        self.fig, self.axes = plt.gcf(), plt.gca()
        self.config_axes = lambda: self.set_axes(xlabel, ylabel, xlim, ylim, xscale, yscale, legend) ## 使用lambda表达式捕获变量
        self.X, self.Y, self.fmts = None, None, fmts

    def set_axes(self, xlabel, ylabel, xlim, ylim, xscale, yscale, legend):
        self.axes.set_xlabel(xlabel)
        self.axes.set_ylabel(ylabel)
        self.axes.set_xlim(xlim)
        self.axes.set_ylim(ylim)
        self.axes.set_xscale(xscale)
        self.axes.set_yscale(yscale)
        self.axes.legend(legend)
        self.axes.grid()

    def add(self, x, y):
        """向图表中添加多个数据点"""
        y = [y] if not hasattr(y, "__len__") else y
        x = [x] * len(y) if not hasattr(x, "__len__") else x
        self.X = [[] for _ in range(len(y))] if self.X is None else self.X
        self.Y = [[] for _ in range(len(y))] if self.Y is None else self.Y
        for i, (a, b) in enumerate(zip(x, y)):
            if a is not None and b is not None:
                self.X[i].append(a)
                self.Y[i].append(b)
        self.axes.cla()
        for x, y, fmt in zip(self.X, self.Y, self.fmts):
            self.axes.plot(x, y, fmt)
        self.config_axes()
        display.display(self.fig)
        display.clear_output(wait=True)

In [43]:
def train(net, train_iter, test_iter, num_epochs, lr, device):
    """使用GPU训练模型"""
    def init_weights(m):
        if type(m) == nn.Linear or type(m) == nn.Conv2d:
            nn.init.xavier_uniform_(m.weight)
    net.apply(init_weights)
    print('training on', device)
    net.to(device)
    optimizer = torch.optim.SGD(net.parameters(), lr=lr)
    loss = nn.CrossEntropyLoss()
    animator = Animator(xlabel='epoch', xlim=[1,num_epochs], 
                        legend=['train loss', 'train acc', 'test acc'])
    timer, num_batches = Timer(), len(train_iter)
    for epoch in range(num_epochs):
        metric = Accumulator(3)
        net.train()
        for i, (X,y) in enumerate(train_iter):
            timer.start()
            optimizer.zero_grad()
            X, y = X.to(device), y.to(device)
            y_hat = net(X)
            l = loss(y_hat, y)
            l.backward()
            optimizer.step()
            with torch.no_grad():
                metric.add(l*X.shape[0], accuracy(y_hat, y), X.shape[0])
            timer.stop()
            train_l = metric[0] / metric[2]
            train_acc = metric[1] / metric[2]
            if (i+1) % (num_batches//5) == 0 or i == num_batches-1:
                animator.add(epoch+(i+1)/num_batches, (train_l, train_acc, None))
            test_acc = evaluate_accuracy_gpu(net, test_iter)
            animator.add(epoch+1, (None, None, test_acc))
    print(f'loss {train_l:.3f}, train acc {train_acc:.3f}, '
          f'test acc {test_acc:.3f}')
    print(f'{metric[2]*num_epochs/timer.sum():.1f} examples/sec '
          f'on {str(device)}')

In [None]:
lr, num_epochs = 1.0, 10
train(net, train_iter, test_iter, num_epochs, lr, torch.device('cpu'))

## 简洁实现

In [46]:
net = nn.Sequential(
    nn.Conv2d(1, 6, kernel_size=5), nn.BatchNorm2d(6), nn.Sigmoid(),
    nn.AvgPool2d(kernel_size=2, stride=2),
    nn.Conv2d(6, 16, kernel_size=5), nn.BatchNorm2d(16), nn.Sigmoid(),
    nn.AvgPool2d(kernel_size=2, stride=2), 
    nn.Flatten(),
    nn.Linear(16*4*4, 120), nn.BatchNorm1d(120), nn.Sigmoid(),
    nn.Linear(120, 84), nn.BatchNorm1d(84), nn.Sigmoid(),
    nn.Linear(84, 10)
)

In [None]:
lr, num_epochs = 1.0, 10
train(net, train_iter, test_iter, num_epochs, lr, torch.device('cpu'))

## 争议

## Exercises

1. 在使用批量规范化之前，我们是否可以从全连接层或卷积层中删除偏置参数？为什么？

2. 比较LeNet在使用和不使用批量规范化情况下的学习率
    - 绘制训练和测试准确度的提高
    - 学习率有多高？

3. 我们是否需要在每个层中进行批量规范化？尝试一下？

4. 可以通过批量规范化来替换暂退法吗？行为会如何改变？

5. 确定参数beta和gamma，并观察和分析结果

6. 查看高级API中有关BatchNorm的在线文档，以查看其他批量规范化的应用

7. 研究思路：可以应用的其他“规范化”转换？可以应用概率积分变换吗？全秩协方差估计可以么？