# 1. Regularization

以下两种写法是等价的。

In [1]:
# 写法一：手工添加正则化项
def training_loop(n_epochs, optimizer, model, loss_fn, train_loader):
    for epoch in range(n_epochs):
        total_loss = 0.0
        for imgs, labels in train_loader:
            outputs = model(imgs)
            loss = loss_fn(input=outputs, target=labels)

            # Regularization
            l2_lambda = 0.001  # 正则项系数
            l2_norm = sum(p.pow(2).sum() for p in model.parameters())
            loss += l2_lambda * l2_norm

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"epoch: {epoch} training loss: {total_loss / len(train_loader)}")

In [2]:
import torch
import torch.nn as nn
model = nn.Sequential(nn.Linear(3, 5))

# 写法二：创建optimizer时传入weight_decay参数
optimizer = torch.optim.SGD(model.parameters(), lr=1e-2, weight_decay=0.01)
# 注意这里weight_decay的值应该等于2 * l2_lambda

# 2. Dropout

Dropout的流程：

每一个mini-batch都在训练不同的网络：
1. 随机删除网络中一定比例的隐藏层的神经元（输入输出神经元不变）。
2. 将输入x通过修改后的网络前向传播。
3. 将得到的损失通过修改后的网络反向传播。
4. 重复这一过程。

In [3]:
# Dropout函数形式，用于演示
dropout = nn.Dropout(p=0.1)  # p是丢弃的概率
input = torch.randn(5, 3)
output = dropout(input)
output

tensor([[-0.9123, -0.2944, -0.4776],
        [-0.0000,  1.3038,  0.0000],
        [-0.4803,  1.6488,  0.0000],
        [ 0.0773, -0.5763, -1.1433],
        [-0.0206,  0.1642, -1.1635]])

In [4]:
# Dropout类的形式，用在Sequential中
dropout_net = nn.Sequential(
    nn.Conv2d(3, 16, kernel_size=3, padding=1),
    nn.Tanh(),
    nn.MaxPool2d(2),
    nn.Dropout2d(p=0.4),

    nn.Conv2d(16, 8, kernel_size=3, padding=1),
    nn.Tanh(),
    nn.MaxPool2d(2),
    nn.Dropout2d(p=0.4),

    nn.Flatten(),

    nn.Linear(8 * 8 * 8, 32),
    nn.Dropout(p=0.4),
    nn.Tanh(),
    nn.Linear(32, 2),
)

img = torch.randn(1000, 3, 32, 32)

# 将模型状态设为“训练”，启用Dropout
dropout_net.train()
out = dropout_net(img)
print(out.shape)

# 将模型状态设为“测试”，停用Dropout
dropout_net.eval()
out = dropout_net(img)
print(out.shape)

torch.Size([1000, 2])
torch.Size([1000, 2])


# 3. Batch Normalization

Batch Normalization的主要思想：调整每一层输入到激活函数的输入值，
使其满足某个分布，以防止梯度消失。

Batch Normalization的好处：允许使用更高的学习率、
使训练过程更少依赖于初始值、可以代替Dropout。

In [5]:
# Batch Normalization一般使用在激活函数以前
bn_net = nn.Sequential(
    nn.Conv2d(3, 16, kernel_size=3, padding=1),
    nn.BatchNorm2d(16),
    nn.Tanh(),
    nn.MaxPool2d(2),

    nn.Conv2d(16, 8, kernel_size=3, padding=1),
    nn.BatchNorm2d(8),
    nn.Tanh(),
    nn.MaxPool2d(2),

    nn.Flatten(),

    nn.Linear(8 * 8 * 8, 32),
    nn.BatchNorm1d(32),
    nn.Tanh(),
    nn.Linear(32, 2),
)

img = torch.randn(1000, 3, 32, 32)

# 将模型状态设为“训练”
bn_net.train()
out = bn_net(img)
print(out.shape)

# 将模型状态设为“测试”，Batch Normalization的规则与训练时有所不同
bn_net.eval()
out = bn_net(img)
print(out.shape)

torch.Size([1000, 2])
torch.Size([1000, 2])
