# PyTorch-Lightning 实现 ResNet18

In [1]:
import torch
from torch import nn
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
from torch.nn import functional as F
import pytorch_lightning as pl
from torch.optim import Adam
from torch.optim.lr_scheduler import MultiStepLR

## 定义残差块

在 ResNet18 中的包含两种类型的残差块：
- 输入维度和输出维度相同：当第一个卷积层的步幅设置为 1 时输入的每个通道的大小和输出相同，所以残差连接时直接将输入和输出相加即可。
- 输入维度和输出维度不同：当第一个卷积层的步幅设置为 2 时输入的每个通道的大小是输入的2倍，所以在残差连接之前需要先使用步幅为2的1x1卷积对输入进行下采样以匹配输出的大小，然后才能进行相加操作。

参考：[http://zh-v2.d2l.ai/chapter_convolutional-modern/resnet.html#id3](http://zh-v2.d2l.ai/chapter_convolutional-modern/resnet.html#id3)

In [2]:
class Residual(nn.Module):
    """残差块"""
    def __init__(self, in_channels, out_channels, strides=1):
        super(Residual, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1, stride=strides)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1)
        # 如果第一个卷积层使用的步幅大于1则需要使用1x1卷积对输入x进行下采样以匹配输出尺寸
        if strides > 1:
            self.conv3 = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=strides)
        else:
            self.conv3 = None
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.bn2 = nn.BatchNorm2d(out_channels)

    def forward(self, x):
        y = F.relu(self.bn1(self.conv1(x)))
        y = self.bn2(self.conv2(y))
        if self.conv3:
            x = self.conv3(x)  # 1x1卷积对输入下采样
        y += x
        return F.relu(y)

## 定义ResNet18

按照原文 ResNet18 的前两层应该是在输出通道数为64、步幅为2的卷积层后接步幅为2的的最大池化层，但是考虑到 CIFAR-10 图片本身很小，所以这里将第一层替换为步幅为1的卷积层，并且取消最大池化层，以免图片信息过早丢失。

其次，为了降低过拟合风险，在最后一个全连接层之前添加了一个dropout。

In [3]:
class ResNet18(pl.LightningModule):
    def __init__(self, lr: float=1e-3, dropout: float=0.2):
        super().__init__()
        # 将超参数存储在 self.hparams 中
        self.save_hyperparameters()
        
        # self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3)
        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1)
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU()
        # self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        
        self.layer1 = nn.Sequential(
            Residual(64, 64), 
            Residual(64, 64)
        )
        self.layer2 = nn.Sequential(
            Residual(64, 128, strides=2),
            Residual(128, 128)
        )
        self.layer3 = nn.Sequential(
            Residual(128, 256, strides=2),
            Residual(256, 256)
        )
        self.layer4 = nn.Sequential(
            Residual(256, 512, strides=2),
            Residual(512, 512)
        )
        
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        # 增加一个 dropout 降低过拟合风险
        self.dropout = nn.Dropout(self.hparams.dropout)
        self.fc = nn.Linear(512, 10)
    
    
    def forward(self, x: torch.Tensor):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        # x = self.maxpool(x)
        
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        
        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.dropout(x)
        return self.fc(x)
    
    
    def training_step(self, batch, batch_idx):
        X, y = batch
        logits = self(X)
        loss = F.cross_entropy(logits, y)
        self.log("train_loss", loss)
        return loss
    
    
    def validation_step(self, batch, batch_idx):
        X, y = batch
        logits = self(X)
        loss = F.cross_entropy(logits, y)
        acc = (logits.argmax(1) == y).to(torch.float).mean()
        self.log("val_loss", loss)
        self.log("val_acc", acc)
    
    
    def configure_optimizers(self):
        optimizer = Adam(self.parameters(), lr=self.hparams.lr)
        scheduler = MultiStepLR(optimizer, milestones=[10, 20], gamma=0.1)
        return {"optimizer": optimizer, "lr_scheduler": scheduler}

## 加载数据集

加载 CIFAR-10 训练集和验证集，对训练集使用 随机裁剪和水平翻转进行数据增强。

In [4]:
train_transforms = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

test_transforms = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])


train_data = datasets.CIFAR10(
    root=".",
    train=True,
    download=True,
    transform=train_transforms
)
val_data = datasets.CIFAR10(
    root=".",
    train=False,
    download=True,
    transform=test_transforms
)
train_dataloader = DataLoader(train_data, batch_size=64, num_workers=12)
val_dataloader = DataLoader(val_data, batch_size=64, num_workers=12)

Files already downloaded and verified
Files already downloaded and verified


In [5]:
X, y = next(iter(train_dataloader))
X.size()

torch.Size([64, 3, 32, 32])

In [6]:
resnet18 = ResNet18()
logits = resnet18(X)
logits.size()

torch.Size([64, 10])

In [7]:
lr_monitor = pl.callbacks.LearningRateMonitor(logging_interval='epoch')

trainer = pl.Trainer(max_epochs=30, accelerator="gpu", devices=[1], callbacks=[lr_monitor])

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [8]:
trainer.fit(resnet18, train_dataloader, val_dataloader)

Missing logger folder: /home/wh/workspace/pytorch-lightning/lightning_logs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name    | Type              | Params
----------------------------------------------
0 | conv1   | Conv2d            | 1.8 K 
1 | bn1     | BatchNorm2d       | 128   
2 | relu    | ReLU              | 0     
3 | layer1  | Sequential        | 148 K 
4 | layer2  | Sequential        | 525 K 
5 | layer3  | Sequential        | 2.1 M 
6 | layer4  | Sequential        | 8.4 M 
7 | avgpool | AdaptiveAvgPool2d | 0     
8 | dropout | Dropout           | 0     
9 | fc      | Linear            | 5.1 K 
----------------------------------------------
11.2 M    Trainable params
0         Non-trainable params
11.2 M    Total params
44.708    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

训练过程中在命令行使用`tensorboard --logdir lightning_logs` 启动 TensorBoard 查看训练日志

In [9]:
trainer.validate(resnet18, val_dataloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Validation: 0it [00:00, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
     Validate metric           DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
         val_acc            0.9111999869346619
        val_loss            0.32236620783805847
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'val_loss': 0.32236620783805847, 'val_acc': 0.9111999869346619}]

最终在 CIFAR-10 验证集上得到了 91.11% 的准确率。