In [1]:
import d2lzh as d2l
import torch
from torch.nn import init
from torch import nn
from torch.nn import functional as F
from torch import optim

class Residual(nn.Module):
    def __init__(self, in_channels, out_channels, use_1x1conv=False, stride=1):
        super(Residual, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1, stride=stride)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1)
        if use_1x1conv:
            self.conv3 = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride)
        else:
            self.conv3 = None
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.bn2 = nn.BatchNorm2d(out_channels)
        
    def forward(self, X):
        Y = F.relu(self.bn1(self.conv1(X)))
        y = self.bn2(self.conv2(Y))
        if self.conv3:
            X = self.conv3(X)
        return F.relu(Y + X)

In [2]:
block = Residual(3, 3)
X = torch.rand((4, 3, 6, 6))
block(X).shape

torch.Size([4, 3, 6, 6])

In [3]:
block = Residual(3, 6, use_1x1conv=True, stride=2)
block(X).shape

torch.Size([4, 6, 3, 3])

In [4]:
net = nn.Sequential(
    nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3),
    nn.BatchNorm2d(64),
    nn.ReLU(),
    nn.MaxPool2d(3, stride=2, padding=1)
)

In [5]:
def resnet_block(in_channels, out_channels, num_residual, first_block=False):
    block = nn.Sequential()
    for i in range(num_residual):
        if i == 0 and not first_block:
            block.add_module('residual-%d' % i, Residual(in_channels, out_channels, use_1x1conv=True, stride=2))
        else:
            block.add_module('residual-%d' % i, Residual(out_channels, out_channels))
    return block

In [6]:
net.add_module('block-1', resnet_block(64, 64, 2, first_block=True))
net.add_module('block-2', resnet_block(64, 128, 2))
net.add_module('block-3', resnet_block(128, 256, 2))
net.add_module('block-4', resnet_block(256, 512, 2))

In [7]:
net.add_module('avgpool', d2l.GlobalAvgPool2d())
net.add_module('flatten', d2l.FlattenLayer())
net.add_module('fc', nn.Linear(512, 10))

In [8]:
X = torch.rand((1, 1, 224, 224))
for layer in net:
    X = layer(X)
    print('output shape:\t', X.shape)

output shape:	 torch.Size([1, 64, 112, 112])
output shape:	 torch.Size([1, 64, 112, 112])
output shape:	 torch.Size([1, 64, 112, 112])
output shape:	 torch.Size([1, 64, 56, 56])
output shape:	 torch.Size([1, 64, 56, 56])
output shape:	 torch.Size([1, 128, 28, 28])
output shape:	 torch.Size([1, 256, 14, 14])
output shape:	 torch.Size([1, 512, 7, 7])
output shape:	 torch.Size([1, 512, 1, 1])
output shape:	 torch.Size([1, 512])
output shape:	 torch.Size([1, 10])


In [9]:
lr, num_epochs, batch_size = 0.05, 5, 256
device = 'cuda' if torch.cuda.is_available() else 'cpu'
for name, layer in net.named_modules():
    if type(layer) in (nn.Conv2d, nn.Linear):
        init.xavier_normal_(layer.weight)
optimizer = optim.SGD(net.parameters(), lr=lr)
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size, resize=96)
d2l.train_ch5(net, train_iter, test_iter, batch_size, optimizer, device, num_epochs)

training on cuda
epoch 1, loss 0.0021, train acc 0.815, test acc 0.862, time 21.0 sec
epoch 2, loss 0.0012, train acc 0.890, test acc 0.872, time 20.4 sec
epoch 3, loss 0.0010, train acc 0.909, test acc 0.888, time 20.5 sec
epoch 4, loss 0.0009, train acc 0.923, test acc 0.881, time 20.5 sec
epoch 5, loss 0.0008, train acc 0.931, test acc 0.913, time 20.5 sec
