# 卷积层
图像卷积

## 互相关运算
输出矩阵的形状为

$(n_w-h_w+1)\times (n_h-k_h+1)$

In [1]:
import torch 
from torch import nn
from d2l import torch as d2l 

In [2]:
def corr2d(X,K):
    '''互相关运算'''
    h,w = K.shape
    Y = torch.zeros(X.shape[0] - h + 1,X.shape[1] - w + 1)
    for i in range(Y.shape[0]):
        for j in range(Y.shape[1]):
            Y[i,j] = (X[i:i+h,j:j+w] * K).sum()
    return Y

In [3]:
# 验证互相关运算
X = torch.tensor([[0.0,1.0,2.0],[3.0,4.0,5.0],[6.0,7.0,8.0]])
K = torch.tensor([[0.0,1.0],[2.0,3.0]])
corr2d(X,K)

tensor([[19., 25.],
        [37., 43.]])

## 卷积层
1. 进行输入与卷积核的互相关运算
2. 加上偏置项
3. 得到最后输出

In [4]:
class Conv2D(nn.Module):
    def __init__(self,kernel_size):
        super().__init__()
        # 初始化卷积核权重和标量偏置
        self.weight = nn.Parameter(torch.rand(kernel_size))
        self.bias = nn.Parameter(torch.zeros(1))
    def forward(self,X):
        return corr2d(X,self.weight) + self.bias

# 图像中目标的边缘检测
卷积层的一个简单应用

In [5]:
X = torch.ones(6,8)
X[:,2:6] = 0
X

tensor([[1., 1., 0., 0., 0., 0., 1., 1.],
        [1., 1., 0., 0., 0., 0., 1., 1.],
        [1., 1., 0., 0., 0., 0., 1., 1.],
        [1., 1., 0., 0., 0., 0., 1., 1.],
        [1., 1., 0., 0., 0., 0., 1., 1.],
        [1., 1., 0., 0., 0., 0., 1., 1.]])

构造一个 $1\times 2$ 的卷积核，进行互相关运算时，若是水平相邻的两个元素相同，则得到结果为0，否则非0

In [6]:
K = torch.tensor([[1.0,-1.0]])

结果中 $1$ 代表白到黑的边缘（因为白色的RGB值更大，结果为正）,$-1$ 代表黑到白的边缘，其他为 $0$

In [7]:
Y = corr2d(X,K)
Y

tensor([[ 0.,  1.,  0.,  0.,  0., -1.,  0.],
        [ 0.,  1.,  0.,  0.,  0., -1.,  0.],
        [ 0.,  1.,  0.,  0.,  0., -1.,  0.],
        [ 0.,  1.,  0.,  0.,  0., -1.,  0.],
        [ 0.,  1.,  0.,  0.,  0., -1.,  0.],
        [ 0.,  1.,  0.,  0.,  0., -1.,  0.]])

将输入转置，再进行互相关运算，此时水平边缘变成了垂直边缘，而卷积核只能检测水平的，无法检测垂直边缘，结果全为0

In [8]:
corr2d(X.t(),K)

tensor([[0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.]])

# 学习卷积核
对于更加复杂的任务，难以手动指定卷积核，需要通过学习训练的方法。
1. 初始化卷积核权重
2. 更新卷积核参数
   1. 先前向传播得到输出结果
   2. 计算结果与真实 $\bf Y$ 之间的 `MSE` 
   3. 反向传播，利用梯度对卷积核参数进行更新

（忽略偏置）

In [9]:
# 构造二维卷积层，1个channel，卷积核形状(1，2)
conv2d = nn.Conv2d(1,1,kernel_size=(1,2),bias=False)

# 二维卷积层使用思维输入和输出（batch_size, channel, height, width）
X = X.reshape(1,1,6,8)
Y = Y.reshape(1,1,6,7)
lr = 3e-2

for i in range(10):
    Y_hat = conv2d(X)
    #print(Y_hat.shape)
    l = (Y_hat - Y) ** 2
    conv2d.zero_grad()
    l.sum().backward()
    conv2d.weight.data[:] -= lr * conv2d.weight.grad
    
    if (i + 1) % 2 == 0:
        print(f"epoch {i+1}, loss {l.sum():.3f}")

epoch 2, loss 12.299
epoch 4, loss 2.172
epoch 6, loss 0.409
epoch 8, loss 0.087
epoch 10, loss 0.022


查看当前卷积核权重

In [10]:
conv2d.weight.data.reshape(1,2)

tensor([[ 0.9937, -0.9705]])

这与之前手动定义的卷积核很接近

# 填充

In [11]:
def comp_conv2d(conv2d,X):
    X = X.reshape((1,1) + X.shape) # (1,1)指批量大小和通道数
    Y = conv2d(X)
    return Y.reshape(Y.shape[2:]) # 省略前两维，批量大小和通道数

# 这里的padding=1指每边都填充了1行或1列，因此总共添加了2行2列
conv2d = nn.Conv2d(1,1,kernel_size=3,padding=1)
X = torch.rand(size=(8,8))
comp_conv2d(conv2d,X)

tensor([[ 0.3996,  0.3685,  0.5199,  0.3312,  0.4455,  0.3291,  0.6515,  0.7110],
        [ 0.6548,  0.7531,  0.9232,  0.6899,  0.5685,  0.3645,  1.0758,  0.7272],
        [ 0.7727,  0.6601,  0.8418,  0.5716,  0.6020,  0.7242,  0.8847,  0.4896],
        [ 0.7219,  0.6256,  0.7898,  0.4810,  0.7979,  0.9729,  1.0473,  0.5666],
        [ 0.9236,  0.3687,  0.5537,  0.6873,  0.4901,  0.3966,  0.9801,  0.6681],
        [ 0.6594,  0.2367,  0.8908,  0.9591,  0.7761,  0.7799,  0.5310,  0.2995],
        [ 0.5597,  0.7099,  0.7016,  0.7472,  0.7089,  0.9325,  0.9890,  0.3391],
        [ 0.5228,  0.3182,  0.3597,  0.5453,  0.0543,  0.4066,  0.4735, -0.0270]],
       grad_fn=<ReshapeAliasBackward0>)

卷积核高度和宽度不同时，可以填充不同的高度和宽度，使输入和输出具有相同形状

In [12]:
conv2d = nn.Conv2d(1,1,kernel_size=(5,3),padding=(2,1))
comp_conv2d(conv2d,X)

tensor([[ 0.1131,  0.2865,  0.1526,  0.5786,  0.3448,  0.3066, -0.0010,  0.4695],
        [ 0.1016,  0.6861,  0.2926,  0.4483,  0.3911,  0.6994,  0.2409,  0.3696],
        [ 0.1683,  0.8271,  0.3489,  0.5280,  0.2540,  0.6718,  0.9124,  0.4339],
        [ 0.1209,  0.7737,  0.5261,  0.6060,  0.3929,  0.4872,  0.7636,  0.6328],
        [ 0.1418,  0.7242,  0.4915,  0.4857,  0.8019,  0.5707,  0.6731,  0.6340],
        [ 0.3633,  0.3315,  0.5328,  0.6310,  0.6902,  0.5789,  0.5479,  0.7128],
        [ 0.2413,  0.4409,  0.5399,  0.4978,  0.7804,  0.6827,  0.3848,  0.4191],
        [ 0.3089,  0.4131,  0.5471,  0.4439,  0.5613,  0.6888,  0.4876,  0.3602]],
       grad_fn=<ReshapeAliasBackward0>)

# 步幅
- 高效计算
- 缩减采样次数


In [13]:
conv2d = nn.Conv2d(1,1,kernel_size=3,padding=1,stride=2)
comp_conv2d(conv2d,X).shape

torch.Size([4, 4])

In [14]:
conv2d = nn.Conv2d(1,1,kernel_size=(3,5),padding=(0,1),stride=(3,4))
comp_conv2d(conv2d,X)

tensor([[-0.3122, -0.0341],
        [-0.4073, -0.4644]], grad_fn=<ReshapeAliasBackward0>)

# 多输入通道

In [15]:
def corr2d_multi_in(X,K):
    return sum(d2l.corr2d(x,k) for x,k in zip(X,K))

In [16]:
X = torch.tensor([[[0.0, 1.0, 2.0], [3.0, 4.0, 5.0], [6.0, 7.0, 8.0]],
               [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]]])
K = torch.tensor([[[0.0, 1.0], [2.0, 3.0]], [[1.0, 2.0], [3.0, 4.0]]])

corr2d_multi_in(X,K)
#X.shape

tensor([[ 56.,  72.],
        [104., 120.]])

# 多输出通道


为每个输出通道建立 $c_i\times k_h\times k_w$ 的卷积核，这样卷积核就是四维张量 $c_o\times c_i\times k_h\times k_w$ ，在互相关运算中，每个输出通道先获取所有输入通道，再以对应该通道的卷积核计算出结果

In [17]:
def corr2d_multi_in_out(X,K):
    return torch.stack([corr2d_multi_in(X,k) for k in K],0)

通过将卷积核 $K$ $K+1$ $K+2$ 连接起来，构成了三个输出通道的卷积核

In [18]:
K = torch.stack((K,K+1,K+2),0)
K.shape

torch.Size([3, 2, 2, 2])

In [19]:
corr2d_multi_in_out(X,K)

tensor([[[ 56.,  72.],
         [104., 120.]],

        [[ 76., 100.],
         [148., 172.]],

        [[ 96., 128.],
         [192., 224.]]])

# $1\times 1$ 卷积层
- 改变输出通道数，元素值不变
- 等价于全连接层，没有实现卷积的“提取相邻像素的相关特征”的作用

In [20]:
def corr2d_multi_in_out_11(X,K):
    c_i,h,w = X.shape
    c_o = K.shape[0]
    X = X.reshape(c_i,h*w)
    K = K.reshape(c_o,c_i)
    # 全连接层的矩阵乘法
    Y = torch.matmul(K,X) # K的列等于X的行，从reshape可知
    # 输入X的通道数是3，输出Y的通道数是2
    return Y.reshape(c_o,h,w)

In [21]:
X = torch.normal(0,1,(3,3,3))
K = torch.normal(0,1,(2,3,1,1))

In [22]:
Y1 = corr2d_multi_in_out_11(X,K)
Y2 = corr2d_multi_in_out(X,K)
assert float(torch.abs(Y1 - Y2).sum()) < 1e-6
Y1.shape,Y2.shape

(torch.Size([2, 3, 3]), torch.Size([2, 3, 3]))

# 汇聚层 `Pooling`
作用：
- 降低卷积层对位置的敏感性
- 降低对空间降采样表示的敏感性

In [23]:
def pool2d(X,pool_size,mode="max"):
    p_h,p_w = pool_size
    Y = torch.zeros(X.shape[0] - p_h + 1,X.shape[1] - p_w + 1)
    for i in range(Y.shape[0]):
        for j in range(Y.shape[1]):
            if mode == "max":
                Y[i,j] = X[i:i+p_h,j:j+p_w].max()
            elif mode == "mean":
                Y[i,j] = X[i:i+p_h,j:j+p_w].mean()
                
    return Y

In [25]:
X = torch.tensor([[0.0,1.0,2.0],[3.0,4.0,5.0],[6.0,7.0,8.0]])
pool2d(X,(2,2))

tensor([[4., 5.],
        [7., 8.]])

In [27]:
pool2d(X,(2,2),mode="mean")

tensor([[2., 3.],
        [5., 6.]])

## [填充与步幅]
改变输出形状

In [48]:
# X的样本数和通道数都为1
X = torch.arange(16,dtype=torch.float32).reshape(1,1,4,4)
X

tensor([[[[ 0.,  1.,  2.,  3.],
          [ 4.,  5.,  6.,  7.],
          [ 8.,  9., 10., 11.],
          [12., 13., 14., 15.]]]])

使用内置的二维最大汇聚层，演示填充与步幅的使用

In [49]:
pool2d = nn.MaxPool2d(3)
pool2d(X)

tensor([[[[10.]]]])

In [50]:
pool2d = nn.MaxPool2d((2,3),stride=(2,3),padding=(0,1))
pool2d(X)

tensor([[[[ 5.,  7.],
          [13., 15.]]]])

## [多个通道]

汇聚层对每个通道单独运算，不求和，输出通道数与输入通道数相同

在通道维度上连接X与X+1

In [51]:
X = torch.cat((X,X+1),1)
X

tensor([[[[ 0.,  1.,  2.,  3.],
          [ 4.,  5.,  6.,  7.],
          [ 8.,  9., 10., 11.],
          [12., 13., 14., 15.]],

         [[ 1.,  2.,  3.,  4.],
          [ 5.,  6.,  7.,  8.],
          [ 9., 10., 11., 12.],
          [13., 14., 15., 16.]]]])

In [52]:
X.shape

torch.Size([1, 2, 4, 4])

In [53]:
pool2d = nn.MaxPool2d(3,padding=1,stride=2)
pool2d(X)

tensor([[[[ 5.,  7.],
          [13., 15.]],

         [[ 6.,  8.],
          [14., 16.]]]])