In [1]:
%matplotlib inline
from mxnet import autograd, nd
from mxnet.gluon import nn
from mxnet import gluon
from mxnet import init
import common as comm

#### 5.1 卷积神经网络
卷积神经网络均使用最常见的二维卷积层。它有高和宽两个空间维度，常用来处理图像数据。
###### 二维互相关运算
在二维互相关运算中，卷积窗口从输入数组的最左上方开始，按从左往右、从上往下的顺序，依次在输入数组上滑动。当卷积窗口滑动到某一位置时，窗口中的输入子数组与核数组按元素相乘并求和，得到输出数组中相应位置的元素

In [2]:
def corr2d(X, K):
    h, w = K.shape
    Y = nd.zeros((X.shape[0] - h + 1, X.shape[1] - w + 1))
    for i in range(Y.shape[0]):
        for j in range(Y.shape[1]):
            Y[i, j] = ((X[i:i + h, j : j + w]) * K).sum()
    return Y

In [3]:
X = nd.array([[0, 1, 2], [3, 4, 5], [6, 7, 8]])
K = nd.array([[0, 1], [2, 3]])
corr2d(X, K)


[[19. 25.]
 [37. 43.]]
<NDArray 2x2 @cpu(0)>

#### 二维卷积层¶

In [4]:
class Conv2D(nn.Block):
    def __init__(self, kernel_size, **kwargs):
        super(Conv2D, self).__init__(**kwargs)
        self.weight = self.params.get('weight', shape = kernel_size)
        self.bias = self.params.get('bias', shape=(1,))
    def forwar(self, x):
        return corr2d(x, self.weight.data()) + self.bias.data()

In [5]:
X = nd.ones((6, 8))
X[:, 2:6] = 0
X


[[1. 1. 0. 0. 0. 0. 1. 1.]
 [1. 1. 0. 0. 0. 0. 1. 1.]
 [1. 1. 0. 0. 0. 0. 1. 1.]
 [1. 1. 0. 0. 0. 0. 1. 1.]
 [1. 1. 0. 0. 0. 0. 1. 1.]
 [1. 1. 0. 0. 0. 0. 1. 1.]]
<NDArray 6x8 @cpu(0)>

In [6]:
K = nd.array([[1, -1]])

In [7]:
Y = corr2d(X, K)
Y


[[ 0.  1.  0.  0.  0. -1.  0.]
 [ 0.  1.  0.  0.  0. -1.  0.]
 [ 0.  1.  0.  0.  0. -1.  0.]
 [ 0.  1.  0.  0.  0. -1.  0.]
 [ 0.  1.  0.  0.  0. -1.  0.]
 [ 0.  1.  0.  0.  0. -1.  0.]]
<NDArray 6x7 @cpu(0)>

In [8]:
# 构造一个输出通道数为1（将在“多输入通道和多输出通道”一节介绍通道），核数组形状是(1, 2)的二
# 维卷积层
conv2d = nn.Conv2D(1, kernel_size = (1, 2))
conv2d.initialize()

In [9]:
X = X.reshape(1, 1, 6, 8)
Y = Y.reshape(1, 1, 6, 7)

In [10]:
for i in range(10):
    with autograd.record():
        Y_hat = conv2d(X)
        l = (Y_hat - Y) ** 2
    l.backward()
    # 简单起见，这里忽略了偏差
    conv2d.weight.data()[:] -= 3e-2 * conv2d.weight.grad()
    if (i + 1) % 2 == 0:
        print('batch %d, loss %.3f' % (i + 1, l.sum().asscalar()))

batch 2, loss 4.949
batch 4, loss 0.831
batch 6, loss 0.140
batch 8, loss 0.024
batch 10, loss 0.004


In [11]:
conv2d.weight.data().reshape((1, 2))


[[ 0.9895    -0.9873705]]
<NDArray 1x2 @cpu(0)>

如果源变量和目标变量的context一致，as_in_context函数
使目标变量和源变量共享源变量的内存或显存。
MXNet可以指定用来存储和计算的设备，如使用内存的CPU或者使用显存的GPU。在默认情况下，MXNet会将数据创建在内存，然后利用CPU来计算。
MXNet要求计算的所有输入数据都在内存或同一块显卡的显存上。

#### 5.2 填充和步幅

假设输入形状是 nh×nw ，卷积核窗口形状是 kh×kw ，那么输出形状将会是

(nh−kh+1)×(nw−kw+1).
 
所以卷积层的输出形状由输入形状和卷积核窗口形状决定。

填充（padding）是指在输入高和宽的两侧填充元素（通常是0元素）

在高的两侧一共填充 ph 行，在宽的两侧一共填充 pw 列，那么输出形状将会是

(nh−kh+ph+1)×(nw−kw+pw+1),
 
也就是说，输出的高和宽会分别增加 ph 和 pw 。

In [12]:
# 定义一个函数来计算卷积层。它初始化卷积层权重，并对输入和输出做相应的升维和降维
def comp_conv2d(conv2d, X):
    conv2d.initialize()
    # (1, 1)代表批量大小和通道数（“多输入通道和多输出通道”一节将介绍）均为1
    X = X.reshape((1, 1) + X.shape)
    Y = conv2d(X)
    return Y.reshape(Y.shape[2:])  # 排除不关心的前两维：批量和通道

# 注意这里是两侧分别填充1行或列，所以在两侧一共填充2行或列
conv2d = nn.Conv2D(1, kernel_size=3, padding=1)
X = nd.random.uniform(shape=(8, 8))
comp_conv2d(conv2d, X).shape

(8, 8)

In [13]:
def corr2d_multi_in(X, K):
    return nd.add_n(*[corr2d(x, k) for x, k in zip(X, K)])

In [14]:
X = nd.array([[[0, 1, 2], [3, 4, 5], [6, 7, 8]],
              [[1, 2, 3], [4, 5, 6], [7, 8, 9]]])
K = nd.array([[[0, 1], [2, 3]], [[1, 2], [3, 4]]])
corr2d_multi_in(X, K)


[[ 56.  72.]
 [104. 120.]]
<NDArray 2x2 @cpu(0)>

In [15]:
help(nd.stack)

Help on function stack:

stack(*data, **kwargs)
    Join a sequence of arrays along a new axis.
    
    The axis parameter specifies the index of the new axis in the dimensions of the
    result. For example, if axis=0 it will be the first dimension and if axis=-1 it
    will be the last dimension.
    
    Examples::
    
      x = [1, 2]
      y = [3, 4]
    
      stack(x, y) = [[1, 2],
                     [3, 4]]
      stack(x, y, axis=1) = [[1, 3],
                             [2, 4]]
    
    
    Parameters
    ----------
    data : NDArray[]
        List of arrays to stack
    axis : int, optional, default='0'
        The axis in the result array along which the input arrays are stacked.
    
    out : NDArray, optional
        The output NDArray to hold the result.
    
    Returns
    -------
    out : NDArray or list of NDArrays
        The output of this function.



In [16]:
help(nd.add_n)

Help on function add_n:

add_n(*args, **kwargs)
    Adds all input arguments element-wise.
    
    .. math::
       add\_n(a_1, a_2, ..., a_n) = a_1 + a_2 + ... + a_n
    
    ``add_n`` is potentially more efficient than calling ``add`` by `n` times.
    
    The storage type of ``add_n`` output depends on storage types of inputs
    
    - add_n(row_sparse, row_sparse, ..) = row_sparse
    - add_n(default, csr, default) = default
    - add_n(any input combinations longer than 4 (>4) with at least one default type) = default
    - otherwise, ``add_n`` falls all inputs back to default storage and generates default storage
    
    
    
    Defined in src/operator/tensor/elemwise_sum.cc:L155
    
    Parameters
    ----------
    args : NDArray[]
        Positional input arguments
    
    out : NDArray, optional
        The output NDArray to hold the result.
    
    Returns
    -------
    out : NDArray or list of NDArrays
        The output of this function.



In [17]:
def corr2d_multi_in_out(X, K):
    # 对K的第0维遍历，每次同输入X做互相关计算。所有结果使用stack函数合并在一起
    return nd.stack(*[corr2d_multi_in(X, k) for k in K])

In [18]:

K = nd.stack(K, K + 1, K + 2)
K.shape

(3, 2, 2, 2)

In [19]:
K


[[[[0. 1.]
   [2. 3.]]

  [[1. 2.]
   [3. 4.]]]


 [[[1. 2.]
   [3. 4.]]

  [[2. 3.]
   [4. 5.]]]


 [[[2. 3.]
   [4. 5.]]

  [[3. 4.]
   [5. 6.]]]]
<NDArray 3x2x2x2 @cpu(0)>

In [20]:
corr2d_multi_in_out(X, K)


[[[ 56.  72.]
  [104. 120.]]

 [[ 76. 100.]
  [148. 172.]]

 [[ 96. 128.]
  [192. 224.]]]
<NDArray 3x2x2 @cpu(0)>

In [21]:
def corr2d_multi_in_out_1x1(X, K):
    c_i, h, w = X.shape
    c_o = K.shape[0]
    X = X.reshape((c_i, h * w))
    K = K.reshape((c_o, c_i))
    Y = nd.dot(K, X)  # 全连接层的矩阵乘法
    return Y.reshape((c_o, h, w))

In [22]:
X.shape

(2, 3, 3)

In [23]:
X = nd.random.uniform(shape=(3, 3, 3))
K = nd.random.uniform(shape=(2, 3, 1, 1))

Y1 = corr2d_multi_in_out_1x1(X, K)
Y2 = corr2d_multi_in_out(X, K)

(Y1 - Y2).norm().asscalar() < 1e-6

True

#### 5.4 池化层

同卷积层一样，池化层每次对输入数据的一个固定形状窗口（又称池化窗口）中的元素计算输出。不同于卷积层里计算输入和核的互相关性，池化层直接计算池化窗口内元素的最大值或者平均值。该运算也分别叫做最大池化或平均池化。

In [24]:
def pool2d(X, pool_size, mode = 'max'):
    p_h, p_w = pool_size
    Y = nd.zeros((X.shape[0] - p_h + 1, X.shape[1] - p_w + 1))
    for i in range(Y.shape[0]):
        for j in range(Y.shape[1]):
            if mode == 'max':
                Y[i, j] = X[i:i+p_h, j:j+p_w].max()
            elif mode == 'avg':
                Y[i, j] = X[i:i+p_h, j:j+p_w].avg()
    return Y

In [25]:
X = nd.array([[0, 1, 2], [3, 4, 5], [6, 7, 8]])
pool2d(X, (2, 2))


[[4. 5.]
 [7. 8.]]
<NDArray 2x2 @cpu(0)>

In [26]:
help(nn.MaxPool2D)

Help on class MaxPool2D in module mxnet.gluon.nn.conv_layers:

class MaxPool2D(_Pooling)
 |  Max pooling operation for two dimensional (spatial) data.
 |  
 |  
 |  Parameters
 |  ----------
 |  pool_size: int or list/tuple of 2 ints,
 |      Size of the max pooling windows.
 |  strides: int, list/tuple of 2 ints, or None.
 |      Factor by which to downscale. E.g. 2 will halve the input size.
 |      If `None`, it will default to `pool_size`.
 |  padding: int or list/tuple of 2 ints,
 |      If padding is non-zero, then the input is implicitly
 |      zero-padded on both sides for padding number of points.
 |  layout : str, default 'NCHW'
 |      Dimension ordering of data and out ('NCHW' or 'NHWC').
 |      'N', 'C', 'H', 'W' stands for batch, channel, height, and width
 |      dimensions respectively. padding is applied on 'H' and 'W' dimension.
 |  ceil_mode : bool, default False
 |      When `True`, will use ceil instead of floor to compute the output shape.
 |  
 |  
 |  Inputs:


In [27]:
X = nd.arange(16).reshape((1, 1, 4, 4))

In [28]:
pool2d = nn.MaxPool2D(3, padding=1, strides=2)
pool2d(X) # 因为池化层没有模型参数，所以不需要调用参数初始化函数


[[[[ 5.  7.]
   [13. 15.]]]]
<NDArray 1x1x2x2 @cpu(0)>

#### 5.5 LeNet

In [29]:
net = nn.Sequential()
net.add(nn.Conv2D(channels=16, kernel_size=4, activation='sigmoid'),
        nn.MaxPool2D(pool_size=2, strides=2),
        nn.Conv2D(channels=32, kernel_size=4, activation='sigmoid'),
        nn.MaxPool2D(pool_size=2, strides=2),
        # Dense会默认将(批量大小, 通道, 高, 宽)形状的输入转换成
        # (批量大小, 通道 * 高 * 宽)形状的输入
        nn.Dense(120, activation='sigmoid'),
        nn.Dense(84, activation='sigmoid'),
        nn.Dense(10))

In [30]:
X = nd.random.uniform(shape=(1, 1, 28, 28))
net.initialize()
for layer in net:
    X = layer(X)
    print(layer.name, 'output shape:\t', X.shape)

conv2 output shape:	 (1, 16, 25, 25)
pool1 output shape:	 (1, 16, 12, 12)
conv3 output shape:	 (1, 32, 9, 9)
pool2 output shape:	 (1, 32, 4, 4)
dense0 output shape:	 (1, 120)
dense1 output shape:	 (1, 84)
dense2 output shape:	 (1, 10)


In [31]:
batch_size = 256
train_iter, test_iter = comm.load_data_fashion_mnist(batch_size=batch_size)

In [32]:
import mxnet as mx
from mxnet.gluon import loss as gloss

In [33]:
 def try_gpu():
    try:
        ctx = mx.gpu()
        _ = nd.zeros((1,), ctx=ctx)
    except mx.base.MXNetError:
        ctx = mx.cpu()
    return ctx

ctx = try_gpu()
ctx

gpu(0)

In [34]:
def evaluate_accuracy(data_iter, net, ctx):
    acc_sum, n = nd.array([0], ctx=ctx), 0
    for X, y in data_iter:
        # 如果ctx代表GPU及相应的显存，将数据复制到显存上
        X, y = X.as_in_context(ctx), y.as_in_context(ctx).astype('float32')
        acc_sum += (net(X).argmax(axis=1) == y).sum()
        n += y.size
    return acc_sum.asscalar() / n

In [35]:
def train_ch5(net, train_iter, test_iter, batch_size, trainer, ctx,
              num_epochs):
    print('training on', ctx)
    loss = gloss.SoftmaxCrossEntropyLoss()
    for epoch in range(num_epochs):
        train_l_sum, train_acc_sum, n, start = 0.0, 0.0, 0, time.time()
        for X, y in train_iter:
            X, y = X.as_in_context(ctx), y.as_in_context(ctx)
            with autograd.record():
                y_hat = net(X)
                l = loss(y_hat, y).sum()
            l.backward()
            trainer.step(batch_size)
            y = y.astype('float32')
            train_l_sum += l.asscalar()
            train_acc_sum += (y_hat.argmax(axis=1) == y).sum().asscalar()
            n += y.size
        test_acc = evaluate_accuracy(test_iter, net, ctx)
        print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f, '
              'time %.1f sec'
              % (epoch + 1, train_l_sum / n, train_acc_sum / n, test_acc,
                 time.time() - start))

In [36]:
import time

In [39]:
lr, num_epochs = 0.5, 100
net.initialize(force_reinit=True, ctx=ctx, init=init.Xavier())
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr})
train_ch5(net, train_iter, test_iter, batch_size, trainer, ctx, num_epochs)

training on gpu(0)
epoch 1, loss 2.3195, train acc 0.101, test acc 0.100, time 2.2 sec
epoch 2, loss 1.7528, train acc 0.338, test acc 0.583, time 2.3 sec
epoch 3, loss 0.9802, train acc 0.625, test acc 0.671, time 2.2 sec
epoch 4, loss 0.8344, train acc 0.683, test acc 0.703, time 2.1 sec
epoch 5, loss 0.7425, train acc 0.715, test acc 0.737, time 2.1 sec
epoch 6, loss 0.6924, train acc 0.731, test acc 0.744, time 2.2 sec
epoch 7, loss 0.6553, train acc 0.745, test acc 0.750, time 2.2 sec
epoch 8, loss 0.6275, train acc 0.756, test acc 0.769, time 2.2 sec
epoch 9, loss 0.5978, train acc 0.767, test acc 0.785, time 2.2 sec
epoch 10, loss 0.5734, train acc 0.778, test acc 0.794, time 2.1 sec
epoch 11, loss 0.5493, train acc 0.788, test acc 0.801, time 2.1 sec
epoch 12, loss 0.5327, train acc 0.794, test acc 0.807, time 2.1 sec
epoch 13, loss 0.5126, train acc 0.802, test acc 0.812, time 2.2 sec
epoch 14, loss 0.4957, train acc 0.810, test acc 0.823, time 2.1 sec
epoch 15, loss 0.4851, t

#### 5.6 AlexNet
AlexNet与LeNet的设计理念非常相似，但也有显著的区别。

第一，与相对较小的LeNet相比，AlexNet包含8层变换，其中有5层卷积和2层全连接隐藏层，以及1个全连接输出层。下面我们来详细描述这些层的设计。

AlexNet第一层中的卷积窗口形状是11×11。因为ImageNet中绝大多数图像的高和宽均比MNIST图像的高和宽大10倍以上，ImageNet图像的物体占用更多的像素，所以需要更大的卷积窗口来捕获物体。第二层中的卷积窗口形状减小到5×5，之后全采用3×3。此外，第一、第二和第五个卷积层之后都使用了窗口形状为3×3、步幅为2的最大池化层。而且，AlexNet使用的卷积通道数也大于LeNet中的卷积通道数数十倍。

紧接着最后一个卷积层的是两个输出个数为4096的全连接层。这两个巨大的全连接层带来将近1 GB的模型参数。由于早期显存的限制，最早的AlexNet使用双数据流的设计使一个GPU只需要处理一半模型。幸运的是，显存在过去几年得到了长足的发展，因此通常我们不再需要这样的特别设计了。

第二，AlexNet将sigmoid激活函数改成了更加简单的ReLU激活函数。一方面，ReLU激活函数的计算更简单，例如它并没有sigmoid激活函数中的求幂运算。另一方面，ReLU激活函数在不同的参数初始化方法下使模型更容易训练。这是由于当sigmoid激活函数输出极接近0或1时，这些区域的梯度几乎为0，从而造成反向传播无法继续更新部分模型参数；而ReLU激活函数在正区间的梯度恒为1。因此，若模型参数初始化不当，sigmoid函数可能在正区间得到几乎为0的梯度，从而令模型无法得到有效训练。

第三，AlexNet通过丢弃法（参见“丢弃法”一节）来控制全连接层的模型复杂度。而LeNet并没有使用丢弃法。

第四，AlexNet引入了大量的图像增广，如翻转、裁剪和颜色变化，从而进一步扩大数据集来缓解过拟合。我们将在后面的“图像增广”一节详细介绍这种方法。

In [40]:

net = nn.Sequential()
# 使用较大的11 x 11窗口来捕获物体。同时使用步幅4来较大幅度减小输出高和宽。这里使用的输出通
# 道数比LeNet中的也要大很多
net.add(nn.Conv2D(96, kernel_size=11, strides=4, activation='relu'),
        nn.MaxPool2D(pool_size=3, strides=2),
        # 减小卷积窗口，使用填充为2来使得输入与输出的高和宽一致，且增大输出通道数
        nn.Conv2D(256, kernel_size=5, padding=2, activation='relu'),
        nn.MaxPool2D(pool_size=3, strides=2),
        # 连续3个卷积层，且使用更小的卷积窗口。除了最后的卷积层外，进一步增大了输出通道数。
        # 前两个卷积层后不使用池化层来减小输入的高和宽
        nn.Conv2D(384, kernel_size=3, padding=1, activation='relu'),
        nn.Conv2D(384, kernel_size=3, padding=1, activation='relu'),
        nn.Conv2D(256, kernel_size=3, padding=1, activation='relu'),
        nn.MaxPool2D(pool_size=3, strides=2),
        # 这里全连接层的输出个数比LeNet中的大数倍。使用丢弃层来缓解过拟合
        nn.Dense(4096, activation="relu"), nn.Dropout(0.5),
        nn.Dense(4096, activation="relu"), nn.Dropout(0.5),
        # 输出层。由于这里使用Fashion-MNIST，所以用类别数为10，而非论文中的1000
        nn.Dense(10))

In [41]:
X = nd.random.uniform(shape=(1, 1, 224, 224))
net.initialize()
for layer in net:
    X = layer(X)
    print(layer.name, 'output shape:\t', X.shape)

conv4 output shape:	 (1, 96, 54, 54)
pool3 output shape:	 (1, 96, 26, 26)
conv5 output shape:	 (1, 256, 26, 26)
pool4 output shape:	 (1, 256, 12, 12)
conv6 output shape:	 (1, 384, 12, 12)
conv7 output shape:	 (1, 384, 12, 12)
conv8 output shape:	 (1, 256, 12, 12)
pool5 output shape:	 (1, 256, 5, 5)
dense3 output shape:	 (1, 4096)
dropout0 output shape:	 (1, 4096)
dense4 output shape:	 (1, 4096)
dropout1 output shape:	 (1, 4096)
dense5 output shape:	 (1, 10)


In [43]:
batch_size = 128
# 如出现“out of memory”的报错信息，可减小batch_size或resize
train_iter, test_iter = comm.load_data_fashion_mnist(batch_size, resize=224)

In [46]:
lr, num_epochs, ctx = 0.01, 50, comm.try_gpu()
net.initialize(force_reinit=True, ctx=ctx, init=init.Xavier())
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr})
comm.train_ch5(net, train_iter, test_iter, batch_size, trainer, ctx, num_epochs)

training on gpu(0)
epoch 1, loss 1.3751, train acc 0.495, test acc 0.743, time 30.7 sec
epoch 2, loss 0.6608, train acc 0.751, test acc 0.804, time 30.6 sec
epoch 3, loss 0.5426, train acc 0.800, test acc 0.838, time 30.7 sec
epoch 4, loss 0.4780, train acc 0.823, test acc 0.857, time 30.8 sec
epoch 5, loss 0.4287, train acc 0.843, test acc 0.865, time 30.8 sec
epoch 6, loss 0.3977, train acc 0.854, test acc 0.878, time 30.7 sec
epoch 7, loss 0.3745, train acc 0.864, test acc 0.877, time 30.9 sec
epoch 8, loss 0.3566, train acc 0.870, test acc 0.885, time 30.8 sec
epoch 9, loss 0.3403, train acc 0.875, test acc 0.884, time 30.8 sec
epoch 10, loss 0.3262, train acc 0.881, test acc 0.888, time 30.7 sec
epoch 11, loss 0.3156, train acc 0.885, test acc 0.891, time 30.8 sec
epoch 12, loss 0.3057, train acc 0.888, test acc 0.898, time 30.7 sec
epoch 13, loss 0.2945, train acc 0.892, test acc 0.899, time 30.8 sec
epoch 14, loss 0.2888, train acc 0.895, test acc 0.902, time 30.5 sec
epoch 15, 

#### 5.6 使用重复元素的网络（VGG）
VGG块的组成规律是：连续使用数个相同的填充为1、窗口形状为3×3的卷积层后接上一个步幅为2、窗口形状为2×2的最大池化层。卷积层保持输入的高和宽不变，而池化层则对其减半。我们使用vgg_block函数来实现这个基础的VGG块，它可以指定卷积层的数量num_convs和输出通道数num_channels。

In [47]:
def vgg_block(num_convs, num_channels):
    blk = nn.Sequential()
    for _ in range(num_convs):
        blk.add(nn.Conv2D(num_channels, kernel_size = 3, padding = 1, activation = 'relu'))
    blk.add(nn.MaxPool2D(pool_size=2, strides = 2))
    return blk

现在我们构造一个VGG网络。它有5个卷积块，前2块使用单卷积层，而后3块使用双卷积层。

第一块的输出通道是64，之后每次对输出通道数翻倍，直到变为512。

因为这个网络使用了8个卷积层和3个全连接层，所以经常被称为VGG-11。

In [48]:
conv_arch = ((1, 64), (1, 128), (2, 256), (2, 512), (2, 512))

In [49]:
def vgg(conv_arch):
    net = nn.Sequential()
    #卷积层部分
    for(num_convs, num_channels) in conv_arch:
        net.add(vgg_block(num_convs, num_channels))
    #全连接层部分
    net.add(nn.Dense(4096, activation = 'relu'))
    net.add(nn.Dense(4096, activation = 'relu'))
    net.add(nn.Dense(10))
    return net;

In [50]:
net = vgg(conv_arch)

In [52]:
net.initialize(init.Normal(sigma = 0.05))
X = nd.random.uniform(shape=(1, 1, 224, 224))
for blk in net:
    X = blk(X)
    print(blk.name, 'output shape:\t', X.shape)

sequential3 output shape:	 (1, 64, 112, 112)
sequential4 output shape:	 (1, 128, 56, 56)
sequential5 output shape:	 (1, 256, 28, 28)
sequential6 output shape:	 (1, 512, 14, 14)
sequential7 output shape:	 (1, 512, 7, 7)
dense6 output shape:	 (1, 4096)
dense7 output shape:	 (1, 4096)
dense8 output shape:	 (1, 10)


In [53]:
# 获取数据并训练
ratio = 4
small_conv_arch = [(pair[0], pair[1] // ratio) for pair in conv_arch]
net = vgg(small_conv_arch)

In [54]:
lr, num_epochs, batch_size, ctx = 0.05, 5, 128, comm.try_gpu()
net.initialize(ctx=ctx, init=init.Xavier())
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr})
train_iter, test_iter = comm.load_data_fashion_mnist(batch_size, resize=224)
comm.train_ch5(net, train_iter, test_iter, batch_size, trainer, ctx,
              num_epochs)

training on gpu(0)
epoch 1, loss 0.8026, train acc 0.720, test acc 0.865, time 64.8 sec
epoch 2, loss 0.3489, train acc 0.872, test acc 0.881, time 63.1 sec
epoch 3, loss 0.2832, train acc 0.897, test acc 0.888, time 62.1 sec
epoch 4, loss 0.2385, train acc 0.912, test acc 0.906, time 61.9 sec
epoch 5, loss 0.2013, train acc 0.925, test acc 0.915, time 62.4 sec


#### 5.8 NiN 网络中的网络

NiN块是NiN中的基础块。它由一个卷积层加两个充当全连接层的 1×1 卷积层串联而成。其中第一个卷积层的超参数可以自行设置，而第二和第三个卷积层的超参数一般是固定的。

In [64]:
def nin_block(num_channels, kernel_size, strides, padding):
    blk = nn.Sequential()
    blk.add(nn.Conv2D(num_channels, kernel_size, strides, padding, activation = 'relu'))
    blk.add(nn.Conv2D(num_channels, kernel_size = 1, activation='relu'))
    blk.add(nn.Conv2D(num_channels, kernel_size = 1, activation='relu'))
    return blk

In [65]:
net = nn.Sequential()
net.add(nin_block(96, kernel_size=11, strides=4, padding=0),
        nn.MaxPool2D(pool_size=3, strides=2),
        nin_block(256, kernel_size=5, strides=1, padding=2),
        nn.MaxPool2D(pool_size=3, strides=2),
        nin_block(384, kernel_size=3, strides=1, padding=1),
        nn.MaxPool2D(pool_size=3, strides=2), nn.Dropout(0.5),
        # 标签类别数是10
        nin_block(10, kernel_size=3, strides=1, padding=1),
        # 全局平均池化层将窗口形状自动设置成输入的高和宽
        nn.GlobalAvgPool2D(),
        # 将四维的输出转成二维的输出，其形状为(批量大小, 10)
        nn.Flatten())

In [66]:
X = nd.random.uniform(shape=(1, 1, 224, 224))
net.initialize()
for layer in net:
    X = layer(X)
    print(layer.name, 'output shape:\t', X.shape)

sequential25 output shape:	 (1, 96, 54, 54)
pool24 output shape:	 (1, 96, 26, 26)
sequential26 output shape:	 (1, 256, 26, 26)
pool25 output shape:	 (1, 256, 12, 12)
sequential27 output shape:	 (1, 384, 12, 12)
pool26 output shape:	 (1, 384, 5, 5)
dropout4 output shape:	 (1, 384, 5, 5)
sequential28 output shape:	 (1, 10, 5, 5)
pool27 output shape:	 (1, 10, 1, 1)
flatten2 output shape:	 (1, 10)


In [69]:
net

Sequential(
  (0): Sequential(
    (0): Conv2D(1 -> 96, kernel_size=(11, 11), stride=(4, 4), Activation(relu))
    (1): Conv2D(96 -> 96, kernel_size=(1, 1), stride=(1, 1), Activation(relu))
    (2): Conv2D(96 -> 96, kernel_size=(1, 1), stride=(1, 1), Activation(relu))
  )
  (1): MaxPool2D(size=(3, 3), stride=(2, 2), padding=(0, 0), ceil_mode=False, global_pool=False, pool_type=max, layout=NCHW)
  (2): Sequential(
    (0): Conv2D(96 -> 256, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2), Activation(relu))
    (1): Conv2D(256 -> 256, kernel_size=(1, 1), stride=(1, 1), Activation(relu))
    (2): Conv2D(256 -> 256, kernel_size=(1, 1), stride=(1, 1), Activation(relu))
  )
  (3): MaxPool2D(size=(3, 3), stride=(2, 2), padding=(0, 0), ceil_mode=False, global_pool=False, pool_type=max, layout=NCHW)
  (4): Sequential(
    (0): Conv2D(256 -> 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), Activation(relu))
    (1): Conv2D(384 -> 384, kernel_size=(1, 1), stride=(1, 1), Activation(relu))

In [72]:
lr, num_epochs, batch_size, ctx = 0.1, 50, 128, comm.try_gpu()
net.initialize(force_reinit=True, ctx=ctx, init=init.Xavier())
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr})
train_iter, test_iter = comm.load_data_fashion_mnist(batch_size, resize=128)
comm.train_ch5(net, train_iter, test_iter, batch_size, trainer, ctx,
              num_epochs)

training on gpu(0)
epoch 1, loss 1.7387, train acc 0.320, test acc 0.513, time 17.4 sec
epoch 2, loss 0.7872, train acc 0.696, test acc 0.764, time 15.9 sec
epoch 3, loss 0.5710, train acc 0.785, test acc 0.809, time 16.0 sec
epoch 4, loss 0.5018, train acc 0.811, test acc 0.835, time 16.1 sec
epoch 5, loss 0.4527, train acc 0.832, test acc 0.854, time 16.2 sec
epoch 6, loss 0.4169, train acc 0.845, test acc 0.862, time 16.2 sec
epoch 7, loss 0.3870, train acc 0.856, test acc 0.871, time 16.1 sec
epoch 8, loss 0.3680, train acc 0.863, test acc 0.871, time 16.2 sec
epoch 9, loss 0.3475, train acc 0.871, test acc 0.878, time 16.1 sec
epoch 10, loss 0.3293, train acc 0.876, test acc 0.890, time 16.1 sec
epoch 11, loss 0.3164, train acc 0.883, test acc 0.891, time 16.1 sec
epoch 12, loss 0.3024, train acc 0.890, test acc 0.896, time 16.1 sec
epoch 13, loss 0.2869, train acc 0.894, test acc 0.904, time 16.1 sec
epoch 14, loss 0.2766, train acc 0.898, test acc 0.905, time 16.0 sec
epoch 15, 