In [6]:
import sys
sys.path.insert(0, '..')
import gluonbook as gb
from mxnet import nd, gluon, init
from mxnet.gluon import nn

def nin_block(num_channels, kernel_size, strides, padding):
    blk = nn.Sequential()
    blk.add(nn.Conv2D(num_channels, kernel_size, 
                      strides, padding, activation='relu'),
            nn.Conv2D(num_channels, kernel_size=1, activation='relu'),
            nn.Conv2D(num_channels, kernel_size=1, activation='relu'))
    return blk

In [7]:
net = nn.Sequential()
net.add(
    nin_block(96, kernel_size=11, strides=4, padding=0),
    nn.MaxPool2D(pool_size=3, strides=2),
    nin_block(256, kernel_size=5, strides=1, padding=2),
    nn.MaxPool2D(pool_size=3, strides=2),
    nin_block(384, kernel_size=3, strides=1, padding=1),
    nn.MaxPool2D(pool_size=3, strides=2),
    nn.Dropout(.5),
    # 标签类数是 10。
    nin_block(10, kernel_size=3, strides=1, padding=1),
    # 全局平均池化层将窗口形状自动设置成输出的高和宽。
    nn.GlobalAvgPool2D(),
    # 将四维的输出转成二维的输出，其形状为（批量大小，10）。
    nn.Flatten()
)


In [8]:
X = nd.random.uniform(shape=(1,1,224,224))

net.initialize()

for layer in net:
    X = layer(X)
    print(layer.name, 'output shape:\t', X.shape)

sequential6 output shape:	 (1, 96, 54, 54)
pool4 output shape:	 (1, 96, 26, 26)
sequential7 output shape:	 (1, 256, 26, 26)
pool5 output shape:	 (1, 256, 12, 12)
sequential8 output shape:	 (1, 384, 12, 12)
pool6 output shape:	 (1, 384, 5, 5)
dropout1 output shape:	 (1, 384, 5, 5)
sequential9 output shape:	 (1, 10, 5, 5)
pool7 output shape:	 (1, 10, 1, 1)
flatten1 output shape:	 (1, 10)


In [10]:
ctx = gb.try_gpu()
net.initialize(force_reinit=True, ctx=ctx, init=init.Xavier())
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': .1})

loss = gluon.loss.SoftmaxCrossEntropyLoss()
train_data, test_data = gb.load_data_fashion_mnist(batch_size=128, resize=224)
gb.train(train_data, test_data, net, loss, trainer, ctx, num_epochs=15)

Start training on  gpu(0)
Epoch 0. Loss: 2.043, Train acc 0.25, Test acc 0.62, Time 114.3 sec
Epoch 1. Loss: 0.934, Train acc 0.65, Test acc 0.77, Time 114.6 sec
Epoch 2. Loss: 0.609, Train acc 0.77, Test acc 0.76, Time 114.1 sec
Epoch 3. Loss: 0.498, Train acc 0.82, Test acc 0.85, Time 114.8 sec
Epoch 4. Loss: 0.446, Train acc 0.83, Test acc 0.86, Time 115.5 sec
Epoch 5. Loss: 0.403, Train acc 0.85, Test acc 0.87, Time 114.1 sec
Epoch 6. Loss: 0.594, Train acc 0.79, Test acc 0.85, Time 114.4 sec
Epoch 7. Loss: 0.375, Train acc 0.86, Test acc 0.87, Time 114.8 sec
Epoch 8. Loss: 0.347, Train acc 0.87, Test acc 0.89, Time 114.6 sec
Epoch 9. Loss: 0.330, Train acc 0.88, Test acc 0.89, Time 114.7 sec
Epoch 10. Loss: 0.315, Train acc 0.88, Test acc 0.89, Time 115.2 sec
Epoch 11. Loss: 0.302, Train acc 0.89, Test acc 0.89, Time 115.5 sec
Epoch 12. Loss: 0.297, Train acc 0.89, Test acc 0.90, Time 115.0 sec
Epoch 13. Loss: 0.284, Train acc 0.89, Test acc 0.90, Time 114.9 sec
Epoch 14. Loss: 0.

NiN提供了两个重要的设计思路：

    重复使用由卷积层和代替全连接层的1×1

    卷积层构成的基础块来构建深层网络；
    去除了容易造成过拟合的全连接层，而是替代成由输出通道数为标签类数的卷积层和全局平均池化层作为输出。

虽然因为精度和收敛速度等问题NiN并没有像本章中介绍的其他网络那么被广泛使用，但NiN的设计思想影响了后面的一系列网络的设计。