卷积神经网络是指主要由卷积层构成的神经网络。
### 卷积层
卷积层跟前面的全连接层类似，但输入和权重不是做简单的矩阵乘法，而是使用每次作用在一个**窗口**上的卷积

In [1]:
from mxnet import nd

w = nd.arange(4).reshape((1, 1, 2, 2))
b = nd.array([1])
data = nd.arange(9).reshape((1, 1, 3, 3))
out = nd.Convolution(data, w, b, kernel= w.shape[2:], num_filter=w.shape[1])

print('input:', data, '\nweight:', w, '\nbias:', b, '\noutput:', out)

input: 
[[[[ 0.  1.  2.]
   [ 3.  4.  5.]
   [ 6.  7.  8.]]]]
<NDArray 1x1x3x3 @cpu(0)> 
weight: 
[[[[ 0.  1.]
   [ 2.  3.]]]]
<NDArray 1x1x2x2 @cpu(0)> 
bias: 
[ 1.]
<NDArray 1 @cpu(0)> 
output: 
[[[[ 20.  26.]
   [ 38.  44.]]]]
<NDArray 1x1x2x2 @cpu(0)>


In [2]:
# 控制窗口的填充方式等
out = nd.Convolution(data, w, b, kernel=w.shape[2:], num_filter=w.shape[1], stride=(2, 2), pad=(1, 1))
print('input:', data, '\nweight:', w, '\nbias:', b, '\noutput:', out)

input: 
[[[[ 0.  1.  2.]
   [ 3.  4.  5.]
   [ 6.  7.  8.]]]]
<NDArray 1x1x3x3 @cpu(0)> 
weight: 
[[[[ 0.  1.]
   [ 2.  3.]]]]
<NDArray 1x1x2x2 @cpu(0)> 
bias: 
[ 1.]
<NDArray 1 @cpu(0)> 
output: 
[[[[  1.   9.]
   [ 22.  44.]]]]
<NDArray 1x1x2x2 @cpu(0)>


In [3]:
w = nd.arange(8).reshape((1, 2, 2, 2))
data = nd.arange(18).reshape((1, 2, 3, 3))

out = nd.Convolution(data, w, b, kernel=w.shape[2:], num_filter=w.shape[0])

print('input:', data, '\nweight:', w, '\nbias:', b, '\noutput:', out)

input: 
[[[[  0.   1.   2.]
   [  3.   4.   5.]
   [  6.   7.   8.]]

  [[  9.  10.  11.]
   [ 12.  13.  14.]
   [ 15.  16.  17.]]]]
<NDArray 1x2x3x3 @cpu(0)> 
weight: 
[[[[ 0.  1.]
   [ 2.  3.]]

  [[ 4.  5.]
   [ 6.  7.]]]]
<NDArray 1x2x2x2 @cpu(0)> 
bias: 
[ 1.]
<NDArray 1 @cpu(0)> 
output: 
[[[[ 269.  297.]
   [ 353.  381.]]]]
<NDArray 1x1x2x2 @cpu(0)>


In [4]:
print(w.shape[2:])
print(w.shape[0])

(2, 2)
1


In [5]:
w = nd.arange(16).reshape((2, 2, 2, 2))
data = nd.arange(18).reshape((1, 2, 3, 3))
b = nd.array([1, 2])

out = nd.Convolution(data, w, b, kernel=w.shape[2:], num_filter=w.shape[0])

print('input:', data, '\nweight:', w, '\nbias:', b, '\noutput:', out)

input: 
[[[[  0.   1.   2.]
   [  3.   4.   5.]
   [  6.   7.   8.]]

  [[  9.  10.  11.]
   [ 12.  13.  14.]
   [ 15.  16.  17.]]]]
<NDArray 1x2x3x3 @cpu(0)> 
weight: 
[[[[  0.   1.]
   [  2.   3.]]

  [[  4.   5.]
   [  6.   7.]]]


 [[[  8.   9.]
   [ 10.  11.]]

  [[ 12.  13.]
   [ 14.  15.]]]]
<NDArray 2x2x2x2 @cpu(0)> 
bias: 
[ 1.  2.]
<NDArray 2 @cpu(0)> 
output: 
[[[[  269.   297.]
   [  353.   381.]]

  [[  686.   778.]
   [  962.  1054.]]]]
<NDArray 1x2x2x2 @cpu(0)>


### 池化层
每次看一个小窗口，然后选出窗口里面最大的元素，或者平均元素作为输出。

In [6]:
data = nd.arange(18).reshape((1, 2, 3, 3))

max_pool = nd.Pooling(data=data, pool_type='max', kernel=(2, 2))
avg_pool = nd.Pooling(data=data, pool_type='avg', kernel=(2, 2))

print('data:', data, '\nmax_pooling:', max_pool, '\navg_pooling:', avg_pool)

data: 
[[[[  0.   1.   2.]
   [  3.   4.   5.]
   [  6.   7.   8.]]

  [[  9.  10.  11.]
   [ 12.  13.  14.]
   [ 15.  16.  17.]]]]
<NDArray 1x2x3x3 @cpu(0)> 
max_pooling: 
[[[[  4.   5.]
   [  7.   8.]]

  [[ 13.  14.]
   [ 16.  17.]]]]
<NDArray 1x2x2x2 @cpu(0)> 
avg_pooling: 
[[[[  2.   3.]
   [  5.   6.]]

  [[ 11.  12.]
   [ 14.  15.]]]]
<NDArray 1x2x2x2 @cpu(0)>


### 使用卷积层和池化层构建模型

##### 计算方法：https://discuss.gluon.ai/t/topic/736/48
>输入数据： 
shape(256, 1, 28, 28), W1(20, 1, 5, 5)

>h1_conv:    shape(256, 20, 28-stride+1, 28-stride+1)=(256, 20, 24, 24)

>h1: shape(256,20, 24/stride, 24/stride) = (256, 20, 12, 12), W2(50, 20, 3, 3)

>h2_conv: (256, 50, 12 - 3 + 1,12 - 3 + 1)=(256, 50, 10, 10)

>h2: (256, 50, 10/2, 10/2) =  (256, 50, 5, 5)

>flatten (256, 50*5*5)= (256, 1250)

In [7]:
import sys
sys.path.append('..')
from utils import load_data_fashion_mnist

batch_size = 256
train_data, test_data = load_data_fashion_mnist(batch_size)

In [8]:
import mxnet as mx

try:
    ctx = mx.gpu()
    _ = nd.zeros((1,), ctx=ctx)
except:
    ctx = mx.cpu()
ctx

gpu(0)

In [9]:
# LeNet 网络模型
weight_scale = .01

#output channels = 20, kernel = (5, 5)
W1 = nd.random_normal(shape=(20, 1, 5, 5), scale=weight_scale, ctx=ctx)
b1 = nd.zeros(W1.shape[0], ctx=ctx)

#output channels = 50, kernel = (3, 3)
W2 = nd.random_normal(shape=(50, 20, 3, 3), scale=weight_scale, ctx=ctx)
b2 = nd.zeros(W2.shape[0], ctx=ctx)

#output dim = 128
W3 = nd.random_normal(shape=(1250, 128), scale=weight_scale, ctx=ctx)
b3 = nd.zeros(W3.shape[1], ctx=ctx)

#output dim = 10
W4 = nd.random_normal(shape=(W3.shape[1], 10), scale=weight_scale, ctx=ctx)
b4 = nd.zeros(W4.shape[1], ctx=ctx)

params = [W1, b1, W2, b2, W3, b3, W4, b4]
for param in params:
    param.attach_grad()

In [10]:
#网络
def net(X, verboss=False):
    X = X.as_in_context(W1.context)
    
    #第一层卷积
    h1_conv = nd.Convolution(data=X, weight=W1, bias=b1, kernel=W1.shape[2:], num_filter=W1.shape[0])
    h1_activation = nd.relu(h1_conv)
    h1 = nd.Pooling(data=h1_activation, pool_type='max', kernel=(2, 2), stride=(2, 2))
    
    #第二层卷积
    h2_conv = nd.Convolution(data=h1, weight=W2, bias=b2, kernel=W2.shape[2:], num_filter=W2.shape[0])
    h2_activation = nd.relu(h2_conv)
    h2 = nd.Pooling(data=h2_activation, pool_type='max', kernel=(2, 2), stride=(2, 2))
    h2 = nd.flatten(h2)
    
    #第一层全连接
    h3_liner = nd.dot(h2, W3) + b3
    h3 = nd.relu(h3_liner)
    
    #第二层全连接
    h4_liner = nd.dot(h3, W4) + b4
    if verboss:
        print('1st conv block:', h1.shape)
        print('2nd conv block:', h2.shape)
        print('1st dense:', h3.shape)
        print('2nd dense:', h4_liner.shape)
        print('output:', h4_liner)
    return h4_liner

In [11]:
for data, _ in train_data:
    net(data, verboss=True)
    break

1st conv block: (256, 20, 12, 12)
2nd conv block: (256, 1250)
1st dense: (256, 128)
2nd dense: (256, 10)
output: 
[[ -2.02656065e-05   1.79427698e-05   8.49295611e-06 ...,   8.63793830e-05
    5.56598097e-05  -3.88981316e-05]
 [  1.40335360e-05   1.95785396e-05   1.35214996e-05 ...,   4.92718173e-05
    1.29033542e-05  -3.51454728e-05]
 [  6.67927634e-06  -8.64464891e-06   5.85202615e-05 ...,   1.39381722e-04
   -2.49788827e-05   2.11280440e-06]
 ..., 
 [ -3.26744048e-05  -4.21523218e-05   5.03123920e-05 ...,   2.46529489e-05
    1.62451051e-05   4.58401519e-05]
 [ -3.23488312e-05   1.34215225e-05   6.32679585e-05 ...,   1.08290595e-04
    1.26568812e-05   1.55653470e-06]
 [ -3.58920806e-06  -8.62017259e-06   5.09955462e-05 ...,   1.06604697e-04
   -2.80516906e-06  -5.09478832e-06]]
<NDArray 256x10 @gpu(0)>


### 训练

In [12]:
from mxnet import autograd
from utils import SGD, accuracy, evaluate_accuracy
from mxnet import gluon

softmax_cross_entropy = gluon.loss.SoftmaxCrossEntropyLoss()

learning_rate = .2

for epoch in range(5):
    train_loss = 0.
    train_acc = 0.
    for data, label in train_data:
        label = label.as_in_context(ctx)
        with autograd.record():
            output = net(data)
            loss = softmax_cross_entropy(output, label)
        loss.backward()
        SGD(params, learning_rate/batch_size)
        
        train_loss += nd.mean(loss).asscalar()
        train_acc += accuracy(output, label)
        
    test_acc = evaluate_accuracy(test_data, net, ctx)
    print('epoch %d. loss: %f, train acc %f, test acc %f' %(epoch, train_loss/len(train_data),
                                                           train_acc/len(train_data), test_acc))

epoch 0. loss: 2.302451, train acc 0.102781, test acc 0.099960
epoch 1. loss: 1.484592, train acc 0.442074, test acc 0.686198
epoch 2. loss: 0.672021, train acc 0.738181, test acc 0.781751
epoch 3. loss: 0.530988, train acc 0.798160, test acc 0.814503
epoch 4. loss: 0.461873, train acc 0.828893, test acc 0.843450
