## MINST数据集的神经网络

### 神经网络术语约定

* Input Layer为第一层，Output Layer为最后一层，hidden layer为中间层。层的集合为`L`。
* `W`为weights的缩写,`b`为bias的缩写。
* \\(a^i\\)为\\(L^i\\)层的输出，公式为：\\(a^i = \sigma(W^i*a^{i-1} + b^i)\\)。约定\\(a^1\\)为输入数据
* 前向传播即为计算各神经层的输出的过程：\\(a^1,a^2,...,a^m\\)

In [16]:
import numpy as np
from functools import reduce

class Network:
    def __init__(self, sizes):
        self.num_layers = len(sizes)
        self.sizes = sizes
        self.weights = [np.random.rand(L_2, L_1) for L_2, L_1 in zip(sizes[1:], sizes[:-1])]
        self.biases = [np.random.rand(L_2, 1) for L_2 in sizes[1:]]
    
    def parameter_size(self):
        """Total parameters of the network"""
        sizes = 0
        for w,b in zip(self.weights, self.biases):
            sizes += reduce(lambda x,y: x*y, w.shape)
            sizes += reduce(lambda x,y: x*y, b.shape)
        return sizes
    
    def sigmoid(self, z):
        return 1.0/(1.0+np.exp(-z))
    
    def sigmoid_derivative(self, z):
        return self.sigmoid(z) * (1 - self.sigmoid(z))
    
    def feedforward(self, a):
        """
        formula: f(Wa+b)
        """
        for W, b in zip(self.weights, self.biases):
            a = sigmoid(np.dot(W, a) + b)
        
        return a

    def mini_batch(self, m_batch, learning_rate):
        m = len(m_batch)
        nabla_b = [np.zeros(bias.shape) for bias in self.biases]
        nabla_w = [np.zeros(weight.shape) for weight in self.weights]
        
        for x, y in m_batch:
            d_w, d_b = self.backprop(x, y)
            nabla_w = [nw+dw for nw, dw in zip(nabla_w, d_w)]
            nabla_b = [nb+db for nb, db in zip(nabla_b, d_b)]
        
        self.weights = [w-(learning_rate/m)*dw for w, dw in zip(self.weights, nabla_w)]
        self.biases = [b-(learning_rate/m)*db for b, db in zip(self.biases, nabla_b)]
        
    def backprop(self, x, y):
        nabla_b = [np.zeros(bias.shape) for bias in self.biases]
        nabla_w = [np.zeros(weight.shape) for weight in self.weights]
        
        activation = x
        activationx = [x]
        activation_sd = []
        
        zs = []
        
        for l in range(self.num_layers):
            z = np.dot(self.weights[l], activation) + self.biases[l]
            zs.append(z)
            activation = self.sigmoid(z)
            activationx.append(activation)
            activation_sd.append(self.sigmoid_derivative(z))
        
        delta = self.cost_derivative(y, activationx[-1]) * activation_sd[-1]
        nabla_b[-1] = delta
        nabla_w[-1] = np.dot(delta, activationx[-2].transpose())
        
        for i in range(2, self.num_layers):
            z = zs[-i]
            delta = np.dot(self.weights[-i + 1].transpose(), delta) * self.sigmoid_derivative(z)
            nabla_b[-i] = delta
            nabla_w[-i] = np.dot(delta, activationx[-i - 1].transpose())
        
        return nabla_w, nabla_b
        
        
    def cost_derivative(self, y, a):
        """return partial derivatives of C_x"""
        return (a - y)

In [18]:
network = Network(sizes=[784,30,10])
print(network.parameter_size())

23860


In [29]:
#-*-coding:utf8-*-
import _pickle as cPickle
import gzip

# Third-party libraries
import numpy as np

def load_data_from_pickle():
    f = gzip.open('data/mnist.pkl.gz', 'rb')
    training_data, validation_data, test_data = cPickle.load(f)
    f.close()
    return (training_data, validation_data, test_data)

In [30]:
print(load_data_from_pickle())

UnicodeDecodeError: 'ascii' codec can't decode byte 0x90 in position 614: ordinal not in range(128)