This notebook will build the neural network that Mark Neilson designs in Chapter 1 of [Neural Networks and Deep Learning](http://neuralnetworksanddeeplearning.com).

In [1]:
import numpy as np
from tqdm import tqdm_notebook
import mnist_loader

In [15]:
class Network(object):
    '''Initializes a network with biases and weights drawn from a Gaussian distribution with mean 0
    and standard deviation 1.'''
    def __init__(self, sizes):
        self.num_layers = len(sizes)
        self.sizes = sizes # number of neurons per layer
        self.biases = [np.random.randn(y, 1) for y in sizes[1:]]
        self.weights = [np.random.randn(y, x) for x, y in zip(sizes[1:], sizes[1:])]
        
    def SGD(self, training_data, epochs, mini_batch_size, eta, test_data=None):
        '''Train the neural network using mini-batch stochastic gradient descent.

        Parameters
        ----------
        training_data : a list of tuples (input, desired output)
        epochs : number of mini-batches to evaluate
        mini_batch_size : size of mini-batches
        eta : learning rate
        test_data : a list of inputs without known outputs'''

        if test_data: n_test = len(test_data)
        n = len(training_data)
        for j in tqdm_notebook(xrange(epochs)):
            np.random.shuffle(training_data)
            mini_batches = [training_data[k:k+mini_batch_size] for k in xrange(0, n, mini_batch_size)]

            for mini_batch in mini_batches:
                self.update_mini_batch(mini_batch, eta)
            if test_data:
                print('Epoch {0}: {1}/{2}'.format(j, self.evaluate(test_data)), n_test)
            else: print('Epoch {0} complete'.format(j))
    
    def update_mini_batch(self, mini_batch, eta):
        '''Update the network's weights and biases by applying gradient descent 
        using backpropogation to a single mini batch.

        Parameters
        ----------
        mini_batch : list of tuples (x, y)
        eta : learning rate'''
        del_b = [np.zeros(b.shape) for b in self.biases]
        del_w = [np.zeros(w.shape) for w in self.weights]

        for x, y in mini_batch:
            delta_del_b, delta_del_w = self.backprop(x, y)
            del_b = [db+ddb for db, ddb in zip(del_b, delta_del_b)]
            del_w = [dw+ddw for dw, ddw in zip(del_w, delta_del_w)]

        self.biases = [b-(eta/len(mini_batch))*db for b, db in zip(self.biases, del_b)]
        self.weights = [w-(eta/len(mini_batch))*dw for w, dw in zip(self.weights, del_w)]
    
    def backprop(self, x, y):
        '''Return a tuple (del_b, del_w) representing the gradient for the cost function

        Paramters
        ---------
        x : inputs
        y : desired outputs'''
        # Initialize arrays 
        del_b = [np.zeros(b.shape) for b in self.biases]
        del_w = [np.zeros(w.shape) for w in self.weights]

        # Feed forward
        activation = x
        activations = [x]
        zs = []

        for b, w in zip(self.biases, self.weights):
            z = np.dot(w, activation)+b
            zs.append(z)
            activation = sigmoid(z)
            activations.append(activation)

        # Backward pass
        delta = self.cost_derivative(activations[-1], y)*sigmoid_prime(zs[-1])
        del_b[-1] = delta
        del_w[-1] = np.dot(delta, activations[-2].transpose())

        for l in xrange(2, self.num_layers):
            z = zs[-l]
            sp = sigmoid_prime(z)
            delta = np.dot(self.weights[-l+1].transpose(), delta)*sp
            del_b[-l] = delta
            del_w[-l] = np.dot(delta, activations[-l-1].transpose())

        return (del_b, del_w)

    def feedforward(self, a):
        '''Return the output of the network if "a" is input.

        Parameters
        ----------
        a : an n x 1 ndarray
        ''' 
        a = [sigmoid(np.dot(w, a)+b) for b, w in zip(self.biases, self.weights)]
        return a

    def evaluate(self, test_data):
        '''Return the number of test inputs for which the neural network outputs the correct result.
        Note that the neural network's output is assumed to be the index of whichever neuron in the 
        final layer has the highest activaion

        Parameters
        ----------
        test_data : a list of inputs without known outputs'''
        test_results = [(np.argmax(self.feedforward(x)), y) for (x, y) in test_data]
    
        return test_results

    def sigmoid(z):
        return 1./(1.+np.exp(-z))
    
    def sigmoid_prime(z):
        '''Derivative of the sigmoid function'''
        return sigmoid(z)*(1-sigmoid(z))
    
    def cost_derivative(self, output_activations, y):
        '''Return the vector of partial derivatives for the output activations'''
        return (output_activations-y)

With all our function defined we can now load the MNIST data and partition it into training, validation, and testing sets. We'll use the `mnist_loader` script writting by Mark Nielson, which I modified to accept different paths.

In [16]:
training_data, validation_data, testing_data = mnist_loader.load_data_wrapper(path='./mnist.pkl.gz')

We'll begin with a network with a single layer of 30 hidden neurons and use stochastic gradient descent (SGD) with mini-batch size 10 for 30 epochs with a learning rate of $\eta=3.0$.

In [17]:
net = Network([748, 30, 10])

In [18]:
net.SGD(training_data=training_data, epochs=30, mini_batch_size=10, eta=3.0, test_data=None)




ValueError: shapes (30,30) and (784,1) not aligned: 30 (dim 1) != 784 (dim 0)