In [1]:
import random


import numpy as np

In [2]:
import mnist_loader

training_data, validation_data, test_data = mnist_loader.load_data_wrapper()

In [22]:
def sigmoid(z):
    """Calculate the sigmoid function."""
    return 1.0/(1.0 + np.exp(-z))


def sigmoid_prime(z):
    """Calculate the derivateive of sigmoid function."""
    return sigmoid(z) * (1-sigmoid(z))


class Network:
    def __init__(self, sizes):
        self.num_layers = len(sizes)
        self.sizes = sizes
        self.biases = [np.random.randn(m,1) for m in sizes[1:]]
        self.weights = [np.random.randn(m, n)/np.sqrt(m)
                        for n, m in zip(sizes[:-1], sizes[1:])]
        self.deltas = {
            "mse": lambda a, y, z: (a - y) * sigmoid_prime(z),
            "xentropy": lambda a, y, _z: (a - y)
        }
        
    def feedforward(self, a):
        """Walk through each layer of the network."""
        for w, b in zip(self.weights, self.biases):
            a = sigmoid(np.dot(w, a) + b)
        return a

    def SGD(self, training_data, epochs, mini_batch_size, eta, lmbda, 
            test_data=None, cost="xentropy"):
        "Given all these hyperparams, run a stochastic gradient descent."
        if test_data: n_test = len(test_data)
        n = len(training_data)
        for j in range(epochs):
            random.shuffle(training_data)
            mini_batches = [training_data[k:k+mini_batch_size]
                           for k in range(0, n, mini_batch_size)]
            for mini_batch in mini_batches:
                self.update_mini_batch_matrix(mini_batch, eta, lmbda, cost, n)
            if test_data:
                n_correct = self.evaluate(test_data, cost)
                print("Epoch {}: {} / {}".format(
                    j, n_correct, n_test))
            else:
                print("Epoch {} complete.".format(j))

    def update_mini_batch_matrix(self, mini_batch, eta, lmbda, cost, n):
        arr = np.array(mini_batch)
        x_mat, y_mat = (np.hstack(arr[:,0]), np.hstack(arr[:,1]))
        nabla_b, nabla_w = self.backprop_matrix(x_mat, y_mat, cost)
        # nabla_b, nabla_w are collapsed into a single column vector each
        m = float(len(mini_batch))
        self.biases = [b - (eta/m) * nb for b, nb in zip(self.biases, nabla_b)]
        self.weights = [(1.0 - eta * lmbda / m) * w - (eta/m)*nw for w, nw in zip(self.weights, nabla_w)]
        
    def backprop_matrix(self, x_mat, y_mat, cost):
        num_inputs = x_mat.shape[1]
        deltas = [np.zeros((b.shape[0], num_inputs)) for b in self.biases] # index by layer
        nabla_w = [np.zeros(w.shape + (num_inputs,)) for w in self.weights]
        activations = [x_mat]
        zs = []
        
        for b, w in zip(self.biases, self.weights):
            zs.append(np.matmul(w, activations[-1]) + b)
            activations.append(sigmoid(zs[-1]))

        last_error = self.deltas[cost](activations[-1], y_mat, zs[-1])
        
        deltas[-1] = last_error
        
        nabla_w[-1] = np.matmul(last_error, activations[-2].transpose())
        
        for l in range(2, self.num_layers):
            z = zs[-l]
            sp = sigmoid_prime(z)
            error = np.matmul(self.weights[-l+1].transpose(), deltas[-1]) * sp
            deltas[-l] = error
            ac = activations[-l]
            nabla_w[-l] = np.matmul(error, activations[-l-1].transpose())
                   
        return [nb.sum(axis=1, keepdims=True) for nb in deltas], nabla_w
            
    def evaluate(self, test_data, cost):
        "Return # of correct results"
        test_results = [(np.argmax(self.feedforward(x)), y)
                        for (x, y) in test_data]
        return sum(int(x == y) for x, y in test_results)

    def cost_derivative(self, output_activations, y):
        return (output_activations - y)



In [None]:
net = Network([784, 30, 10])
net.SGD(training_data[:1000], 20, 10, 0.5, 0.0, test_data=test_data, cost="xentropy")

Epoch 0: 7499 / 10000
Epoch 1: 8241 / 10000
Epoch 2: 8398 / 10000
Epoch 3: 8591 / 10000
Epoch 4: 8701 / 10000
Epoch 5: 8653 / 10000
Epoch 6: 8666 / 10000
Epoch 7: 8776 / 10000
Epoch 8: 8761 / 10000
Epoch 9: 8722 / 10000


In [13]:
# see the effect of overfit
#net = Network([784, 30, 10])
#net.SGD(training_data[:1000], 100, 10, 0.5, test_data=test_data)