In [None]:
# Using linear activation function just for last layer sigmoid for rest
# This is supposed to solve the problem of neurons getting saturated at wrong outputs when weights and biases are set randomly.
# However, it does not give good results with digit recognition

In [1]:
import numpy as np
import pandas as pd
import random

In [5]:
class Network():
    def __init__(self, sizes):
        self.num_layers = len(sizes)
        self.sizes = sizes 
        self.weights = [np.random.randn(y, x) for x, y in zip(sizes[:-1], sizes[1:])]  # y = output layer, x = input layer
        self.biases = [np.random.randn(y, 1) for y in sizes[1: ]]
    
    # Calculating output of the network = activations of last layer
    def feedforward(self, a):                # a is a (n, 1) ndnumpy array not an (n, ) vector
        for w, b in zip(self.weights, self.biases):
            a = sigmoid(np.dot(w, a) + b)             
        return a 

    # training_data is a list of tuples (x, y),  x = column array of pixel grayscale value , y = column vector of correct output activations 
    def SGD(self, training_data, epochs, mini_batch_size, eta, test_data=None):
        if test_data : n_test = len(test_data)
        n = len(training_data)
        for i in range(epochs):
            random.shuffle(training_data)
            mini_batches = [training_data[j : j + mini_batch_size] for j in range(0, n, mini_batch_size)]

            for mini_batch in mini_batches:
                self.update_mini_batch(mini_batch, eta)

            if test_data:
                print(f"Epoch {i} : {self.evaluate(test_data)}/{len(test_data)}")
            else:
                print(f"Epoch {i} completed")

    def update_mini_batch(self, mini_batch, eta):
        gsum_w = [np.zeros(w.shape) for w in self.weights]
        gsum_b = [np.zeros(b.shape) for b in self.biases]

        for (x, y) in mini_batch:
            grad_w, grad_b = self.backprop(x, y)
            gsum_w = [gsw + gw for gsw, gw in zip(gsum_w, grad_w)]
            gsum_b = [gsb + gb for gsb, gb in zip(gsum_b, grad_b)]

        self.weights = [w - (eta/len(mini_batch))*gsw for w, gsw in zip(self.weights, gsum_w)]
        self.biases = [b - (eta/len(mini_batch))*gsb for b, gsb in zip(self.biases, gsum_b)]

    def backprop(self, x, y):
        gradient_w = [np.zeros(w.shape) for w in self.weights]
        gradient_b = [np.zeros(b.shape) for b in self.biases]

        activation = x
        activations = [x]
        z_list = []

        for w, b in zip(self.weights[:-1], self.biases[:-1]):
            z = np.dot(w, activation) + b
            z_list.append(z)
            activation = sigmoid(z)
            activations.append(activation)

        z = np.dot(self.weights[-1], activation) + self.biases[-1]
        z_list.append(z)
        activation = z[-1]
        activations.append(activation)

        delta = self.cost_derivative(activations[-1], y)*sigmoid_prime(z_list[-1])
        gradient_w[-1] = np.dot(delta, activations[-2].T)
        gradient_b[-1] = delta

        for l in range(2, self.num_layers):
            sp = sigmoid_prime(z_list[-l])
            delta = np.dot(self.weights[-l + 1].T, delta)*sp
            gradient_w[-l] = np.dot(delta, activations[-l -1].T)
            gradient_b[-l] = delta
        return (gradient_w, gradient_b)

    def cost_derivative(self, output_activations, true_activations):
        return (output_activations - true_activations)

    def evaluate(self, test_data):
        test_results = [(np.argmax(self.feedforward(x)), y) for (x, y) in test_data]
        sum = 0
        for (x, y) in test_results:
            if x == y: sum += 1
        return sum

def sigmoid_prime(z):
    return sigmoid(z)*(1.0 - sigmoid(z))

def sigmoid(z):
    return 1.0/(1.0 + np.exp(-z))
        

In [6]:
import mnist_loader
training_data, validation_data, testing_data = mnist_loader.load_data_wrapper()

In [7]:
net = Network([784, 30, 10])
net.SGD(training_data, 30, 10, 3.0, test_data = testing_data)

Epoch 0 : 3680/10000
Epoch 1 : 1881/10000
Epoch 2 : 1839/10000
Epoch 3 : 1939/10000
Epoch 4 : 2758/10000
Epoch 5 : 2790/10000
Epoch 6 : 1780/10000


  return 1.0/(1.0 + np.exp(-z))


Epoch 7 : 2651/10000
Epoch 8 : 2081/10000
Epoch 9 : 2331/10000
Epoch 10 : 2739/10000
Epoch 11 : 1757/10000
Epoch 12 : 2688/10000
Epoch 13 : 2570/10000
Epoch 14 : 2290/10000
Epoch 15 : 2338/10000
Epoch 16 : 2338/10000
Epoch 17 : 2128/10000
Epoch 18 : 1971/10000
Epoch 19 : 2140/10000
Epoch 20 : 2286/10000
Epoch 21 : 1655/10000
Epoch 22 : 2221/10000
Epoch 23 : 2108/10000
Epoch 24 : 2362/10000
Epoch 25 : 2561/10000
Epoch 26 : 2774/10000
Epoch 27 : 2779/10000
Epoch 28 : 3222/10000
Epoch 29 : 3007/10000
