In [None]:
# Digit Recognition with Neural Nets and SVM

In [1]:
import numpy as np
import random

In [2]:
class Network(object):
    def __init__(self, sizes):
        self.num_layers = len(sizes)
        self.sizes = sizes
        self.biases = [np.random.randn(x, 1) for x in sizes[1: ]]
        self.weights = [np.random.randn(x, y) for (x, y) in zip(sizes[1: ], sizes[ :-1])]

    # Applying Stochastic Gradient Descent
    # training data is a list of 50,000 tuples each tuple is of the form (x, y) where x is a 1-D column array of 784 values 
    # representing the grayscale value of each pixel of an image and 
    # y is a 1-D column array of 10 values each equal to 0 besides the one corresponding to the correct digit which is equal to 1.
    # For ex: if the digit is 4,  y = [0, 0, 0, 0, 1, 0, 0, 0, 0, 0].transpose() 
    # epochs = No. of times the complete data is to be trained
    # test_data may or may not be provided, if provided after each mini batch is trained, results will be evaluated for the test_data
    def SGD(self, training_data, epochs, mini_batch_size, eta, test_data=None):
        if test_data: n_test = len(test_data)
        n = len(training_data)
        for i in range(epochs):
            random.shuffle(training_data)
            # Divide data into mini - batches 
            mini_batches = [training_data[j:j + mini_batch_size] for j in range(0, n, mini_batch_size)]
            # update weights and biases for each mini batch - this is the function doing all the heavy lifting
            for mini_batch in mini_batches:
                self.update_mini_batch(mini_batch, eta)
            if test_data: 
                print("Epoch {}: {}/{}".format(i, self.evaluate(test_data), n_test))
            else: 
                print("Epoch {} completed".format(i))

    # update_mini_batch
    def update_mini_batch(self, mini_batch, eta):
        # Initialising arrays to store sum of gradients over image for the current mini_batch
        nabla_b = [np.zeros(b.shape) for b in self.biases]
        nabla_w = [np.zeros(w.shape) for w in self.weights]
        # calculating gradient - backprop function calculates gradient
        for (x, y) in mini_batch: 
            delta_nabla_b, delta_nabla_w = self.backprop(x, y)
            nabla_b = [nb + dnb for nb, dnb in zip(nabla_b, delta_nabla_b)]
            nabla_w = [nw + dnw for nw, dnw in zip(nabla_w, delta_nabla_w)]
        # Updating biases
        self.biases = [b - (eta/len(mini_batch))*nb for b, nb in zip(self.biases, nabla_b)]
        # Updating weights 
        self.weights = [w - (eta/len(mini_batch))*nw for w, nw in zip(self.weights, nabla_w)]

    # backpropogation - calculating gradient matrix for an image
    def backprop(self, x, y):
        nabla_b = [np.zeros(b.shape) for b in self.biases]
        nabla_w = [np.zeros(w.shape) for w in self.weights]

        # Calculating activations for each layer with current values of weights and biases
        activation = x
        activations = [x]
        zs = []

        for b, w in zip(self.biases, self.weights):
            z = np.dot(w, activation) + b
            zs.append(z)
            activation = sigmoid(z)
            activations.append(activation)

        # Backpropogation
        delta = self.cost_derivative(activations[-1], y)              # derivative of cost function w.r.t y at last layer of activations 
        sigmoid_prime(zs[-1])                                     # Derivative of sigmoid function, y = sigmoid(w*activation + b)
        nabla_b[-1] = delta                                        # Final derivative of cost function w.r.t b = cost_derivative*sigmoid_prime
        nabla_w[-1] = np.dot(delta, activations[-2].transpose())      # Final derivative of cost function w.r.t w = cost_derivative*sigmoid_prime*a
                                                                    # Last layer doesn't have any weights, therefore sigmoid_prime isn't multiplied

        
        for l in range(2, self.num_layers):
            z = zs[-l]
            sp = sigmoid_prime(z)
            delta = (np.dot(self.weights[-l + 1].transpose(), delta))*sp  # delta = y - y_actual
            nabla_b[-l] = delta
            nabla_w[-l] = np.dot(delta, activations[-l -1].transpose())
        return (nabla_b, nabla_w)


    def cost_derivative(self, output_activations, y):
        return (output_activations - y)                 # Cost function = 1/2n*((y - y_actual)^2), derivative = 1/n(y - y_actual)*derivative of y 
                                                        # 1/n has been accounted for when updating weights and biases.
                                                        # y = sigmoid(w*a + b)
                                                        # dy/db = sigmoid_prime(w*a + b)*1
                                                        # dy/dw = sigmoid_prime(w*a + b)*a
     
    
    def feedforward(self, a):
        for w, b in zip(self.weights, self.biases):
            a = sigmoid(np.dot(w, a) + b)            # after completion of loop, a will be a column array of last layer's activations 
        return a
    
    def evaluate(self, test_data):
        test_results = [(np.argmax(self.feedforward(x)), y) for (x, y) in test_data]    # np.argmax returns index of maximum value in an array
        return sum(int(x==y) for (x, y) in test_results)                               # y for test data stores the actual digit value not an iterable 
                                                                                        # of correct output activations

# Sigmoid and sigmoid prime
def sigmoid(z):
    return (1.0/(1.0 + np.exp(-z)))

def sigmoid_prime(z):
    return sigmoid(z)*(1 - sigmoid(z))        

In [3]:
# How exactly is the model learning ?
# Reduction in the cost function. Cost function = 1/2n * sigma((y - y_actual)^2)
# The cost function is like telling the computer how far away it is from the actual result, if the value of the cost function is large, it is pretty far
# but if it's low, the result is close to the actual value
# Since y = sigmoid(wx + b), the cost function is dependent on the weights and biases, tweaking the weights and biases to minimise the cost function would result
# in y getting closer to y_actual.
# Also, the cost function is taken over the mean of all values, therefore reducing it affects in improvement of all weights and biases.

# update_mini_batch
    # Gradient Descent - An easier way to find the local minima of a function with so many variables/parameters(weights and biases) without 
    # having to calculate a painful amount of first, second, third and so on derivatives
    # We start at some random values for the weights and biases and compute the gradient vector, that is to say the gradient is a multidimensional
    # array, there is a specific value of the gradient with respect to each weight and bias. This gradient determines the steepness of ascent for
    # the cost function, that is how quickly the cost function can increase with respect to each of the weights and biases. If we were talking
    # in one dimension this would be the job of the slope.
    # Taking a step in the opposite direction to that gradient would cause the cost function to decrease.
    # For ex: let's say w(3,2) that is,  the weight of the 2nd neuron in the this layer with respect to the 3rd in the next
    # has a gradient of 0.5, if we subtract 0.5*learning_rate from that weight the function would decrease, 
    # if the gradient had been -0.5, adding 0.5*learning_rate to that weight, the function decreases
    # learning_rate represents the size of the step, multiplying it with the gradient ensures that the overall step taken decreases as 
    # the parameters are closer to the minima
    # We do this repetetively until a local minima is reached. The exact local minima reached is determined by the random values we began at
    # which is why it's relatively much much easier to reach a local minima than a global one. 
    # This repetitions can be adjusted by the hyperparameter mini_batch_size.

In [None]:
# SGD - dividing into mini batches

# Wrong way
#for j in range(0, n, mini_batch_size):
#    mini_batches = [training_data[j: j + mini_batch_size]]
# Every iteration of the loop overwrites the entire mini_batches list with a new list containing only one mini-batch.


# First way
# mini_batches = []
# for j in range(0, n, mini_batch_size):
#    mini_batches.append(training_data[j: j + mini_batch_size])

# Second way
# mini_batches = [training_data[j: j + mini_batch_size] for j in range(0, n, mini_batch_size)]

In [4]:
import mnist_loader
training_data, validation_data, testing_data = mnist_loader.load_data_wrapper()

In [5]:
net = Network([784, 30, 10])
net.SGD(training_data, 30, 10, 3.0, test_data=testing_data)

Epoch 0: 8975/10000
Epoch 1: 9199/10000
Epoch 2: 9145/10000
Epoch 3: 9229/10000
Epoch 4: 9339/10000
Epoch 5: 9252/10000
Epoch 6: 9359/10000
Epoch 7: 9337/10000
Epoch 8: 9336/10000
Epoch 9: 9365/10000
Epoch 10: 9351/10000
Epoch 11: 9407/10000
Epoch 12: 9376/10000
Epoch 13: 9410/10000
Epoch 14: 9439/10000
Epoch 15: 9352/10000
Epoch 16: 9406/10000
Epoch 17: 9435/10000
Epoch 18: 9425/10000
Epoch 19: 9455/10000
Epoch 20: 9421/10000
Epoch 21: 9444/10000
Epoch 22: 9420/10000
Epoch 23: 9461/10000
Epoch 24: 9452/10000
Epoch 25: 9453/10000
Epoch 26: 9454/10000
Epoch 27: 9507/10000
Epoch 28: 9516/10000
Epoch 29: 9469/10000


In [6]:
net1 = Network([784, 100, 10])
net1.SGD(training_data, 30, 10, 3.0, testing_data)

Epoch 0: 9240/10000
Epoch 1: 9323/10000
Epoch 2: 9434/10000
Epoch 3: 9446/10000
Epoch 4: 9512/10000
Epoch 5: 9378/10000
Epoch 6: 9547/10000
Epoch 7: 9574/10000
Epoch 8: 9569/10000
Epoch 9: 9572/10000
Epoch 10: 9632/10000
Epoch 11: 9586/10000
Epoch 12: 9641/10000
Epoch 13: 9630/10000
Epoch 14: 9596/10000
Epoch 15: 9615/10000
Epoch 16: 9569/10000
Epoch 17: 9633/10000
Epoch 18: 9613/10000
Epoch 19: 9653/10000
Epoch 20: 9624/10000
Epoch 21: 9655/10000
Epoch 22: 9626/10000
Epoch 23: 9647/10000
Epoch 24: 9597/10000
Epoch 25: 9648/10000
Epoch 26: 9637/10000
Epoch 27: 9645/10000
Epoch 28: 9660/10000
Epoch 29: 9655/10000


In [7]:
net1.SGD(training_data, 30, 10, 0.001, testing_data)

Epoch 0: 9657/10000
Epoch 1: 9659/10000
Epoch 2: 9658/10000
Epoch 3: 9662/10000
Epoch 4: 9662/10000
Epoch 5: 9663/10000
Epoch 6: 9663/10000
Epoch 7: 9663/10000
Epoch 8: 9665/10000
Epoch 9: 9668/10000
Epoch 10: 9668/10000
Epoch 11: 9671/10000
Epoch 12: 9673/10000
Epoch 13: 9674/10000
Epoch 14: 9675/10000
Epoch 15: 9674/10000
Epoch 16: 9674/10000
Epoch 17: 9675/10000
Epoch 18: 9676/10000
Epoch 19: 9676/10000
Epoch 20: 9675/10000
Epoch 21: 9675/10000
Epoch 22: 9676/10000
Epoch 23: 9678/10000
Epoch 24: 9679/10000
Epoch 25: 9679/10000
Epoch 26: 9679/10000
Epoch 27: 9679/10000
Epoch 28: 9680/10000
Epoch 29: 9681/10000


In [8]:
net2 = Network([784, 10])
net2.SGD(training_data, 30, 10, 3.0, testing_data)

Epoch 0: 8678/10000
Epoch 1: 8878/10000
Epoch 2: 8938/10000
Epoch 3: 8712/10000
Epoch 4: 8520/10000
Epoch 5: 8749/10000
Epoch 6: 8670/10000
Epoch 7: 8896/10000
Epoch 8: 8721/10000
Epoch 9: 8870/10000
Epoch 10: 8604/10000
Epoch 11: 8712/10000
Epoch 12: 8430/10000
Epoch 13: 8886/10000
Epoch 14: 8897/10000
Epoch 15: 8794/10000
Epoch 16: 8782/10000
Epoch 17: 8554/10000
Epoch 18: 8802/10000
Epoch 19: 8838/10000
Epoch 20: 8880/10000
Epoch 21: 8639/10000
Epoch 22: 8511/10000
Epoch 23: 8461/10000
Epoch 24: 8832/10000
Epoch 25: 8692/10000
Epoch 26: 8308/10000
Epoch 27: 8613/10000
Epoch 28: 8615/10000
Epoch 29: 8749/10000


In [9]:
net2.SGD(training_data, 30, 20, 3.0, testing_data)

Epoch 0: 8902/10000
Epoch 1: 8706/10000
Epoch 2: 8605/10000
Epoch 3: 8846/10000
Epoch 4: 8743/10000
Epoch 5: 8869/10000
Epoch 6: 8758/10000
Epoch 7: 8286/10000
Epoch 8: 8681/10000
Epoch 9: 8824/10000
Epoch 10: 8712/10000
Epoch 11: 9009/10000
Epoch 12: 8967/10000
Epoch 13: 8824/10000
Epoch 14: 8892/10000
Epoch 15: 8684/10000
Epoch 16: 8849/10000
Epoch 17: 9009/10000
Epoch 18: 8914/10000
Epoch 19: 8654/10000
Epoch 20: 8907/10000
Epoch 21: 8741/10000
Epoch 22: 8892/10000
Epoch 23: 8490/10000
Epoch 24: 8949/10000
Epoch 25: 8912/10000
Epoch 26: 8716/10000
Epoch 27: 8750/10000
Epoch 28: 8685/10000
Epoch 29: 8887/10000


In [10]:
net2.SGD(training_data, 40, 10, 3.0, testing_data)

Epoch 0: 8743/10000
Epoch 1: 8908/10000
Epoch 2: 8916/10000
Epoch 3: 8930/10000
Epoch 4: 8795/10000
Epoch 5: 8965/10000
Epoch 6: 8932/10000
Epoch 7: 8425/10000
Epoch 8: 8798/10000
Epoch 9: 8484/10000
Epoch 10: 8822/10000
Epoch 11: 8806/10000
Epoch 12: 8795/10000
Epoch 13: 8728/10000
Epoch 14: 8884/10000
Epoch 15: 8647/10000
Epoch 16: 8971/10000
Epoch 17: 8923/10000
Epoch 18: 7833/10000
Epoch 19: 8746/10000
Epoch 20: 8468/10000
Epoch 21: 8741/10000
Epoch 22: 8491/10000
Epoch 23: 8894/10000
Epoch 24: 8690/10000
Epoch 25: 8534/10000
Epoch 26: 8644/10000
Epoch 27: 8830/10000
Epoch 28: 8826/10000
Epoch 29: 8833/10000
Epoch 30: 8909/10000
Epoch 31: 8935/10000
Epoch 32: 8910/10000
Epoch 33: 8958/10000
Epoch 34: 8897/10000
Epoch 35: 8914/10000
Epoch 36: 8680/10000
Epoch 37: 8838/10000
Epoch 38: 8669/10000
Epoch 39: 8846/10000


In [12]:
net2.SGD(training_data, 30, 10, 4.0, testing_data)

  return (1.0/(1.0 + np.exp(-z)))


Epoch 0: 8986/10000
Epoch 1: 8849/10000
Epoch 2: 8901/10000
Epoch 3: 8893/10000
Epoch 4: 8828/10000
Epoch 5: 8970/10000
Epoch 6: 8874/10000
Epoch 7: 8634/10000
Epoch 8: 8791/10000
Epoch 9: 8924/10000
Epoch 10: 8836/10000
Epoch 11: 8915/10000
Epoch 12: 8895/10000
Epoch 13: 8587/10000
Epoch 14: 8811/10000
Epoch 15: 8941/10000
Epoch 16: 8667/10000
Epoch 17: 8905/10000
Epoch 18: 8845/10000
Epoch 19: 8673/10000
Epoch 20: 8740/10000
Epoch 21: 8910/10000
Epoch 22: 8969/10000
Epoch 23: 8738/10000
Epoch 24: 8967/10000
Epoch 25: 8581/10000
Epoch 26: 8697/10000
Epoch 27: 8615/10000
Epoch 28: 8793/10000
Epoch 29: 8970/10000


In [13]:
net2.SGD(training_data, 30, 5, 3.0, testing_data)

  return (1.0/(1.0 + np.exp(-z)))


Epoch 0: 8737/10000
Epoch 1: 8656/10000
Epoch 2: 8853/10000
Epoch 3: 8802/10000
Epoch 4: 8917/10000
Epoch 5: 8881/10000
Epoch 6: 8874/10000
Epoch 7: 8777/10000
Epoch 8: 8742/10000
Epoch 9: 8918/10000
Epoch 10: 8823/10000
Epoch 11: 8784/10000
Epoch 12: 8805/10000
Epoch 13: 8702/10000
Epoch 14: 8596/10000
Epoch 15: 8873/10000
Epoch 16: 8795/10000
Epoch 17: 8798/10000
Epoch 18: 8590/10000
Epoch 19: 8878/10000
Epoch 20: 8933/10000
Epoch 21: 8928/10000
Epoch 22: 8936/10000
Epoch 23: 8750/10000
Epoch 24: 8664/10000
Epoch 25: 8823/10000
Epoch 26: 8773/10000
Epoch 27: 8259/10000
Epoch 28: 8767/10000
Epoch 29: 8852/10000


In [14]:
net2.SGD(training_data, 30, 10, 2.0, testing_data)

  return (1.0/(1.0 + np.exp(-z)))


Epoch 0: 8991/10000
Epoch 1: 8886/10000
Epoch 2: 8865/10000
Epoch 3: 8935/10000
Epoch 4: 8711/10000
Epoch 5: 8784/10000
Epoch 6: 8940/10000
Epoch 7: 8850/10000
Epoch 8: 8894/10000
Epoch 9: 8799/10000
Epoch 10: 8921/10000
Epoch 11: 8841/10000
Epoch 12: 8706/10000
Epoch 13: 8743/10000
Epoch 14: 8839/10000
Epoch 15: 8953/10000
Epoch 16: 8980/10000
Epoch 17: 8451/10000
Epoch 18: 8834/10000
Epoch 19: 8970/10000
Epoch 20: 8813/10000
Epoch 21: 8673/10000
Epoch 22: 8840/10000
Epoch 23: 8849/10000
Epoch 24: 8989/10000
Epoch 25: 8711/10000
Epoch 26: 8940/10000
Epoch 27: 7939/10000
Epoch 28: 8938/10000
Epoch 29: 8898/10000


In [1]:
import mnist_loader
from sklearn import svm

def svm_baseline():
    training_data, validation_data, testing_data = mnist_loader.load_data()

    clf = svm.SVC()

    x_train = training_data[0]
    y_train = training_data[1]
    x_test = testing_data[0]
    y_test = testing_data[1]
    
    clf.fit(x_train, y_train)

    predictions = [int(a) for a in clf.predict(x_test)]
    num_correct = sum(int(a == y) for a, y in zip(predictions, y_test))
    print("Baseline classifier using an SVM")
    print(f"{num_correct} of {len(testing_data[1])} values correct")

svm_baseline()

Baseline classifier using an SVM
9785 of 10000 values correct


In [None]:
import mnist_loader
training_data, validation_data, testing_data 