In [1]:
import pickle
import gzip
import random

from tqdm import tqdm
import numpy as np

## 1. Dataset
In order to train the model we we were use the MNIST data set. 
The first step is to load the input data.

In [2]:
def load_data(path):
    f = gzip.open(path)
    training_data, validation_data, test_data = pickle.load(f,encoding="latin1")
    f.close()
    return (training_data, validation_data, test_data)

training_data, validation_data, test_data = load_data('mnist.pkl.gz')

## 2. Data processing and batch generation

The MNIST dataset requires minimal processing. We will transform the labels into one hot vectors for convienient comparisons with the model output. We will also define a function that produces random batches of training data, this will be important later for implementing the stochastic gradient descent.

In [3]:
def one_hot(j):
    r = np.zeros((10,1))
    r[j] = 1
    return r

def get_batch(input_data, output_data, batch_size):
    
    n = len(input_data)
    idx = np.random.choice(n, batch_size)
    input_batch = [np.reshape(input_data[i], (784, 1)) for i in idx]
    output_batch = [one_hot(output_data[i]) for i in idx]
    
    x_batch = np.reshape(input_batch, [batch_size, 784])
    y_batch = np.reshape(output_batch, [batch_size, 10])
    
    return x_batch, y_batch 


## 3. Model definition
Now we are ready to start building the neural network used for digit recognition. We will first define the model class that will hold information about layers arrangement, weights and biases. The weights and biases will be initialized with random numbers on model creation.

In [4]:
class Model(object):
    def __init__(self, layers):
        self.layers_count = len(layers)
        self.layers = layers
        self.biases = [np.random.randn(1, layer) for layer in layers[1:]]
        self.weights = [np.random.randn(y, x) for x, y in zip(layers[:-1], layers[1:])]


In order to define forward propagation and backprogagation we will also need some activation function. The sigmoid function was chosen for this purposes.

In [5]:
def sigmoid(z):
    return 1.0/(1.0+np.exp(-z))

def sigmoid_prime(z):
    return sigmoid(z)*(1-sigmoid(z))


Finally we need to define a cost function. We will use mean square error function. ITs derivative will be used for the gradient computation:

In [6]:
def cost_prime(output_activations, y):
    return (output_activations-y)

## 4. Forward pass

Now we are ready to define the forward pass:

In [7]:
def call(model, x):
    
    for w, b in zip(model.weights, model.biases):
        z = np.dot(x, w.T) + b
        x = sigmoid(z)
    return x


## 5. Training definition

In order to obtain some meanigful results, we first need to train the model. Here we define a single training step. The function is called with a batch of inputs and labels. Then it uses backpropagation to compute the gradient and update the weights and biases accordingly.

In [8]:
def training_step(model, input_batch, output_batch, learning_rate):
            
    delta_nabla_b, delta_nabla_w = backprop(model, input_batch, output_batch)
        
    model.weights = [w-(learning_rate/len(input_batch))*nw
        for w, nw in zip(model.weights, delta_nabla_w)]
    model.biases = [b-(learning_rate/len(input_batch))*nb
        for b, nb in zip(model.biases, delta_nabla_b)]


The core of the trainig_step function is the backpropagation algorithm, which is implemented by the backprop function.
First we initialize the forward pass trough the network saving the resulting activations. Then in the reverse order we compute the partial derivates and store them in the gradient vector.

In [9]:
def backprop(model, x, y):
        
    nabla_b = [np.zeros(b.shape) for b in model.biases]
    nabla_w = [np.zeros(w.shape) for w in model.weights]
    activation = x
    activations = [x]
    zs = []
    
    
    #Forward pass    
    for w, b in zip(model.weights, model.biases):
        z = np.dot(activation, w.T) + b
        zs.append(z)
        activation = sigmoid(z)
        activations.append(activation)
            
    ##Backward pass        
    delta = cost_prime(activations[-1], y) * sigmoid_prime(zs[-1])
    nabla_b[-1] = np.sum(delta, axis=0)
    nabla_w[-1] = np.dot(delta.T, activations[-2])
    
    for l in range(2, model.layers_count):
        z = zs[-l]
        delta = np.dot(delta, model.weights[-l+1]) * sigmoid_prime(z)
        nabla_b[-l] = np.sum(delta, axis=0)
        nabla_w[-l] = np.dot(delta.T, activations[-l-1])
        
    return (nabla_b, nabla_w)


# Training

Now we are ready to start training our model. We need to specify the batch size, number of iterations and the learning rate, and launch the training.

In [12]:
batch_size = 30
iterations = 20000
learning_rate = 2.0

model = Model([784,300,100,10])

if hasattr(tqdm, '_instances'): tqdm._instances.clear()
for _ in tqdm(range(iterations)):
    x, y = get_batch(training_data[0], training_data[1], batch_size)
    training_step(model,x,y,learning_rate)


100%|███████████████████████████████████████████████████████████████████████████| 20000/20000 [00:41<00:00, 478.75it/s]


## Testing
After the training is completed, we can use the test set in order to check how good our model has become at recognizing digits.

In [13]:
test_inputs = [np.reshape(x, (1, 784)) for x in test_data[0]]
test_outputs = test_data[1]

counter = 0
for i in range(len(test_inputs)):
    x = call(model, test_inputs[i])
    predicted_id = np.argmax(x)
    if predicted_id == test_outputs[i]:
        counter+=1
print(f'The model classified correctly {counter} out of {len(test_inputs)} test examples')


The model classified correctly 9441 out of 10000 test examples
