In [123]:
import random
import numpy as np
import scipy.io as sio
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
data = sio.loadmat('SVHN.mat')

In [3]:
X_train = data['train_x'].astype('float')
y_train = data['train_label'].astype('float')
X_test = data['test_x'].astype('float')
y_test = data['test_label'].astype('float')

In [4]:
""" Utils """
def onehot_to_label(onehot):
    return np.argwhere(onehot == 1)[:,1]

In [175]:
""" Cost functions """

""" Reference 
- http://peterroelants.github.io/posts/neural_network_implementation_intermezzo02/
- http://eli.thegreenplace.net/2016/the-softmax-function-and-its-derivative/
"""
# This is actually an activation function.
def softmax(z):
    cache = z
#     The following code is not stable enough (`exp(large number)` leads to `inf`)
#     a = np.exp(z) / np.sum(np.exp(z), axis=1, keepdims=True)
    shifts = z - np.max(z, axis=1, keepdims=True)
    exps = np.exp(shifts)
    probs = exps / np.sum(exps, axis=1, keepdims=True)
    return probs, cache

def softmax_cross_entropy_loss(z, y):
    y_pred, _ = softmax(z)
    N = z.shape[0]
#     empirical error
    loss = -np.sum(y * np.log(y_pred)) / N 
#     partial deriative of loss w.r.t y_pred
    d_loss = (y_pred - y) / N
    return loss, d_loss

In [180]:
""" Activations """
def sigmoid(z):
    return 1.0 / (1.0 + np.exp(-z))

def sigmoid_prime(z):
    return sigmoid(z)*(1 - sigmoid(z))

def affine_forward(x, W, b):
    z = np.dot(x, W) + b
    cache = (x, W, b)
    return z, cache

def affine_backward(dout, cache):
    x, W, b = cache
    dx = np.dot(dout, W.T).reshape(x.shape)
    dW = np.dot(x.T, dout)
    db = np.sum(dout, axis=0)
    return dx, dW, db

def relu_forward(z):
    a = np.maximum(0, z)
    cache = z
    return a, cache

def relu_backward(dout, cache):
    z = cache
    da = np.array(dout, copy=True)
    da[z <= 0] = 0
    return da

def affine_relu_forward(x, W, b):
    z, affine_cache = affine_forward(x, W, b)
    a, relu_cache = relu_forward(z)
    cache = (affine_cache, relu_cache)
    return a, cache

def relu_affine_backward(dout, cache):
    affine_cache, relu_cache = cache
    da = relu_backward(dout, relu_cache)
    dx, dW, db = affine_backward(da, affine_cache)
    return dx, dW, db

In [215]:
class TwoLayerNet(object):
    """ Two-layer neural network """
    
    def __init__(self, data, num_epoch=100, batch_size=100, input_dim=784, hidden_dim=100, output_dim=10, 
                 learning_rate=0.001, reg_lambda=0.05):
        # Unpack data
        self.X_train = data['X_train']
        self.y_train = data['y_train']
        self.X_test = data['X_test']
        self.y_test = data['y_test']
        
        # Hyperparameters
        self.num_epoch = num_epoch
        self.batch_size = batch_size # full batch: train_X.shape[0]
        self.input_dim = input_dim # train_X.shape[1]
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim # train_y.shape[1]
        self.learning_rate = learning_rate
        self.reg_lambda = reg_lambda

        # Model parameters
        self.params = {}
        W1 = np.random.randn(input_dim, hidden_dim) / np.sqrt(batch_size)
        b1 = np.zeros(hidden_dim)
        W2 = np.random.randn(hidden_dim, output_dim) / np.sqrt(hidden_dim)
        b2 = np.zeros(output_dim)
        self.params.update({ 'W1': W1, 'W2': W2, 'b1': b1, 'b2': b2 })
        
    def update_parameters(self, grads):
        """ Perform parameter update """
        for params_name, params_val in self.params.items():
            params_grads = grads[params_name]
            params_val -= self.learning_rate * params_grads # Gradient descent update rule
            self.params[params_name] = params_val
    
    def check_accuracy(self, X, y):
        """ Unpack model parameters """
        W1, b1, W2, b2 = self.params['W1'], self.params['b1'], self.params['W2'], self.params['b2']
        
        """ Forward """
        z1 = X.dot(W1) + b1
        a1 = sigmoid(z1)
        z2 = a1.dot(W2) + b2
        exp_scores = np.exp(z2)
        y_pred = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)

        """ Make predictions (no need to compute loss) """
        label_pred = np.argmax(y_pred, axis=1)
        label_y = onehot_to_label(y)
        acc = np.mean(label_pred == label_y)
        
        return acc
    
    def train(self):

        train_size = self.X_train.shape[0]
        num_iteration_per_epoch = train_size // self.batch_size
        
        """ Start training """
        for epoch in range(self.num_epoch):
            
            for iteration in range(num_iteration_per_epoch):
                """ Make a minibatch of training data randomly """
                batch_indexs = np.random.choice(train_size, self.batch_size)
                X_batch = self.X_train[batch_indexs]
                y_batch = self.y_train[batch_indexs]
                
                """ Unpack parameters from model """
                W1, b1, W2, b2 = self.params['W1'], self.params['b1'], self.params['W2'], self.params['b2']
                
                """ Forward """
                z1 = X_batch.dot(W1) + b1
                a1 = sigmoid(z1)
                z2 = a1.dot(W2) + b2
                exp_scores = np.exp(z2)
                y_pred = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
                
                """ Compute loss """
                loss = -np.sum(y_batch * np.log(y_pred)) / self.batch_size
                
                """ Backward """
                # http://neuralnetworksanddeeplearning.com/chap2.html
                grads = {}
                # output layer
                delta_output = (y_pred - y_batch)
                dW2 = (a1.T).dot(delta_output)
                db2 = np.sum(delta_output, axis=0)
                # hidden layer
                delta_hidden = delta_output.dot(W2.T) * sigmoid_prime(z1)
                dW1 = np.dot(X_batch.T, delta_hidden)
                db1 = np.sum(delta_hidden, axis=0)
                
                # Add L2 regularization term for weight gradents (biases don't need reg. terms)
                dW2 += self.reg_lambda * W2
                dW1 += self.reg_lambda * W1
                grads.update({ 'W1': dW1, 'W2': dW2, 'b1': db1, 'b2': db2 })
                
                """ Update parameters by gradient descent """
                self.update_parameters(grads)

                """ Print loss for every minibatch and last iteration """
                if iteration % self.batch_size == 0 or iteration == num_iteration_per_epoch - 1:
                    print('Epoch: {}/{}, Iteration: {}/{}, Loss: {}'.format(epoch+1, self.num_epoch, iteration+1, num_iteration_per_epoch, loss))
                    
            """ Evaluate accuracy at the end of every epoch"""
            train_acc = self.check_accuracy(self.X_train, self.y_train)
            test_acc = self.check_accuracy(self.X_test, self.y_test)
            print('-'*100)
            print('Epoch: {}/{}, Train accuracy: {}, Test accuracy: {}'.format(epoch+1, self.num_epoch, train_acc, test_acc))
            print('-'*100)

In [216]:
model = TwoLayerNet(data={
    'X_train': X_train,
    'y_train': y_train,
    'X_test': X_test,
    'y_test': y_test
})

model.train()

Epoch: 1/100, Iteration: 1/450, Loss: 2.427123108381142
Epoch: 1/100, Iteration: 101/450, Loss: 2.2998353232156026
Epoch: 1/100, Iteration: 201/450, Loss: 2.3195042562862223
Epoch: 1/100, Iteration: 301/450, Loss: 2.3073316113820126
Epoch: 1/100, Iteration: 401/450, Loss: 2.2922773268783825
Epoch: 1/100, Iteration: 450/450, Loss: 2.298665572551617
----------------------------------------------------------------------------------------------------
Epoch: 1/100, Train accuracy: 0.10095555555555556, Test accuracy: 0.10233333333333333
----------------------------------------------------------------------------------------------------
Epoch: 2/100, Iteration: 1/450, Loss: 2.3151929275528262
Epoch: 2/100, Iteration: 101/450, Loss: 2.3278845691565873
Epoch: 2/100, Iteration: 201/450, Loss: 2.2993400128319403
Epoch: 2/100, Iteration: 301/450, Loss: 2.2911113382094648
Epoch: 2/100, Iteration: 401/450, Loss: 2.2757056506132813
Epoch: 2/100, Iteration: 450/450, Loss: 2.3118204379846055
----------