Impl of neural network in python following the same structure as the c++ code using numpy for efficient calculations

In [107]:
from sklearn.datasets import fetch_openml
from keras.utils.np_utils import to_categorical
import numpy as np
from sklearn.model_selection import train_test_split
import time

x, y = fetch_openml('mnist_784', version=1, return_X_y=True)
x = (x/255).astype('float32')
x = x.to_numpy()
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.15, random_state=42)
y_train_oh = to_categorical(y_train)
y_val_oh = to_categorical(y_val)
y_train = y_train.astype('int')
y_val = y_val.astype('int')

In [108]:
from enum import Enum
class ActivationFunctionId(Enum):
    RELU = 1
    SIGMOID = 2
    SOFTMAX = 3

In [109]:
def activate(inputs, fxnId):
    output = None
    if fxnId == ActivationFunctionId.SIGMOID:
        output = 1 / (1 + np.exp(-inputs))
    elif fxnId == ActivationFunctionId.SOFTMAX:
        e_x = np.exp(inputs - np.max(inputs))
        output = e_x / np.sum(e_x, axis = 0)
    return output

In [110]:
class Layer:
    weights = None
    biases = None  # biases.shape has to be (numOutputFeatures, 1)
    deltaWeights = None
    deltaBiases = None
    previousOutput = None
    activationFunction = None
    nodeLocalGradients = None

    def __init__(self, inputFeatures, outputFeatures, activationFunctionId):
        self.weights = np.random.randn(outputFeatures, inputFeatures) * np.sqrt(1 / outputFeatures)
        self.biases = np.zeros((outputFeatures, 1))
        self.activationFunction = activationFunctionId

    def forward(self, input):
        unactivated = np.matmul(self.weights, input) #+ self.biases
        self.previousOutput = activate(unactivated, self.activationFunction)
        return self.previousOutput

    def backward(self, leftLayer, sampleIdx):
        curSample = leftLayer.previousOutput[:, sampleIdx].copy()  # 1d np array
        self.deltaWeights += np.outer(self.nodeLocalGradients, curSample)
        return np.matmul(np.transpose(self.weights), self.nodeLocalGradients) * curSample * (1 - curSample)  # column vector



In [114]:
class Network:
    layers = []
    learningRate = 1e-1

    def forward(self, input):
        assert len(self.layers) > 0
        self.layers[0].forward(input)
        for i in range(1, len(self.layers)):
            self.layers[i].forward(self.layers[i-1].previousOutput)
        return self.layers[-1].previousOutput

    def backpropagation(self, inputBatch, labels):
        # initialize the weight and bias update values for a batch
        for layer in self.layers:
            layer.deltaWeights = np.zeros(layer.weights.shape)
        # calculate
        tmp = self.layers[-1].previousOutput.copy()
        tmp[labels, np.arange(tmp.shape[1])] -= 1
        batchOutputDeltas = 2 * tmp / tmp.shape[0]
        for i in range(inputBatch.shape[1]):
            self.layers[-1].nodeLocalGradients = batchOutputDeltas[:, i].copy() # 1d numpy array
            for l in range(len(self.layers) -1, 0, -1):
                self.layers[l-1].nodeLocalGradients = self.layers[l].backward(self.layers[l-1], i)
            inputLayerDeltaWeights = np.outer(self.layers[0].nodeLocalGradients, inputBatch[:, i])
            self.layers[0].deltaWeights += inputLayerDeltaWeights

        for layer in self.layers:
            layer.weights -= self.learningRate *layer.deltaWeights / inputBatch.shape[1]


    def get_accuracy(self, batch, labels):
        output = self.forward(batch)
        return np.mean(np.argmax(output, axis=0) == labels)


    def get_loss(self, batch, labels, eps=1e-7):
        output = self.forward(batch)
        clipped = np.clip(output, eps, 1 - eps)[labels, np.arange(len(labels))]
        neg_log = -np.log(clipped)
        return np.sum(neg_log) / len(labels)

In [115]:
class Trainer:
    network = None
    dataX = None
    dataY = None
    batchesX = None
    batchesY = None
    def __init__(self, network, dataX, dataY):
        self.network = network
        self.dataX, self.dataY = (dataX, dataY)

    def train(self, batchSize, numEpochs):
        batchesX = np.split(self.dataX, len(self.dataX) / batchSize)
        batchesY = np.split(self.dataY, len(self.dataY) / batchSize)
        for epoch in range(numEpochs):
            for batchIdx in range(len(batchesX)):
                batch = np.transpose(batchesX[batchIdx])
                labels = batchesY[batchIdx]
                network.forward(batch)
                network.backpropagation(batch, labels)
            val_acc = network.get_accuracy(np.transpose(x_val), y_val)
            val_loss = network.get_loss(np.transpose(x_val), y_val)
            print(f"end epoch {epoch} with validation accuracy {val_acc} and loss {val_loss}")


In [118]:
network = Network()
network.layers = [
    Layer(784, 128, ActivationFunctionId.SIGMOID),
    Layer(128, 64, ActivationFunctionId.SIGMOID),
    Layer(64, 10, ActivationFunctionId.SOFTMAX)
    ]

t = Trainer(network, x_train, y_train)
t.train(10, 3)

end epoch 0 with validation accuracy 0.8923809523809524 and loss 0.3972930986634044
end epoch 1 with validation accuracy 0.9108571428571428 and loss 0.31636767138263844
end epoch 2 with validation accuracy 0.9204761904761904 and loss 0.2805386971509729
