In [31]:
import numpy as np
import nnfs
from nnfs.datasets import spiral_data

nnfs.init()

# Dense Layer
class Layer_Dense:

    def __init__(self, n_inputs, n_neurons):
        self.weights = 0.01 * np.random.randn(n_inputs, n_neurons)
        self.biases = np.zeros((1, n_neurons))

    def forward(self, inputs):
        self.inputs = inputs # store the inputs for gradient calculation later
        self.output = np.dot(inputs, self.weights) + self.biases

    def backward(self, dvalues):
        # Calculate gradients on parameters
        self.dweights = np.dot(self.inputs.T,
                               dvalues)
        self.dbiases = np.sum(dvalues,
                             axis=0,
                             keepdims=True)
        # Gradient on values
        self.dinputs = np.dot(dvalues, self.weights.T)


# ReLU Activation
class Activation_ReLU:

    def forward(self, inputs):
        self.inputs = inputs
        self.output = np.maximum(0, inputs)

    def backward(self, dvalues):
        # Since we need to modify original variables, let's make a copy of values first
        self.dinputs = dvalues.copy()

        # Zero gradients where input values were negative or zero
        self.dinputs[self.inputs <= 0] = 0


# Softmax Activation
class Activation_Softmax:

    def forward(self, inputs):
        self.inputs = inputs

        # Get unnormalzed probabilities
        exp_values = np.exp(inputs - np.max(inputs,
                                            axis=1,
                                            keepdims=True))

        # Normalize them for each sample
        probabilities = exp_values / np.sum(exp_values,
                                            axis=1,
                                            keepdims=True)

        self.output = probabilities

    def backward(self, dvalues):

        self.dinputs = np.empty_like(dvalues)

        # Enumerate outputs and gradients
        for index, (single_output, single_dvalues) in enumerate(zip(self.output, dvalues)):

            # Flatten output array
            single_output = single_output.reshape(-1, 1)

            # Calculate Jacobian matrix of the output
            jacobian_matrix = np.diagflat(single_output) - np.dot(single_output, single_output.T)

            # Calculate sample-wise gradient and add it to the array of sample gradients
            self.dinputs[index] = np.dot(jacobian_matrix,
                                         single_dvalues)


# Common Loss class
class Loss:

    # Calculate the data and regularization losses
    # given model output and ground truth values
    def calculate(self, output, y):

        # Calculate sample losses
        sample_losses = self.forward(output, y)

        data_loss = np.mean(sample_losses)

        return data_loss

# Cross-entropy loss
class Loss_CategoricalCrossentropy(Loss):

    def forward(self, y_pred, y_true):

        # Number of samples in a batch
        samples = len(y_pred)

        # Clip data to prevent division by 0
        # Clip both sides to not drag mean toward any value
        y_pred_clipped = np.clip(y_pred, 1e-7, 1 - 1e-7)

        # Probabilities for target values - only if categorical labels
        if len(y_true.shape) == 1:
            correct_confidences = y_pred_clipped[
                range(samples),
                y_true
            ]

        # Mask values - only if one-hot encoded labels
        elif len(y_true.shape) == 2:
            correct_confidences = np.sum(
                y_pred_clipped * y_true,
                axis=1
            )

        # Losses
        negative_log_likelihoods = -np.log(correct_confidences)

        return negative_log_likelihoods

    def backward(self, dvalues, y_true):

        # number of samples
        samples = len(dvalues)

        # Number of labels in every sample
        labels = len(dvalues[0])

        # If labels are sparse, turn them into one-hot vector
        if len(y_true.shape) == 1:
            y_true = np.eye(labels)[y_true]

        # Calculate gradients
        self.dinputs = -y_true / dvalues

        # Normalize gradient
        self.dinputs = self.dinputs / samples


# Softmax classifier - combined Softmax activation
# and cross-entropy loss for faster backward step
class Activation_Softmax_Loss_CategoricalCrossentropy:

    # Create activation and loss function objects
    def __init__(self):
        self.activation = Activation_Softmax()
        self.loss = Loss_CategoricalCrossentropy()

    def forward(self, inputs, y_true):

        # Output layer's activation function
        self.activation.forward(inputs)

        # set the output
        self.output = self.activation.output

        # Calculate and return loss value
        return self.loss.calculate(self.output, y_true)

    def backward(self, dvalues, y_true):

        # Number of samples
        samples = len(dvalues)

        # If labels are one-hot encode, turn them into discrete values
        if len(y_true.shape) == 2:
            y_true = np.argmax(y_true,
                               axis=1)

        # Copy so we can safely modify
        self.dinputs = dvalues.copy()

        # Calculate gradient
        self.dinputs[range(samples), y_true] -= 1

        # Normalize gradient
        self.dinputs = self.dinputs / samples

In [32]:
X, y = spiral_data(samples=100, classes=3)

dense1 = Layer_Dense(2, 3)
dense2 = Layer_Dense(3, 3)
activation1 = Activation_ReLU()
loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy()

dense1.forward(X)
activation1.forward(dense1.output)
dense2.forward(activation1.output)
loss_activation.forward(dense2.output, y)

# Calculate accuracy from output of loss_activation and targets along first axis
predictions = np.argmax(loss_activation.output,
                        axis=1)

if len(y.shape) == 2:
    y = np.argmax(y, axis=1)

accuracy = np.mean(predictions==y)

# Backward pass
loss_activation.backward(loss_activation.output, y)
dense2.backward(loss_activation.dinputs)
activation1.backward(dense2.dinputs)
dense1.backward(activation1.dinputs)

In [33]:
class Optimizer_SGD:

    # Initialize optimizer - set settings,
    # learning rate of 1. is default for this optimizer
    def __init__(self, learning_rate=1.0):
        self.learning_rate = learning_rate

    # Update parameters
    def update_params(self, layer):
        layer.weights += -self.learning_rate * layer.dweights
        layer.biases += -self.learning_rate * layer.dbiases

In [34]:
# Create an optimizer object
optimizer = Optimizer_SGD()

# Update our network layer's parameters after calculating the gradient using:
optimizer.update_params(dense1)
optimizer.update_params(dense2)

The layer object contains its parameters (weights and biases) and also, at this stage, the gradient that is calculated during backpropagation. We store these in the layer's properties so that the optimizer can make use of them. In our main neural network code, we'd bring the optimization in after backpropagation. Let's make a 1x64 densely connected neural network (1 hidden layer with 64 neurons) and use the same dataset as before:


In [35]:
X, y = spiral_data(samples=100, classes=3)

dense1 = Layer_Dense(2, 64)

activation1 = Activation_ReLU()

dense2 = Layer_Dense(64, 3)

loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy()

optimizer = Optimizer_SGD()

dense1.forward(X)
activation1.forward(dense1.output)
dense2.forward(activation1.output)
loss = loss_activation.forward(dense2.output, y)

print(f'loss: {loss}')

# Calculate accuracy
prediction = np.argmax(loss_activation.output, axis=1)

if len(y.shape) == 2:
    y = np.argmax(y, axis=1)

accuracy = np.mean(predictions==y)

print(f'acc: {accuracy}')

# Backward pass
loss_activation.backward(loss_activation.output, y)
dense2.backward(loss_activation.dinputs)
activation1.backward(dense2.dinputs)
dense1.backward(activation1.dinputs)

# Use optimizer to update weights and biases
optimizer.update_params(dense1)
optimizer.update_params(dense2)


loss: 1.098605751991272
acc: 0.34


This is everything we need to train out model. We will repeatedly perform a forward pass, backward pass, and optimization until we reach some stopping point. Each full pass through all of the training data is called an **epoch**. In most deep learning tasks, a neural network will be trained for multiple epochs, though the ideal scenario would be to have a perfect model with ideal weights and biases afer only one epoch.

In [37]:
X, y = spiral_data(samples=100, classes=3)

dense1 = Layer_Dense(2, 64)
activation1 = Activation_ReLU()
dense2 = Layer_Dense(64, 3)
loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy()

optimizer = Optimizer_SGD()

for epoch in range(10001):

    dense1.forward(X)

    activation1.forward(dense1.output)

    dense2.forward(activation1.output)

    loss = loss_activation.forward(dense2.output, y)

    predictions = np.argmax(loss_activation.output, axis=1)
    if len(y.shape) == 2:
        y = np.argmax(y, axis=1)
    accuracy = np.mean(predictions==y)

    if not epoch % 100:
        print(f"epoch: {epoch}, acc: {accuracy:.3f}, loss: {loss:.3f}")

    loss_activation.backward(loss_activation.output, y)
    dense2.backward(loss_activation.dinputs)
    activation1.backward(dense2.dinputs)
    dense1.backward(activation1.dinputs)

    # Update weights and biases
    optimizer.update_params(dense1)
    optimizer.update_params(dense2)


epoch: 0, acc: 0.350, loss: 1.099
epoch: 100, acc: 0.390, loss: 1.083
epoch: 200, acc: 0.417, loss: 1.072
epoch: 300, acc: 0.410, loss: 1.071
epoch: 400, acc: 0.417, loss: 1.070
epoch: 500, acc: 0.407, loss: 1.069
epoch: 600, acc: 0.393, loss: 1.067
epoch: 700, acc: 0.393, loss: 1.064
epoch: 800, acc: 0.397, loss: 1.056
epoch: 900, acc: 0.400, loss: 1.056
epoch: 1000, acc: 0.397, loss: 1.049
epoch: 1100, acc: 0.417, loss: 1.041
epoch: 1200, acc: 0.443, loss: 1.035
epoch: 1300, acc: 0.473, loss: 1.030
epoch: 1400, acc: 0.500, loss: 1.023
epoch: 1500, acc: 0.503, loss: 1.020
epoch: 1600, acc: 0.380, loss: 1.012
epoch: 1700, acc: 0.380, loss: 1.008
epoch: 1800, acc: 0.453, loss: 1.006
epoch: 1900, acc: 0.503, loss: 1.020
epoch: 2000, acc: 0.493, loss: 1.026
epoch: 2100, acc: 0.403, loss: 0.991
epoch: 2200, acc: 0.423, loss: 1.038
epoch: 2300, acc: 0.527, loss: 0.995
epoch: 2400, acc: 0.457, loss: 0.981
epoch: 2500, acc: 0.393, loss: 0.977
epoch: 2600, acc: 0.523, loss: 0.984
epoch: 2700, 

## Learning Rate


In [38]:
X, y = spiral_data(samples=100, classes=3)

dense1 = Layer_Dense(2, 64)
activation1 = Activation_ReLU()
dense2 = Layer_Dense(64, 3)
loos_activation = Activation_Softmax_Loss_CategoricalCrossentropy()

optimizer = Optimizer_SGD(learning_rate=0.85)

for epoch in range(10001):
    dense1.forward(X)
    activation1.forward(dense1.output)
    dense2.forward(activation1.output)

    loss = loss_activation.forward(dense2.output, y)

    predictions = np.argmax(loss_activation.output, axis=1)
    if len(y.shape) == 2:
        y = np.argmax(y, axis=1)
    accuracy = np.mean(predictions==y)

    if not epoch % 100:
        print(f"epoch: {epoch}, acc: {accuracy}, loss: {loss}")

    loss_activation.backward(loss_activation.output, y)
    dense2.backward(loss_activation.dinputs)
    activation1.backward(dense2.dinputs)
    dense1.backward(activation1.dinputs)

    optimizer.update_params(dense1)
    optimizer.update_params(dense2)

epoch: 0, acc: 0.34, loss: 1.0986042022705078
epoch: 100, acc: 0.42333333333333334, loss: 1.0861332416534424
epoch: 200, acc: 0.41, loss: 1.0704448223114014
epoch: 300, acc: 0.43, loss: 1.0670863389968872
epoch: 400, acc: 0.42333333333333334, loss: 1.065380573272705
epoch: 500, acc: 0.43, loss: 1.0625945329666138
epoch: 600, acc: 0.44, loss: 1.0569745302200317
epoch: 700, acc: 0.43, loss: 1.0473774671554565
epoch: 800, acc: 0.43333333333333335, loss: 1.0337408781051636
epoch: 900, acc: 0.3933333333333333, loss: 1.034419298171997
epoch: 1000, acc: 0.39666666666666667, loss: 1.0229823589324951
epoch: 1100, acc: 0.4166666666666667, loss: 1.0165765285491943
epoch: 1200, acc: 0.43666666666666665, loss: 1.0082749128341675
epoch: 1300, acc: 0.45666666666666667, loss: 1.000727891921997
epoch: 1400, acc: 0.4666666666666667, loss: 0.9951295852661133
epoch: 1500, acc: 0.48333333333333334, loss: 0.9894167184829712
epoch: 1600, acc: 0.5, loss: 0.9815554022789001
epoch: 1700, acc: 0.4633333333333333

As can be seen, the neural network did slightly better in terms of accuracy, and it achieved a lower loss; lower loss is not always associated with higher accuracy. Even if we desire the best accuracy out of our model, the optimizer's task is to decrease loss, not raise accuracy directly. Loss is the mean value of all of the smaple losses, and some of them could drop significantly, while others might rise just slightly, chaning the prediction for them from a correct to an incorrect class at the same time. This would cause a lower mean loss in general, but also more incorrectly predicted samples, which will, at the same time, lower the accuracy. A likely reason for this model's lower accuracy is that it found andother local minimum by chance - the descent path has changed, due to smaller steps. In a direct comparison of these two models in training, different learning rates did not show that the lower this learning rate value is, the better. In most cases, we want to start with a larger learning rate and decrease the learning rate over time/steps.

A commonly-used solution to keep initial updates large and explore various learning rates during training is to implement a **learning rate decay**.


## Learning Rate Decay

The idea of a **learning rate decay** is to start with large learning rate, say 1.0 in our case, and then decrease it during training. There are a few methods for doing this. One is to decrease the learning rate in response to the loss across epochs - for example, if th eloss begins to level out/ plateau or starts "jumping" over large deltas. We can either program this behavior-monitoring logically or simply track our loss over time and manually decrease the learning rate when we deem it appropriate. Another option, which we will implement, is to program a **Decay Rate**, which steadily decays the learning rate per batch or epoch.

Let's plan to decay per step. This can also be referred to as **1/t decaying** or **exponential decaying**. Basically, we're going to update the learning rate each step by the reciprocal of the step count fraction. This fraction is a new hyper-parameter that we'll add to the optimizer, called the **learning rate decay**. How this decaying works is it takes the step and the decaying ratio and multiplies them. The further in training, the bigger the step is, and the bigger result of this multiplication is. We then take its reciprocal (the firther in training, the lower the value) and multiply the initial learning rate by it. The added *1* makes sure that the resulting algorithm never raises the learning rate. for example, for the first step, we might divide 1 by the learning rate, *0.001* for example, which will result in a current learning rate of *1000*. That's definitely not what we wanted. 1 divided by the 1+fraction ensures that the result, a fraction of the starting learning rate, will always be less than or equals to 1, decreasing over time. That's the desired result -- start with the current learning rate and make smaller with time.

In [39]:
starting_learning_rate = 1.
learning_rate_decay = 0.1
step = 1

learning_rate = starting_learning_rate * (1. / (1. + learning_rate_decay * step))
learning_rate

0.9090909090909091

In [40]:
for step in range(20):
    learning_rate = starting_learning_rate * (1. / (1. + learning_rate_decay * step))
    print(learning_rate)

1.0
0.9090909090909091
0.8333333333333334
0.7692307692307692
0.7142857142857143
0.6666666666666666
0.625
0.588235294117647
0.5555555555555556
0.5263157894736842
0.5
0.47619047619047616
0.45454545454545453
0.4347826086956522
0.41666666666666663
0.4
0.3846153846153846
0.37037037037037035
0.35714285714285715
0.3448275862068965


This learning rate decay scheme lowers the learning rate each step using the mentioned formula. Initially, the learning rate drops fast, but the change in the learning rate lowers each step, letting the model sit as close as possible to the minimum. The model needs small updates near the end of the training to be able to get as close to this point as possible. We can now update our SGD optimizer class to allow for the learning rate decay:

In [44]:
class Optimizer_SGD:

    # initialize optimizer - set settings, learning rate of 1. is default for this optimizer
    def __init__(self, learning_rate=1., decay=0):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iteration = 0

    # Call once before any parameter updates
    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * (1. / (1. + self.decay * self.iteration))

    # Update parameters
    def update_params(self, layer):
        layer.weights += -self.current_learning_rate * layer.dweights
        layer.biases += -self.current_learning_rate * layer.dbiases

    # call once after any parameter updates
    def post_update_params(self):
        self.iteration += 1

We've updated a few things in the SGD class. First, in the `__init__` method, we added handling for the current learning rate, and `self.learning_rate` is now the initial leaning rate. We also added attributes to track the decay rate and the number of iterations that the optimizer has gone through. Next, we added a new method called `pre_update_params`. This method, if we have a decay rate other than 0, will update our `self.current_learning_rate` using the prior formula. The `update_params` method remains unchanged, but we do have a new `post_update_params` method that will add to our `self.iterations` tracking. With out updated SGD optimizer class, we've added printing the current learning rate, and added pre and post optimizer method class. Let's use a decay rate of 1e-2 (0.01) and train our model again:

In [48]:
X, y = spiral_data(samples=100, classes=3)

dense1 = Layer_Dense(2, 64)
activation1 = Activation_ReLU()
dense2 = Layer_Dense(64, 3)
loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy()

optimizer = Optimizer_SGD(decay=1e-2)

# Training loop
for epoch in range(10001):

    # Forward pass
    dense1.forward(X)
    activation1.forward(dense1.output)
    dense2.forward(activation1.output)
    loss = loss_activation.forward(dense2.output, y)

    # Accuracy
    predictions = np.argmax(loss_activation.output, axis=1)
    if len(y.shape) == 2:
        y = np.argmax(y, axis=1)
    accuracy = np.mean(predictions==y)

    if not epoch % 100:
        print(f'epoch: {epoch}, loss: {loss:.3f}, acc: {accuracy:.3f}, lr: {optimizer.current_learning_rate:}')

    # Backward pass
    loss_activation.backward(loss_activation.output, y)
    dense2.backward(loss_activation.dinputs)
    activation1.backward(dense2.dinputs)
    dense1.backward(activation1.dinputs)

    # Update parameters
    optimizer.pre_update_params()
    optimizer.update_params(dense1)
    optimizer.update_params(dense2)
    optimizer.post_update_params()



epoch: 0, loss: 1.099, acc: 0.317, lr: 1.0
epoch: 100, loss: 1.097, acc: 0.383, lr: 0.5025125628140703
epoch: 200, loss: 1.091, acc: 0.380, lr: 0.33444816053511706
epoch: 300, loss: 1.085, acc: 0.383, lr: 0.2506265664160401
epoch: 400, loss: 1.083, acc: 0.380, lr: 0.2004008016032064
epoch: 500, loss: 1.082, acc: 0.380, lr: 0.1669449081803005
epoch: 600, loss: 1.081, acc: 0.383, lr: 0.14306151645207438
epoch: 700, loss: 1.081, acc: 0.380, lr: 0.1251564455569462
epoch: 800, loss: 1.081, acc: 0.380, lr: 0.11123470522803114
epoch: 900, loss: 1.081, acc: 0.380, lr: 0.10010010010010009
epoch: 1000, loss: 1.081, acc: 0.380, lr: 0.09099181073703366
epoch: 1100, loss: 1.080, acc: 0.383, lr: 0.08340283569641367
epoch: 1200, loss: 1.080, acc: 0.383, lr: 0.07698229407236336
epoch: 1300, loss: 1.080, acc: 0.383, lr: 0.07147962830593281
epoch: 1400, loss: 1.080, acc: 0.380, lr: 0.066711140760507
epoch: 1500, loss: 1.080, acc: 0.380, lr: 0.06253908692933083
epoch: 1600, loss: 1.080, acc: 0.380, lr: 0

This model definitely got stuck, and the reason is almost certainly because the learning rate decayed far too quickly and became too small, trapping the model in some local minimum. This is most likely why, rather than wiggling, our accuracy and loss stopped chaning *at all*..

We can, instead, try to decay a bit slower by making our decay a smaller number, say 1e-3:

In [50]:
X, y = spiral_data(samples=100, classes=3)

dense1 = Layer_Dense(2, 64)
activation1 = Activation_ReLU()
dense2 = Layer_Dense(64, 3)
loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy()

optimizer = Optimizer_SGD(decay=1e-3)

for epoch in range(10001):

    # Forward pass
    dense1.forward(X)
    activation1.forward(dense1.output)
    dense2.forward(activation1.output)
    loss = loss_activation.forward(dense2.output, y)

    # Accuracy
    predictions = np.argmax(loss_activation.output, axis=1)
    if len(y.shape) == 2:
        y = np.argmax(y, axis=1)
    accuracy = np.mean(predictions==y)

    if not epoch % 100:
        print(f"epoch: {epoch}, acc: {accuracy:.3f}, loss: {loss:.3f}, lr: {optimizer.current_learning_rate}")

    # Backward pass
    loss_activation.backward(loss_activation.output, y)
    dense2.backward(loss_activation.dinputs)
    activation1.backward(dense2.dinputs)
    dense1.backward(activation1.dinputs)

    # update parameters
    optimizer.pre_update_params()
    optimizer.update_params(dense1)
    optimizer.update_params(dense2)
    optimizer.post_update_params()

epoch: 0, acc: 0.317, loss: 1.099, lr: 1.0
epoch: 100, acc: 0.443, loss: 1.081, lr: 0.9099181073703367
epoch: 200, acc: 0.460, loss: 1.066, lr: 0.8340283569641367
epoch: 300, acc: 0.460, loss: 1.066, lr: 0.7698229407236336
epoch: 400, acc: 0.460, loss: 1.065, lr: 0.7147962830593281
epoch: 500, acc: 0.453, loss: 1.065, lr: 0.66711140760507
epoch: 600, acc: 0.453, loss: 1.065, lr: 0.6253908692933083
epoch: 700, acc: 0.453, loss: 1.064, lr: 0.5885815185403178
epoch: 800, acc: 0.453, loss: 1.064, lr: 0.5558643690939411
epoch: 900, acc: 0.453, loss: 1.064, lr: 0.526592943654555
epoch: 1000, acc: 0.453, loss: 1.063, lr: 0.5002501250625312
epoch: 1100, acc: 0.453, loss: 1.062, lr: 0.4764173415912339
epoch: 1200, acc: 0.457, loss: 1.061, lr: 0.45475216007276037
epoch: 1300, acc: 0.457, loss: 1.060, lr: 0.43497172683775553
epoch: 1400, acc: 0.460, loss: 1.058, lr: 0.4168403501458941
epoch: 1500, acc: 0.453, loss: 1.056, lr: 0.4001600640256102
epoch: 1600, acc: 0.447, loss: 1.053, lr: 0.38476337

Stochastic Gradient Descent with learning rate decay can do fairly well but is still a fairly basic optimization method that only follows a gradient without any additiional logic that could potentially help the model find the global minimum to the loss function. One option for improving the SGD optimizer is to introduce **momentum**.

# Stochastic Gradient Descent with Momentum

**Momentum** creates a rolling average of gradients over some number of updates and uses this average with the unique gradient at each step. Another way of undestanding this is to imagine a ball going down a hill - even if it finds a small hole or hill, momentum will let it go straight through it towards a lower minimum - the bottom of this hill. This can help in cases where we're stuck in some local minimum (a hole), bouncing back and forth. With momentum, a model is more likely to pass through local minimum, further decreasing loss. Simply put, momentum may still point towards the global descent direction.

With regular updates, the SGD optimizer might determine that the next best step is one that keeps the model in a local minimum. Remember that the gradient points toward the current steepest loss ascent for that step - taking the negative of the gradient vector flips it toward the current steepes descent, which may not necessarily follow descent toward the global minimum - the current steepest descent may point toward a local minimum. So this step may decrease loss for that update but might not get us out of the local minimum. We might wind up with a gradient that points in one direction and then the opposite direction in the next update; the gradient could continue to bounce back and forth around the local minimum like this, keeping the optimization of the loss stuck. Instead, momentum uses the previous update's direction to influence the next update's direction, minimizing the changes of bouncing around and getting stuck.

We utilize momentum by setting a parameter between 0 and 1, representing the fraction of the previous parameter update to retain, and subtracting (adding the negative) our actual gradient, multiplied by the learning rate (like before), from it. The update contains a portion of the gradient from preceding steps as our momentum (direction of previous changes) and only a portion of the current gradient; together, these portions form the actual change to our parameters and the bigger the role that momentum takes in the update, the slower the update can change the direction. When we set the momentum fraction too hight, the model might stop learning at all since the direction of the updates won't be able to follow the global gradient descent.

<code>

weight_updates = self.momentum * layer.weight_momentums - self.current_learning_rate * layer.dweights

</code>

The hyperparameter, `self.momentum` is chosen at the start and the `layer.weight_momentums` starts as all zeros but are altered during training as:

<code>

layer.weight_momentums = weight_updates

</code>



This means that the momentum is always the previous update to the parameters. We will perform the same operations as the above with the biases. We can then update our SGD optimizer class' `update_params` method with the momentum calculation, applying with the parameters, and retaining them for the next steps as an alternative chain of operations to the current code. The difference is that we only calculate the updates and we add these updates with the common code:

In [51]:
def update_params(self, layer):

    # If we use momentum
    if self.momentum:

        # If layer does not contain momentum arrays, create them
        # filled with zeros
        if not hasattr(layer, 'weight_momentums'):
            layer.weight_momentums = np.zeros_like(layer.weights)

            # If there is no momentum array for weights
            # The array doesn't exist for biases yet either
            layer.bias_momentums = np.zeros_like(layer.biases)

        # Build weight updates with momentum - take previous updates
        # multiplied by retained factor and update with current gradients
        weight_updates = self.momentum * layer.weight_momentums - self.current_learning_rate * layer.dweights

        layer.weight_momentums = weight_updates

        # build bias updates
        bias_updates = self.momentum * layer.bias_momentums - self.current_learning_rate * layer.dbiases
        layer.bias_momentums = bias_updates

    # Vanilla SGD updates (as before momentum update)
    else:
        weight_updates = -self.current_learning_rate * layer.dweights
        bias_updates = -self.current_learning_rate * layer.dbiases

    # Update weights and biases using either vanilla or momentum updates
    layer.weights += weight_updates
    layer.biases += bias_updates


In [60]:
class Optimizer_SGD:

    def __init__(self, learning_rate=1., decay=0., momentum=0.):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.momentum = momentum

    # Call once before any parameter updates
    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * (1 / (1. + self.decay * self.iterations))

    # Update parameters
    def update_params(self, layer):

        # If we use momentum
        if self.momentum:

            # If layer does not contain momentum arrays, create them filled with zeros
            if not hasattr(layer, 'weight_momentums'):
                layer.weight_momentums = np.zeros_like(layer.weights)
                # If there is no momentum array for weights
                # The array doesn't exist for biases yet either
                layer.bias_momentums = np.zeros_like(layer.biases)

            # Build weight updates with momentum - take previous
            # updates multiplied by retain factor and update with current gradients
            weight_updates = self.momentum * layer.weight_momentums - self.current_learning_rate * layer.dweights
            layer.weight_momentums = weight_updates

            # Build bias updates
            bias_updates = self.momentum * layer.bias_momentums - self.current_learning_rate * layer.dbiases
            layer.bias_momentum = bias_updates

        # Vanilla SGD updates (as before momentum update)
        else:
            weight_updates = -self.current_learning_rate * layer.dweights
            bias_updates = -self.current_learning_rate * layer.dbiases

        # Update weights and biases using either vanilla or momentum updates
        layer.weights += weight_updates
        layer.biases += bias_updates

    # Call once after any parameter updates
    def post_update_params(self):
        self.iterations += 1


In [61]:
X, y = spiral_data(samples=100, classes=3)

dense1 = Layer_Dense(2, 64)
activation1 = Activation_ReLU()
dense2 = Layer_Dense(64, 3)
loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy()
optimizer = Optimizer_SGD(decay=1e-3, momentum=0.5)

for epoch in range(10001):

    # Forward pass
    dense1.forward(X)
    activation1.forward(dense1.output)
    dense2.forward(activation1.output)
    loss = loss_activation.forward(dense2.output, y)

    # Accuracy
    predictions = np.argmax(loss_activation.output, axis=1)
    if len(y.shape) == 2:
        y = np.argmax(y, axis=1)
    accuracy = np.mean(predictions==y)

    if not epoch % 100:
        print(f"epoch: {epoch}, acc: {accuracy:.3f}, loss: {loss:.3f}. lr: {optimizer.current_learning_rate}, momentum: {optimizer.momentum}")

    # Backward pass
    loss_activation.backward(loss_activation.output, y)
    dense2.backward(loss_activation.dinputs)
    activation1.backward(dense2.dinputs)
    dense1.backward(activation1.dinputs)

    # update parameters
    optimizer.pre_update_params()
    optimizer.update_params(dense1)
    optimizer.update_params(dense2)
    optimizer.post_update_params()

epoch: 0, acc: 0.363, loss: 1.099. lr: 1.0, momentum: 0.5
epoch: 100, acc: 0.430, loss: 1.077. lr: 0.9099181073703367, momentum: 0.5
epoch: 200, acc: 0.410, loss: 1.073. lr: 0.8340283569641367, momentum: 0.5
epoch: 300, acc: 0.423, loss: 1.069. lr: 0.7698229407236336, momentum: 0.5
epoch: 400, acc: 0.413, loss: 1.067. lr: 0.7147962830593281, momentum: 0.5
epoch: 500, acc: 0.420, loss: 1.064. lr: 0.66711140760507, momentum: 0.5
epoch: 600, acc: 0.433, loss: 1.058. lr: 0.6253908692933083, momentum: 0.5
epoch: 700, acc: 0.420, loss: 1.048. lr: 0.5885815185403178, momentum: 0.5
epoch: 800, acc: 0.470, loss: 1.035. lr: 0.5558643690939411, momentum: 0.5
epoch: 900, acc: 0.497, loss: 1.023. lr: 0.526592943654555, momentum: 0.5
epoch: 1000, acc: 0.477, loss: 1.009. lr: 0.5002501250625312, momentum: 0.5
epoch: 1100, acc: 0.500, loss: 0.995. lr: 0.4764173415912339, momentum: 0.5
epoch: 1200, acc: 0.520, loss: 0.979. lr: 0.45475216007276037, momentum: 0.5
epoch: 1300, acc: 0.540, loss: 0.960. lr:

In [66]:
# Try momentum is 0.9
X, y = spiral_data(samples=100, classes=3)

dense1 = Layer_Dense(2, 64)
activation1 = Activation_ReLU()
dense2 = Layer_Dense(64, 3)
loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy()

optimizer = Optimizer_SGD(decay=1e-3, momentum=0.9)

for epoch in range(10001):

    # Forward pass
    dense1.forward(X)
    activation1.forward(dense1.output)
    dense2.forward(activation1.output)
    loss = loss_activation.forward(dense2.output, y)

    # Accuracy
    predictions = np.argmax(loss_activation.output, axis=1)
    if len(y.shape) == 2:
        y = np.argmax(y, axis=1)
    accuracy = np.mean(predictions==y)

    if not epoch % 100:
        print(f"epoch: {epoch}, acc: {accuracy:.3f}, loss: {loss:.3f}, lr: {optimizer.current_learning_rate}")

    # Bacward pass
    loss_activation.backward(loss_activation.output, y)
    dense2.backward(loss_activation.dinputs)
    activation1.backward(dense2.dinputs)
    dense1.backward(activation1.dinputs)

    # Update parameters
    optimizer.pre_update_params()
    optimizer.update_params(dense1)
    optimizer.update_params(dense2)
    optimizer.post_update_params()


epoch: 0, acc: 0.323, loss: 1.099, lr: 1.0
epoch: 100, acc: 0.503, loss: 1.039, lr: 0.9099181073703367
epoch: 200, acc: 0.497, loss: 0.982, lr: 0.8340283569641367
epoch: 300, acc: 0.597, loss: 0.951, lr: 0.7698229407236336
epoch: 400, acc: 0.537, loss: 1.004, lr: 0.7147962830593281
epoch: 500, acc: 0.560, loss: 0.896, lr: 0.66711140760507
epoch: 600, acc: 0.603, loss: 0.844, lr: 0.6253908692933083
epoch: 700, acc: 0.667, loss: 0.803, lr: 0.5885815185403178
epoch: 800, acc: 0.637, loss: 0.880, lr: 0.5558643690939411
epoch: 900, acc: 0.557, loss: 0.909, lr: 0.526592943654555
epoch: 1000, acc: 0.540, loss: 0.938, lr: 0.5002501250625312
epoch: 1100, acc: 0.573, loss: 0.944, lr: 0.4764173415912339
epoch: 1200, acc: 0.570, loss: 0.852, lr: 0.45475216007276037
epoch: 1300, acc: 0.587, loss: 0.866, lr: 0.43497172683775553
epoch: 1400, acc: 0.570, loss: 0.949, lr: 0.4168403501458941
epoch: 1500, acc: 0.693, loss: 0.750, lr: 0.4001600640256102
epoch: 1600, acc: 0.663, loss: 0.804, lr: 0.38476337

## AdaGrad

Short **adaptive gradient**, institutes a per-parameter learning rate rather than a globally-shared rate. The idea here is to normalize updates made to the features. During the training process, some weights can rise significantly, while others tend to not change by much. It is usually better for weights to not rise too high compared to the other weights, which will be discusses with regularization techniques. Adagrad provides a way to normalize parameter updates by keepong a history of previous updates - the bigger the sum of the updates is, in either direction (positive or negative), the smaller updates are made further in training. This leets less-frequently updated parameters to keep up with changes, effectively utilizing more neurons for training. The concept of AdaGrad can be contained in the following two lines of code:

<code>

cache += param_gradient ** 2
param_updates = learning_rate * param_gradient / (sqrt(cache) + eps)

</code>

The `cache` holds a history of squared gradients, and the `param_updates` is a function of the learning rate multiplied by the gradient (basic SGD so far) and then is divided by the square root of the cache plus some **epsilon** value. The division operation performed with a constantly rising cache might also cause the learning to tall as updates become smaller with time, due to the monotimic nature of updates. That's why this optimizer is not widely used, except for some specific application. The epsilon is hyperparameter (pre-training control knob setting) preventing division by 0. We also sum the squared values and taking the square root.

The resulting cache value grows slower, and in a different way, taking care of the negative numbers (we would not want to divide the update by the negative number and flip its sign). Overall, the impact is the learning rates for parameters with smaller gradients are decreased slowly, while the parameters with larger gradients have their learning rates decreased faster.

In [68]:
# AdaGrad optimizer
class Optimizer_Adagrad:

    # Initialize optimizer - set settings
    def __init__(self, learning_rate=1., decay=0., epsilon=1e-7):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.epsilon = epsilon

    # call once befero any parameter updates
    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * (1. / (1. + self.decay * self.iterations))

    # Update parameters
    def update_params(self, layer):

        # If layer does not contain cache arrays,
        # create them filled with zeros
        if not hasattr(layer, 'weight_cache'):
            layer.weight_cache = np.zeros_like(layer.weights)
            layer.bias_cache = np.zeros_like(layer.biases)

        # Update cache with squared current gradients
        layer.weight_cache += layer.dweights ** 2
        layer.bias_cache += layer.dbiases ** 2

        # Vanilla SGD parameter update + normalization with squared rooted cache
        layer.weights += -self.current_learning_rate * layer.dweights / (np.sqrt(layer.weight_cache) + self.epsilon)
        layer.biases += -self.current_learning_rate * layer.dbiases / (np.sqrt(layer.bias_cache) + self.epsilon)

    # Call once after any paramter updates
    def post_update_params(self):
        self.iterations += 1

Testing this optimizer with decaying set to *1e-4* as well as *1e-5* works better than *1e-3*. This optimizer with our dataset works better with lesser decaying:

In [72]:
X, y = spiral_data(samples=100, classes=3)

dense1 = Layer_Dense(2, 64)
activation1 = Activation_ReLU()
dense2 = Layer_Dense(64, 3)
loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy()

optimizer = Optimizer_Adagrad(decay=1e-4)

for epoch in range(10001):

    # Forward pass
    dense1.forward(X)
    activation1.forward(dense1.output)
    dense2.forward(activation1.output)
    loss = loss_activation.forward(dense2.output, y)

    # Accuracy
    predictions = np.argmax(loss_activation.output, axis=1)
    if len(y.shape) == 2:
        y = np.argmax(y, axis=1)
    accuracy = np.mean(predictions==y)

    if not epoch % 100:
        print(f"epoch: {epoch}, acc: {accuracy:.3f}, loss: {loss:.3f}, lr: {optimizer.current_learning_rate}")

    # Backward pass
    loss_activation.backward(loss_activation.output, y)
    dense2.backward(loss_activation.dinputs)
    activation1.backward(dense2.dinputs)
    dense1.backward(activation1.dinputs)

    # Update parameters
    optimizer.pre_update_params()
    optimizer.update_params(dense1)
    optimizer.update_params(dense2)
    optimizer.post_update_params()

epoch: 0, acc: 0.327, loss: 1.099, lr: 1.0
epoch: 100, acc: 0.497, loss: 0.966, lr: 0.9901970492127933
epoch: 200, acc: 0.537, loss: 0.922, lr: 0.9804882831650161
epoch: 300, acc: 0.563, loss: 0.880, lr: 0.9709680551509855
epoch: 400, acc: 0.607, loss: 0.845, lr: 0.9616309260505818
epoch: 500, acc: 0.640, loss: 0.808, lr: 0.9524716639679969
epoch: 600, acc: 0.603, loss: 0.777, lr: 0.9434852344560807
epoch: 700, acc: 0.660, loss: 0.780, lr: 0.9346667912889054
epoch: 800, acc: 0.677, loss: 0.711, lr: 0.9260116677470135
epoch: 900, acc: 0.663, loss: 0.679, lr: 0.9175153683824203
epoch: 1000, acc: 0.683, loss: 0.648, lr: 0.9091735612328392
epoch: 1100, acc: 0.687, loss: 0.628, lr: 0.9009820704567978
epoch: 1200, acc: 0.720, loss: 0.608, lr: 0.892936869363336
epoch: 1300, acc: 0.723, loss: 0.595, lr: 0.8850340738118416
epoch: 1400, acc: 0.757, loss: 0.572, lr: 0.8772699359592947
epoch: 1500, acc: 0.770, loss: 0.562, lr: 0.8696408383337683
epoch: 1600, acc: 0.790, loss: 0.551, lr: 0.86214328

AdaGrad worked quite well here, but not as good as SGD with momentum, and we can see that loss consistently fell through the enture training process.

## RMSProp

Short for **Root Mean Square Propagation**. Similar to AdaGrad, RMSProp calculates an adaptive learning rate per parameter, it's just calculated in a different way than AdaGrad.

<code>

cache = rho * cache + (1 - rho) * gradient ** 2

</code>

This is similar to both momentum with the SGD optimizer and cache with the AdaGrad. RMSProp adds a mechanism similar to momentum but also adds a per-parameter adaptive learning rate, so the learning rate changes are smoother. This helps to retain the global direction of changes and slows changes in direction. Instead of continually adding squred gradients to a cache (like in AdaGrad), it uses a moving average of the cache. Each update to the cache retains a part of the cache and updates it with a fraction of the new, squared gradients. In this way, cache contents "move" with data in time, and learning does not stall. In the case of this optimizer, the per-parameter learning rate can either fall or rise, depending on the last updates and current gradient. RMSProp applies the cache in the same way as AdaGrad does.

The new hyperparameter here is *rho*. *rho* is the cache memory decay rate. Because this optimizer, with default values, carries over so much momentum of gradient and the adaptive learning rate updates, even small gradient updates are enough to keep it going; therefore, a default learning rate of *1* is far too large and causes instant model instability. A learning rate that becomes stable again and gives fast enough updates is around *0.001* (that's also the default value for this optimizer used in well-known machine learning frameworks). That's waht we'll use as default from now on too.

In [82]:
# RMSProp optimizer
class Optimizer_RMSProp:

    # initialize optimizer - set settings
    def __init__(self, learning_rate=0.001, decay=0., epsilon=1e-7, rho=0.9):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.epsilon = epsilon
        self.rho = rho

    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * (1. / (1. + self.decay * self.iterations))

    def update_params(self, layer):

        # If layer does not contain cache arrays
        # create them filled with zeros
        if not hasattr(layer, 'weight_cache'):
            layer.weight_cache = np.zeros_like(layer.weights)
            layer.bias_cache = np.zeros_like(layer.biases)

        # Update cache with squared current gradient
        layer.weight_cache = self.rho * layer.weight_cache + (1 - self.rho) * layer.dweights**2
        layer.bias_cache = self.rho * layer.bias_cache + (1 - self.rho) * layer.dbiases**2

        # Vanilla SGD parameter update + normalization with square rooted cache
        layer.weights += -self.current_learning_rate * layer.dweights / (np.sqrt(layer.weight_cache) + self.epsilon)
        layer.biases += -self.current_learning_rate * layer.dbiases / (np.sqrt(layer.bias_cache) + self.epsilon)

    def post_update_params(self):
        self.iterations += 1

In [85]:
X, y = spiral_data(samples=100, classes=3)

dense1 = Layer_Dense(2, 64)
activation1 = Activation_ReLU()
dense2 = Layer_Dense(64, 3)
loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy()

optimizer = Optimizer_RMSProp(decay=1e-4)

for epoch in range(10001):

    # Forward pass
    dense1.forward(X)
    activation1.forward(dense1.output)
    dense2.forward(activation1.output)
    loss = loss_activation.forward(dense2.output, y)

    # Accuracy
    predictions = np.argmax(loss_activation.output, axis=1)
    if len(y.shape) == 2:
        y = np.argmax(y, axis=1)
    accuracy = np.mean(predictions==y)

    if not epoch % 100:
        print(f"epoch: {epoch}, acc: {accuracy:.3f}. loss: {loss:.3f}, lr: {optimizer.current_learning_rate:.9f}")

    # Backward pass
    loss_activation.backward(loss_activation.output, y)
    dense2.backward(loss_activation.dinputs)
    activation1.backward(dense2.dinputs)
    dense1.backward(activation1.dinputs)

    # Update parameters
    optimizer.pre_update_params()
    optimizer.update_params(dense1)
    optimizer.update_params(dense2)
    optimizer.post_update_params()

epoch: 0, acc: 0.360. loss: 1.099, lr: 0.001000000
epoch: 100, acc: 0.403. loss: 1.071, lr: 0.000990197
epoch: 200, acc: 0.410. loss: 1.063, lr: 0.000980488
epoch: 300, acc: 0.437. loss: 1.058, lr: 0.000970968
epoch: 400, acc: 0.483. loss: 1.054, lr: 0.000961631
epoch: 500, acc: 0.500. loss: 1.049, lr: 0.000952472
epoch: 600, acc: 0.493. loss: 1.043, lr: 0.000943485
epoch: 700, acc: 0.487. loss: 1.036, lr: 0.000934667
epoch: 800, acc: 0.483. loss: 1.027, lr: 0.000926012
epoch: 900, acc: 0.493. loss: 1.018, lr: 0.000917515
epoch: 1000, acc: 0.507. loss: 1.009, lr: 0.000909174
epoch: 1100, acc: 0.513. loss: 1.001, lr: 0.000900982
epoch: 1200, acc: 0.523. loss: 0.992, lr: 0.000892937
epoch: 1300, acc: 0.537. loss: 0.984, lr: 0.000885034
epoch: 1400, acc: 0.513. loss: 0.977, lr: 0.000877270
epoch: 1500, acc: 0.520. loss: 0.970, lr: 0.000869641
epoch: 1600, acc: 0.537. loss: 0.963, lr: 0.000862143
epoch: 1700, acc: 0.557. loss: 0.957, lr: 0.000854774
epoch: 1800, acc: 0.557. loss: 0.950, lr

## Adam

Short for **Adaptive Momentum**, is currently the most widely-used optimizer and is built atop RMSProp, with the momentum concept from SGD added back in. This means that, instead of applying current gradients, we're going to apply momentums like in the SGD optimizer with momentum, then apply a per-weight adaptive learning rate with the cache as done in RMSProp.

The Adam optimizer additionally adds a bias correction mechanism. This is different from the layer's bias. The bias correction mechanism is applied to the cache and momentum, compensating for the initial zeroed values before they warm up with initial steps. To achieve this correction, both momentum and caches are divided by *1-beta^step*. As step raises, *beta^step* approaches *0* ( a fraction to the power of a rising value decreases), solving this whole expression to a fraction during the first steps and approaching *1* as training progreses. For example, *beta 1*, a fraction of momentum to apply, defaults to 0.9. this means that, during the first step, the correction value equals:

<code>

1 - 0.9**1 = 1 - 0.9 = 0.1

</code>

With training progression, a step count rises:

<code>

1 - lim_step->inf (0.9**1) = 1 - 0 = 1

The same applies to the cache and the *beta 2* - in this case, the starting value is 0.001 and also approaches *1*. These values divide the momentums and the cache, respectively. Division by a fraction causes them to be multiple times bigger, significantly speeding up training in the initial stages before both tables warm up during multiple initial steps. We also previously mentioned that both of these bias-correcting coefficients go towards a value of *1* as training progresses and return parameter updates to their typical values for the later training steps. To get parameter updates, we divide the scaled momentum by the scaled square-rooted cache.

The code for the Adam Optimizer is based on the RMSProp optimizer. It adds the cache seen from the SGD along with the *beta 1* hyperparameter. Next, it introduces the bias correction mechanism for both the momentum and the cache. We've also modified the way the parameter updates are calculated - using corrected momentums and corrected caches, instead of gradients and caches.

In [106]:
class Optimizer_Adam:

    # Initialiize optimzier - set settings
    def __init__(self,
                 learning_rate=0.001,
                 decay=0.,
                 epsilon=1e-7,
                 beta_1=0.9,
                 beta_2=0.999):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.epsilon = epsilon
        self.beta_1 = beta_1
        self.beta_2 = beta_2

    # Call once before any parameter updates
    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * (1. / (1. + self.decay * self.iterations))

    # Update parameters
    def update_params(self, layer):

        # If layer does not contain cache arrays,
        # create them filled with zeros,
        if not hasattr(layer, 'weight_cache'):
            layer.weight_cache = np.zeros_like(layer.weights)
            layer.weight_momentums = np.zeros_like(layer.weights)
            layer.bias_cache = np.zeros_like(layer.biases)
            layer.bias_momentums = np.zeros_like(layer.biases)

        # Update momentum with current gradient
        layer.weight_momentums = self.beta_1 * layer.weight_momentums + (1 - self.beta_1) * layer.dweights
        layer.bias_momentums = self.beta_1 * layer.bias_momentums + (1 - self.beta_1) * layer.dbiases

        # Get corrected momentums
        # self.iterations is 0 at first pass and we need to start with 1
        weight_momentums_corrected = layer.weight_momentums / (1 - self.beta_1 ** (self.iterations + 1))
        bias_momentums_corrected = layer.bias_momentums / (1 - self.beta_1 ** (self.iterations + 1))

        # Update cache with squared current gradients
        layer.weight_cache = self.beta_2 * layer.weight_cache + (1 - self.beta_2) * layer.dweights**2
        layer.bias_cache = self.beta_2 * layer.bias_cache + (1 - self.beta_2) * layer.dbiases**2

        # Get corrected cache
        weight_cache_corrected = layer.weight_cache / (1 - self.beta_2 ** (self.iterations + 1))
        bias_cache_corrected = layer.bias_cache / (1 - self.beta_2 ** (self.iterations + 1))

        # Vanilla SGD parameter update + normalization
        # with squared rooted cache
        layer.weights += -self.current_learning_rate * weight_momentums_corrected / (np.sqrt(weight_cache_corrected) + self.epsilon)
        layer.biases += -self.current_learning_rate * bias_momentums_corrected / (np.sqrt(bias_cache_corrected) + self.epsilon)

    def post_update_params(self):
        self.iterations += 1

The following changes were made from copying the RAMSProp class code

* renamed the *rho* hyperparameter and property to `beta_2` in `__init__`
* added `beta_1` hyperparameter and property in `__init__`
* added *momentum* array creation in `update_params()`
* added *momentum* calculation
* renamed `self.hro` to `self.beta_2` with cache calculation code in `update_params()`
* added `*_corrected` variables as corrected momentums and weights
* replaced `layer.dweights`, `layer.dbiases`, `layer.weight_cache`, and `layer.bias_cache` with corrected arrays of values in parameter updates with momentum arrays

In [110]:
X, y = spiral_data(samples=100, classes=3)

dense1 = Layer_Dense(2, 64)
activation1 = Activation_ReLU()
dense2 = Layer_Dense(64, 3)
loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy()

optimizer = Optimizer_Adam(learning_rate=0.05, decay=5e-7)

for epoch in range(10001):

    # Forward pass
    dense1.forward(X)
    activation1.forward(dense1.output)
    dense2.forward(activation1.output)
    loss = loss_activation.forward(dense2.output, y)

    # Accuracy
    predictions = np.argmax(loss_activation.output, axis=1)
    if len(y.shape) == 2:
        y = np.argmax(y, axis=1)
    accuracy = np.mean(predictions==y)

    if not epoch % 100:
        print(f"epoch: {epoch}, acc: {accuracy:.3f}, loss: {loss:.3f}, lr: {optimizer.current_learning_rate}")

    # Backward pass
    loss_activation.backward(loss_activation.output, y)
    dense2.backward(loss_activation.dinputs)
    activation1.backward(dense2.dinputs)
    dense1.backward(activation1.dinputs)

    # Update parameters
    optimizer.pre_update_params()
    optimizer.update_params(dense1)
    optimizer.update_params(dense2)
    optimizer.post_update_params()

epoch: 0, acc: 0.347, loss: 1.123, lr: 0.05
epoch: 100, acc: 0.670, loss: 0.811, lr: 0.04999752512250644
epoch: 200, acc: 0.780, loss: 0.561, lr: 0.04999502549496326
epoch: 300, acc: 0.783, loss: 0.480, lr: 0.049992526117345455
epoch: 400, acc: 0.800, loss: 0.430, lr: 0.04999002698961558
epoch: 500, acc: 0.833, loss: 0.389, lr: 0.049987528111736124
epoch: 600, acc: 0.843, loss: 0.360, lr: 0.049985029483669646
epoch: 700, acc: 0.883, loss: 0.323, lr: 0.049982531105378675
epoch: 800, acc: 0.880, loss: 0.301, lr: 0.04998003297682575
epoch: 900, acc: 0.890, loss: 0.278, lr: 0.049977535097973466
epoch: 1000, acc: 0.907, loss: 0.256, lr: 0.049975037468784345
epoch: 1100, acc: 0.920, loss: 0.232, lr: 0.049972540089220974
epoch: 1200, acc: 0.920, loss: 0.217, lr: 0.04997004295924593
epoch: 1300, acc: 0.910, loss: 0.240, lr: 0.04996754607882181
epoch: 1400, acc: 0.940, loss: 0.187, lr: 0.049965049447911185
epoch: 1500, acc: 0.940, loss: 0.178, lr: 0.04996255306647668
epoch: 1600, acc: 0.940, lo

# Full code up to this point

In [103]:
import numpy as np
import nnfs
from nnfs.datasets import spiral_data

nnfs.init()

# Dense Layer
class Layer_Dense:

    # initalize layer
    def __init__(self, n_inputs, n_neurons):
        self.weights = np.random.rand(n_inputs, n_neurons)
        self.biases = np.zeros((1, n_neurons))

    # Forward pass
    def forward(self, inputs):
        # Remmeber input values
        self.inputs = inputs
        self.output = np.dot(inputs, self.weights) + self.biases

    # Backward pass
    def backward(self, dvalues):
        # Gradients on parameters
        self.dweights = np.dot(self.inputs.T, dvalues)
        self.dbiases = np.sum(dvalues, axis=0, keepdims=True)

        # Gradients on values
        self.dinputs = np.dot(dvalues, self.weights.T)


# ReLU Activation
class Activation_ReLU:

    # Forward pass
    def forward(self, inputs):
        self.inputs = inputs
        self.output = np.maximum(0, inputs)

    def backward(self, dvalues):
        # Since we need to modify original variables,
        # let's make a copy of values first
        self.dinputs = dvalues.copy()

        # Zero gradients where input values are negative or zeros
        self.dinputs[self.inputs <= 0] = 0


# Softmax activation
class Activation_Softmax:

    # Forward pass
    def forward(self, inputs):
        self.inputs = inputs

        # Get unnormalzied probabilities
        exp_values = np.exp(inputs - np.max(inputs,
                                            axis=1,
                                            keepdims=True))
        # Normalize them for each sample
        probabilities = exp_values / np.sum(exp_values,
                                            axis=1,
                                            keepdims=True)
        self.output = probabilities

    def backward(self, dvalues):

        # Create uninitialized array
        self.dinputs = np.empty_like(dvalues)

        # Enumberate outputs and gradients
        for index, (single_output, single_dvalues) in enumerate(zip(self.output, dvalues)):
            # Flatten output array
            single_output = single_output.reshape(-1, 1)

            # Calculate Jacobian matrix
            jacobian_matrix = np.diagflat(single_output) - np.dot(single_output, single_output.T)

            # Calculate sample-wise gradient and add it to the array of sample gradients
            self.dinputs[index] = np.dot(jacobian_matrix,
                                         single_dvalues)


# SGD Optimizer
class Optimizer_SGD:

    # initialize optimizer - set settings
    # learning rate of 1. is default for this optimizer
    def __init__(self,
                 learning=1.,
                 decay=0,
                 momentum=0):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.momentum = momentum

    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * (1. / (1. + self.decay * self.iterations))

    def update_params(self, layer):

        # If we use momentum
        if self.momentum:

            # If layer does not contain momentum arrays
            # create them fillted with zeros
            if not hasattr(layer, 'weight_momentums'):
                layer.weight_momentums = np.zeros_like(layer.weights)
                layer.bias_momentums = np.zeros_like(layer.biases)

            # Build weight updates with momentum - take previous updates
            # multipled by retain factor and update with current gradients
            weight_updates = self.momentum * layer.weight_momentums - self.current_learning_rate * layer.dweights
            layer.weight_momentums = weight_updates

            # Build bias updates
            bias_updates = self.momentum * layer.bias_momentums - self.current_learning_rate * layer.dbiases
            layer.bias_momentums = bias_updates

        # Vanilla SGD updates (as before momentum update
        else:
            weight_updates = -self.current_learning_rate * layer.dweights
            bias_updates = -self.current_learning_rate * layer.dbiases

        # Update weights and biases using either vanilla or momentum updates
        layer.weights += weight_updates
        layer.biases += bias_updates

    def post_update_params(self):
        self.iterations += 1


# AdaGrad Optimizer:
class Optimizer_Adagrad:

    def __init__(self,
                 learning_rate=1.,
                 decay=0.,
                 epsilon=1e-7):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.epsilon = epsilon

    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * (1. / (1. / + self.decay * self.iterations))

    def update_params(self, layer):

        # If layer does not contain cache arrays,
        # create them filled with zeros
        if not hasattr(layer, 'weight_cache'):
            layer.weight_cache = np.zeros_like(layer.weights)
            layer.bias_cache = np.zeros_like(layer.biases)

        # update caches with squared current gradients
        layer.weight_cache += layer.dweights ** 2
        layer.bias_cache += layer.dbiases ** 2

        # Vanilla SGD parameter update + normalization with square rotted cache
        layer.weights += -self.current_learning_rate * layer.dweights / (np.sqrt(layer.weight_cache) + self.epsilon)
        layer.biases += -self.current_learning_rate * layer.dbiases / (np.sqrt(layer.bias_cache) + self.epsilon)

    def post_update_params(self):
        self.iterations += 1


# RMSProp optimzier
class Optimizer_RMSProp:

    def __init__(self,
                 learning_rate=0.001,
                 decay=0.,
                 epsilon=1e-7,
                 rho=0.9):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.epsilon = epsilon
        self.rho = rho

    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * (1. / (1. + self.decay * self.iterations))

    def update_params(self, layer):

        # If layer does not contain cache arrays,
        # create them fillted with zeros
        if not hasattr(layer, 'weight_cache'):
            layer.weight_cache = np.zeros_like(layer.weights)
            layer.bias_cache = np.zeros_like(layer.biases)

        # Update cache with squared current gradients
        layer.weight_cache = self.rho * layer.weight_cache + (1 - self.rho) * layer.dweights**2
        layer.bias_cache = self.rho * layer.bias_cache + (1 - self.rho) * layer.dbiases**2

        # Vanilla SGD parameter update + normalization with squared rooted cache
        layer.weights += -self.current_learning_rate * layer.dweights / (np.sqrt(layer.weight_cache) + self.epsilon)
        layer.biases += -self.current_learning_rate * layer.dbiases / (np.sqrt(layer.bias_cache) + self.epsilon)

    def post_update_params(self):
        self.iterations += 1


# Adam optimizer
class Optimizer_Adam:

    def __init__(self,
                 learning_rate=0.001,
                 decay=0.,
                 epsilon=1e-7,
                 beta_1=0.9,
                 beta_2=0.999):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.epsilon = epsilon
        self.beta_1 = beta_1
        self.beta_2 = beta_2

    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * (1. / (1. + self.decay * self.iterations))

    def update_params(self, layer):

        # If layer does not contain cache arrays,
        # create them filled with zeros
        if not hasattr(layer, 'weight_cache'):
            layer.weight_cache = np.zeros_like(layer.weights)
            layer.weight_momentums = np.zeros_like(layer.weights)
            layer.bias_cache = np.zeros_like(layer.biases)
            layer.bias_momentums = np.zeros_like(layer.biases)

        # Update momentum with current gradient
        layer.weight_momentums = self.beta_1 * layer.weight_momentums + (1 - self.beta_1) * layer.dweights
        layer.bias_momentums = self.beta_1 * layer.bias_momentums + (1 - self.beta_1) * layer.dbiases

        # Get corrected momentum
        # self.iterations is 0 at first pass and we need to start with 1
        weight_momentums_corrected = layer.weight_momentums / (1 - self.beta_1 ** (self.iterations + 1))
        bias_momentums_corrected = layer.bias_momentums / (1 - self.beta_1 ** (self.iterations + 1))

        # Update cache with squared current gradients
        layer.weight_cache = self.beta_2 * layer.weight_cache + (1 - self.beta_2) * layer.dweights**2
        layer.bias_cache = self.beta_2 * layer.bias_cache + (1 - self.beta_2) * layer.dbiases**2

       # Get corrected cache
        weight_cache_corrected = layer.weight_cache / (1 - self.beta_2 ** (self.iterations + 1))
        bias_cache_corrected = layer.bias_cache / (1 - self.beta_2 ** (self.iterations + 1))

        # Vanilla SGD parameter update + normalization
        # with square rooted cache
        layer.weights += -self.current_learning_rate * weight_momentums_corrected / (np.sqrt(weight_cache_corrected) + self.epsilon)
        layer.biases += -self.current_learning_rate * bias_momentums_corrected / (np.sqrt(bias_cache_corrected) + self.epsilon)

    def post_update_params(self):
        self.iterations += 1


# Common Loss class
class Loss:

    # Calculate the dta and regularization losses
    # given model output and ground truth values
    def calculate(self, output, y):

        # Calculate sample losses
        sample_losses = self.forward(output, y)

        # Calculate mean loss
        data_loss = np.mean(sample_losses)

        return data_loss

# Cross-entropy loss
class Loss_CategoricalCrossentropy(Loss):

    def forward(self, y_pred, y_true):

        # Number of samples in a batch
        samples = len(y_pred)

        # Clip data to prevent division by 0
        # Clip both sides to not drag mean toward any value
        y_pred_clipped = np.clip(y_pred, 1e-7, 1 - 1e-7)

        # Probabilities for target values - only if categorical labels
        if len(y_true.shape) == 1:
            correct_confidences = y_pred_clipped[
                range(samples),
                y_true
            ]

        # Mask values - only for one-hot encoded labels
        elif len(y_true.shape) == 2:
            correct_confidences = np.sum(
                y_pred_clipped * y_true,
                axis=1
            )

        # Losses
        negative_log_likelihoods = -np.log(correct_confidences)
        return negative_log_likelihoods

    def backward(self, dvalues, y_true):

        # Numer of samples
        samples = len(dvalues)

        # Number of lables in every sample
        labels = len(dvalues[0])

        # If labels are sparse, turn them into one-hot vector
        if len(y_true.shape) == 1:
            y_true = np.eye(labels)[y_true]

        # Calculate gradient
        self.dinputs = -y_true / dvalues

        # Normalize gradient
        self.dinputs = self.dinputs / samples


# Softmax classifier - combined Softmax activation
# and crossentropy loss for faster back propagation
class Activation_Softmax_Loss_CategoricalCrossentropy:

    def __init__(self):
        self.activation = Activation_Softmax()
        self.loss = Loss_CategoricalCrossentropy()

    def forward(self, inputs, y_true):
        self.activation.forward(inputs)
        self.output = self.activation.output

        return self.loss.calculate(self.output, y_true)

    def backward(self, dvalues, y_true):

        # Number of samples
        samples = len(dvalues)

        # If labels are one-hot encoded turn them into discrete values
        if len(y_true.shape) == 2:
            y_true = np.argmax(y_true, axis=1)

        # Copy so we can safely modify
        self.dinputs = dvalues.copy()

        # Calculate gradient
        self.dinputs[range(samples), y_true] -= 1

        # Normalize gradient
        self.dinputs = self.dinputs / samples


In [108]:
X, y = spiral_data(samples=100, classes=3)

dense1 = Layer_Dense(2, 64)
activation1 = Activation_ReLU()
dense2 = Layer_Dense(64, 3)
loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy()

optimizer = Optimizer_Adam(learning_rate=0.05, decay=5e-7)

for epoch in range(10001):

    # Forward pass
    dense1.forward(X)
    activation1.forward(dense1.output)
    dense2.forward(activation1.output)
    loss = loss_activation.forward(dense2.output, y)

    # Accuracy
    predictions = np.argmax(loss_activation.output, axis=1)
    if len(y.shape) == 2:
        y = np.argmax(y, axis=1)
    accuracy = np.mean(predictions==y)

    if not epoch % 100:
        print(f"epoch: {epoch}, acc: {accuracy:.3f}, loss: {loss:.3f} lr: {optimizer.current_learning_rate}")

    # Backward pass
    loss_activation.backward(loss_activation.output, y)
    dense2.backward(loss_activation.dinputs)
    activation1.backward(dense2.dinputs)
    dense1.backward(activation1.dinputs)

    # Update parameters
    optimizer.pre_update_params()
    optimizer.update_params(dense1)
    optimizer.update_params(dense2)
    optimizer.post_update_params()

epoch: 0, acc: 0.297, loss: 1.306 lr: 0.05
epoch: 100, acc: 0.650, loss: 0.856 lr: 0.04999752512250644
epoch: 200, acc: 0.697, loss: 0.627 lr: 0.04999502549496326
epoch: 300, acc: 0.800, loss: 0.500 lr: 0.049992526117345455
epoch: 400, acc: 0.860, loss: 0.385 lr: 0.04999002698961558
epoch: 500, acc: 0.893, loss: 0.327 lr: 0.049987528111736124
epoch: 600, acc: 0.913, loss: 0.273 lr: 0.049985029483669646
epoch: 700, acc: 0.927, loss: 0.240 lr: 0.049982531105378675
epoch: 800, acc: 0.933, loss: 0.210 lr: 0.04998003297682575
epoch: 900, acc: 0.937, loss: 0.193 lr: 0.049977535097973466
epoch: 1000, acc: 0.937, loss: 0.179 lr: 0.049975037468784345
epoch: 1100, acc: 0.937, loss: 0.168 lr: 0.049972540089220974
epoch: 1200, acc: 0.940, loss: 0.158 lr: 0.04997004295924593
epoch: 1300, acc: 0.943, loss: 0.152 lr: 0.04996754607882181
epoch: 1400, acc: 0.947, loss: 0.148 lr: 0.049965049447911185
epoch: 1500, acc: 0.950, loss: 0.142 lr: 0.04996255306647668
epoch: 1600, acc: 0.950, loss: 0.141 lr: 0.