**Overfitting** is effectively just memorizing the data without any understanding of it. An overfit model will do very well predicting the data it has already seen, but often significantly worse on unseen data.

**Training** data should only be used to train the model. The **testing**, or **out-of-sample** data, should only be sued to validate a model's performance after training. The idea is that some data are reserved and withheld from the training data for testing the model's performance.

In many cases, one can take a random sampling of available data to train with and make the remaining data the testing dataset. We still need to be very careful about information leaking throught. One common area where this can be problematic is in time-series data. Consider a scenario where we have data from sensors collected every second. We might have millioons of observations collected, and randomly selecting our data for the testing data might result in samples in our testing dataset that are only a second in time apart from our training data, thus are very similar. This means overfitting can spill into our testing data, and the mdeol can achieve good results on both the training and the testing data, which won't mean it generalized well. Randomly allocating time-series data as testing data may be very similar to training data. Both datasets must differ enough to preove the model's ability to generalize. In time-series data, a beter approach is to take multiple slices of our data, entire blocks of time, and reserve those for testing.

In our case, we can use our data-generating function to create new data that will serve as out-of-sample/testing data:

In [12]:
import numpy as np
import nnfs
from nnfs.datasets import spiral_data

nnfs.init()

X_test, y_test = spiral_data(samples=100, classes=3)

In [35]:
class Layer_Dense:

    def __init__(self, n_inputs, n_neurons):
        self.weights = 0.01 * np.random.randn(n_inputs, n_neurons)
        self.biases = np.zeros((1, n_neurons))

    def forward(self, inputs):
        self.inputs = inputs
        self.output = np.dot(inputs, self.weights) + self.biases

    def backward(self, dvalues):
        self.dweights = np.dot(self.inputs.T,
                               dvalues)
        self.dbiases = np.sum(dvalues,
                              axis=0,
                              keepdims=True)
        self.dinputs = np.dot(dvalues,
                              self.weights.T)


In [36]:

class Activation_ReLU:

    def forward(self, inputs):
        self.inputs = inputs
        self.output = np.maximum(0, inputs)

    def backward(self, dvalues):
        self.dinputs = dvalues.copy()
        self.dinputs[self.inputs <= 0] = 0


class Activation_Softmax:

    def forward(self, inputs):
        self.inputs = inputs

        # Unnormalized probs
        exp_values = np.exp(inputs - np.max(inputs,
                                            axis=1,
                                            keepdims=True))
        probabilities = exp_values / np.sum(exp_values,
                                            axis=1,
                                            keepdims=True)
        self.output = probabilities

    def backward(self, dvalues):
        self.dinputs = dvalues.copy()

        for index, (single_output, single_dvalues) in enumerate(zip(self.output, dvalues)):
            # Flatten output array
            single_output = single_output.reshape(-1, 1)

            jacobian_matrix = np.diagflat(single_output) - np.dot(single_output, single_output.T)

            self.dinputs[index] = np.dot(jacobian_matrix,
                                         single_dvalues)

In [27]:
class Optimzier_SGD:

    def __init__(self,
                 learning_rate=1.,
                 decay=0.,
                 momentum=0.):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.momentum = momentum

    def pre_update_params(self, layer):

        if self.momentum:
            if not hasattr(layer, 'weight_momentums'):
                layer.weight_momentums = np.zeros_like(layer.weights)
                layer.bias_momentums = np.zeros_like(layer.biases)

            weight_updates = self.momentum * layer.weight_momentums - self.current_learning_rate * layer.dweights
            layer.weight_momentums = weight_updates

            bias_updates = self.momentum * layer.bias_momentums - self.current_learning_rate * layer.dbiases
            layer.bias_momentums = bias_updates

        else:
            weight_updates = -self.current_learning_rate * layer.dweights
            bias_updates = -self.current_learning_rate * layer.dbiases

        layer.weights += weight_updates
        layer.biases += bias_updates

    def post_update_params(self):
        self.iterations += 1

In [28]:
class Optimizer_Adagrad:

    def __init__(self,
                 learning_rate=1.,
                 decay=0.,
                 epsilon=1e-7):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.epsilon = epsilon

    def pre_update_parms(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * (1. / (1. + self.decay * self.iterations))

    def update_params(self, layer):

        if not hasattr(layer, 'weight_cache'):
            layer.weight_cache = np.zeros_like(layer.weights)
            layer.bias_cache = np.zeros_like(layer.biases)

        # update cache with squared current gradients
        layer.weight_cache = layer.dweights**2
        layer.bias_cache = layer.dbiases**2

        layer.weights += -self.current_learning_rate * layer.dweights / (np.sqrt(layer.weight_cache) + self.epsilon)
        layer.biases += -self.current_learning_rate * layer.dbiases / (np.sqrt(layer.bias_cache) + self.epsilon)

    def post_update_params(self):
        self.iterations += 1

In [29]:
class Optimzier_RMSProp:

    def __init(self,
               learning_rate=0.001,
               decay=0.,
               epsilon=1e-7,
               rho=0.9):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.epsilon = epsilon
        self.rho = rho

    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * (1. / (1. + self.decay * self.iterations))

    def update_params(self, layer):

        if not hasattr(layer, 'weight_cache'):
            layer.weight_cache = np.zeros_like(layer.weights)
            layer.bias_cache = np.zeros_like(layer.biases)

        layer.weight_cache = self.rho * layer.weight_cache + (1 - self.rho) * layer.dweights**2
        layer.bias_cache = self.rho * layer.bias_cache + (1 - self.rho) * layer.dbiases**2

        layer.weights += -self.current_learning_rate * layer.weights / (np.sqrt(layer.weight_cache) + self.epsilon)
        layer.biases += -self.current_learning_rate * layer.biases / (np.sqrt(layer.bias_cache) + self.epsilon)

    def post_update_params(self):
        self.iterations += 1


In [47]:
class Optimizer_Adam:

    def __init__(self,
                 learning_rate=0.001,
                 decay=0.,
                 epsilon=1e-7,
                 beta_1=0.9,
                 beta_2=0.999):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.epsilon = epsilon
        self.beta_1 = beta_1
        self.beta_2 = beta_2

    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * (1. / (1. + self.decay * self.iterations))

    def update_params(self, layer):

        if not hasattr(layer, 'weight_momentums'):
            layer.weight_momentums = np.zeros_like(layer.weights)
            layer.weight_cache = np.zeros_like(layer.weights)
            layer.bias_momentums = np.zeros_like(layer.biases)
            layer.bias_cache = np.zeros_like(layer.biases)

        layer.weight_momentums = self.beta_1 * layer.weight_momentums + (1 - self.beta_1) * layer.dweights
        layer.bias_momentums = self.beta_1 * layer.bias_momentums + (1 - self.beta_1) * layer.dbiases

        weight_momentums_corrected = layer.weight_momentums / (1 - self.beta_1 ** (self.iterations + 1))
        bias_momentums_corrected = layer.bias_momentums / (1 - self.beta_1 ** (self.iterations + 1))

        layer.weight_cache = self.beta_2 * layer.weight_cache + (1 - self.beta_2) * layer.dweights**2
        layer.bias_cache = self.beta_2 * layer.bias_cache + (1 - self.beta_2) * layer.dbiases**2

        weight_cache_corrected = layer.weight_cache / (1 - self.beta_2 ** (self.iterations + 1))
        bias_cache_corrected = layer.bias_cache / (1 - self.beta_2 ** (self.iterations + 1))

        layer.weights += -self.current_learning_rate * weight_momentums_corrected / (np.sqrt(weight_cache_corrected) + self.epsilon)
        layer.biases += -self.current_learning_rate * bias_momentums_corrected / (np.sqrt(bias_cache_corrected) + self.epsilon)

    def post_update_params(self):
        self.iterations += 1

In [44]:
class Loss:

    def calculate(self, output, y):
        sample_losses = self.forward(output, y)
        data_loss = np.mean(sample_losses)
        return data_loss

class Loss_CategoricalCrossentropy(Loss):

    def forward(self, y_pred, y_true):
        samples = len(y_pred)
        y_pred_clipped = np.clip(y_pred, 1e-7, 1 - 1e-7)

        if len(y_true.shape) == 1:
            correct_confidences = y_pred_clipped[
                range(samples),
                y_true
            ]

        elif len(y_true.shape) == 2:
            correct_confidences = np.sum(
                y_pred_clipped * y_true,
                axis=1
            )

        negative_log_likelihood = -np.log(correct_confidences)
        return negative_log_likelihood

    def backward(self, dvalues, y_true):

        samples = len(dvalues)
        labels = len(dvalues[0])

        if len(y_true.shape) == 1:
            y_true = np.eye(labels)[y_true]

        self.dinputs = -y_true / dvalues
        self.dinputs = self.dinputs / samples

In [45]:
class Activation_Softmax_Loss_CategoricalCrossentropy:

    def __init__(self):
        self.activation = Activation_Softmax()
        self.loss = Loss_CategoricalCrossentropy()

    def forward(self, inputs, y_true):
        self.activation.forward(inputs)
        self.output = self.activation.output
        return self.loss.calculate(self.output, y_true)

    def backward(self, dvalues, y_true):

        samples = len(dvalues)

        if len(y_true.shape) == 2:
            y_true = np.argmax(y_true,
                               axis=1)

        self.dinputs = dvalues.copy()
        self.dinputs[range(samples), y_true] -= 1
        self.dinputs = self.dinputs / samples

In [53]:
X, y = spiral_data(samples=100, classes=3)

dense1 = Layer_Dense(2, 64)
activation1 = Activation_ReLU()
dense2 = Layer_Dense(64, 3)
loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy()

optimizer = Optimizer_Adam(learning_rate=0.05, decay=5e-7)

for epoch in range(10001):

    # Forward pass
    dense1.forward(X)
    activation1.forward(dense1.output)
    dense2.forward(activation1.output)
    loss = loss_activation.forward(dense2.output, y)

    # Accuracy
    predictions = np.argmax(loss_activation.output, axis=1)
    if len(y.shape) == 2:
        y = np.argmax(y, axis=1)
    accuracy = np.mean(predictions==y)

    if not epoch % 100:
        print(f"epoch: {epoch}, acc: {accuracy:.3f}, loss: {loss:.3f}, lr: {optimizer.current_learning_rate}")

    # backward pass
    loss_activation.backward(loss_activation.output, y)
    dense2.backward(loss_activation.dinputs)
    activation1.backward(dense2.dinputs)
    dense1.backward(activation1.dinputs)

    optimizer.pre_update_params()
    optimizer.update_params(dense1)
    optimizer.update_params(dense2)
    optimizer.post_update_params()

epoch: 0, acc: 0.297, loss: 1.099, lr: 0.05
epoch: 100, acc: 0.757, loss: 0.607, lr: 0.04999752512250644
epoch: 200, acc: 0.847, loss: 0.402, lr: 0.04999502549496326
epoch: 300, acc: 0.873, loss: 0.282, lr: 0.049992526117345455
epoch: 400, acc: 0.910, loss: 0.231, lr: 0.04999002698961558
epoch: 500, acc: 0.923, loss: 0.202, lr: 0.049987528111736124
epoch: 600, acc: 0.940, loss: 0.180, lr: 0.049985029483669646
epoch: 700, acc: 0.933, loss: 0.169, lr: 0.049982531105378675
epoch: 800, acc: 0.957, loss: 0.155, lr: 0.04998003297682575
epoch: 900, acc: 0.950, loss: 0.148, lr: 0.049977535097973466
epoch: 1000, acc: 0.947, loss: 0.144, lr: 0.049975037468784345
epoch: 1100, acc: 0.950, loss: 0.139, lr: 0.049972540089220974
epoch: 1200, acc: 0.957, loss: 0.132, lr: 0.04997004295924593
epoch: 1300, acc: 0.940, loss: 0.136, lr: 0.04996754607882181
epoch: 1400, acc: 0.940, loss: 0.133, lr: 0.049965049447911185
epoch: 1500, acc: 0.947, loss: 0.129, lr: 0.04996255306647668
epoch: 1600, acc: 0.953, lo

In [55]:
X_test, y_test = spiral_data(samples=100, classes=3)

for epoch in range(10001):

    # Forward pass
    dense1.forward(X_test)
    activation1.forward(dense1.output)
    dense2.forward(activation1.output)
    loss = loss_activation.forward(dense2.output, y_test)

    # Accuracy
    predictions = np.argmax(loss_activation.output, axis=1)
    if len(y_test.shape) == 2:
        y = np.argmax(y_test, axis=1)
    accuracy = np.mean(predictions==y)

    if not epoch % 100:
        print(f"epoch: {epoch}, acc: {accuracy:.3f}, loss: {loss:.3f} lr: {optimizer.current_learning_rate}")

    # Backward pass
    loss_activation.backward(loss_activation.output, y_test)
    dense2.backward(loss_activation.dinputs)
    activation1.backward(dense2.dinputs)
    dense1.backward(activation1.dinputs)

    # Update parameters
    optimizer.pre_update_params()
    optimizer.update_params(dense1)
    optimizer.update_params(dense2)
    optimizer.post_update_params()

epoch: 0, acc: 0.767, loss: 1.535 lr: 0.0495049259876604
epoch: 100, acc: 0.913, loss: 0.273 lr: 0.04950247537128094
epoch: 200, acc: 0.907, loss: 0.238 lr: 0.049500024997512625
epoch: 300, acc: 0.923, loss: 0.200 lr: 0.04949757486631943
epoch: 400, acc: 0.927, loss: 0.190 lr: 0.049495124977665325
epoch: 500, acc: 0.927, loss: 0.170 lr: 0.049492675331514316
epoch: 600, acc: 0.927, loss: 0.161 lr: 0.04949022592783039
epoch: 700, acc: 0.927, loss: 0.154 lr: 0.04948777676657754
epoch: 800, acc: 0.930, loss: 0.143 lr: 0.04948532784771979
epoch: 900, acc: 0.930, loss: 0.136 lr: 0.049482879171221156
epoch: 1000, acc: 0.937, loss: 0.133 lr: 0.04948043073704565
epoch: 1100, acc: 0.940, loss: 0.131 lr: 0.049477982545157326
epoch: 1200, acc: 0.940, loss: 0.130 lr: 0.04947553459552019
epoch: 1300, acc: 0.937, loss: 0.136 lr: 0.04947308688809832
epoch: 1400, acc: 0.940, loss: 0.123 lr: 0.04947063942285573
epoch: 1500, acc: 0.930, loss: 0.133 lr: 0.04946819219975651
epoch: 1600, acc: 0.950, loss: 0

# L1 and L2 Regularization

**Regularization methods** are those which reduce generalization error. The first forms of regularization that we'll address are **L1** and **L2 regularization**. L1 and L2 regularization are used ot calculate a number (called a **penalty**) added to the loss value to penalize the model for large weights and biases. Large weights might indicate that a neuron attempting to memorize a data element; generally, it is believed that it would be better to have many neurons contributing to a model's output, rather than a select few.

## Forward Pass

L1 regularization's penalty is the sum of all the absolute values for the weights and biases. This is a linear penalty as regularziation loss returned by this function is directly proportionaly to parameter values. L2 regularization's penalty is the sum of the squared weights and biases. This non-linear approach penalizes larger weights and biases more than smaller ones because of the square function used to calculate the result. In other words, L2 regularization is commonly used as it does not affect small parameter values substantially and does not allow the model to grow weights too large by heavily penalizing relatively big values. L1 regularization, because of its linear nature, penalizes small weights more than L2 regularization, causing the model to start being invariant to small inputs and variant only to the bigger ones. That's why L1 regularization is rarely used alone and usually combined with L2 regularization if it's even used at all. Regularization functions of this type can drive the sum of weights and the sum of parameters towards *0*, which can help in cases of exploding gradients (model instability, which might cause weights to become very large values). Beyond this, we also want to dictate how much of an impact we want this regularization penalty to carry. We use a value referred to as **lamda** in this equation -- where a higher value means a moer significant penalty.

<code>

l1w = lambda_l1w * sum(abs(weights))
l1b = lambda_l1b * sum(abs(biases))
l2w = lambda_l2w * sum(weights**2)
l2b = lambda_l2b * sum(biases**2)
loss = data_loss + l1w + l1b + l2w + l2b

</code>

Regularization losses are calculated separately, then summed witht the data loss, to form the overall loss. Parameter *m* is an arbitrary iterator over all of the weights in a model, parameter *n* is the bias equivalent of this iterator, *w_m* is the given weight, and *b_n* is the given bias.

To implement regularization in our neural network code, we'll start with the `__init__` method of the `Dense` layer's class. which will house the **lambda** regularization strength parameters, since these can be set separately for every layer.






In [56]:
def __init__(self,
             n_inputs, n_neurons,
             weight_regularizer_L1=0, weight_regularizer_L2=0,
             bias_regularizer_L1=0, bias_regularizer_L2=0):
    self.weights = np.random.randn(n_inputs, n_neurons)
    self.biases = np.zeros((1, n_neurons))

    # Set regularization strength
    self.weight_regularizer_L1 = weight_regularizer_L1
    self.weight_regularizer_L2 = weight_regularizer_L2
    self.bias_regularizer_L1 = bias_regularizer_L1
    self.bias_regularizer_L2 = bias_regularizer_L2

This method sets the lambda hyperparameters. Now we update our loss class to include the additional penalty if we choose to set the lambda hyperparameter for any of the regularizers in the layer's initialization. We will implement this code into the `Loss` class as it is common for the hidden layers. What's more, the regularization calculation is the same, regardless of the type of loss used. It's only a penalty that is summed with the data loss value resulting in a final, overall loss value. For this reason, we're going to add a new method to a general loss class, which is inherited by all of our specific loss functions (such as our existing `Loss_CategoricalCrossentropy). For the code of this method, we'll create the layer's regularization loss variable. We'll add to it each of the atomic regularization losses if its corresponding lambda value is greater than *0*. to perform these calculations, we read the lambda hyperparameters, weights, and biases from the passed-in layer object.

In [57]:
def regularization_loss(self, layer):

    # 0 by default
    regularization_loss = 0

    # L1 regularization - weights
    # calculate only when factor greater than 0
    if layer.weight_regularizer_L1 > 0:
        regularization_loss += layer.weight_regularizer_L1 * np.sum(np.abs(layer.weights))

    # L2 regularization - weights
    if layer.weight_regularizer_L2 > 0:
        regularization_loss += layer.weight_regularizer_L2 * np.sum(layer.weights * layer.weights)

    # L1 regularization - biases
    # calculate only when factor greater than 0
    if layer.bias_regularizer_L1 > 0:
        regularization_loss += layer.bias_regularizer_L1 * np.sum(np.abs(layer.biases))

    # L2 regularization - biases
    if layer.bias_regularizer_L2 > 0:
        regularization_loss += layer.bias_regularizer_L2 * np.sum(layer.biases * layer.biases)

    return regularization_loss

Then we'll calculate the regularization loss and add it to our calculated loss in the training loop:

In [None]:

# Calculate loss from output of activation2
data_loss = loss_function.forward(activation2.output, y)

# Calculate regularization penalty
regularization_loss = loss_function.regularization_loss(dense1) + loss_function.regularization_loss(dense2)

# Calculate overall loss
loss = data_loss + regularization_loss


We created a new `regularization_loss` variable and added all layer's regularization losses to it This completes the forward pass for regularization, but this also means our overall loss has changed since part of the calculation can include regularization, which must be accounted for in the backpropagation of the gradients. Thus, we will now cover the partial derivatives for both L1 and L2 regularization.

## Backward pass

### L2 Regularization Partial Derivative

$$
\begin{aligned}
L_2(w) &= \lambda \sum_{m} w_m^2 \\
\frac{\partial L_2(w)}{\partial w_j} &= \frac{\partial}{\partial w_j} \left( \lambda \sum_{m} w_m^2 \right) \\
&= \lambda \sum_{m} \frac{\partial}{\partial w_j} (w_m^2) \\
&= \lambda \left( \sum_{m \neq j} \frac{\partial}{\partial w_j} (w_m^2) + \frac{\partial}{\partial w_j} (w_j^2) \right) \\
&= \lambda \left( \sum_{m \neq j} 0 + 2w_j \right) \\
\frac{\partial L_2(w)}{\partial w_j} &= 2\lambda w_j \\
\frac{\partial J}{\partial w_j} &= \frac{\partial J_{data}}{\partial w_j} + 2\lambda w_j
\end{aligned}
$$

Lambda is a constant, so we can move it outside of the derivative term. We can remove the sum operator since we calculate the partial derivative with respect to the given parameter only, and the sum of one element equals this element. So, we only need to calculate the derivative of $$w^2$$ which we know is 2w. From the coding perspective, we will multiply all of the weights by $$2\lambda$$. We'll implement this with NumPy directly as it's just a simple multiplication operation.

L1 regularization's derivative, on the other hand, requires more explanation. In the case of L1 regularization, we must calculate the derivatie of the absolute value piecewise function, which effectively multiplies a value by -1 if it is less than 0, otherwise, it's multiplied by 1. This is because the absolute value function is linear for positive values, and we know that a linear functions's derivative is **1**.

For negative values, it negates the sign of the value to make it positive. In other words, it multiplies values by -1, so derivative is **-1**.

$$
\begin{aligned}
L_1(w) &= \lambda \sum_{m} |w_m| \\
\frac{\partial L_1(w)}{\partial w_j} &= \lambda \sum_{m} \frac{\partial}{\partial w_j} |w_m| \\
&= \lambda \left( \sum_{m \neq j} 0 + \frac{\partial}{\partial w_j} |w_j| \right) \\
\frac{\partial L_1(w)}{\partial w_j} &= \lambda \cdot
\begin{cases}
1 & \text{if } w_j > 0 \\
-1 & \text{if } w_j < 0 \\
\text{undefined} & \text{if } w_j = 0
\end{cases} \\
\frac{\partial J}{\partial w_j} &= \frac{\partial J_{data}}{\partial w_j} + \lambda \cdot
\begin{cases}
1 & \text{if } w_j > 0 \\
-1 & \text{if } w_j < 0 \\
\end{cases}
\end{aligned}
$$



Like L2 regularization, lambda is a constant, and we can calculate the partial derivative of this regularization with respect to the specific input. The partial derivative, in this case, equals 1 or -1 depending on the $$w_m$$ (weight) value.

We are calculating this derivative with respect to weights, and the resulting gradient, which has the same shape as the weights, is what we'll use to update the weights.

In [60]:
weights = [0.2, 0.8, -0.5]
dL1 = []
for weight in weights:
    if weight >= 0:
        dL1.append(1)
    else:
        dL1.append(-1)
print(dL1)

[1, 1, -1]


We're using `>= 0` in the code where the equation above  clearly depects `> 0`. If we picture the `np.abs` function, it's a line going down and "bouncing" at the value *0*, like a saw tooth. At the point end (i.e, the value of *0*), the derivative of the `np.abs` funciton is undefined, but we cannot code it this way, so we need to hanle this situation and break this rule a bit.

Now we modify this L1 derivative to work with multiple neurons in a layer:

In [61]:
weights = [[0.2, 0.8, -0.5, 1], # now we have 3 sets of weights
            [0.5, -0.91, 0.26, -0.5],
            [-0.26, -0.27, 0.17, 0.87]]

dL1 = []

for neuron in weights:
    neuron_dL1 = []

    for weight in neuron:
        if weight >= 0:
            neuron_dL1.append(1)
        else:
            neuron_dL1.append(-1)
    dL1.append(neuron_dL1)

print(dL1)

[[1, 1, -1, 1], [1, -1, 1, -1], [-1, -1, 1, 1]]


With NumPy, we're going to use conditions and binary masks. We'll create the gradient as an array filled with values of *1* and shaped like weights, using `np.ones_like(weights)`. Next, the condition `weights < 0` returns an array of the same shape as `dL1`, containing *0* where the condition is false and `1` where it's true. We're usoing this as a binary mask to `dL1` to set values to *-1* only where the condition is true (where weight and values are less than 0)

In [62]:
import numpy as np

weights = np.array([[0.2, 0.8, -0.5, 1],
                    [0.5, -0.91, 0.26, -0.5],
                    [-0.26, -0.27, 0.17, 0.87]])

dL1 = np.ones_like(weights)

dL1[weights < 0] = -1

print(dL1)

[[ 1.  1. -1.  1.]
 [ 1. -1.  1. -1.]
 [-1. -1.  1.  1.]]


This returned an array of the same shape containing values of 1 and -1 -- the partial gradient of the `np.abs` function ( we still have to multiply it by the lambda hyperparameter). We can now take these and update the backward pass method for the dense layer object. For L1 regularization, we'll take the code above and multiply it by $$\lambda$$ for weights and perform the same operation for biases. For L2 regularization, as discussed at the beginning of this chapter, all we need to do is take the weights/biases, multiply them by $$2\lambda$$, and add that product to the gradients

In [63]:
class Layer_Dense:

    def __init__(self, n_inputs, n_neurons,
                 weight_regularizer_l1=0, bias_regularizer_l1=0,
                 weight_regularizer_l2=0, bias_regularizer_l2=0):
        self.weights = np.random.rand(n_inputs, n_neurons)
        self.biases = np.zeros((1, n_neurons))
        self.weight_regularizer_l1 = weight_regularizer_l1
        self.weight_regularizer_l2 = weight_regularizer_l2
        self.bias_regularizer_l1 = bias_regularizer_l1
        self.bias_regularizer_l2 = bias_regularizer_l2

    def forward(self, inputs):
        self.inputs = inputs
        self.output = np.dot(inputs, self.weights) + self.biases

    def backward(self, dvalues):
        # Gradients on parameters
        self.dweights = np.dot(self.inputs.T,
                               dvalues)
        self.dbiases = np.sum(dvalues,
                             axis=0,
                             keepdims=True)

        # Gradients on regularization
        if self.weight_regularizer_l1 > 0:
            dL1 = np.ones_like(self.weights)
            dL1[self.weights < 0] = -1
            self.weights += self.weight_regularizer_l1 * dL1

        if self.bias_regularizer_l1 > 0:
            dL1 = np.ones_like(self.biases)
            dL1[self.biases < 0] = -1
            self.dbiases += self.bias_regularizer_l2 * dL1

        if self.weight_regularizer_l2 > 0:
            self.dweights += 2 * self.weight_regularizer_l2 * self.weights

        if self.bias_regularizer_l2 > 0:
            self.dbiases += 2 * self.bias_regularizer_l2 * self.biases

        # Gradients on values
        self.dinputs = np.dot(dvalues, self.weights.T)

In [64]:
class Activation_ReLU:

    def forward(self, inputs):
        self.inputs = inputs
        self.output = np.maximum(0, inputs)

    def backward(self, dvalues):
        self.dinputs = dvalues.copy()
        self.dinputs[self.inputs <= 0] = 0


In [65]:
class Activation_Softmax:

    def forward(self, inputs):
        self.inputs = inputs

        exp_values = np.exp(inputs - np.max(inputs,
                                            axis=1,
                                            keepdims=True))
        probabilities = exp_values / np.sum(exp_values,
                                            axis=1,
                                            keepdims=True)
        self.output = probabilities

    def backward(self, dvalues):
        self.dinputs = np.empty_like(dvalues)

        for index, (single_output, single_dvalue) in enumerate(zip(self.output, dvalues)):
            single_output = single_output.reshape(-1, 1)
            jacobian_matrix = np.diagflat(single_output) - np.dot(single_output, single_output.T)

            self.dinputs[index] = np.dot(jacobian_matrix,
                                         single_dvalue)


In [66]:
class Optimizer_SGD:

    def __init__(self,
                 learning_rate=1.,
                 decay=0.,
                 momentum=0.):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.momentum = momentum

    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * (1. / (1. + self.decay * self.iterations))

    def update_params(self, layer):

        if self.momentum:
            if not hasattr(layer, 'weight_momentum'):
                layer.weight_momentums = np.zeros_like(layer.weights)
                layer.bias_momentums = np.zeros_like(layer.biases)

            weight_updates = self.momentum * layer.weight_momentums - self.current_learning_rate * layer.dweights
            self.weight_momentums = weight_updates

            bias_updates = self.momentum * layer.bias_momentums - self.current_learning_rate * layer.dbiases
            self.bias_momentums = bias_updates

        else:
            weight_updates = -self.current_learning_rate * layer.dweights
            bias_updates = -self.current_learning_rate * layer.dbiases

        layer.weights += weight_updates
        layer.biases += bias_updates

    def post_update_params(self):
        self.iterations += 1

In [67]:
class Optimizer_Adagrad:

    def __init__(self,
                 learning_rate=1.,
                 decay=0.,
                 epsilon=1e-7):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.epsilon = epsilon

    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * (1. / (1. + self.decay * self.iterations))

    def update_params(self, layer):

        if not hasattr(layer, 'weight_cache'):
            layer.weight_cache = np.zeros_like(layer.weights)
            layer.bias_cache = np.zeros_like(layer.biases)

        layer.weight_cache += layer.dweights ** 2
        layer.bias_cache += layer.dweights ** 2

        layer.weights += -self.current_learning_rate * layer.dweights / (np.sqrt(layer.weight_cache) + self.epsilon)
        layer.biases += -self.current_learning_rate * layer.dbiases / (np.sqrt(layer.bias_cache) + self.epsilon)

    def post_update_params(self):
        self.iterations += 1


In [68]:
class Optimizer_RMSProp:

    def __init__(self,
                 learning_rate=1e-3,
                 decay=0.,
                 epsilon=1e-7,
                 rho=0.9):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.epsilon = epsilon
        self.rho = rho

    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * (1. / (1. + self.decay * self.iterations))

    def update_params(self, layer):
        if not hasattr(layer, 'weight_cache'):
            layer.weight_cache = np.zeros_like(layer.weights)
            layer.bias_cache = np.zeros_like(layer.biases)

        layer.weight_cache = self.rho * layer.weight_cache + (1 - self.rho) * layer.dweights**2
        layer.bias_cache = self.rho * layer.bias_cache + (1 - self.rho) * layer.dbiases**2

        layer.weights += -self.current_learning_rate * layer.dweights / (np.sqrt(layer.weight_cache) + self.epsilon)
        layer.biases += -self.current_learning_rate * layer.dbiases / (np.sqrt(layer.bias_cache) + self.epsilon)

    def post_update_params(self):
        self.iterations += 1


In [86]:
class Optimizer_Adam:

    def __init__(self,
                 learning_rate=0.001,
                 decay=0.,
                 epsilon=1e-7,
                 beta_1=0.9,
                 beta_2=0.999):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.epsilon = epsilon
        self.beta_1 = beta_1
        self.beta_2 = beta_2

    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * (1. / (1. + self.decay * self.iterations))

    def update_params(self, layer):
        if not hasattr(layer, 'weight_cache'):
            layer.weight_cache = np.zeros_like(layer.weights)
            layer.weight_momentums = np.zeros_like(layer.weights)
            layer.bias_cache = np.zeros_like(layer.biases)
            layer.bias_momentums = np.zeros_like(layer.biases)

        layer.weight_momentums = self.beta_1 * layer.weight_momentums + (1 - self.beta_1) * layer.dweights
        layer.bias_momentums = self.beta_1 * layer.bias_momentums + (1 - self.beta_1) * layer.dbiases

        weight_momentums_corrected = layer.weight_momentums / (1 - self.beta_1 ** (self.iterations + 1))
        bias_momentums_corrected = layer.bias_momentums / (1 - self.beta_1 ** (self.iterations + 1))

        layer.weight_cache = self.beta_2 * layer.weight_cache + (1 - self.beta_2) * layer.dweights**2
        layer.bias_cache = self.beta_2 * layer.bias_cache + (1 - self.beta_2) * layer.dbiases**2

        weight_cache_corrected = layer.weight_cache / (1 - self.beta_2 ** (self.iterations +1))
        bias_cache_corrected = layer.bias_cache / (1 - self.beta_2 ** (self.iterations + 1))

        layer.weights += -self.current_learning_rate * weight_momentums_corrected / (np.sqrt(weight_cache_corrected) + self.epsilon)
        layer.biases += -self.current_learning_rate * bias_momentums_corrected / (np.sqrt(bias_cache_corrected) + self.epsilon)

    def post_update_params(self):
        self.iterations += 1


In [87]:
class Loss:

    def regularization_loss(self, layer):

        regularization_loss = 0

        if layer.weight_regularizer_l1 > 0:
            regularization_loss += layer.weight_regularizer_l1 * np.sum(np.abs(layer.weights))

        if layer.weight_regularizer_l2 > 0:
            regularization_loss += layer.weight_regularizer_l2 * np.sum(layer.weights * layer.weights)

        if layer.bias_regularizer_l1 > 0:
            regularization_loss += layer.bias_regularizer_l1 * np.sum(np.abs(layer.biases))

        if layer.bias_regularizer_l2 > 0:
            regularization_loss += layer.bias_regularizer_l2 * np.sum(layer.biases * layer.biases)

        return regularization_loss

    def calculate(self, output, y):
        sample_losses = self.forward(output, y)
        data_loss = np.mean(sample_losses)
        return data_loss


In [88]:
class Loss_CategoricalCrossentropy(Loss):

    def forward(self, y_pred, y_true):

        samples = len(y_pred)
        y_pred_clipped = np.clip(y_pred, 1e-7, 1 - 1e-7)

        if len(y_true.shape) == 1:
            correct_confidences = y_pred_clipped[
                range(samples),
                y_true
            ]

        elif len(y_true.shape) == 2:
            correct_confidences = np.sum(
                y_pred_clipped * y_true,
                axis=1
            )

        negative_log_likelihoods = -np.log(correct_confidences)
        return negative_log_likelihoods

    def backward(self, dvalues, y_true):

        samples = len(dvalues)
        labels = len(dvalues[0])

        if len(y_true.shape) == 1:
            y_true = np.eye(labels)[y_true]

        self.dinputs = -y_true / dvalues
        self.dinputs = self.dinputs / samples

In [89]:
class Activation_Softmax_Loss_CategoricalCrossentropy:

    def __init__(self):
        self.activation = Activation_Softmax()
        self.loss = Loss_CategoricalCrossentropy()

    def forward(self, inputs, y_true):
        self.activation.forward(inputs)
        self.output = self.activation.output
        return self.loss.calculate(self.output, y_true)

    def backward(self, dvalues, y_true):
        samples = len(dvalues)

        if len(y_true.shape) == 2:
            y_true = np.argmax(y_true, axis=1)

        self.dinputs = dvalues.copy()
        self.dinputs[range(samples), y_true] -= 1
        self.dinputs = self.dinputs / samples


In [104]:
X, y = spiral_data(samples=100, classes=3)

dense1 = Layer_Dense(2, 64,
                     weight_regularizer_l2=5e-4,
                     bias_regularizer_l2=5e-5)
activation1 = Activation_ReLU()
dense2 = Layer_Dense(64, 3)
loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy()

optimizer = Optimizer_Adam(learning_rate=0.02,
                          decay=5e-7)

for epoch in range(10001):

    dense1.forward(X)
    activation1.forward(dense1.output)
    dense2.forward(activation1.output)
    data_loss = loss_activation.forward(dense2.output, y)

    regularization_loss = loss_activation.loss.regularization_loss(dense1) + \
        loss_activation.loss.regularization_loss(dense2)

    loss = data_loss + regularization_loss

    predictions = np.argmax(loss_activation.output,
                            axis=1)
    if len(y.shape) == 2:
        y = np.argmax(y, axis=1)
    accuracy = np.mean(predictions==y)

    if not epoch % 100:
        print(f"epoch: {epoch}, " +
              f"acc: {accuracy:.3f}, " +
              f"loss: {loss:.3f}, (" +
              f"data_loss: {data_loss:.3f} " +
              f"reg_loss: {regularization_loss:.3f}), " +
              f"lr: {optimizer.current_learning_rate}")

    loss_activation.backward(loss_activation.output, y)
    dense2.backward(loss_activation.dinputs)
    activation1.backward(dense2.dinputs)
    dense1.backward(activation1.dinputs)

    optimizer.pre_update_params()
    optimizer.update_params(dense1)
    optimizer.update_params(dense2)
    optimizer.post_update_params()

epoch: 0, acc: 0.360, loss: 1.257, (data_loss: 1.236 reg_loss: 0.021), lr: 0.02
epoch: 100, acc: 0.483, loss: 1.034, (data_loss: 1.021 reg_loss: 0.012), lr: 0.019999010049002574
epoch: 200, acc: 0.610, loss: 0.894, (data_loss: 0.852 reg_loss: 0.042), lr: 0.019998010197985302
epoch: 300, acc: 0.647, loss: 0.806, (data_loss: 0.745 reg_loss: 0.061), lr: 0.019997010446938183
epoch: 400, acc: 0.687, loss: 0.748, (data_loss: 0.680 reg_loss: 0.068), lr: 0.01999601079584623
epoch: 500, acc: 0.750, loss: 0.690, (data_loss: 0.615 reg_loss: 0.076), lr: 0.01999501124469445
epoch: 600, acc: 0.777, loss: 0.645, (data_loss: 0.565 reg_loss: 0.080), lr: 0.01999401179346786
epoch: 700, acc: 0.787, loss: 0.614, (data_loss: 0.532 reg_loss: 0.082), lr: 0.01999301244215147
epoch: 800, acc: 0.783, loss: 0.595, (data_loss: 0.514 reg_loss: 0.081), lr: 0.0199920131907303
epoch: 900, acc: 0.797, loss: 0.576, (data_loss: 0.496 reg_loss: 0.080), lr: 0.019991014039189386
epoch: 1000, acc: 0.800, loss: 0.563, (data_

In [105]:
X_test, y_test = spiral_data(samples=100, classes=3)

dense1.forward(X_test)
activation1.forward(dense1.output)
dense2.forward(activation1.output)
loss = loss_activation.forward(dense2.output, y_test)

predictions = np.argmax(loss_activation.output, axis=1)
if len(y_test.shape) == 2:
    y_test = np.argmax(y_test, axis=1)
accuracy = np.mean(predictions==y_test)

print(f"validation, acc: {accuracy:.3f}, loss: {loss:.3f}")

validation, acc: 0.817, loss: 0.576


#### Increase the number of samples

In [106]:
X, y = spiral_data(samples=1000, classes=3)

dense1 = Layer_Dense(2, 64,
                     weight_regularizer_l2=5e-4,
                     bias_regularizer_l2=5e-5)
activation1 = Activation_ReLU()
dense2 = Layer_Dense(64, 3)
loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy()

optimizer = Optimizer_Adam(learning_rate=0.02,
                          decay=5e-7)

for epoch in range(10001):

    dense1.forward(X)
    activation1.forward(dense1.output)
    dense2.forward(activation1.output)
    data_loss = loss_activation.forward(dense2.output, y)

    regularization_loss = loss_activation.loss.regularization_loss(dense1) + \
        loss_activation.loss.regularization_loss(dense2)

    loss = data_loss + regularization_loss

    predictions = np.argmax(loss_activation.output,
                            axis=1)
    if len(y.shape) == 2:
        y = np.argmax(y, axis=1)
    accuracy = np.mean(predictions==y)

    if not epoch % 100:
        print(f"epoch: {epoch}, " +
              f"acc: {accuracy:.3f}, " +
              f"loss: {loss:.3f}, (" +
              f"data_loss: {data_loss:.3f} " +
              f"reg_loss: {regularization_loss:.3f}), " +
              f"lr: {optimizer.current_learning_rate}")

    loss_activation.backward(loss_activation.output, y)
    dense2.backward(loss_activation.dinputs)
    activation1.backward(dense2.dinputs)
    dense1.backward(activation1.dinputs)

    optimizer.pre_update_params()
    optimizer.update_params(dense1)
    optimizer.update_params(dense2)
    optimizer.post_update_params()

epoch: 0, acc: 0.329, loss: 1.237, (data_loss: 1.217 reg_loss: 0.021), lr: 0.02
epoch: 100, acc: 0.447, loss: 1.059, (data_loss: 1.050 reg_loss: 0.010), lr: 0.019999010049002574
epoch: 200, acc: 0.572, loss: 0.965, (data_loss: 0.942 reg_loss: 0.023), lr: 0.019998010197985302
epoch: 300, acc: 0.690, loss: 0.837, (data_loss: 0.777 reg_loss: 0.060), lr: 0.019997010446938183
epoch: 400, acc: 0.743, loss: 0.761, (data_loss: 0.685 reg_loss: 0.076), lr: 0.01999601079584623
epoch: 500, acc: 0.781, loss: 0.711, (data_loss: 0.627 reg_loss: 0.084), lr: 0.01999501124469445
epoch: 600, acc: 0.801, loss: 0.671, (data_loss: 0.583 reg_loss: 0.088), lr: 0.01999401179346786
epoch: 700, acc: 0.820, loss: 0.643, (data_loss: 0.553 reg_loss: 0.091), lr: 0.01999301244215147
epoch: 800, acc: 0.830, loss: 0.622, (data_loss: 0.530 reg_loss: 0.091), lr: 0.0199920131907303
epoch: 900, acc: 0.848, loss: 0.596, (data_loss: 0.504 reg_loss: 0.092), lr: 0.019991014039189386
epoch: 1000, acc: 0.854, loss: 0.574, (data_

In [107]:
X_test, y_test = spiral_data(samples=1000, classes=3)

dense1.forward(X_test)
activation1.forward(dense1.output)
dense2.forward(activation1.output)
loss = loss_activation.forward(dense2.output, y_test)

predictions = np.argmax(loss_activation.output, axis=1)
if len(y_test.shape) == 2:
    y_test = np.argmax(y_test, axis=1)
accuracy = np.mean(predictions==y_test)

print(f"validation, acc: {accuracy:.3f}, loss: {loss:.3f}")

validation, acc: 0.886, loss: 0.316


We can see that this change alone also had a considerable impact on both validation accuracy in general, as well as the delta between the validation and training accuracies -- lower accuracy and higher training loss suggest that the capacity of the model might be too low. A large delta earlier and a small one now suggests that the model was most likely overfitting previously. In theory, this regularization allows us to create much larger models without fear of overfitting (or memorization). We can test this by increasing the number of neurons per layer. Going with 128 or 256 neurons per layer helps with the training accuracy but not that much with the validation accuracy:

In [111]:
X, y = spiral_data(samples=1000, classes=3)

dense1 = Layer_Dense(2, 256,
                     weight_regularizer_l2=5e-4,
                     bias_regularizer_l2=5e-5)
activation1 = Activation_ReLU()
dense2 = Layer_Dense(256, 3)
loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy()

optimizer = Optimizer_Adam(learning_rate=0.02,
                          decay=5e-7)

for epoch in range(10001):

    dense1.forward(X)
    activation1.forward(dense1.output)
    dense2.forward(activation1.output)
    data_loss = loss_activation.forward(dense2.output, y)

    regularization_loss = loss_activation.loss.regularization_loss(dense1) + \
        loss_activation.loss.regularization_loss(dense2)

    loss = data_loss + regularization_loss

    predictions = np.argmax(loss_activation.output,
                            axis=1)
    if len(y.shape) == 2:
        y = np.argmax(y, axis=1)
    accuracy = np.mean(predictions==y)

    if not epoch % 100:
        print(f"epoch: {epoch}, " +
              f"acc: {accuracy:.3f}, " +
              f"loss: {loss:.3f}, (" +
              f"data_loss: {data_loss:.3f} " +
              f"reg_loss: {regularization_loss:.3f}), " +
              f"lr: {optimizer.current_learning_rate}")

    loss_activation.backward(loss_activation.output, y)
    dense2.backward(loss_activation.dinputs)
    activation1.backward(dense2.dinputs)
    dense1.backward(activation1.dinputs)

    optimizer.pre_update_params()
    optimizer.update_params(dense1)
    optimizer.update_params(dense2)
    optimizer.post_update_params()

epoch: 0, acc: 0.309, loss: 1.675, (data_loss: 1.587 reg_loss: 0.089), lr: 0.02
epoch: 100, acc: 0.510, loss: 1.067, (data_loss: 1.016 reg_loss: 0.050), lr: 0.019999010049002574
epoch: 200, acc: 0.648, loss: 0.932, (data_loss: 0.882 reg_loss: 0.051), lr: 0.019998010197985302
epoch: 300, acc: 0.716, loss: 0.804, (data_loss: 0.729 reg_loss: 0.074), lr: 0.019997010446938183
epoch: 400, acc: 0.783, loss: 0.723, (data_loss: 0.635 reg_loss: 0.088), lr: 0.01999601079584623
epoch: 500, acc: 0.818, loss: 0.671, (data_loss: 0.577 reg_loss: 0.094), lr: 0.01999501124469445
epoch: 600, acc: 0.829, loss: 0.635, (data_loss: 0.538 reg_loss: 0.097), lr: 0.01999401179346786
epoch: 700, acc: 0.816, loss: 0.627, (data_loss: 0.528 reg_loss: 0.099), lr: 0.01999301244215147
epoch: 800, acc: 0.855, loss: 0.575, (data_loss: 0.473 reg_loss: 0.102), lr: 0.0199920131907303
epoch: 900, acc: 0.850, loss: 0.562, (data_loss: 0.459 reg_loss: 0.103), lr: 0.019991014039189386
epoch: 1000, acc: 0.863, loss: 0.543, (data_

In [120]:
X_test, y_test = spiral_data(samples=1000, classes=3)

dense1.forward(X_test)
activation1.forward(dense1.output)
dense2.forward(activation1.output)
loss = loss_activation.forward(dense2.output, y_test)

predictions = np.argmax(loss_activation.output, axis=1)
if len(y_test.shape) == 2:
    y_test = np.argmax(y_test, axis=1)
accuracy = np.mean(predictions==y_test)

print(f"validation, acc: {accuracy:.3f}, loss: {loss:.3f}")

validation, acc: 0.895, loss: 0.263


This didn't produce much of a change in results, but raising this number again to 512 did improve validation accuracy and loss as well:

In [113]:
X, y = spiral_data(samples=1000, classes=3)

dense1 = Layer_Dense(2, 512,
                     weight_regularizer_l2=5e-4,
                     bias_regularizer_l2=5e-5)
activation1 = Activation_ReLU()
dense2 = Layer_Dense(512, 3)
loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy()

optimizer = Optimizer_Adam(learning_rate=0.02,
                          decay=5e-7)

for epoch in range(10001):

    dense1.forward(X)
    activation1.forward(dense1.output)
    dense2.forward(activation1.output)
    data_loss = loss_activation.forward(dense2.output, y)

    regularization_loss = loss_activation.loss.regularization_loss(dense1) + \
        loss_activation.loss.regularization_loss(dense2)

    loss = data_loss + regularization_loss

    predictions = np.argmax(loss_activation.output,
                            axis=1)
    if len(y.shape) == 2:
        y = np.argmax(y, axis=1)
    accuracy = np.mean(predictions==y)

    if not epoch % 100:
        print(f"epoch: {epoch}, " +
              f"acc: {accuracy:.3f}, " +
              f"loss: {loss:.3f}, (" +
              f"data_loss: {data_loss:.3f} " +
              f"reg_loss: {regularization_loss:.3f}), " +
              f"lr: {optimizer.current_learning_rate}")

    loss_activation.backward(loss_activation.output, y)
    dense2.backward(loss_activation.dinputs)
    activation1.backward(dense2.dinputs)
    dense1.backward(activation1.dinputs)

    optimizer.pre_update_params()
    optimizer.update_params(dense1)
    optimizer.update_params(dense2)
    optimizer.post_update_params()

epoch: 0, acc: 0.335, loss: 2.194, (data_loss: 2.023 reg_loss: 0.171), lr: 0.02
epoch: 100, acc: 0.471, loss: 1.087, (data_loss: 0.985 reg_loss: 0.102), lr: 0.019999010049002574
epoch: 200, acc: 0.729, loss: 0.895, (data_loss: 0.810 reg_loss: 0.085), lr: 0.019998010197985302
epoch: 300, acc: 0.832, loss: 0.745, (data_loss: 0.648 reg_loss: 0.097), lr: 0.019997010446938183
epoch: 400, acc: 0.837, loss: 0.659, (data_loss: 0.551 reg_loss: 0.108), lr: 0.01999601079584623
epoch: 500, acc: 0.851, loss: 0.603, (data_loss: 0.490 reg_loss: 0.113), lr: 0.01999501124469445
epoch: 600, acc: 0.866, loss: 0.561, (data_loss: 0.446 reg_loss: 0.114), lr: 0.01999401179346786
epoch: 700, acc: 0.871, loss: 0.532, (data_loss: 0.419 reg_loss: 0.113), lr: 0.01999301244215147
epoch: 800, acc: 0.874, loss: 0.509, (data_loss: 0.397 reg_loss: 0.112), lr: 0.0199920131907303
epoch: 900, acc: 0.877, loss: 0.487, (data_loss: 0.378 reg_loss: 0.109), lr: 0.019991014039189386
epoch: 1000, acc: 0.885, loss: 0.465, (data_

In [119]:
X_test, y_test = spiral_data(samples=1000, classes=3)

dense1.forward(X_test)
activation1.forward(dense1.output)
dense2.forward(activation1.output)
loss = loss_activation.forward(dense2.output, y_test)

predictions = np.argmax(loss_activation.output, axis=1)
if len(y_test.shape) == 2:
    y_test = np.argmax(y_test, axis=1)
accuracy = np.mean(predictions==y_test)

print(f"validation, acc: {accuracy:.3f}, loss: {loss:.3f}")

validation, acc: 0.889, loss: 0.308


Or not really!!
Let's try more layers

In [121]:
dense1 = Layer_Dense(2, 512,
                     weight_regularizer_l2=5e-4,
                     bias_regularizer_l2=5e-5)
activation1 = Activation_ReLU()
dense2 = Layer_Dense(512, 128)
activation2 = Activation_ReLU()
dense3 = Layer_Dense(128, 64)
activation3 = Activation_ReLU()
dense4 = Layer_Dense(64, 3)
loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy()

optimizer = Optimizer_Adam(learning_rate=0.02,
                          decay=5e-7)

for epoch in range(10001):

    dense1.forward(X)
    activation1.forward(dense1.output)
    dense2.forward(activation1.output)
    activation2.forward(dense2.output)
    dense3.forward(activation2.output)
    activation3.forward(dense3.output)
    dense4.forward(activation3.output)
    data_loss = loss_activation.forward(dense4.output, y)

    regularization_loss = loss_activation.loss.regularization_loss(dense1) + \
        loss_activation.loss.regularization_loss(dense2) + \
        loss_activation.loss.regularization_loss(dense3) + \
        loss_activation.loss.regularization_loss(dense4)

    loss = data_loss + regularization_loss

    predictions = np.argmax(loss_activation.output,
                            axis=1)
    if len(y.shape) == 2:
        y = np.argmax(y, axis=1)
    accuracy = np.mean(predictions==y)

    if not epoch % 100:
        print(f"epoch: {epoch}, " +
              f"acc: {accuracy:.3f}, " +
              f"loss: {loss:.3f}, (" +
              f"data_loss: {data_loss:.3f} " +
              f"reg_loss: {regularization_loss:.3f}), " +
              f"lr: {optimizer.current_learning_rate}")

    loss_activation.backward(loss_activation.output, y)
    dense4.backward(loss_activation.dinputs)
    activation3.backward(dense4.dinputs)
    dense3.backward(activation3.dinputs)
    activation2.backward(dense3.dinputs)
    dense2.backward(activation2.dinputs)
    activation1.backward(dense2.dinputs)
    dense1.backward(activation1.dinputs)

    optimizer.pre_update_params()
    optimizer.update_params(dense1)
    optimizer.update_params(dense2)
    optimizer.update_params(dense3)
    optimizer.update_params(dense4)
    optimizer.post_update_params()

epoch: 0, acc: 0.299, loss: 8.444, (data_loss: 8.268 reg_loss: 0.176), lr: 0.02
epoch: 100, acc: 0.334, loss: 1.127, (data_loss: 1.100 reg_loss: 0.027), lr: 0.019999010049002574
epoch: 200, acc: 0.334, loss: 1.125, (data_loss: 1.098 reg_loss: 0.026), lr: 0.019998010197985302
epoch: 300, acc: 0.335, loss: 1.125, (data_loss: 1.098 reg_loss: 0.026), lr: 0.019997010446938183
epoch: 400, acc: 0.335, loss: 1.124, (data_loss: 1.098 reg_loss: 0.026), lr: 0.01999601079584623
epoch: 500, acc: 0.335, loss: 1.124, (data_loss: 1.098 reg_loss: 0.026), lr: 0.01999501124469445
epoch: 600, acc: 0.336, loss: 1.124, (data_loss: 1.098 reg_loss: 0.026), lr: 0.01999401179346786
epoch: 700, acc: 0.336, loss: 1.123, (data_loss: 1.098 reg_loss: 0.026), lr: 0.01999301244215147
epoch: 800, acc: 0.336, loss: 1.123, (data_loss: 1.098 reg_loss: 0.025), lr: 0.0199920131907303
epoch: 900, acc: 0.336, loss: 1.123, (data_loss: 1.098 reg_loss: 0.025), lr: 0.019991014039189386
epoch: 1000, acc: 0.336, loss: 1.122, (data_

In [147]:

dense1.forward(X_test)
activation1.forward(dense1.output)
dense2.forward(activation1.output)
activation2.forward(dense2.output)
dense3.forward(activation2.output)
activation3.forward(dense3.output)
dense4.forward(activation3.output)
loss = loss_activation.forward(dense4.output, y_test)

predictions = np.argmax(loss_activation.output, axis=1)
if len(y_test.shape) == 2:
    y_test = np.argmax(y_test, axis=1)
accuracy = np.mean(predictions==y_test)

print(f"validation, acc: {accuracy:.3f}, loss: {loss:.3f}")

validation, acc: 0.909, loss: 0.238


In [157]:
dense1 = Layer_Dense(2, 256,
                     weight_regularizer_l2=5e-4,
                     bias_regularizer_l2=5e-5)
activation1 = Activation_ReLU()
dense2 = Layer_Dense(256, 128)
activation2 = Activation_ReLU()
dense3 = Layer_Dense(128, 128)
activation3 = Activation_ReLU()
dense4 = Layer_Dense(128, 3)
loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy()

optimizer = Optimizer_Adam(learning_rate=0.001,
                          decay=5e-4)

for epoch in range(100001):

    dense1.forward(X)
    activation1.forward(dense1.output)
    dense2.forward(activation1.output)
    activation2.forward(dense2.output)
    dense3.forward(activation2.output)
    activation3.forward(dense3.output)
    dense4.forward(activation3.output)
    data_loss = loss_activation.forward(dense4.output, y)

    regularization_loss = loss_activation.loss.regularization_loss(dense1) + \
        loss_activation.loss.regularization_loss(dense2) + \
        loss_activation.loss.regularization_loss(dense3) + \
        loss_activation.loss.regularization_loss(dense4)

    loss = data_loss + regularization_loss

    predictions = np.argmax(loss_activation.output,
                            axis=1)
    if len(y.shape) == 2:
        y = np.argmax(y, axis=1)
    accuracy = np.mean(predictions==y)

    if not epoch % 100:
        print(f"epoch: {epoch}, " +
              f"acc: {accuracy:.3f}, " +
              f"loss: {loss:.3f}, (" +
              f"data_loss: {data_loss:.3f} " +
              f"reg_loss: {regularization_loss:.3f}), " +
              f"lr: {optimizer.current_learning_rate}")

    loss_activation.backward(loss_activation.output, y)
    dense4.backward(loss_activation.dinputs)
    activation3.backward(dense4.dinputs)
    dense3.backward(activation3.dinputs)
    activation2.backward(dense3.dinputs)
    dense2.backward(activation2.dinputs)
    activation1.backward(dense2.dinputs)
    dense1.backward(activation1.dinputs)

    optimizer.pre_update_params()
    optimizer.update_params(dense1)
    optimizer.update_params(dense2)
    optimizer.update_params(dense3)
    optimizer.update_params(dense4)
    optimizer.post_update_params()

epoch: 0, acc: 0.319, loss: 8.073, (data_loss: 7.991 reg_loss: 0.081), lr: 0.001
epoch: 100, acc: 0.428, loss: 4.129, (data_loss: 4.054 reg_loss: 0.075), lr: 0.0009528346831824678
epoch: 200, acc: 0.408, loss: 3.788, (data_loss: 3.714 reg_loss: 0.074), lr: 0.0009095043201455208
epoch: 300, acc: 0.424, loss: 4.064, (data_loss: 3.992 reg_loss: 0.072), lr: 0.0008699434536755111
epoch: 400, acc: 0.388, loss: 4.244, (data_loss: 4.173 reg_loss: 0.071), lr: 0.0008336807002917883
epoch: 500, acc: 0.340, loss: 5.166, (data_loss: 5.096 reg_loss: 0.069), lr: 0.0008003201280512204
epoch: 600, acc: 0.384, loss: 3.758, (data_loss: 3.690 reg_loss: 0.068), lr: 0.0007695267410542516
epoch: 700, acc: 0.391, loss: 4.566, (data_loss: 4.499 reg_loss: 0.067), lr: 0.0007410151908114116
epoch: 800, acc: 0.402, loss: 3.497, (data_loss: 3.431 reg_loss: 0.065), lr: 0.0007145409074669525
epoch: 900, acc: 0.390, loss: 3.012, (data_loss: 2.948 reg_loss: 0.064), lr: 0.000689893066574681
epoch: 1000, acc: 0.390, loss

In [169]:
X_test, y_test = spiral_data(samples=1000, classes=3)

dense1.forward(X_test)
activation1.forward(dense1.output)
dense2.forward(activation1.output)
activation2.forward(dense2.output)
dense3.forward(activation2.output)
activation3.forward(dense3.output)
dense4.forward(activation3.output)
loss = loss_activation.forward(dense4.output, y_test)

predictions = np.argmax(loss_activation.output, axis=1)
if len(y_test.shape) == 2:
    y_test = np.argmax(y_test, axis=1)
accuracy = np.mean(predictions==y_test)

print(f"validation, acc: {accuracy:.3f}, loss: {loss:.3f}")

validation, acc: 0.889, loss: 0.521


In [165]:
dense1 = Layer_Dense(2, 256,
                     weight_regularizer_l2=5e-4,
                     bias_regularizer_l2=5e-4)
activation1 = Activation_ReLU()
dense2 = Layer_Dense(256, 128)
activation2 = Activation_ReLU()
dense3 = Layer_Dense(128, 128)
activation3 = Activation_ReLU()
dense4 = Layer_Dense(128, 3)
loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy()

optimizer = Optimizer_Adam(learning_rate=0.005,
                          decay=2e-4)

for epoch in range(100001):

    dense1.forward(X)
    activation1.forward(dense1.output)
    dense2.forward(activation1.output)
    activation2.forward(dense2.output)
    dense3.forward(activation2.output)
    activation3.forward(dense3.output)
    dense4.forward(activation3.output)
    data_loss = loss_activation.forward(dense4.output, y)

    regularization_loss = loss_activation.loss.regularization_loss(dense1) + \
        loss_activation.loss.regularization_loss(dense2) + \
        loss_activation.loss.regularization_loss(dense3) + \
        loss_activation.loss.regularization_loss(dense4)

    loss = data_loss + regularization_loss

    predictions = np.argmax(loss_activation.output,
                            axis=1)
    if len(y.shape) == 2:
        y = np.argmax(y, axis=1)
    accuracy = np.mean(predictions==y)

    if not epoch % 100:
        print(f"epoch: {epoch}, " +
              f"acc: {accuracy:.3f}, " +
              f"loss: {loss:.3f}, (" +
              f"data_loss: {data_loss:.3f} " +
              f"reg_loss: {regularization_loss:.3f}), " +
              f"lr: {optimizer.current_learning_rate}")

    loss_activation.backward(loss_activation.output, y)
    dense4.backward(loss_activation.dinputs)
    activation3.backward(dense4.dinputs)
    dense3.backward(activation3.dinputs)
    activation2.backward(dense3.dinputs)
    dense2.backward(activation2.dinputs)
    activation1.backward(dense2.dinputs)
    dense1.backward(activation1.dinputs)

    optimizer.pre_update_params()
    optimizer.update_params(dense1)
    optimizer.update_params(dense2)
    optimizer.update_params(dense3)
    optimizer.update_params(dense4)
    optimizer.post_update_params()

epoch: 0, acc: 0.299, loss: 8.335, (data_loss: 8.250 reg_loss: 0.084), lr: 0.005
epoch: 100, acc: 0.371, loss: 3.297, (data_loss: 3.251 reg_loss: 0.045), lr: 0.004902922141596391
epoch: 200, acc: 0.438, loss: 1.403, (data_loss: 1.362 reg_loss: 0.041), lr: 0.004808617041738796
epoch: 300, acc: 0.475, loss: 1.415, (data_loss: 1.375 reg_loss: 0.039), lr: 0.004717871296471032
epoch: 400, acc: 0.536, loss: 0.946, (data_loss: 0.907 reg_loss: 0.039), lr: 0.004630487127245786
epoch: 500, acc: 0.583, loss: 0.858, (data_loss: 0.819 reg_loss: 0.039), lr: 0.004546281142025823
epoch: 600, acc: 0.538, loss: 1.027, (data_loss: 0.989 reg_loss: 0.038), lr: 0.00446508305054474
epoch: 700, acc: 0.566, loss: 0.900, (data_loss: 0.864 reg_loss: 0.035), lr: 0.004386734514827163
epoch: 800, acc: 0.612, loss: 0.804, (data_loss: 0.768 reg_loss: 0.035), lr: 0.004311088118641146
epoch: 900, acc: 0.637, loss: 0.780, (data_loss: 0.745 reg_loss: 0.036), lr: 0.004238006441769792
epoch: 1000, acc: 0.669, loss: 0.750, 

In [178]:
X_test, y_test = spiral_data(samples=1000, classes=3)


dense1.forward(X_test)
activation1.forward(dense1.output)
dense2.forward(activation1.output)
activation2.forward(dense2.output)
dense3.forward(activation2.output)
activation3.forward(dense3.output)
dense4.forward(activation3.output)
loss = loss_activation.forward(dense4.output, y_test)

predictions = np.argmax(loss_activation.output, axis=1)
if len(y_test.shape) == 2:
    y_test = np.argmax(y_test, axis=1)
accuracy = np.mean(predictions==y_test)

print(f"validation, acc: {accuracy:.3f}, loss: {loss:.3f}")

validation, acc: 0.901, loss: 0.521


Here is what DeepSeek has to say about the above results and discrepancy:

The discrepancy between your training and test performance suggests overfitting, but the static test metrics indicate deeper issues. Here are the key reasons and solutions:

1. Severe Overfitting
Evidence: Training acc=0.955 vs Test acc=0.901

Causes:

** Network is too complex for spiral data (4 dense layers are excessive)

** L2 regularization (5e-4) is too weak

2. Data Distribution Mismatch
Evidence: Static test metrics across runs

3. Incorrect Test Evaluation
Bug: You're reusing loss_activation object

4. Optimizer Saturation
Evidence: Final lr=0.000238 is too small

5. Gradient Flow Issues

The static test metrics strongly suggest either a data leak (are you accidentally training on test data?) or improper data splitting. The spiral dataset should achieve >95% test accuracy with proper regularization.

In [182]:
dense1 = Layer_Dense(2, 128,
                     weight_regularizer_l2=1e-4,
                     bias_regularizer_l2=1e-5)
activation1 = Activation_ReLU()
dense2 = Layer_Dense(128, 64)
activation2 = Activation_ReLU()
dense3 = Layer_Dense(64, 3)
loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy()

optimizer = Optimizer_Adam(learning_rate=0.005,
                          decay=2e-4)
print("Train mean/std:", X.mean(), X.std())
# Standard training set with test set later
X = (X - X.mean()) / X.std()

for epoch in range(100001):

    dense1.forward(X)
    activation1.forward(dense1.output)
    dense2.forward(activation1.output)
    activation2.forward(dense2.output)
    dense3.forward(activation2.output)
    data_loss = loss_activation.forward(dense3.output, y)

    regularization_loss = loss_activation.loss.regularization_loss(dense1) + \
        loss_activation.loss.regularization_loss(dense2) + \
        loss_activation.loss.regularization_loss(dense3) + \
        loss_activation.loss.regularization_loss(dense4)

    loss = data_loss + regularization_loss

    predictions = np.argmax(loss_activation.output,
                            axis=1)
    if len(y.shape) == 2:
        y = np.argmax(y, axis=1)
    accuracy = np.mean(predictions==y)

    if not epoch % 100:
        print(f"epoch: {epoch}, " +
              f"acc: {accuracy:.3f}, " +
              f"loss: {loss:.3f}, (" +
              f"data_loss: {data_loss:.3f} " +
              f"reg_loss: {regularization_loss:.3f}), " +
              f"lr: {optimizer.current_learning_rate}")

    loss_activation.backward(loss_activation.output, y)
    dense3.backward(loss_activation.dinputs)
    activation2.backward(dense3.dinputs)
    dense2.backward(activation2.dinputs)
    activation1.backward(dense2.dinputs)
    dense1.backward(activation1.dinputs)

    optimizer.pre_update_params()
    optimizer.update_params(dense1)
    optimizer.update_params(dense2)
    optimizer.update_params(dense3)
    optimizer.post_update_params()


Train mean/std: 1.0172526e-08 1.0
epoch: 0, acc: 0.328, loss: 5.546, (data_loss: 5.537 reg_loss: 0.010), lr: 0.005
epoch: 100, acc: 0.435, loss: 1.069, (data_loss: 1.061 reg_loss: 0.008), lr: 0.004902922141596391
epoch: 200, acc: 0.453, loss: 1.055, (data_loss: 1.047 reg_loss: 0.008), lr: 0.004808617041738796
epoch: 300, acc: 0.462, loss: 1.053, (data_loss: 1.046 reg_loss: 0.008), lr: 0.004717871296471032
epoch: 400, acc: 0.466, loss: 1.022, (data_loss: 1.014 reg_loss: 0.007), lr: 0.004630487127245786
epoch: 500, acc: 0.502, loss: 0.981, (data_loss: 0.974 reg_loss: 0.007), lr: 0.004546281142025823
epoch: 600, acc: 0.559, loss: 0.929, (data_loss: 0.921 reg_loss: 0.007), lr: 0.00446508305054474
epoch: 700, acc: 0.600, loss: 0.881, (data_loss: 0.874 reg_loss: 0.007), lr: 0.004386734514827163
epoch: 800, acc: 0.629, loss: 0.828, (data_loss: 0.820 reg_loss: 0.008), lr: 0.004311088118641146
epoch: 900, acc: 0.655, loss: 0.776, (data_loss: 0.768 reg_loss: 0.008), lr: 0.004238006441769792
epoc

In [209]:
X_test, y_test = spiral_data(samples=1000, classes=3)

test_loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy()

X_test = (X_test - X_test.mean()) / X_test.std()

dense1.forward(X_test)
activation1.forward(dense1.output)
dense2.forward(activation1.output)
activation2.forward(dense2.output)
dense3.forward(activation2.output)
loss = loss_activation.forward(dense3.output, y_test)

predictions = np.argmax(loss_activation.output, axis=1)
if len(y_test.shape) == 2:
    y_test = np.argmax(y_test, axis=1)
accuracy = np.mean(predictions==y_test)



print(f"validation, acc: {accuracy:.3f}, loss: {loss:.3f}")

validation, acc: 0.896, loss: 0.354
