In [1]:
import numpy as np
import nnfs
from nnfs.datasets import sine_data, spiral_data

nnfs.init()

class Layer_Dense:
    def __init__(self, n_inputs, n_neurons, weight_regularizer_l1 = 0,
                weight_regularizer_l2 = 0,
                bias_regularizer_l1 = 0, bias_regularizer_l2 = 0):




        self.weights = 0.01 * np.random.randn(n_inputs, n_neurons)
        self.biases = np.zeros((1, n_neurons))

        #set regularization strength
        self.weight_regularizer_l1 = weight_regularizer_l1
        self.weight_regularizer_l2 = weight_regularizer_l2
        self.bias_regularizer_l1 = bias_regularizer_l1
        self.bias_regularizer_l2= bias_regularizer_l2


    def forward(self, inputs):
        #remember input values
        self.inputs = inputs
        #calculate output value from inputs, weights and biases
        self.output = np.dot(inputs, self.weights) + self.biases

    #backward pass

    def backward(self, dvalues):
        #gradients on parameters
        self.dweights = np.dot(self.inputs.T, dvalues)
        self.dbiases = np.sum(dvalues, axis = 0, keepdims = True)
        #Gradient on regularization
        #L1 on weights
        if self.weight_regularizer_l1 > 0:
            dL1 = np.ones_like(self.weights)
            dL1[self.weights < 0] = -1
            self.dweights += self.weight_regularizer_l1 * dL1

        #L2 on weights
        if self.weight_regularizer_l2 > 0:
            self.dweights += 2* self.weight_regularizer_l2* self.weights

        #L1 on biases
        if self.bias_regularizer_l1 > 0:
            dL1 = np.ones_like(self.biases)
            dL1[self.weights < 0] = -1
            self.dbiases += self.bias_regularizer_l1 * dL1


        #L2 on biases
        if self.bias_regularizer_l2 > 0:
            self.dbiases += 2 * self.bias_regularizer_l2 * self.biases

        #Gradient on values
        self.dinputs = np.dot(dvalues, self.weights.T)

#Dropout

class Layer_Dropout:
    #init
    def __init__(self, rate):
        #store rate, we inter ti as for example for dropout of 0.1 we need siccess rate of 0.9
        self.rate = 1 - rate
    #forward pass
    def forward(self, inputs):
        #save input values
        self.inputs = inputs

        #if not in the training_mode -return values
  

        #Generate and save scaled mask
        self.binary_mask = np.random.binomial(1, self.rate, size = inputs.shape )/ self.rate

        #Apply mask to output values
        self.output = inputs * self.binary_mask

        #backward pass
    def backward(self, dvalues):
        #Gradient on values
        self.dinputs = dvalues * self.binary_mask

class Layer_Input:

    #forward pass
    def forward (self, inputs):
        self.output = inputs

class Activation_ReLU:
    #forward pass
    def forward(self, inputs):
        #remember input values
        self.inputs = inputs
        #calculate output values from inputs
        self.output = np.maximum(0, inputs)

    def backward(self, dvalues):
        #since we need to modify original variable,
        #let's make a copy of values first
        self.dinputs = dvalues.copy()

        #Zero gradient where input values were negative
        self.dinputs[self.dinputs <= 0] = 0

    
    #calculate predictions for outputs
    def predictions(self,outputs):
        return outputs
#Softmax activation

class Activation_Softmax:
    def forward(self, inputs, training):
        #remember input values
        self.inputs = inputs

        #get nunnormalizer probabilities
        exp_values = np.exp(inputs - np.max(inputs, axis = 1, keepdims = True))
        
        #normalize them for each sample
        probabilities = exp_values / np.sum(exp_values, axis =1, keepdims = True )
        self.output = probabilities
    
    def backward(self, dvalues):
        #create unintialized array
        self.dinputs = np.empty_like(dvalues)

        #enumerate outputs and gradients
        for index, (single_output, single_dvalues) in enumerate(zip(self.output, dvalues)):
            #flatten the output array
            single_output = single_output.reshape(-1,1)
            
            #calculate jacobian matrix of the output
            jacobian_matrix = np.diagflat(single_output) - np.dot(single_output, single_output.T)
            
            #calculate sample-wise gradient
            #and add it to the array of sample gradients
            self.dinputs[index] = np.dot(jacobian_matrix, single_dvalues)
    
    def predictions(self, outputs):
        return np.argmax(outputs, axis = 1)

#Sigmoid activation
class Activation_Sigmoid:

    #forward pass
    def forward(self, inputs, training):
        #save input and calculate / save output of the sigmoid function

        self.inputs = inputs
        self.output = 1/ (1 + np.exp(-inputs))

        #bakcward pass
    def backward(self, dvalues):
        #derivative - calculates from output of the sigmoid function
        self.dinputs = dvalues * (1- self.output) * self.output

    #calculate prediction for outputs
    def predictions(self, outputs):
        return(outputs > 0.5) * 1

#Linear activation

class Activation_Linear:

    #forward pass
    def forward(self, inputs):
        #just remember values
        self.inputs = inputs
        self.output = inputs

    #Backward pass
    def backward(self, dvalues):
        #derivative is 1,1 * dvalues = dvalues - the chain rule
        self.dinputs = dvalues.copy()

    #Calculate predictions for outputs
    def predictions(self, outputs):
        return outputs

class Optimizer_SGD:
    #initialize optimizer -set settings,
    #learning rate of 1 is default for this optimizer

    def __init__(self, learning_rate = 1., decay = 0., momentum = 0.):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.momentum = momentum

    #Call once before any parameter updates
    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * (1. / (1. + self.decay + self.iterations))

    def update_params(self, layer):
        if self.momentum:

            #if layer does not contain momentum arrays. create them
            #filled with zeros

            if not hasattr(layer, 'weight_momentums'):
                layer.weight_momentums = np.zeros_like(layer.weights)
                #if there is no momentum array for weights
                #the array does not exist for biases yet either
                layer.bias_momentums = np.zeros_like(layer.biases)

                #build weight updates with momentum - take previous
                #updates multiplied by retain factor and update with current gradient
                weight_updates = self.momentum * layer.weight_momentums - self.current_learning_rate * layer.dweights
                layer.weight_momentums = weight_updates

                #build bias update
                bias_updates = self.momentum * layer.bias_momentums - self.current_learning_rate * layer.dbiases
                layer.bias_momentums = bias_updates
            #vanilla SGD updates

            else:
                weight_updates = -self.current_learning_rate * layer.dweights
                bias_updates = -self.current_learning_rate * layer.dbiases

            #udpate weights and biases using either vanilla or momentum updates

            layer.weights += weight_updates
            layer.biases += bias_updates

    def post_update_params(self):
        self.iterations += 1


#Adagrad Optimizer
class Optimizer_Adagrad:
    #initialize optimizer - set settings:

    def __init__(self, learning_rate = 1., decay = 0, epsilon = 1e-7):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.epsilon = epsilon

    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * (1. / (1. + self.decay * self.iterations))
    
    def update_params(self, layer):
        # if layer does not contain cache arrays
        #create them filled with zeros

        if not hasattr(layer, 'weight_cache'):
            layer.weight_cache = np.zeros_like(layer.weights)
            layer.bias_cache = np.zeros_like(layer.biases)

        #update cache with squared current gradients
        layer.weight_cache += layer.dweights ** 2
        layer.bias_cache += layer.dbiases ** 2

        # Vanilla SGD parameter udpate + normalizatoin
        #with square rooted cache
        layer.weights += self.current_learning_rate * layer.dweights / (np.sqrt(layer.weight_cache) + self.epsilon)
        layer.biases += -self.current_learning_rate * layer.dbiases / (np.sqrt(layer.bias_cache)+ self.epsilon)

        #call once after any parameter updates

    def post_update_params(self):
        self.iterations += 1


#RMSprop optimizer

class Optimizer_RMSprop:

    #initialize optimizer - set setiings

    def __init__(self, learning_rate = 0.001, decay = 0, epsilon = 1e-7, rho = 0.9):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.epsilon = epsilon
        self.rho = rho
    
    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * (1. / (1. + self.decay * self.iterations))

    def update_params(self, layer):

        #if layer does not contain cache arrays
        #create them filled with zeros

        if not hasattr(layer, 'weight_cache'):
            layer.weight_cache = np.zeros_like(layer.weights)
            layer.bias_cache = np.zeros_like(layer.biases)

        #update cache with squared current gradients
        layer.weight_cache = self.rho * layer.weight_cache + (1 - self.rho) * layer.dweights ** 2
        layer.bias_cache = self.rho * layer.bias_cache + (1 - self.rho) * layer.dbiases ** 2


        #Vanilla SGD Parameter update + normalization with square rooted cache
        layer.weights += - self.current_learning_rate * layer.dweights (np.sqrt(layer.weight_cache) + self.epsilon)
        layer.bias += self.current_learning_rate * layer.dbiases(np.sqrt(layer.bias_cache) + self.epsilon)

        #call once after any parameter updates
    def post_update_params(self):
        self.iterations += 1


class Optimizer_Adam:

    #Initialize optimizer - set setiings

    def __init__(self, learning_rate = 0.001, decay = 0, epsilon = 1e-7, beta_1 = 0.9, beta_2 = 0.999):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay =decay
        self.iterations = 0
        self.epsilon = epsilon
        self.beta_1 = beta_1
        self.beta_2 = beta_2
    
    #call once before any paramter updates
    def pre_update_params(self):
        if self.decay: 
            self.current_learning_rate = self.learning_rate * (1. / (1. + self.decay * self.iterations))
    #Update parameters
    def update_params(self, layer):

        #if layer does not conatin cache arrays
        #create them filled with zeros

        if not hasattr(layer, 'weight_cache'):
            layer.weight_momentums = np.zeros_like(layer.weights)
            layer.weight_cache = np.zeros_like(layer.weights)
            layer.bias_momentums = np.zeros_like(layer.biases)
            layer.bias_cache= np.zeros_like(layer.biases)

            #update momentum with current gradients
        layer.weights_momentums = self.beta_1 * layer.weight_momentums +(1- self.beta_1)* layer.dweights
        layer.bias_momentums = self.beta_1 * layer.bias_momentums + (1 - self.beta_1) * layer.dbiases

        weight_momentums_corrected = layer.weight_momentums / (1- self.beta_1 ** (self.iterations + 1))
        bias_momentums_corrected = layer.bias_momentums / (1- self.beta_1 ** (self.iterations + 1))
        #update cache with squared current gradients
        layer.weight_cache = self.beta_2 * layer.weight_cache + (1- self.beta_2)* layer.dweights ** 2
        layer.bias_cache = self.beta_2 * layer.bias_cache + (1- self.beta_2) * layer.dbiases ** 2

        #get corrected cache
        weight_cache_corrected = layer.weight_cache / (1- self.beta_2 ** (self.iterations + 1))
        bias_cache_corrected = layer.bias_cache / (1- self.beta_2 ** (self.iterations + 1))

        #Vanilla SGD parameter update + normalization with square rooted cache

        layer.weights += -self.current_learning_rate * weight_momentums_corrected/ (np.sqrt(weight_cache_corrected) + self.epsilon)
        layer.biases += self.current_learning_rate * bias_momentums_corrected / (np.sqrt(bias_cache_corrected) + self.epsilon)

    def post_update_params(self):
        self.iterations += 1

class Loss:

    #regularization loss calculation
    def regularization_loss(self, layer):
        #0 by default
        regularization_loss = 0

        #calculate regularization loss
        #iterate all trainable layers

        #L1 regularization weight 
        #calculate only when factor greater than 0

        if layer.weight_regularizer_l1 > 0:
            regularization_loss += layer.weight_regularizer_l1 * np.sum(np.abs(layer.weights))

        #L2 regularization weights
        if layer.weight_regularizer_l2 >0:
            regularization_loss += layer.weight_regularizer_l2 * np.sum(layer.weights * layer.weights)

        #L1 regularization biases
        #calcualte only factor greater than 0

        if layer.bias_regularizer_l1 > 0:
            regularization_loss += layer.bias_regularizer_l1 * np.sum(np.abs(layer.dbiases))

        #L2 regularizer -biases
        if layer.bias_regularizer_l2 > 0:
            regularization_loss += layer.bias_regularizer_l2 * np.sum(layer.biases* layer.biases)

        return regularization_loss

    def remember_trainable_layers(self, trainable_layers):
        self.trainable_layers = trainable_layers
        
        #calculates the data and regularization losses
        #given model output and gorund truth values

    def calculate(self, output, y, * , include_regularization = False):
        #calculate sample losses
        sample_losses = self.forward(output, y)

        #calculate mean loss
        data_loss = np.mean(sample_losses)

        #if not just data loss return it
        if not include_regularization:
            return data_loss

        #Return the data and regularization losses
        return data_loss, self.regularization_loss()

class Loss_CategoricalCrossentropy(Loss):

    #forward pass
    def forward(self, y_pred, y_true):
        #Number of samples in a batch
        samples= len(y_pred)

        #clip data to prevent division by 0
        #clip both sides to not drag mean towards and value
        y_pred_clipped = np.clip(y_pred, 1e-7, 1- 1e-7)

        #probabilites for taget values only if categorical variables
        if len(y_true.shape) == 1:
            correct_confidences = y_pred_clipped[
                range(samples), y_true
            ]
        #mask values only for one hot encoded labels
        elif len(y_true.shape) == 2:
            correct_confidences = np.sum(
                y_pred_clipped * y_true,
                axis = 1
            )

        #losses
        negative_log_likelihoods = -np.log(correct_confidences)
        return negative_log_likelihoods

    def backward(self, dvalues, y_true):

        #Number of samples
        samples = len(dvalues)

        #Number of labels in every sample
        #We'll use the first sample to count them
        labels = len(dvalues[0])

        #if labels are sparse, turn them into one-hot vector
        if len(y_true.shape) == 1:
            y_true = np.eye(labels)[y_true]
        
        #calculate gradient
        self.dinputs = -y_true / dvalues

        #Normalize gradient
        self.dinputs = self.dinputs / samples

    #Softmax classifier- combined softmax activation
    #and cross entropy loss for faster backward step

class Activation_Softmax_Loss_CatergoricalCrossentropy():
    #backward pass

    def backward(self, dvalues, y_true):
        #number of samples

        samples = len(dvalues)

        #if labels are one hot encoded turn them into discrete values
        if len(y_true.shape) ==2:
            y_true = np.argmax(y_true, axis = 1)

        #copy so we can safely modify()
        self.dinputs = dvalues.copy()

        #calculate gradient
        self.dinputs[range(samples), y_true] -=1
        #Normalize gradient
        self.dinputs = self.dinputs / samples

class Loss_BinaryCrossentropy(Loss):

    def forward(self, y_pred, y_true):

        #clip data to prevent division by 0
        #clip both sides to not drage man towards any value
        y_pred_clipped = np.clip(y_pred, 1e-7, 1- 1e-7)

        #calculate sample wise loss
        sample_losses = -(y_true * np.log(y_pred_clipped) + (1- y_true) * np.log(1 - y_pred_clipped))
        sample_losses = np.mean(sample_losses, axis = 1)

        #Return losses
        return sample_losses

    def backward(self, dvalues, y_true):
        #number of samples
        samples = len(dvalues)

        #Number of outputs in every sample
        #We'll use the first sample to count them
        outputs = len(dvalues[0])

        #clip data to prevent division by 0
        #clip both sides to not drage mean towards any value
        clipped_dvalues = np.clip(dvalues, 1e-7, 1-1e-7)

        #calculate gradient
        self.dinputs = -(y_true / clipped_dvalues - ( 1- y_true) / (1 - clipped_dvalues)) / outputs
        
        #normalize gradient
        self.dinputs =self.dinputs / samples

class Loss_MeanSquaredError(Loss):

    #forward pass
    def forward(self, y_pred, y_true):
        #calculate loss
        sample_losses = np.mean((y_true - y_pred)** 2, axis = 1)
        return sample_losses

    #backward pass
    def backward(self, dvalues, y_true):
        #number of samples
        samples = len(dvalues)

        #number of outputs in every sample
        #we'll use the first sample to count htem
        outputs = len(dvalues[0])

        #Gradient on values
        self.dinputs = -2 * (y_true - dvalues) / outputs
        #Normalize Gradient
        self.dinputs = self.dinputs / samples

#Mean Absolute Error Loss

class Loss_MeanAbsoluteError(Loss):

    def forward(self, y_pred, y_true):
        #calculate_loss

        sample_losses = np.mean(np.abs(y_true - y_pred),axis = 1)
        return sample_losses

    #backward pass
    def backward(self, dvalues, y_true):
        #number of samples
        samples = len(dvalues)
        
        #number of outputs in every samples
        #we'll use the first sample to count them
        outputs = len(dvalues[0])

        #calculate gradient
        self.dinputs = np.sign(y_true, dvalues) / outputs
        #Normalize graident
        self.dinputs = self.dinputs /samples
class Accuracy:
    #calculates an accuracy
    #given predictions and ground truth values

    def calculate(self, predictions, y):
        #get comparison results
        comparisons= self.compare(predictions, y)

        #calculate an accuracy 
        accuracy = np.mean(comparisons)

        #return accuracy
        return accuracy


class Accuracy_Categorical(Accuracy):
    def init(self, y):
        pass
    #compares predictions to the ground truth values

    def compare(self, predictions, y):
        if len(y.shape) == 2:
            y = np.argmax(y, axis =1 )
        return predictions == y

class Accuracy_Regression(Accuracy):
    def __init__(self):
        self.precision = None

    #calculates precision value
    #based on passed in the ground truth values
    def init(self, y , reinit = False):
        if self.precision is None or reinit:
            self.precision = np.std(y) / 250

    def compare(self, predictions,y ):
        return np.absolut(predictions - y) < self.precision

class Model:

    def __init__(self):
        #creat a list of network objects
        self.layers= []

        #Softmax classifiers output object
        self.softmax_classifier_output = None

    #add objects to the model
    
    def add(self, layer):
        self.layers.append(layer)

    #set loss, optimizer and accuracy

    def set(self, *, loss, optimizer, accuracy):
        self.loss = loss
        self.optimizer = optimizer
        self.accuracy = accuracy

    #finalize the model

    def finalize(self):
        
        #create and set the input layer
        self.input_layer = Layer_Input()

        #count all the objects
        layer_count = len(self.layers)

        #Initialize a list of containing trainable layers:
        self.trainable_layers = []

        #Iterate the object
        for i in range(layer_count):

            #if it's the first layer, the previous layer is the input layer
            if i ==0:
                self.layers[i].prev = self.input_layer
                self.layers[i].next = self.layers[i+1]

            #all layers ecept for the first and the last

            elif i < layer_count - 1:
                self.layers[i].prev = self.layers[i-1]
                self.layers[i].next = self.layers[i+1]

            #the last layer - the next object is the loss
            #also let's ave aside the reference ot the last object
            #whose output is the model's output 

            else:
                self.layers[i].prev = self.layers[i - 1]
                self.layers[i].next = self.loss
                self.output_layer_activation =self.layers[i]

            #if layer contains an attribute called weights
            #it is a trainable layer
            #add it to the list of trainable layers
            #We don't need to check for biases
            #checking for weights is enough

            if hasattr(self.layers[i], 'weights'):
                self.trainable_layers.append(self.layers[i])

            #update loss object with trainable layers
            self.loss.remember_trainable_layers(self.trainable_layers)

            #if output activation is SOftmax and loss function is categorical cross entropy
            #create an object of combined activation
            # and loss function containing faster gradient calculation

        if isinstance(self.layers[-1], Activation_Softmax) and isinstance(self.loss, Loss_CategoricalCrossentropy):
            #create an object of combined activation
            #and loss function

            self.softmax_classifier_output = Activation_Softmax_Loss_CatergoricalCrossentropy()
   
    def train(self, X, y, * , epochs = 1, print_every = 1, validation_data = None):
        self.accuracy.init(y)

        #main training loop

        for epoch in range(1, epochs + 1):
            #perform the forward pass
            output = self.forward(X)
            
            #calculate loss
            data_loss , regularization_loss = self.loss.calculate(output, y, include_regularization = True)
            loss = data_loss + regularization_loss

            #Get predictions and calculate an accuracy
            predictions = self.output_layer_activation.predictions(output)
            accuracy = self.accuracy.calculate(predictions,y )

            #perform a backward pass
            self.backward(output, y)

            #optimize(update parameters)

            self.optimizer.pre_update_params()
            for layer in self.trainable_layers:
                self.optimizer.update_params(layer)
            self.optimizer.post_update_params()

            #print a summary 
            if not epoch % print_every:
                print ( f'epoch: {epoch} , ' +
                        f'acc: {accuracy :.3f} , ' +
                        f'loss: {loss :.3f} (' +
                        f'data_loss: {data_loss :.3f} , ' +
                        f'reg_loss: {regularization_loss :.3f} ), ' +
                        f'lr: {self.optimizer.current_learning_rate} ' )

        if validation_data is not None:
            #for better readability
            X_val, y_val = validation_data

            #perform the forward pass
            output = self.forward(X_val)

            #calculate the loss
            loss = self.loss.calculate(output, y_val)

            #get predictions and calcualte an accuracy
            predictions= self.output_layer_activation.predictions(output)
            accuracy = self.accuracy.calculate(predictions, y_val)
            print ( f'validation, ' +
                    f'acc: {accuracy :.3f} , ' +
                    f'loss: {loss :.3f} ' )

     #perform a forward pass               
    def forward(self, X):

        #call forward method on the input layer this will set the output property that the first layer in "prev" object is expecting
        self.input_layer.forward(X)

   

        #call forward method of every object in a chain
        #pass output of the previous bject as a paramter

        for layer in self.layers:
            layer.forward(layer.prev.output)

        #layer is now the last object from the list, return its output
        return layer.output

    def backward(self, output, y):
        
        #if softmax classifier
        if self.softmax_classifier_output is not None:
            #first call backward method
            #on the combined activation / loss
            #this will set dinputs property
            self.softmax_classifier_output.backward(output, y)

            #since we'll not call backward method of the last layer
            #which is softmax activation
            #as we used combined activation / loss
            #object, let's set dinputs in this objects
            self.layers[-1].dinputs = self.softmax_classifier_output.dinputs

            #call backward method going trhough
            #all the objects but last
            #in reversed oreder passing dinputs as a parameter

            for layer in reversed(self.layers[:-1]):
                layer.backward(layer.next.dinputs)
            #first call backward method on the loss
            #this will set dinputs property that the last layer will try to access shortly
    
            return  
        self.loss.backward(output,y)

        #call backward method going through all the objects
        #in reversed order passing dinputs as a parameters
        for layer in reversed(self.layers):
            layer.backward(layer.next.dinputs)
            


In [2]:
X, y = sine_data()
# Create Dense layer with 1 input feature and 64 output values
dense1 = Layer_Dense( 1 , 64 )
# Create ReLU activation (to be used with Dense layer):
activation1 = Activation_ReLU()
# Create second Dense layer with 64 input features (as we take output
# of previous layer here) and 64 output values
dense2 = Layer_Dense( 64 , 64 )
# Create ReLU activation (to be used with Dense layer):
activation2 = Activation_ReLU()
# Create third Dense layer with 64 input features (as we take output
# of previous layer here) and 1 output value
dense3 = Layer_Dense( 64 , 1 )
# Create Linear activation:
activation3 = Activation_Linear()
# Create loss function
loss_function = Loss_MeanSquaredError()
# Create optimizer
optimizer = Optimizer_Adam( learning_rate = 0.005 , decay = 1e-3 )
# Accuracy precision for accuracy calculation
# There are no really accuracy factor for regression problem,
# but we can simulate/approximate it. We'll calculate it by checking
# how many values have a difference to their ground truth equivalent
# less than given precision
# We'll calculate this precision as a fraction of standard deviation
# of all the ground truth values
accuracy_precision = np.std(y) / 250
# Train in loop
for epoch in range ( 10001 ):
    # Perform a forward pass of our training data through this layer
    dense1.forward(X)
    # Perform a forward pass through activation function
    # takes the output of first dense layer here
    activation1.forward(dense1.output)
    # Perform a forward pass through second Dense layer
    # takes outputs of activation function
    # of first layer as inputs
    dense2.forward(activation1.output)
    activation2.forward(dense2.output)
    # Perform a forward pass through third Dense layer
    # takes outputs of activation function of second layer as inputs
    dense3.forward(activation2.output)
    # Perform a forward pass through activation function
    # takes the output of third dense layer here
    activation3.forward(dense3.output)
    # Calculate the data loss
    data_loss = loss_function.calculate(activation3.output, y)
    # Calculate regularization penalty
    regularization_loss = \
    loss_function.regularization_loss(dense1) + \
    loss_function.regularization_loss(dense2) + \
    loss_function.regularization_loss(dense3)
    # Calculate overall loss
    loss = data_loss + regularization_loss

    
# Calculate accuracy from output of activation2 and targets
# To calculate it we're taking absolute difference between
# predictions and ground truth values and compare if differences
# are lower than given precision value
predictions = activation3.output
accuracy = np.mean(np.absolute(predictions - y) <accuracy_precision) #if the difference between predictions and observation outside of std deviation bound, we call it error. 
loss_function.backward(activation3.output, y)
activation3.backward(loss_function.dinputs)
dense3.backward(activation3.dinputs)
activation2.backward(dense3.dinputs)
dense2.backward(activation2.dinputs)
activation1.backward(dense2.dinputs)
dense1.backward(activation1.dinputs)
# Update weights and biases
optimizer.pre_update_params()
optimizer.update_params(dense1)
optimizer.update_params(dense2)
optimizer.update_params(dense3)
optimizer.post_update_params()

In [5]:
accuracy_precision

0.0028284271247461905

In [13]:
np.mean(np.absolute(predictions - y)<accuracy_precision)

0.002