In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import style 
from nnfs.datasets import spiral_data
from nnfs.datasets import sine_data
import nnfs

style.use("dark_background")

# Dense Layer Class

In [2]:
class Layer_Dense:
    
    #Layer initialization
    def __init__(self, n_inputs, n_neurons,\
                 weight_lambda_l1=0, weight_lambda_l2=0,\
                 bias_lambda_l1=0, bias_lambda_l2=0):
        #initialize weights and biases
        self.weights = 0.1 * np.random.randn(n_inputs, n_neurons)
        self.biases = np.zeros((1, n_neurons))
        
        #set regularization strength (lambda)
        self.weight_lambda_l1 = weight_lambda_l1
        self.weight_lambda_l2 = weight_lambda_l2
        self.bias_lambda_l1 = bias_lambda_l1
        self.bias_lambda_l2 = bias_lambda_l2
    
    #Forward Pass
    def forward(self, inputs, training):
        #remember input values
        self.inputs = inputs
        #calculate output values from inputs, weights, and biases
        self.output = np.dot(inputs, self.weights) + self.biases
        
    #Backward Pass
    def backward(self, dvalues):
        #gradients on parameters
        self.dweights = np.dot(self.inputs.T, dvalues)
        self.dbiases = np.sum(dvalues, axis=0, keepdims=True)
        
        #gradients on regularization
        #L1 on weights
        if self.weight_lambda_l1 > 0:
            dL1 = np.ones_like(self.weights)
            dL1[self.weights < 0] = -1
            self.dweights += self.weight_lambda_l1 * dl1
        #L2 on weights
        if self.weight_lambda_l2 > 0:
            self.dweights += 2 * self.weight_lambda_l2 * \
                            self.weights
        #L1 on biases
        if self.bias_lambda_l1 > 0:
            dL1 = np.ones_like(self.biases)
            dL1[self.biases < 0] = -1
            self.dbiases += self.bias_lambda_l1 * dL1
        #L2 on biases
        if self.bias_lambda_l2 > 0:
            self.dbiases += 2 * self.bias_lambda_l2 * \
                            self.biases
        
        #gradient on values
        self.dinputs = np.dot(dvalues, self.weights.T)
        
        
    #retrieve layer parameters
    def get_parameters(self):
        return self.weights, self.biases
    
    #set weights and biases in layer instance
    def set_parameters(self, weights, biases):
        self.weights = weights
        self.biases = biases
    

# ReLU activation function class
used in the deep layers of a nerual network. not really used for output layers

In [3]:
#ReLu code
class ReLU_Activation:
    def forward(self, inputs, training):
        #ReLu function:
        # y = x : x > 0
        # y = 0 : x <= 0
        #calculate maximum values from input
        self.inputs = inputs
        self.output = np.maximum(0,inputs)
        
    def backward(self, dvalues):
        #original variable will be modified. make a copy
        self.dinputs = dvalues.copy()
        
        #zero gradients where input values are equal to or less than 0
        self.dinputs[self.inputs <= 0] = 0
        
    #calculate predictions for outputs
    def predictions(self, outputs):
        return outputs

# Softmax Activation Function Class
used as output layer for multi classification

In [4]:
class Activation_Softmax:
    def forward(self, inputs, training):
        #Softmax = e^Z_i,j / Sum_l=1->L(e^Z_i,l) 
        #Output: Normalized probability distribution for classes. Returns confidence scores for each class
        self.inputs = inputs
        exp_values = np.exp(inputs - np.max(inputs, axis=1, keepdims=True)) #combat dead neurons & exploding values
        probabilities = exp_values / np.sum(exp_values, axis=1, keepdims=True)
        self.output = probabilities
        
    def backward(self, dvalues):
        #uninitialized array
        self.dinputs = np.empty_like(dvalues)
        
        #enumerate outputs and gradients
        for index, (single_output, single_dvalues) in enumerate(zip(self.output, dvalues)):
            #flatten output array
            single_output = single_output.reshape(-1, 1)
            
            #calculate Jacobian matrix of output
            jacobian_matrix = np.diagflat(single_output) - np.dot(single_output, single_output.T)
            
            #Calculate sample-wise gradient and add it to the array of sample gradients
            self.dinputs[index] = np.dot(jacobian_matrix, single_dvalues)
            
    #calculate predictions for outputs
    def predictions(self, outputs):
        return np.argmax(outputs, axis=1)

# Common Loss Class:
Common operations are performed for loss functions. One of these operations is calculating the overall loss and regularization loss

In [5]:
class Loss:
    #regularization loss calculation
    def regularization_loss(self):
        #0 by defalut
        regularization_loss = 0
        
        for layer in self.trainable_layers:
            #L1 regularization - weights
            #calculate only when factor greater than 0
            if layer.weight_lambda_l1 > 0:
                regularization_loss += layer.weight_lambda_l1 *\
                                        np.sum(np.abs(layer.weights))
            #l2 regularization - weights
            if layer.weight_lambda_l2 > 0:
                regularization_loss += layer.weight_lambda_l2 *\
                                        np.sum(layer.weights * layer.weights)

            #l1 regularization - biases
            #calculate if > 0
            if layer.bias_lambda_l1 > 0:
                regularization_loss += layer.bias_lambda_l1 *\
                                        np.sum(np.abs(layer.biases))
            #l2 regularization - biases
            if layer.bias_lambda_l2 > 0:
                regularization_loss += layer.bias_lambda_l2 *\
                                        np.sum(layer.biases * layer.biases)
            
        return regularization_loss
    
    
    #Set/remember trainable layers
    def remember_trainable_layers(self, trainable_layers):
        self.trainable_layers = trainable_layers
    
    
    #claculate the data and regularization losses
    #given model output and ground truths (y_pred, y_target)
    def calculate(self, output, y, *, include_regularization=False):
        
        #calculate sample loses using the loss functions class
        sample_losses = self.forward(output, y)
        
        #calculate mean loss
        data_loss =  np.mean(sample_losses)
        
        #add accumulated sum of losses and sample count
        self.accumulated_sum += np.sum(sample_losses)
        self.accumulated_count += len(sample_losses)
        
        #if just data loss - return it
        #this feature is for validation of models
        if not include_regularization:
            return data_loss
    
        #return loss
        return data_loss, self.regularization_loss()
    
    #Calculates accumulated loss
    def calculate_accumulated(self, *, include_regularization=False):
        
        #calculate mean loss
        data_loss = self.accumulated_sum / self.accumulated_count
        
        #if just data loss - return it
        #this is for validation of models
        if not include_regularization:
            return data_loss
        
        #return the data and regularization losses
        return data_loss, self.regularization_loss()
    
    #reset variables for accumulated loss
    #when new epoch occurs
    def new_pass(self):
        self.accumulated_sum = 0
        self.accumulated_count = 0

# Categorical Cross Entropy Loss Class
used as loss function with a softmax classifier

In [6]:
#Cross Entropy Loss
#inherits the "Loss" class
class Loss_CategoricalCrossEntropy(Loss):
    
    #forward pass
    def forward(self, y_pred, y_true):
        #number of samples in the batch (j)
        num_samples = len(y_pred)
               
        #Li = -SUM_j(Y_true_i,j * log(Y_pred_i,j))
        
        #Li = sample loss value
        #i = i-th sample in the set
        #j = label/output index
        #Y_true = target values
        #Y_pred = predicted values
        #           _i_
        #       0   1  2
        #  |0  [[1, 2, 3,],
        #j |1   [4, 5, 6,],
        #  |2   [7, 8, 9]]
        

        #clip data to prevent division by 0
        #clip data on both sides to prevent altering mean to either side
        y_pred_clipped = np.clip(y_pred, 1e-7, 1 - 1e-7)
        
        #Use Case 1: 
        #get probabilites for target values if categorical labels (sparse)
        if(len(y_true.shape) == 1):
            correct_confidences = y_pred_clipped[range(num_samples), y_true]
          
        #Use Case 2:
        #get probabilites from distribution for target values when target labels are one-hot encoded
        #multipy by 0 to drop non-targets
        #multiply by 1 to keep target probability
        elif(len(y_true.shape) == 2):
            correct_confidences = np.sum(y_pred_clipped*y_true, axis=1)
            
        #losses
        negative_log_likelihoods = -np.log(correct_confidences)
        return negative_log_likelihoods
    
    def backward(self, dvalues, y_true):
        samples = len(dvalues) #number of samples
        
        #number of labels in every sample
        #use the first sample to count them.
        labels = len(dvalues[0])
        
        #if labels are sparse, turn them into one-hot vector
        if(len(y_true.shape) == 1):
            y_true = np.eye(labels)[y_true]
        
        #calculate gradient
        self.dinputs = -y_true / dvalues #derivative of CE function w.r.t. inputs
        
        #normalize gradient
        #batch optimizers sum gradients before multiplying by learning rate
        #the number of samples per batch can vary, meaning we would need to alter the learning rate 
        #to solve this issue we can find the mean value between all the gradients related to each weight and bias
        self.dinputs = self.dinputs / samples
        

# Combined Softmax and Cross-Entropy function 
This allows for faster calculation of the gradients capitalizing on the chain rule. Softmax and Categorical cross-entropy are often used together as the output layer and loss function respectively in neural network

In [7]:
#Combined softmax activation and cross-entropy loss
#for faster back step
class Activation_Softmax_Loss_CategoricalCrossEntropy():
    
    #OBSOLETE#
    #creates activation and loss function objects
#     def __init__(self):
#         self.activation = Activation_Softmax()
#         self.loss = Loss_CategoricalCrossEntropy()
        
#     #forward pass
#     def forward(self, inputs, y_true):
#         #output layer's activation function
#         self.activation.forward(inputs)
#         #Set the output variable
#         self.output = self.activation.output
        
#         #calculate and return loss value
#         return self.loss.calculate(self.output, y_true)
    
    #backward pass
    def backward(self, dvalues, y_true):
        #number of samples
        samples = len(dvalues)
        
        #if labels are one-hot encoded
        #turn them into descrete values
        if (len(y_true.shape) == 2):
            y_true = np.argmax(y_true, axis=1)
            
        #copy for safe modification
        self.dinputs = dvalues.copy()
        #calculate gradient
        self.dinputs[range(samples), y_true] -= 1
        #normalize gradient
        self.dinputs = self.dinputs / samples

# Stochastic Gradient Descent Optimizer Class With Optional Momentum Functionality

In [8]:
class Optimizer_SGD:
    
    #initialize optimizer - set hyper parameters
    def __init__(self, learning_rate=1.0, decay=0, momentum=0.0):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.momentum = momentum
        
    #Call once before any parameter updates
    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * \
            (1.0 / (1.0 + self.decay * self.iterations))
        
    #update parameters
    def update_params(self, layer):
        
        #if we use momentum
        if self.momentum:
            #if layer does not contain momentum arrays
            #create them filled with zeros
            if not hasattr(layer, 'weight_momentums'):
                layer.weight_momentums = np.zeros_like(layer.weights)
                #if there is no momentum array for weights
                #the array doesn't exist for biases either
                layer.bias_momentums = np.zeros_like(layer.biases)
            
            #build weight updates with momentum - take previous
            #updates multiplied by retain factor and update with
            #current gradients
            weight_updates = \
                self.momentum * layer.weight_momentums - \
                self.current_learning_rate * layer.dweights
            layer.weight_momentums = weight_updates
            
            #build bias updates
            bias_updates = \
                self.momentum * layer.bias_momentums - \
                self.current_learning_rate * layer.dbiases
            layer.bias_momentums = bias_updates
        
        #Vanilla SGD updates
        else:
            weight_updates = -self.current_learning_rate * \
                layer.dweights #negative direction of the slope
            bias_updates = -self.current_learning_rate * \
                layer.dbiases
        
        #update weights and biases using either
        #vanilla or momentum updates
        layer.weights += weight_updates
        layer.biases += bias_updates
        
    #call once after any parameter updates
    def post_update_params(self):
        self.iterations += 1

# SGD with Adaptive Gradient (AdaGrad) Optimizer

In [9]:
class Optimizer_Adagrad:
    
    #initialize optimizer - set hyper parameters
    def __init__(self, learning_rate=1.0, decay=0, epsilon=1e-7):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.epsilon = epsilon
        
    #Call once before any parameter updates
    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * \
            (1.0 / (1.0 + self.decay * self.iterations))
        
    #update parameters
    def update_params(self, layer):
        
        #if layer does not contain cache arrays
        #create them filled with zeros
        if not hasattr(layer, 'weight_cache'):
            layer.weight_cache = np.zeros_like(layer.weights)
            #if there is no cache array for weights
            #the array doesn't exist for biases either
            layer.bias_cache = np.zeros_like(layer.biases)
            
        #update cache with squared current gradients
        layer.weight_cache += layer.dweights**2
        layer.bias_cache += layer.dbiases**2
        
        #Vanilla SGD parameter update + normalization
        #with square rooted cache
        layer.weights += -self.current_learning_rate * \
            layer.dweights / \
            (np.sqrt(layer.weight_cache) + self.epsilon)
        layer.biases += -self.current_learning_rate * \
            layer.dbiases / \
            (np.sqrt(layer.bias_cache) + self.epsilon)
        
    #call once after any parameter updates
    def post_update_params(self):
        self.iterations += 1

## SGD with Root Mean Square Propagation (RMSProp) Optimization

In [10]:
class Optimizer_RMSProp:
    
    #initialize optimizer - set hyper parameters
    def __init__(self, learning_rate=0.001, decay=0, epsilon=1e-7, rho=0.9):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.epsilon = epsilon
        self.rho = rho
        
    #Call once before any parameter updates
    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * \
            (1.0 / (1.0 + self.decay * self.iterations))
        
    #update parameters
    def update_params(self, layer):
        
        #if layer does not contain cache arrays
        #create them filled with zeros
        if not hasattr(layer, 'weight_cache'):
            layer.weight_cache = np.zeros_like(layer.weights)
            #if there is no cache array for weights
            #the array doesn't exist for biases either
            layer.bias_cache = np.zeros_like(layer.biases)
            
        #update cache with squared current gradients
        layer.weight_cache = self.rho * layer.weight_cache + \
            (1 - self.rho) * layer.dweights**2
        layer.bias_cache = self.rho * layer.bias_cache + \
            (1 - self.rho) * layer.dbiases**2
        
        #Vanilla SGD parameter update + normalization
        #with square rooted cache
        layer.weights += -self.current_learning_rate * \
            layer.dweights / \
            (np.sqrt(layer.weight_cache) + self.epsilon)
        layer.biases += -self.current_learning_rate * \
            layer.dbiases / \
            (np.sqrt(layer.bias_cache) + self.epsilon)
        
    #call once after any parameter updates
    def post_update_params(self):
        self.iterations += 1

# SGD with Adaptive Momentum (Adam) Optimization

In [11]:
class Optimizer_Adam:
    
    #initialize optimizer - set hyper parameters
    def __init__(self, learning_rate=0.001, decay=0, epsilon=1e-7, beta_1=0.9, beta_2=0.999):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.epsilon = epsilon
        self.beta_1 = beta_1
        self.beta_2 = beta_2
        
    #Call once before any parameter updates
    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * \
            (1.0 / (1.0 + self.decay * self.iterations))
        
    #update parameters
    def update_params(self, layer):
        
        #if layer does not contain bias and weight cache arrays
        #create them filled with zeros
        #initialize bias and weight momentum arrays
        if not hasattr(layer, 'weight_cache'):
            layer.weight_momentums = np.zeros_like(layer.weights)
            layer.weight_cache = np.zeros_like(layer.weights)
            layer.bias_momentums = np.zeros_like(layer.biases)
            layer.bias_cache = np.zeros_like(layer.biases)
            
            
        #update momentum with current gradients
        layer.weight_momentums = self.beta_1 * \
            layer.weight_momentums + \
            (1 - self.beta_1) * layer.dweights
        layer.bias_momentums = self.beta_1 * \
            layer.bias_momentums + \
            (1 - self.beta_1) * layer.dbiases
        
        #Get corrected momentum
        #self.iteration is 0 at first pass
        #we need to start with 1 here
        weight_momentums_corrected = layer.weight_momentums / \
            (1 - self.beta_1** (self.iterations + 1))
        bias_momentums_corrected = layer.bias_momentums / \
            (1 - self.beta_1** (self.iterations + 1))
            
            
        #update cache with squared current gradients
        layer.weight_cache = self.beta_2 * layer.weight_cache + \
            (1 - self.beta_2) * layer.dweights**2
        layer.bias_cache = self.beta_2 * layer.bias_cache + \
            (1 - self.beta_2) * layer.dbiases**2
        
        #Get corrected Cache
        #self.iteration is 0 at first pass
        #we need to start with 1 here
        weight_cache_corrected = layer.weight_cache / \
            (1 - self.beta_2 ** (self.iterations + 1))
        bias_cache_corrected = layer.bias_cache / \
            (1 - self.beta_2 ** (self.iterations + 1))
        
        
        #Vanilla SGD parameter update + normalization
        #with square rooted cache
        layer.weights += -self.current_learning_rate * \
             weight_momentums_corrected / \
            (np.sqrt(weight_cache_corrected) + self.epsilon)
        layer.biases += -self.current_learning_rate * \
            bias_momentums_corrected / \
            (np.sqrt(bias_cache_corrected) + self.epsilon)
        
    #call once after any parameter updates
    def post_update_params(self):
        self.iterations += 1

# Dropout layer Class
This class implements a dropout layer into the neural network

In [12]:
class Layer_Dropout:
    #init
    def __init__(self, rate):
        #store rate, we invert it as for dropout of 0.1 we need a success rate of 0.9
        self.rate = 1 - rate
    
    #forward pass through dropout layer
    def forward(self, inputs, training):
        #save input values
        self.inputs = inputs
        
        #if not in the training mode - return values
        if not training:
            self.output = inputs.copy()
            return
        
        #generate and save scaled mask
        self.binary_mask = np.random.binomial(1, self.rate,\
                            size=inputs.shape) / self.rate
        
        #apply mask to output values
        self.output = inputs * self.binary_mask
        
    #backward pass
    def backward(self, dvalues):
        #gradient on values
        #This partial derivative ends up being the same as binary_mask
        #ri / (1 - q)
        self.dinputs = dvalues * self.binary_mask

# Sigmoid Activation class
This is used for binary logistic regression in the output layer to perform binary classification

In [13]:
class Activation_Sigmoid:
        #forward pass
        def forward(self, inputs, training):
            #save input and calculate/save output
            self.inputs = inputs
            self.output = 1 / (1 + np.exp(-inputs))
            
        #backward pass
        def backward(self, dvalues):
            #derivative calculates from output of sigmoid function
            #sigmoid * (1 - sigmoid)
            self.dinputs = dvalues * (1 - self.output) * self.output
            
        def predictions(self, outputs):
            return (outputs > 0.5) * 1

# Binary Cross-Entropy Loss
This is used with a binary classifier such as the sigmoid function

In [14]:
class Loss_BinaryCrossEntropy(Loss):
    #forward pass
    def forward(self, y_pred, y_true):
        
        #clip data to prevent division by 0
        #clip both sides to bring mean to any value
        y_pred_clipped = np.clip(y_pred, 1e-7, 1 - 1e-7)
        
        #calculate sample-wise loss
        sample_losses = -(y_true * np.log(y_pred_clipped) +\
                         (1 - y_true) * np.log(1 - y_pred_clipped))
        #calculate mean of each sample
        #do this by going along each sample (axis=-1)
        sample_losses = np.mean(sample_losses, axis=-1)
        
        #return losses
        return sample_losses
    
    #backward pass
    def backward(self, dvalues, y_true):
        
        #number of samples
        num_samples = len(dvalues)
        
        #number of outputs in every sample (1 per neuron)
        #use 1st sample to count them
        num_outputs = len(dvalues[0])
        
        #clip data to prevent division by 0
        #clip both sides to bring mean to any value
        clipped_dvalues = np.clip(dvalues, 1e-7, 1 - 1e-7)
        
        #calculate gradient
        self.dinputs = -(y_true / clipped_dvalues -\
                        (1 - y_true) / (1 - clipped_dvalues)) / num_outputs
        
        #normalize gradient
        self.dinputs = self.dinputs / num_samples

# Linear Activation Function
used in output layer of regression models

In [15]:
class Activation_Linear:
    
    #forward pass
    def forward(self, inputs, training):
        #remember values
        self.inputs = inputs
        self.output = inputs
        
    #backward pass
    def backward(self, dvalues):
        #derivative is 1
        #1 * dvalues = dvalues
        self.dinputs = dvalues.copy()
        
    def predictions(self, outputs):
        return outputs

# Mean Square Error (MSE) Loss Class L2
MSE is used with regression models for loss calculation

In [16]:
class Loss_MeanSquaredError(Loss):
    
    #forward pass
    def forward(self, y_pred, y_true):
        #calculate loss per sample
        sample_losses = np.mean((y_true - y_pred)**2, axis=-1)
        
        #return losses
        return sample_losses
    
    #backward pass
    def backward(self, dvalues, y_true):
        #number of samples
        num_samples = len(dvalues)
        
        #number of outputs in every sample
        #use first sample to count
        num_outputs = len(dvalues[0])
        
        #gradient on values
        self.dinputs = -2 * (y_true - dvalues) / num_outputs
        #normalize gradient
        self.dinputs = self.dinputs / num_samples

# Mean Absolute Error (MAE) Loss Class L1

In [17]:
class Loss_MeanAbsoluteError(Loss):
    
    #forward pass
    def forward(self, y_pred, y_true):
        #calculate loss per sample
        sample_losses = np.mean(np.abs(y_true - y_pred), axis=-1)
        
        #return losses
        return sample_losses
    
    #backward pass
    def backward(self, dvalues, y_true):
        
        #number of samples
        num_samples = len(dvalues)
        #number of outputs per sample
        #use first output sample to calculate
        num_outputs = len(dvalues[0])
        
        #calculate gradient
        self.dinputs = np.sign(y_true - dvalues) / num_outputs
        #normalize gradient
        self.dinputs = self.dinputs / num_samples

# Input Class
Used as the first layer in the neural network. Only holds the inputs to the neural network.

In [18]:
class Layer_Input:
    
    #forward pass
    #set training samples
    def forward(self, inputs, training):
        self.output = inputs

# Model Class
handles all objects in the model

In [19]:
class Model:
    def __init__(self):
        #create a list of network objects
        self.layers = []
        #softmax classifier's output object
        self.softmax_classifier_output = None
        
    #add objects to the model
    def add(self, layer):
        self.layers.append(layer)
        
    #set loss and optimizer
    #'*' ensures the following parameters are keyword arguments
    def set(self, *, loss=None, optimizer=None, accuracy=None):
        if loss is not None:
            self.loss = loss
            
        if optimizer is not None:
            self.optimizer = optimizer
            
        if accuracy is not None:
            self.accuracy = accuracy
    
    #finalize the model
    def finalize(self):
        
        #create and set the input layer
        self.input_layer = Layer_Input()
        
        #count all the objects
        layer_count = len(self.layers)
        
        #initialize a list containing trainable layers
        self.trainable_layers = []
        
        #iterate the objects
        for i in range(layer_count):
            #it it's the first layer
            #the previous layer object is the input layer
            if i == 0:
                self.layers[i].prev = self.input_layer
                self.layers[i].next = self.layers[i+1]
                
            #all layers except for the first and the lats
            elif i < layer_count - 1:
                self.layers[i].prev = self.layers[i-1]
                self.layers[i].next = self.layers[i+1]
            
            #The last layer - the next object is the loss
            #we also save a reference to the last object
            #whose output is the model's output
            else:
                self.layers[i].prev = self.layers[i-1]
                self.layers[i].next = self.loss
                self.output_layer_activation = self.layers[i]
                
            #if layer contains and attribute "weights",
            #it's a trainimable layer -
            #add it to the list of trainable layers
            #weights implies there's a bias
            if hasattr(self.layers[i], 'weights'):
                self.trainable_layers.append(self.layers[i])
                
        #update loss object with trainable layers
        if self.loss is not None:
            self.loss.remember_trainable_layers(\
                                        self.trainable_layers)
        
        #if output activation is Softmax and
        #loss function is Categorical cross-entropy
        #create an object of combined activation
        #and loss function containing faster gradient calculation
        if isinstance(self.layers[-1], Activation_Softmax) and \
           isinstance(self.loss, Loss_CategoricalCrossEntropy):
            #create an object of combined activation and loss
            self.softmax_classifier_output = \
            Activation_Softmax_Loss_CategoricalCrossEntropy()
                
    #perform forward pass
    def forward(self, X, training):
        
        #call forward method on the input layer
        #this will set the 'output' attribute that
        #the first layer in "prev" object is expecting
        self.input_layer.forward(X, training)
        
        #call forward method of every object in a chain
        #pass output of the previous object as a parameter
        for layer in self.layers:
            layer.forward(layer.prev.output, training)
        
        #"layer" is now the last object from the list
        #return its output
        return layer.output
        
    #train the model
    #validation data must be a tuple = (samples, targets)
    def train(self, X, y, *, epochs=1, batch_size=None, print_every=1, validation_data=None):
        
        #initialize accuracy object
        self.accuracy.init(y)
        
        #default value if batch size is not set
        train_steps = 1
        
        #if there is validation data passed
        #set default number of steps for validation as well
        if validation_data is not None:
            validation_steps = 1
            
            #for better readability
            X_val, y_val = validation_data
            
        #calculate number of steps
        if batch_size is not None:
            train_steps = len(X) // batch_size #integer division
            #dividing rounds down. if there arre some remaining
            #data samples, but not a full batch, the integer division
            #wont include it
            #add 1 to include. this wont be full batch
            if train_steps * batch_size < len(X):
                train_steps += 1
                
            if validation_data is not None:
                validation_steps = len(X_val) // batch_size
                #dividing rounds down. if there arre some remaining
                #data samples, but not a full batch, the integer division
                #wont include it
                #add 1 to include. this wont be full batch
                if validation_steps * batch_size < len(X_val):
                    validation_steps += 1
        
        #main training loop
        for epoch in range(1, epochs+1):
            
            #print epoch number
            print(f'epoch: {epoch}')
            
            #reset accumulated values in loss and accuracy
            self.loss.new_pass()
            self.accuracy.new_pass()
            
            for step in range(train_steps):
                
                #if batch size is not set -
                #train using one step and full dataset
                if batch_size is None:
                    batch_X = X
                    batch_y = y
                
                #otherwise slice the dataset
                else:
                    batch_X = X[step*batch_size:(step+1)*batch_size]
                    batch_y = y[step*batch_size:(step+1)*batch_size]
            
            
                #perform the forward pass
                output = self.forward(batch_X, training=True)

                #calculate loss
                #include regularization = True for training models
                data_loss, regularization_loss = \
                    self.loss.calculate(output, batch_y,\
                                       include_regularization=True)
                loss = data_loss + regularization_loss

                #get predictions and calculate an accuracy
                predictions = self.output_layer_activation.predictions(output)
                accuracy = self.accuracy.calculate(predictions,\
                                                   batch_y)

                #perform backward pass
                self.backward(output, batch_y)

                #optimize (update parameters)
                self.optimizer.pre_update_params()
                for layer in self.trainable_layers:
                    self.optimizer.update_params(layer)
                self.optimizer.post_update_params()
                
                #print summary
                if not step % print_every or step == train_steps - 1:
                    print(f'step: {step}, ' +
                    f'acc: {accuracy:.3f}, ' +
                    f'loss: {loss:.3f} (' +
                    f'data_loss: {data_loss:.3f}, ' +
                    f'reg_loss: {regularization_loss:.6f}), ' +
                    f'lr: {self.optimizer.current_learning_rate}')
                    
        #get and print epoch loss and accuracy
        epoch_data_loss, epoch_regularization_loss = \
            self.loss.calculate_accumulated(\
                include_regularization=True)
        epoch_loss = epoch_data_loss + epoch_regularization_loss
        epoch_accuracy = self.accuracy.calculate_accumulated()
        
        print(f'training, ' +
            f'acc: {epoch_accuracy:.3f}, ' +
            f'loss: {epoch_loss:.3f} (' +
            f'data_loss: {epoch_data_loss:.3f}, ' +
            f'reg_loss: {epoch_regularization_loss:.6f}), ' +
            f'lr: {self.optimizer.current_learning_rate}')
        
                
        #if there is validation data
        #validate the model
        if validation_data is not None:
            #evaluate the model
            self.evaluate(*validation_data, batch_size=batch_size)

            
    def backward(self, output, y):
        
        #if softmax classifier
        if self.softmax_classifier_output is not None:
            #first call backward method 
            #on the combined activation/loss
            #this will set dinputs property
            self.softmax_classifier_output.backward(output, y)
            
            #since we'll not call backward method of the last layer
            #which is Softmax activation (output layer)
            #we used combined activation/loss
            #object, we will set dinputs in this object
            self.layers[-1].dinputs = self.softmax_classifier_output.dinputs
            
            #call backward method going though
            #all the objects but the last
            #in reverse order passing dinputs as parameter
            for layer in reversed(self.layers[:-1]):
                layer.backward(layer.next.dinputs)
                
            return
        
        #first call backward method on the loss
        #this will set dinputs property that the last
        #layer will try to access shortly
        self.loss.backward(output, y)
        
        #call backward method going through all the objects
        #in reversed order passing dinputs as a parameter
        for layer in reversed(self.layers):
            layer.backward(layer.next.dinputs)
    
    
    #Evaluates the model using passed in dataset
    def evaluate(self, X_val, y_val, *, batch_size=None):
        
        #default value if batch size is not being set
        validation_steps = 1
        
        #calculate number of steps
        if batch_size is not None:
            validation_steps = len(X_val) // batch_size
            
            #dividing rounds down. if there arre some remaining
            #data samples, but not a full batch, the integer division
            #wont include it
            #add 1 to include. this wont be full batch
            if validation_steps * batch_size < len(X_val):
                validation_steps += 1
                
        #reset accumulated values in loss
        #and accuracy objects
        self.loss.new_pass()
        self.accuracy.new_pass()
        
        #iterate over steps
        for step in range(validation_steps):
            
            #if batch size is not set
            #train using one step and full dataset
            if batch_size is None:
                batch_X = X_val
                batch_y = y_val
                
            #otherwise slice a batch
            else:
                batch_X = X_val[\
                        step*batch_size:(step+1)*batch_size\
                        ]
                batch_y = y_val[\
                        step*batch_size:(step+1)*batch_size\
                ]
                
           #perform the forward pass
            output = self.forward(batch_X, training=False)

            #calculate the loss
            #not needed for steps in validation
            self.loss.calculate(output, batch_y)

            #get predictions and calculate an accuracy
            predictions = self.output_layer_activation.predictions(\
                                                            output)

            #calculate accuracy
            #not need for steps in validation
            self.accuracy.calculate(predictions, batch_y)

        #get validation loss and accuracy
        validation_loss = self.loss.calculate_accumulated()
        validation_accuracy = self.accuracy.calculate_accumulated()

        #print a summary
        print(f'Validation, ' +\
              f'acc: {validation_accuracy:.3f}, '+
              f'loss: {validation_loss:.3f}') 
        
        
    #retrieves and returns parameters of trainable layers
    def get_parameters(self):
        
        #create a list for parameters
        parameters = []
        
        #Iterate trainable layers and get their parameters
        for layer in self.trainable_layers:
            parameters.append(layer.get_parameters())
            
        #return a list
        return parameters
    
    #updates the model with new parameters
    def set_parameters(self, parameters):
        
        #iterate over the parameters and layers
        #and update each layer with each set of the parameters
        for parameter_set, layer in zip(parameters,\
                                        self.trainable_layers):
            layer.set_parameters(*parameter_set)

# Common Accuracy Class

In [20]:
class Accuracy:
    
    #calculates an accuracy
    #given predictions and ground truth values
    def calculate(self, predictions, y):
        #get comparison results
        comparisons = self.compare(predictions, y)
        
        #calculate an accuracy
        accuracy = np.mean(comparisons)
        
        #add accumulated sum of matching values and sample count
        self.accumulated_sum += np.sum(comparisons)
        self.accumulated_count += len(comparisons)
        
        return accuracy
    
    #calculate accumulated accuracy
    def calculate_accumulated(self):
        
        #calculate accuracy
        accuracy = self.accumulated_sum / self.accumulated_count
        
        #return accuracy
        return accuracy
    
    #reset variables for accumulated accuracy
    #when new epoch starts
    def new_pass(self):
        self.accumulated_sum = 0
        self.accumulated_count = 0

## Regression Model Accuracy Class
used to calculate accuracy for regression models

In [21]:
class Accuracy_Regression(Accuracy):
    def __init__(self):
        #create precision property
        self.precision = None
        
    #calculates precision values
    #based on passed in ground truth
    #precision_diviser, the lower it is
    #the more precise the predictions must be (250 is default)
    def init(self, y, reinit=False, *, precision_divisor=250):
        if self.precision is None or reinit:
            self.precision = np.std(y) / precision_divisor
            
    #compares predictions to the ground truth values
    def compare(self, predictions, y):
        return np.absolute(predictions - y) < self.precision

## Categorical Classification Model Accuracy Class
Used to calculate accuracy for classification models

In [22]:
class Accuracy_Categorical(Accuracy):
    
    def __init__(self, *, binary=False):
        #checks if this is binary classification mode 
        self.binary = binary
        
    #no initialization needed
    def init(self, y):
        pass
    
    #compares predictions to the ground truth values
    def compare(self, predictions, y):
        if not self.binary and len(y.shape) == 2:
            y = np.argmax(y, axis=1)
        return predictions == y