In [1]:
import numpy as np
import nnfs
from nnfs.datasets import spiral_data
nnfs.init()
import pandas as pd

In [2]:
import numpy as np
import nnfs
from nnfs.datasets import spiral_data
nnfs.init()
# Dense layer
class Layer_Dense :
# Layer initialization
    def __init__ ( self , n_inputs , n_neurons , weight_regularizer_l1 = 0, weight_regularizer_l2 = 0, bias_regularizer_l1 = 0, bias_regularizer_l2 =0):
        # Initialize weights and biases
        self.weights = 0.01 * np.random.randn(n_inputs, n_neurons)
        self.biases = np.zeros(( 1 , n_neurons))
        self.weight_regularizer_l1 = weight_regularizer_l1
        self.weight_regularizer_l2 = weight_regularizer_l2
        self.bias_regularizer_l1 = bias_regularizer_l1
        self.bias_regularizer_l2 = bias_regularizer_l2
        #set regularization strength
    # Forward pass
    def forward ( self , inputs ):
        # Remember input values
        self.inputs = inputs
        # Calculate output values from inputs, weights and biases
        self.output = np.dot(inputs, self.weights) + self.biases
    # Backward pass
    def backward ( self , dvalues ):
        # Gradients on parameters
        self.dweights = np.dot(self.inputs.T, dvalues)
        self.dbiases = np.sum(dvalues, axis = 0 , keepdims = True )
        # Gradient on values
        self.dinputs = np.dot(dvalues, self.weights.T)
        #Gradients on regularization
        #L1 on weights

        if self.weight_regularizer_l1 >0:
            dL1 = np.ones_like(self.weights)
            dL1[self.weights <0 ] =-1
            self.dweights += self.weight_regularizer_l1 * dL1
        #l2 on weights
        if self.weight_regularizer_l2 >0:
            self.dweights += 2*self.weight_regularizer_l2 * self.weights

        #L1 regularization - biases

        if self.bias_regularizer_l1 > 0:
            dL1 = np.ones_like(self.biases)
            dL1[self.biases< 0 ] = -1
            self.dbiases += self.bias_regularizer_l1 * dL1

        #l2 on biases
        if self.bias_regularizer_l2> 0:
            self.dbiases +=  2 * self.bias_regularizer_l2 * self.biases
        #Gradient on values
        self.dinputs = np.dot(dvalues, self.weights.T)

    # ReLU activation
class Activation_ReLU :
    # Forward pass
    def forward ( self , inputs ):
        # Remember input values
        self.inputs = inputs
        # Calculate output values from inputs
        self.output = np.maximum( 0 , inputs)
        # Backward pass
    def backward ( self , dvalues ):
        # Since we need to modify original variable,
        # let's make a copy of values first
        self.dinputs = dvalues.copy()
        # Zero gradient where input values were negative
        self.dinputs[self.inputs <= 0 ] = 0
    # Softmax activation
class Activation_Softmax :
    # Forward pass
    def forward ( self , inputs ):
        # Remember input values
        self.inputs = inputs
        # Get unnormalized probabilities
        exp_values = np.exp(inputs - np.max(inputs, axis = 1 ,
        keepdims = True ))
        # Normalize them for each sample
        probabilities = exp_values / np.sum(exp_values, axis = 1 ,
        keepdims = True )
        self.output = probabilities
        # Backward pass
    def backward ( self , dvalues ):
        # Create uninitialized array
        self.dinputs = np.empty_like(dvalues)
        # Enumerate outputs and gradients
        for index, (single_output, single_dvalues) in \
        enumerate ( zip (self.output, dvalues)):
        # Flatten output array
            single_output = single_output.reshape( - 1 , 1 )
            # Calculate Jacobian matrix of the output and
            jacobian_matrix = np.diagflat(single_output) - \
            np.dot(single_output, single_output.T)
            # Calculate sample-wise gradient
            # and add it to the array of sample gradients
            self.dinputs[index] = np.dot(jacobian_matrix,
            single_dvalues)


# Common loss class
class Loss :

    #Regularization on loss calculation

    def regularization_loss(self, layer):
        #0 by default

        regularization_loss = 0
        #L1 regularization-weights
        #calculate only when factor greater than 0
        #L1 regularization’s penalty is the sum of all the absolute values for the weights and biases.
        #This is a linear penalty as regularization loss returned by this function is directly proportional to
        #parameter values. L2 regularization’s penalty is the sum of the squared weights and biases.


        if layer.weight_regularizer_l1 > 0:
            regularization_loss += layer.weight_regularizer_l1 * np.sum(np.abs(layer.weights))

        #L2 regularization - weights
        if layer.weight_regularizer_l2 > 0:
            regularization_loss += layer.weight_regularizerl2 * np.sum(layer.weights*layer.weights)

        #L1 regularization - biases
        #calculate only when factor greater than 0
        if layer.bias_regularizer_l1 > 0:
            regularization_loss += layer.bias_regularizer_l1 * np.sum(np.abs(layer.biases))

        #L2 regularization - biases
        if layer.bias_regularizer_l2 > 0:
            regularization_loss += layer.bias_regularizer_l2 * np.sum(layer.biases * layer.biases)

        return regularization_loss       

        
    # Calculates the data and regularization losses
    # given model output and ground truth values
    def calculate ( self , output , y ):
        # Calculate sample losses
        sample_losses = self.forward(output, y)
        # Calculate mean loss
        data_loss = np.mean(sample_losses)
        # Return loss
        return data_loss
        # Cross-entropy loss



class Loss_CategoricalCrossentropy ( Loss ):
    # Forward pass
    def forward ( self , y_pred , y_true ):
    # Number of samples in a batch
        samples = len (y_pred)
        # Clip data to prevent division by 0
        # Clip both sides to not drag mean towards any value
        y_pred_clipped = np.clip(y_pred, 1e-7 , 1 - 1e-7 )
        # Probabilities for target values -
        # only if categorical labels
        if len (y_true.shape) == 1 :
            correct_confidences = y_pred_clipped[
            range (samples),
            y_true
            ]
        # Mask values - only for one-hot encoded labels
        elif len (y_true.shape) == 2 :
            correct_confidences = np.sum(
            y_pred_clipped * y_true,
            axis = 1
            )
    # Losses
        negative_log_likelihoods = - np.log(correct_confidences)
        return negative_log_likelihoods
        # Backward pass
    def backward ( self , dvalues , y_true ):
        # Number of samples
        samples = len (dvalues)
        # Number of labels in every sample
        # We'll use the first sample to count them
        labels = len (dvalues[ 0 ])
        # If labels are sparse, turn them into one-hot vector
        if len (y_true.shape) == 1 :
            y_true = np.eye(labels)[y_true]
        # Calculate gradient
        self.dinputs = - y_true / dvalues
        # Normalize gradient
        self.dinputs = self.dinputs / samples

        
        # Softmax classifier - combined Softmax activation
# and cross-entropy loss for faster backward step
class Activation_Softmax_Loss_CategoricalCrossentropy ():
    # Creates activation and loss function objects
    def __init__ ( self ):
        self.activation = Activation_Softmax()
        self.loss = Loss_CategoricalCrossentropy()
        # Forward pass
    def forward ( self , inputs , y_true ):
        # Output layer's activation function
        self.activation.forward(inputs)
        # Set the output
        self.output = self.activation.output
        # Calculate and return loss value
        return self.loss.calculate(self.output, y_true)
    # Backward pass
    def backward ( self , dvalues , y_true ):
        # Number of samples
        samples = len (dvalues)
        # If labels are one-hot encoded,
        # turn them into discrete values
        if len (y_true.shape) == 2 :
            y_true = np.argmax(y_true, axis = 1 )
            # Copy so we can safely modify
        self.dinputs = dvalues.copy()
        # Calculate gradient
        self.dinputs[ range (samples), y_true] -= 1
        # Normalize gradient
        self.dinputs = self.dinputs / samples

class Basic_SGD:

    def __init__(self, learning_rate = 1.0):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate

    def update_params(self, layer):
        layer.weights += -self.learning_rate * layer.dweights
        layer.biases += -self.learning_rate * layer.dbiases

class Optimizer_SGD:
    #Initialize optimizer - set settings,
    #learning rate of 1 is default for this optimizer

    def __init__(self, learning_rate = 1, decay = 0, momentum = 0):
        self.name = "optimizerSGD"
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.momentum = momentum

    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * (1. / (1. + self.decay * self.iterations))

    def update_params(self, layer):
        #if later does not contain momentum arrays, create them
        #filled them with zeros
        if self.momentum:

            if not hasattr(layer, 'weight_momentums'):
                layer.weight_momentums = np.zeros_like(layer.weights)
                #if there is no momentum array for weights
                #the array does not exist for biases yet either
                layer.bias_momentums = np.zeros_like(layer.biases)

            #Build weight updates with momentum - take previous
            #updates multiplied by retain factor and update with
            #current gradients

            weight_updates = self.momentum * layer.weight_momentums - self.current_learning_rate * layer.dweights
            layer.weight_momentums = weight_updates

            #build bias updates
            bias_updates = self.momentum * layer.bias_momentums - self.current_learning_rate * layer.dbiases
            layer.bias_momentum = bias_updates
            
        else:
            weight_updates = -self.current_learning_rate * layer.dweights
            bias_updates = -self.current_learning_rate * layer.dbiases


        layer.weights += -self.learning_rate * layer.dweights
        layer.biases += -self.learning_rate * layer.dbiases

    def post_update_params(self):
        self.iterations += 1

# Adagrad optimizer
class Optimizer_Adagrad :
    # Initialize optimizer - set settings
    def __init__ ( self , learning_rate = 1. , decay = 0. , epsilon = 1e-7 ):
        self.name = "OptimizerAdagrad"
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.epsilon = epsilon
    # Call once before any parameter updates
    def pre_update_params ( self ):
        if self.decay:
            self.current_learning_rate = self.learning_rate *  ( 1. / ( 1. + self.decay * self.iterations))
        
    # Update parameters
    def update_params ( self , layer ):
        # If layer does not contain cache arrays,
        # create them filled with zeros
        if not hasattr (layer, 'weight_cache' ):
            layer.weight_cache = np.zeros_like(layer.weights)
            layer.bias_cache = np.zeros_like(layer.biases)
        # Update cache with squared current gradients
        layer.weight_cache += layer.dweights ** 2
        layer.bias_cache += layer.dbiases ** 2
        # Vanilla SGD parameter update + normalization
        # with square rooted cache
        layer.weights += - self.current_learning_rate * layer.dweights /  (np.sqrt(layer.weight_cache) + self.epsilon)
        layer.biases += - self.current_learning_rate *  layer.dbiases /       (np.sqrt(layer.bias_cache) + self.epsilon)
        # Call once after any parameter updates
    def post_update_params ( self ):
        self.iterations += 1

class Optimizer_RMSprop:

    def __init__(self, learning_rate = 0.001, decay = 0, epsilon = 1e-7, rho=0.9):
        self.name = "RMSProp"
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.epsilon = epsilon
        self.rho = rho
    
    #Update parameters
    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * (1. / (1. + self.decay * self.iterations))
    
    def update_params(self, layer):

        #if layer does not contain cache arrays
        #create them filled with zeros

        if not hasattr(layer, 'weight_cache'): 
            layer.weight_cache = np.zeros_like(layer.weights)
            layer.bias_cache = np.zeros_like(layer.biases)

        #update cache with squared current gradient
        layer.weight_cache = self.rho * layer.weight_cache + (1 - self.rho) * layer.dweights ** 2
        layer.bias_cache =  self.rho * layer.bias_cache + (1 - self.rho) * layer.dbiases ** 2

        layer.weights += - self.current_learning_rate * layer.dweights / (np.sqrt(layer.weight_cache) + self.epsilon)
        layer.biases += - self.current_learning_rate * layer.dbiases / (np.sqrt(layer.bias_cache) + self.epsilon)
    
    def post_update_params(self):
        self.iterations += 1

class Optimizer_Adam:

    def __init__(self, learning_rate = 0.001, decay = 0, epsilon = 1e-7, beta_1 = 0.9, beta_2 = 0.999):
        self.name = "OptimizerAdam"
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.epsilon = epsilon
        self.iterations = 0
        self.beta_1 = beta_1
        self.beta_2 = beta_2
    
    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * (1. / (1. + self.decay * self.iterations))

    def update_params(self, layer):

        #if layer does not contain cache arrays
        #create them filled with zeros

        if not hasattr(layer, 'weight_cache'):
            layer.weight_momentums = np.zeros_like(layer.weights)
            layer.weight_momentums = np.zeros_like(layer.weights)
            layer.weight_cache = np.zeros_like(layer.weights)
            layer.bias_momentums = np.zeros_like(layer.biases)
            layer.bias_cache = np.zeros_like(layer.biases)
            

        layer.weight_momentums = self.beta_1 * layer.weight_momentums  + (1- self.beta_1) * layer.dweights
        layer.bias_momentums = self.beta_1 * layer.bias_momentums + (1- self.beta_1) * layer.dbiases

        weight_momentums_corrected = layer.weight_momentums / ( 1 - self.beta_1 ** (self.iterations + 1 ))
        bias_momentums_corrected = layer.bias_momentums / ( 1 - self.beta_1 ** (self.iterations + 1 ))
# Update cache with squared current gradients
        layer.weight_cache = self.beta_2 * layer.weight_cache + ( 1 - self.beta_2) * layer.dweights ** 2
        layer.bias_cache = self.beta_2 * layer.bias_cache + ( 1 - self.beta_2) * layer.dbiases ** 2

        weight_cache_corrected = layer.weight_cache / ( 1 - self.beta_2 ** (self.iterations + 1 ))
        bias_cache_corrected = layer.bias_cache / ( 1 - self.beta_2 ** (self.iterations + 1 ))

        layer.weights += - self.current_learning_rate * weight_momentums_corrected / (np.sqrt(weight_cache_corrected) + self.epsilon)
        layer.biases += - self.current_learning_rate * bias_momentums_corrected / (np.sqrt(bias_cache_corrected) + self.epsilon)
    
    def post_update_params ( self ):
        self.iterations += 1



In [3]:

X, y = spiral_data( samples = 100 , classes = 3 )
# Create Dense layer with 2 input features and 3 output values
dense1 = Layer_Dense( 2 , 64 , weight_regularizer_l1=5e-4, bias_regularizer_l2= 5e-4)
# Create ReLU activation (to be used with Dense layer):

activation1 = Activation_ReLU()
# Create second Dense layer with 3 input features (as we take output
# of previous layer here) and 3 output values (output values)

dense2 = Layer_Dense( 64 ,3  )
# Create Softmax classifier's combined loss and activation

loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy()

# Perform a forward pass of our training data through this layer
optimizer_class =[ 
Optimizer_Adagrad(decay = 1e-4),
Optimizer_SGD(decay = 1e-3, momentum = 0.9),
Optimizer_RMSprop( decay = 1e-4 ),
Optimizer_RMSprop( learning_rate = 0.02 , decay = 1e-5 ,rho = 0.999 ),
Optimizer_Adam( learning_rate = 0.02 , decay = 1e-5 )]

scores = {"method":[],"epoch":[],"loss":[],"lr":[],"acc":[], "reg_loss":[],"data_loss":[]}



In [4]:
for  optimizer in optimizer_class: 
    
    for epoch in range (10001):
        

        dense1.forward(X)
        #this is whhen input * weight + bias
        # Perform a forward pass through activation function
        # takes the output of first dense layer here

        activation1.forward(dense1.output) #this is relu
        # Perform a forward pass through second Dense layer
        # takes outputs of activation function of first layer as inputs

        dense2.forward(activation1.output) #this is input * weight + bias
        # Perform a forward pass through the activation/loss function
        # takes the output of second dense layer here and returns loss

        data_loss = loss_activation.forward(dense2.output, y)
        
        #calculate regularization penalty

        regularization_loss = loss_activation.loss.regularization_loss(dense1) + loss_activation.loss.regularization_loss(dense2)

        #calculate overall loss
        loss = data_loss + regularization_loss

        predictions = np.argmax(loss_activation.output, axis = 1 )
        if len (y.shape) == 2 :
            y = np.argmax(y, axis = 1 )
        accuracy = np.mean(predictions == y)

        if not epoch % 5000:
        

            print(f'epoch: {epoch} , ' +
                    f'acc: {accuracy :.3f} , ' +
                    f'loss: {loss :.3f} ' +
                    f'data_loss: {data_loss :.3f} , ' +
                    f'reg_loss: {regularization_loss :.3f} , ' +
                    f'lr: {optimizer.current_learning_rate} ' +
                    f'method: {optimizer.name}')

            scores["method"].append(optimizer.name)
            scores["epoch"].append(epoch)
            scores["acc"].append(accuracy)
            scores["loss"].append(loss)
            scores["data_loss"].append(data_loss)
            scores["reg_loss"].append(regularization_loss)
            scores["lr"].append(optimizer.current_learning_rate)       


        # Backward pass
        loss_activation.backward(loss_activation.output, y)
        dense2.backward(loss_activation.dinputs)
        activation1.backward(dense2.dinputs)
        dense1.backward(activation1.dinputs)

            
        optimizer.pre_update_params()
        optimizer.update_params(dense1)
        optimizer.update_params(dense2)
        optimizer.post_update_params()



epoch: 0 , acc: 0.360 , loss: 1.099 data_loss: 1.099 , reg_loss: 0.001 , lr: 1.0 method: OptimizerAdagrad
epoch: 5000 , acc: 0.743 , loss: 0.646 data_loss: 0.584 , reg_loss: 0.062 , lr: 0.6667111140742716 method: OptimizerAdagrad
epoch: 10000 , acc: 0.780 , loss: 0.594 data_loss: 0.529 , reg_loss: 0.065 , lr: 0.5000250012500626 method: OptimizerAdagrad
epoch: 0 , acc: 0.780 , loss: 0.594 data_loss: 0.529 , reg_loss: 0.065 , lr: 1 method: optimizerSGD
epoch: 5000 , acc: 0.597 , loss: 0.844 data_loss: 0.790 , reg_loss: 0.054 , lr: 0.16669444907484582 method: optimizerSGD
epoch: 10000 , acc: 0.607 , loss: 0.797 data_loss: 0.728 , reg_loss: 0.069 , lr: 0.09091735612328393 method: optimizerSGD
epoch: 0 , acc: 0.600 , loss: 0.798 data_loss: 0.729 , reg_loss: 0.069 , lr: 0.001 method: RMSProp
epoch: 5000 , acc: 0.863 , loss: 0.447 data_loss: 0.361 , reg_loss: 0.086 , lr: 0.0006667111140742717 method: RMSProp
epoch: 10000 , acc: 0.900 , loss: 0.383 data_loss: 0.290 , reg_loss: 0.094 , lr: 0.00

In [5]:
df_scores = pd.DataFrame(data = scores)
df_scores

Unnamed: 0,method,epoch,loss,lr,acc,reg_loss,data_loss
0,OptimizerAdagrad,0,1.099095,1.0,0.36,0.0005,1.098594
1,OptimizerAdagrad,5000,0.645967,0.666711,0.743333,0.06236,0.583607
2,OptimizerAdagrad,10000,0.594087,0.500025,0.78,0.064821,0.529266
3,optimizerSGD,0,0.593893,1.0,0.78,0.064811,0.529082
4,optimizerSGD,5000,0.844198,0.166694,0.596667,0.054026,0.790172
5,optimizerSGD,10000,0.796557,0.090917,0.606667,0.06896,0.727597
6,RMSProp,0,0.798036,0.001,0.6,0.068923,0.729113
7,RMSProp,5000,0.446535,0.000667,0.863333,0.085515,0.36102
8,RMSProp,10000,0.383417,0.0005,0.9,0.093591,0.289826
9,RMSProp,0,0.383395,0.02,0.906667,0.093596,0.289799
