In [1]:
import numpy as np
import nnfs
from nnfs.datasets import spiral_data
nnfs.init()
import pandas as pd

In [2]:
import numpy as np
import nnfs
from nnfs.datasets import spiral_data
nnfs.init()
# Dense layer
class Layer_Dense :
# Layer initialization
    def __init__ ( self , n_inputs , n_neurons , weight_regularizer_l1 = 0, weight_regularizer_l2 = 0, bias_regularizer_l1 = 0, bias_regularizer_l2 =0):
        # Initialize weights and biases
        self.weights = 0.01 * np.random.randn(n_inputs, n_neurons)
        self.biases = np.zeros(( 1 , n_neurons))
        self.weight_regularizer_l1 = weight_regularizer_l1
        self.weight_regularizer_l2 = weight_regularizer_l2
        self.bias_regularizer_l1 = bias_regularizer_l1
        self.bias_regularizer_l2 = bias_regularizer_l2
        #set regularization strength
    # Forward pass
    def forward ( self , inputs ):
        # Remember input values
        self.inputs = inputs
        # Calculate output values from inputs, weights and biases
        self.output = np.dot(inputs, self.weights) + self.biases
    # Backward pass
    def backward ( self , dvalues ):
        # Gradients on parameters
        self.dweights = np.dot(self.inputs.T, dvalues)
        self.dbiases = np.sum(dvalues, axis = 0 , keepdims = True )
        # Gradient on values
        self.dinputs = np.dot(dvalues, self.weights.T)
        #Gradients on regularization
        #L1 on weights

        if self.weight_regularizer_l1 >0:
            dL1 = np.ones_like(self.weights)
            dL1[self.weights <0 ] =-1
            self.dweights += self.weight_regularizer_l1 * dL1
        #l2 on weights
        if self.weight_regularizer_l2 >0:
            self.dweights += 2*self.weight_regularizer_l2 * self.weights

        #L1 regularization - biases

        if self.bias_regularizer_l1 > 0:
            dL1 = np.ones_like(self.biases)
            dL1[self.biases< 0 ] = -1
            self.dbiases += self.bias_regularizer_l1 * dL1

        #l2 on biases
        if self.bias_regularizer_l2> 0:
            self.dbiases +=  2 * self.bias_regularizer_l2 * self.biases
        #Gradient on values
        self.dinputs = np.dot(dvalues, self.weights.T)

class Layer_Dropout:
    #init
    def __init__(self, rate):
        #store the rate, we intert it as for example for dropout of 0.1, we need success rate of 0.9
        self.rate = 1 - rate

    def forward(self, inputs):
        #save input values
        self.inputs = inputs
        #Generate and save scaled mask
        self.binary_mask = np.random.binomial(1, self.rate, size = inputs.shape) / self.rate
        #apply mask to output value
        self.output = inputs * self.binary_mask

    def backward(self, dvalues):
        #gradient on values
        self.dinputs = dvalues * self.binary_mask


    # ReLU activation
class Activation_ReLU :
    # Forward pass
    def forward ( self , inputs ):
        # Remember input values
        self.inputs = inputs
        # Calculate output values from inputs
        self.output = np.maximum( 0 , inputs)
        # Backward pass
    def backward ( self , dvalues ):
        # Since we need to modify original variable,
        # let's make a copy of values first
        self.dinputs = dvalues.copy()
        # Zero gradient where input values were negative
        self.dinputs[self.inputs <= 0 ] = 0
    # Softmax activation

class Activation_Softmax :
    # Forward pass
    def forward ( self , inputs ):
        # Remember input values
        self.inputs = inputs
        # Get unnormalized probabilities
        exp_values = np.exp(inputs - np.max(inputs, axis = 1 ,
        keepdims = True ))
        # Normalize them for each sample
        probabilities = exp_values / np.sum(exp_values, axis = 1 ,
        keepdims = True )
        self.output = probabilities
        # Backward pass
    def backward ( self , dvalues ):
        # Create uninitialized array
        self.dinputs = np.empty_like(dvalues)
        # Enumerate outputs and gradients
        for index, (single_output, single_dvalues) in \
        enumerate ( zip (self.output, dvalues)):
        # Flatten output array
            single_output = single_output.reshape( - 1 , 1 )
            # Calculate Jacobian matrix of the output and
            jacobian_matrix = np.diagflat(single_output) - \
            np.dot(single_output, single_output.T)
            # Calculate sample-wise gradient
            # and add it to the array of sample gradients
            self.dinputs[index] = np.dot(jacobian_matrix,
            single_dvalues)

class Activation_Sigmoid:
    #forward pass
    def forward(self, inputs):
        #save input and calculate / save output
        #ot the sigmoid function

        self.inputs = inputs
        self.output = 1/ (1 + np.exp(-inputs))

    def backward(self, dvalues):
        #derivative - calculates from output of the sigmoid function
        self.dinputs = dvalues * (1- self.output) * self.output

class Optimizer_SGD:
    #Initialize optimizer - set settings,
    #learning rate of 1 is default for this optimizer

    def __init__(self, learning_rate = 1, decay = 0, momentum = 0):
        self.name = "optimizerSGD"
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.momentum = momentum

    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * (1. / (1. + self.decay * self.iterations))

    def update_params(self, layer):
        #if later does not contain momentum arrays, create them
        #filled them with zeros
        if self.momentum:

            if not hasattr(layer, 'weight_momentums'):
                layer.weight_momentums = np.zeros_like(layer.weights)
                #if there is no momentum array for weights
                #the array does not exist for biases yet either
                layer.bias_momentums = np.zeros_like(layer.biases)

            #Build weight updates with momentum - take previous
            #updates multiplied by retain factor and update with
            #current gradients

            weight_updates = self.momentum * layer.weight_momentums - self.current_learning_rate * layer.dweights
            layer.weight_momentums = weight_updates

            #build bias updates
            bias_updates = self.momentum * layer.bias_momentums - self.current_learning_rate * layer.dbiases
            layer.bias_momentum = bias_updates
            
        else:
            weight_updates = -self.current_learning_rate * layer.dweights
            bias_updates = -self.current_learning_rate * layer.dbiases


        layer.weights += -self.learning_rate * layer.dweights
        layer.biases += -self.learning_rate * layer.dbiases

    def post_update_params(self):
        self.iterations += 1

class Optimizer_Adagrad :
    # Initialize optimizer - set settings
    def __init__ ( self , learning_rate = 1. , decay = 0. , epsilon = 1e-7 ):
        self.name = "OptimizerAdagrad"
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.epsilon = epsilon
    # Call once before any parameter updates
    def pre_update_params ( self ):
        if self.decay:
            self.current_learning_rate = self.learning_rate *  ( 1. / ( 1. + self.decay * self.iterations))
        
    # Update parameters
    def update_params ( self , layer ):
        # If layer does not contain cache arrays,
        # create them filled with zeros
        if not hasattr (layer, 'weight_cache' ):
            layer.weight_cache = np.zeros_like(layer.weights)
            layer.bias_cache = np.zeros_like(layer.biases)
        # Update cache with squared current gradients
        layer.weight_cache += layer.dweights ** 2
        layer.bias_cache += layer.dbiases ** 2
        # Vanilla SGD parameter update + normalization
        # with square rooted cache
        layer.weights += - self.current_learning_rate * layer.dweights /  (np.sqrt(layer.weight_cache) + self.epsilon)
        layer.biases += - self.current_learning_rate *  layer.dbiases /       (np.sqrt(layer.bias_cache) + self.epsilon)
        # Call once after any parameter updates
    def post_update_params ( self ):
        self.iterations += 1

class Optimizer_RMSprop:

    def __init__(self, learning_rate = 0.001, decay = 0, epsilon = 1e-7, rho=0.9):
        self.name = "RMSProp"
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.epsilon = epsilon
        self.rho = rho
    
    #Update parameters
    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * (1. / (1. + self.decay * self.iterations))
    
    def update_params(self, layer):

        #if layer does not contain cache arrays
        #create them filled with zeros

        if not hasattr(layer, 'weight_cache'): 
            layer.weight_cache = np.zeros_like(layer.weights)
            layer.bias_cache = np.zeros_like(layer.biases)

        #update cache with squared current gradient
        layer.weight_cache = self.rho * layer.weight_cache + (1 - self.rho) * layer.dweights ** 2
        layer.bias_cache =  self.rho * layer.bias_cache + (1 - self.rho) * layer.dbiases ** 2

        layer.weights += - self.current_learning_rate * layer.dweights / (np.sqrt(layer.weight_cache) + self.epsilon)
        layer.biases += - self.current_learning_rate * layer.dbiases / (np.sqrt(layer.bias_cache) + self.epsilon)
    
    def post_update_params(self):
        self.iterations += 1

class Optimizer_Adam:

    def __init__(self, learning_rate = 0.001, decay = 0, epsilon = 1e-7, beta_1 = 0.9, beta_2 = 0.999):
        self.name = "OptimizerAdam"
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.epsilon = epsilon
        self.iterations = 0
        self.beta_1 = beta_1
        self.beta_2 = beta_2
    
    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * (1. / (1. + self.decay * self.iterations))

    def update_params(self, layer):

        #if layer does not contain cache arrays
        #create them filled with zeros

        if not hasattr(layer, 'weight_cache'):
            layer.weight_momentums = np.zeros_like(layer.weights)
            layer.weight_momentums = np.zeros_like(layer.weights)
            layer.weight_cache = np.zeros_like(layer.weights)
            layer.bias_momentums = np.zeros_like(layer.biases)
            layer.bias_cache = np.zeros_like(layer.biases)
            

        layer.weight_momentums = self.beta_1 * layer.weight_momentums  + (1- self.beta_1) * layer.dweights
        layer.bias_momentums = self.beta_1 * layer.bias_momentums + (1- self.beta_1) * layer.dbiases

        weight_momentums_corrected = layer.weight_momentums / ( 1 - self.beta_1 ** (self.iterations + 1 ))
        bias_momentums_corrected = layer.bias_momentums / ( 1 - self.beta_1 ** (self.iterations + 1 ))
# Update cache with squared current gradients
        layer.weight_cache = self.beta_2 * layer.weight_cache + ( 1 - self.beta_2) * layer.dweights ** 2
        layer.bias_cache = self.beta_2 * layer.bias_cache + ( 1 - self.beta_2) * layer.dbiases ** 2

        weight_cache_corrected = layer.weight_cache / ( 1 - self.beta_2 ** (self.iterations + 1 ))
        bias_cache_corrected = layer.bias_cache / ( 1 - self.beta_2 ** (self.iterations + 1 ))

        layer.weights += - self.current_learning_rate * weight_momentums_corrected / (np.sqrt(weight_cache_corrected) + self.epsilon)
        layer.biases += - self.current_learning_rate * bias_momentums_corrected / (np.sqrt(bias_cache_corrected) + self.epsilon)
    
    def post_update_params ( self ):
        self.iterations += 1

# Common loss class
class Loss :

    #Regularization on loss calculation

    def regularization_loss(self, layer):
        #0 by default

        regularization_loss = 0
        #L1 regularization-weights
        #calculate only when factor greater than 0
        #L1 regularization’s penalty is the sum of all the absolute values for the weights and biases.
        #This is a linear penalty as regularization loss returned by this function is directly proportional to
        #parameter values. L2 regularization’s penalty is the sum of the squared weights and biases.


        if layer.weight_regularizer_l1 > 0:
            regularization_loss += layer.weight_regularizer_l1 * np.sum(np.abs(layer.weights))

        #L2 regularization - weights
        if layer.weight_regularizer_l2 > 0:
            regularization_loss += layer.weight_regularizerl2 * np.sum(layer.weights*layer.weights)

        #L1 regularization - biases
        #calculate only when factor greater than 0
        if layer.bias_regularizer_l1 > 0:
            regularization_loss += layer.bias_regularizer_l1 * np.sum(np.abs(layer.biases))

        #L2 regularization - biases
        if layer.bias_regularizer_l2 > 0:
            regularization_loss += layer.bias_regularizer_l2 * np.sum(layer.biases * layer.biases)

        return regularization_loss       

        
    # Calculates the data and regularization losses
    # given model output and ground truth values
    def calculate ( self , output , y ):
        # Calculate sample losses
        sample_losses = self.forward(output, y)
        # Calculate mean loss
        data_loss = np.mean(sample_losses)
        # Return loss
        return data_loss
        # Cross-entropy loss

class Loss_CategoricalCrossentropy ( Loss ):
    # Forward pass
    def forward ( self , y_pred , y_true ):
    # Number of samples in a batch
        samples = len (y_pred)
        # Clip data to prevent division by 0
        # Clip both sides to not drag mean towards any value
        y_pred_clipped = np.clip(y_pred, 1e-7 , 1 - 1e-7 )
        # Probabilities for target values -
        # only if categorical labels
        if len (y_true.shape) == 1 :
            correct_confidences = y_pred_clipped[
            range (samples),
            y_true
            ]
        # Mask values - only for one-hot encoded labels
        elif len (y_true.shape) == 2 :
            correct_confidences = np.sum(
            y_pred_clipped * y_true,
            axis = 1
            )
    # Losses
        negative_log_likelihoods = - np.log(correct_confidences)
        return negative_log_likelihoods
        # Backward pass
    def backward ( self , dvalues , y_true ):
        # Number of samples
        samples = len (dvalues)
        # Number of labels in every sample
        # We'll use the first sample to count them
        labels = len (dvalues[ 0 ])
        # If labels are sparse, turn them into one-hot vector
        if len (y_true.shape) == 1 :
            y_true = np.eye(labels)[y_true]
        # Calculate gradient
        self.dinputs = - y_true / dvalues
        # Normalize gradient
        self.dinputs = self.dinputs / samples

# and cross-entropy loss for faster backward step
class Activation_Softmax_Loss_CategoricalCrossentropy ():
    # Creates activation and loss function objects
    def __init__ ( self ):
        self.activation = Activation_Softmax()
        self.loss = Loss_CategoricalCrossentropy()
        # Forward pass
    def forward ( self , inputs , y_true ):
        # Output layer's activation function
        self.activation.forward(inputs)
        # Set the output
        self.output = self.activation.output
        # Calculate and return loss value
        return self.loss.calculate(self.output, y_true)
    # Backward pass
    def backward ( self , dvalues , y_true ):
        # Number of samples
        samples = len (dvalues)
        # If labels are one-hot encoded,
        # turn them into discrete values
        if len (y_true.shape) == 2 :
            y_true = np.argmax(y_true, axis = 1 )
            # Copy so we can safely modify
        self.dinputs = dvalues.copy()
        # Calculate gradient
        self.dinputs[ range (samples), y_true] -= 1
        # Normalize gradient
        self.dinputs = self.dinputs / samples


class Loss_BinaryCrossentropy(Loss):
    
    #forward pass
    def forward(self, y_pred, y_true):
        #clip data to prevent division by 0
        #clip both sides to not drag mean towards any value
        y_pred_clipped = np.clip(y_pred, 1e-7, 1 - 1e-7)

        #calculate sample-wise loss

        sample_losses = -(y_true * np.log(y_pred_clipped)+ (1 - y_true) * np.log(1- y_pred_clipped) )
        sample_losses = np.mean(sample_losses, axis = 1)

        return sample_losses
    
    def backward(self, dvalues, y_true):
        #Number of samples

        samples = len(dvalues)

        #number of outputs in every sample
        #well use the first sample to count them

        outputs = len(dvalues[0])

        #clip data to prevent division by 0
        #clip both sides to not drag mean towards any value

        clipped_dvalues = np.clip(dvalues, 1e-7, 1-1e-7)

        #calculate gradient
        self.dinputs = -(y_true / clipped_dvalues - (1 - y_true) / (1 - clipped_dvalues)) / outputs

        #Normalize gradient
        self.dinputs = self.dinputs / samples

        


In [3]:

X, y = spiral_data( samples = 100 , classes = 2 )
# Create Dense layer with 2 input features and 3 output values

y = y.reshape(-1,1)
dense1 = Layer_Dense( 2 , 512 , weight_regularizer_l1=5e-4, bias_regularizer_l2= 5e-4)
# Create ReLU activation (to be used with Dense layer):
activation1 = Activation_ReLU()

#create dropout layer
dropout1 = Layer_Dropout(0.1)

# Create second Dense layer with 3 input features (as we take output
# of previous layer here) and 3 output values (output values)

dense2 = Layer_Dense( 512 ,3  )
# Create Softmax classifier's combined loss and activation

activation2 = Activation_Sigmoid()

loss_function = Loss_BinaryCrossentropy()

# Perform a forward pass of our training data through this layer
optimizer_class =[ 
#Optimizer_Adagrad(decay = 1e-4),
#Optimizer_SGD(decay = 1e-3, momentum = 0.9),
#Optimizer_RMSprop( decay = 1e-4 ),
#Optimizer_RMSprop( learning_rate = 0.02 , decay = 1e-5 ,rho = 0.999 ),
Optimizer_Adam( learning_rate = 0.05 , decay = 5e-5 )]

scores = {"method":[],"epoch":[],"loss":[],"lr":[],"acc":[], "reg_loss":[],"data_loss":[]}



In [4]:
for  optimizer in optimizer_class: 
    
    for epoch in range (10001):
        

        dense1.forward(X)
        #this is whhen input * weight + bias
        # Perform a forward pass through activation function
        # takes the output of first dense layer here

        activation1.forward(dense1.output) #this is relu
        # Perform a forward pass through second Dense layer
        # takes outputs of activation function of first layer as inputs


        # Perform a forward pass through the activation/loss function
        # takes the output of second dense layer here and returns loss
        dense2.forward(activation1.output) #this is input * weight + bias

        activation2.forward(dense2.output)

        
        

        data_loss = loss_function.calculate(activation2.output, y)
        
        #calculate regularization penalty

        regularization_loss = loss_function.regularization_loss(dense1) + loss_function.regularization_loss(dense2)

        #calculate overall loss
        loss = data_loss + regularization_loss

        predictions = (activation2.output > 0.5) * 1
        accuracy = np.mean(predictions == y)

        if not epoch % 100:
        

            print(f'epoch: {epoch} , ' +
                    f'acc: {accuracy:.3f} , ' +
                    f'loss: {loss:.3f} ' +
                    f'data_loss: {data_loss :.3f} , ' +
                    f'reg_loss: {regularization_loss :.3f} , ' +
                    f'lr: {optimizer.current_learning_rate} ' +
                    f'method: {optimizer.name}')

            scores["method"].append(optimizer.name)
            scores["epoch"].append(epoch)
            scores["acc"].append(accuracy)
            scores["loss"].append(loss)
            scores["data_loss"].append(data_loss)
            scores["reg_loss"].append(regularization_loss)
            scores["lr"].append(optimizer.current_learning_rate)       


        # Backward pass
        loss_function.backward(activation2.output, y)
        activation2.backward(loss_function.dinputs)
        dense2.backward(activation2.dinputs)
        activation1.backward(dense2.dinputs)
        dense1.backward(activation1.dinputs)

            
        optimizer.pre_update_params()
        optimizer.update_params(dense1)
        optimizer.update_params(dense2)
        optimizer.post_update_params()
#Validate the model

#create test dataset



epoch: 0 , acc: 0.555 , loss: 0.697 data_loss: 0.693 , reg_loss: 0.004 , lr: 0.05 method: OptimizerAdam
epoch: 100 , acc: 0.872 , loss: 0.414 data_loss: 0.352 , reg_loss: 0.062 , lr: 0.04975371909050202 method: OptimizerAdam
epoch: 200 , acc: 0.945 , loss: 0.252 data_loss: 0.171 , reg_loss: 0.081 , lr: 0.049507401356502806 method: OptimizerAdam
epoch: 300 , acc: 0.960 , loss: 0.220 data_loss: 0.125 , reg_loss: 0.095 , lr: 0.0492635105177595 method: OptimizerAdam
epoch: 400 , acc: 0.975 , loss: 0.179 data_loss: 0.089 , reg_loss: 0.090 , lr: 0.04902201088288642 method: OptimizerAdam
epoch: 500 , acc: 0.975 , loss: 0.166 data_loss: 0.078 , reg_loss: 0.088 , lr: 0.048782867456949125 method: OptimizerAdam
epoch: 600 , acc: 0.980 , loss: 0.155 data_loss: 0.070 , reg_loss: 0.086 , lr: 0.04854604592455945 method: OptimizerAdam
epoch: 700 , acc: 0.980 , loss: 0.147 data_loss: 0.064 , reg_loss: 0.083 , lr: 0.048311512633460556 method: OptimizerAdam
epoch: 800 , acc: 0.980 , loss: 0.139 data_loss

ValueError: operands could not be broadcast together with shapes (300,1) (200,3) 

In [7]:

X_test, y_test = spiral_data(samples = 100, classes =2)
y_test = y_test.reshape(-1,1)
dense1.forward(X_test)
activation1.forward(dense1.output)
dense2.forward(activation1.output)

loss = loss_function.calculate(activation2.output, y_test)

predictions = (activation2.output > 0.5 ) * 1
accuracy = np.mean(predictions == y_test)

print(f'validation, acc: {accuracy:.3f}, loss {loss:.3f}')


validation, acc: 0.995, loss 0.023


In [None]:
df_scores = pd.DataFrame(data = scores)
df_scores

In [None]:
import random
dropout_rate = 0.5
# Example output containing 10 values
example_output = [ 0.27 , - 1.03 , 0.67 , 0.99 , 0.05 , - 0.37 , - 2.01 , 1.13 , - 0.07 , 0.73 ]

while True:
    #Randomly choose inde and set value to 0

    index = random.randint(0, len(example_output) - 1)
    example_output[index ] =0 
    
    #we might set an index that already is zeroed
    #there are different ways of overcoming this problem
    #for simplicity we count values that are exactly 0
    #while it's extremely rare in real model that weights
    #are exacly 0, this is not the best method for sure

    dropped_out = 0

    for value in example_output:
        if value == 0:
            dropped_out += 1

    #if required number of outputs is zeroed - leave the loop
    if dropped_out / len(example_output) >= dropout_rate:
        break

print(example_output)



In [None]:
random.randint( 0 , len (example_output) - 1 )

In [None]:
np.random.binomial( 2 , 0.5 , size = 5 )

In [None]:
dropout_rate = 0.20
np.random.binomial( 1 , 1 - dropout_rate, size = 5 )

#this means that there are 80 % that it can be True for once toss. The results are F, T, T, F , F

In [None]:
import numpy as np
dropout_rate = 0.3
example_output = np.array([ 0.27 , - 1.03 , 0.67 , 0.99 , 0.05 ,
- 0.37 , - 2.01 , 1.13 , - 0.07 , 0.73 ])
example_output *= np.random.binomial( 1 , 1 - dropout_rate,
example_output.shape)
print (example_output)

In [None]:
import numpy as np
dropout_rate = 0.2
example_output = np.array([ 0.27 , - 1.03 , 0.67 , 0.99 , 0.05 ,- 0.37 , - 2.01 , 1.13 , - 0.07 , 0.73 ])
print (f'sum initial { sum (example_output)}')
sums = []
for i in range ( 1000000 ):
    example_output2 = example_output *  np.random.binomial( 1 , 1 - dropout_rate, example_output.shape) /    ( 1 - dropout_rate)
    sums.append( sum (example_output2))
print ( f'mean sum: {np.mean(sums)} ' )