In [1]:
import numpy as np
import nnfs
from nnfs.datasets import spiral_data
import pandas as pd

class Layer_Dense:

    #layer initialization 
    def __init__(self, n_inputs, n_neurons):
        self.weights = 0.01 * np.random.randn(n_inputs, n_neurons)
        self.biases = np.zeros((1,n_neurons))

    #forward pass
    def forward(self, inputs):
        self.inputs = inputs
        self.output = np.dot(inputs, self.weights) + self.biases

    def backward(self, dvalues):
        #derivative wrt of weights is inputs. Dimension adjustment is needed
        self.dweights = np.dot(self.inputs.T, dvalues)
        #derivative of bias is column sums
        self.dbiases = np.sum(dvalues, axis = 0, keepdims = True)
        #derivative wrt of inputs is weights
        self.dinputs = np.dot(dvalues, self.weights.T)

class Activation_ReLU:
    def forward(self, inputs):
        self.inputs = inputs
        self.output = np.maximum(0,inputs) #if the inputs is lower than 0, we make it 0, uf not, then we pass on

    def backward(self, dvalues):
        self.dinputs = dvalues.copy() #copy the gradient matrix
        self.dinputs[self.inputs<= 0] = 0 #if it is lower than 0 then we'll make it zero

class Activation_Softmax:
    def forward(self, inputs):
        self.inputs = inputs
        exp_values = np.exp(inputs - np.max(inputs, axis = 1, keepdims = True)) #make the max = 1 and the min is -inf
        probabilities = exp_values / np.sum(exp_values, axis = 1, keepdims = True)# normalize

        self.output = probabilities

    def backward(self, dvalues):
        self.dinputs = np.empty_like(dvalues)

        for index, (single_output, single_dvalues)  in enumerate(zip(self.output, dvalues)):
            #single output is self.output[0]. it has the dimension 1xn, then the next line, we'll shift if to n,1
            single_output = single_output.reshape(-1,1) #reshape the output to become (,1). n row with 1 column
            
            #diagflat is to craete the matrix where diagonal is a value and the rest is 0. then based on the formula 
            #diagflat will have n,n matrix with diagonal is single_output and the rest is 0
            #then we have single.output where dim = n,1 and single.output.T dim = 1,n. this will result n,n
            jacobian_matrix  = np.diagflat(single_output) - np.dot(single_output, single_output.T)

            #jacobian matrix is n,n and single_values is n,1. Why Dvalues has n,n dimension and single is only the first row
            #this will result n,1 dimension
            self.dinputs[index] = np.dot(jacobian_matrix, single_dvalues)
class Loss:
    def calculate(self, output, y):
        sample_losses = self.forward(output,y) #output in here is the preddiction
        data_loss = np.mean(sample_losses)
        return data_loss

class Loss_CategoricalCrossentropy(Loss):
    def forward(self, y_pred, y_true):
        samples = len(y_pred)

        #clip data to avoid division by 0
        y_pred_clipped = np.clip(y_pred, 1e-7, 1-1e-7)
        
        if len(y_true.shape) == 1 : 
            #softmax_outputs1 = np.array([[ 0.7 , 0.1 , 0.2 ],
            #[ 0.1 , 0.5 , 0.4 ],
            #[ 0.02 , 0.9 , 0.08 ]])
            #class_targets1 = np.array([ 0 , 1 , 1 ])
            #for example [0,1,1] just go to observation where it is y true. 
            #y_pred has a dimension nxn so for the first row, take index 0 y_pred_clipped[0,0]
            #y_pred_clipped[1,1]
            #y_pred_clipped[2,1]

            correct_cofidences = y_pred_clipped[ range(samples), y_true]   

        elif len(y_true.shape) == 2:
            #softmax_outputs2 = np.array([[ 0.7 , 0.1 , 0.2 ],
            #[ 0.1 , 0.5 , 0.4 ],
            #[ 0.02 , 0.9 , 0.08 ]])
            #class_targets2 = np.array([[ 1 , 0 , 0 ],
            #[ 0 , 1 , 0 ],
            #[ 0 , 1 , 0 ]])
            #since this is one hot encoding. only 1 value is 1 and the rest is zero. so when multiplying, only the given 1 will yield a result
            correct_confidences1 = np.sum(y_pred_clipped * y_true, axis = 1) 
        
        negative_log_likelihoods = -np.log(correct_cofidences)
        return negative_log_likelihoods
    
    def backward(self, dvalues, y_true):
        #number of samples
        samples = len(dvalues) #np.array([[ 1 , 2 , 3 ],[ 5 , 6 , 7 ],[ 9 , 10 , 11  ]])

        labels = len(dvalues[0]) #array([1,2,3])

        #if y_true is [0,1,1]
        #then np.eye will make it 
        #array([[1., 0., 0.],
        #       [0., 1., 0.],
        #       [0., 1., 0.]], dtype=float32)

        if len(y_true.shape) == 1:
            y_true = np.eye(labels)[y_true] 
    
        self.dinputs = -y_true / dvalues #partial derivatives with respect tp inputs = matrix 3x3 - 3x3
        #the derivative of this loss fucntion with respect ot is input = ground truth vector / vector of predicted values

        self.dinputs = self.dinputs / samples
        #normalize to make the sum magnitude invariant to the number of samples. 

        
class Activation_Softmax_Loss_CategoricalCrossentropy():
    def __init__(self):
        self.activation = Activation_Softmax()
        self.loss = Loss_CategoricalCrossentropy()

    def forward(self, inputs, y_true):
        self.activation.forward(inputs) #use softmax activation
        self.output = self.activation.output #the output is a probability
        return self.loss.calculate(self.output, y_true) #calculate loss between predicted (self.output) and y_true
    
    def backward(self, dvalues, y_true):
        samples = len(dvalues)

        if len(y_true.shape) == 2:
            y_true = np.argmax(y_true, axis = 1) #convert from one hot encoder to the discrete true labels
        
        self.dinputs = dvalues.copy()
        self.dinputs [range(samples) ,y_true] -= 1 #only at the given ytue, the value is minus by one. why?
        #becayse the partial derivative of loss wrt of softmax function inputs. 
        self.dinputs = self.dinputs/samples

class Optimizer_SGD:
    def __init__(self, learning_rate = 1., decay  = 0., momentum  = 0):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.iterations = 0
        self.decay = decay
        self.current_decay = decay
        self.momentum = momentum
        self.name = f'OptimizerSGD lr:{self.learning_rate}, decay ={self.decay}, momentum = {self.momentum}'
        

    def pre_udpate_params(self):
        if self.decay:
            self.current_decay = 1./ (1.+ self.decay * self.iterations)
            self.current_learning_rate = self.learning_rate * (1./ (1.+ self.decay * self.iterations))

    def update_params(self, layer):
        if self.momentum:
        #if we use momentum
            if not hasattr(layer, 'weight_momentums'):
                layer.weight_momentums = np.zeros_like(layer.weights)
                layer.bias_momentums = np.zeros_like(layer.biases)

            weight_updates = self.momentum * layer.weight_momentums - self.current_learning_rate * layer.dweights
            layer.weight_momentums = weight_updates

            bias_updates = self.momentum * layer.bias_momentums - self.current_learning_rate * layer.dbiases
            layer.bias_momentums = bias_updates

        else:
            weight_updates = -self.learning_rate * layer.dweights
            bias_updates = -self.learning_rate * layer.dbiases  
        
        layer.weights += weight_updates
        layer.biases += bias_updates

    def post_update_params(self):
        self.iterations += 1

class Optimizer_Adagrad:
    def __init__(self, learning_rate = 1, decay  = 0.,  epsilon = 1e-7):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.iterations = 0
        self.decay = decay
        self.current_decay = decay
        self.epsilon = epsilon #is only to prevent division by 0
        self.name = f'Adagrad decay =  {self.decay}'
        

    def pre_udpate_params(self):
        if self.decay:
            self.current_decay = 1./ (1.+ self.decay * self.iterations)
            self.current_learning_rate = self.learning_rate * (1./ (1.+ self.decay * self.iterations))

    def update_params(self, layer):
        #if we use momentum
        if not hasattr(layer, 'weight_cache'):
            layer.weight_cache = np.zeros_like(layer.weights)
            layer.bias_cache = np.zeros_like(layer.biases)

        layer.weight_cache += layer.dweights ** 2
        layer.bias_cache += layer.dbiases ** 2


        layer.weights += -self.current_learning_rate * layer.dweights / (np.sqrt(layer.weight_cache) + self.epsilon)
        layer.biases += -self.current_learning_rate * layer.dbiases  / (np.sqrt(layer.bias_cache) + self.epsilon)

    def post_update_params(self):
        self.iterations += 1
        
class Optimizer_RMSprop:
    def __init__(self, learning_rate = 0.001, decay  = 0., rho = 0.9, epsilon = 1e-7):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.iterations = 0
        self.decay = decay
        self.current_decay = decay
        self.epsilon = epsilon #is only to prevent division by 0
        self.name = f'RMSProp =  {self.decay}'
        self.rho = rho
        

    def pre_udpate_params(self):
        if self.decay:
            self.current_decay = 1./ (1.+ self.decay * self.iterations)
            self.current_learning_rate = self.learning_rate * (1./ (1.+ self.decay * self.iterations))

    def update_params(self, layer):
        #if we use momentum
        if not hasattr(layer, 'weight_cache'):
            layer.weight_cache = np.zeros_like(layer.weights)
            layer.bias_cache = np.zeros_like(layer.biases)

        layer.weight_cache = self.rho * layer.weight_cache + ( 1- self.rho) * layer.dweights **2
        layer.bias_cache = self.rho * layer.bias_cache + ( 1- self.rho) * layer.dbiases **2


        layer.weights += -self.learning_rate * layer.dweights / (np.sqrt(layer.weight_cache) + self.epsilon)
        layer.biases += -self.learning_rate * layer.dbiases  / (np.sqrt(layer.bias_cache) + self.epsilon)

    def post_update_params(self):
        self.iterations += 1


class Optimizer_Adam:
    def __init__(self, learning_rate = 0.001, decay  = 0., epsilon = 1e-7, beta_1 = 0.9, beta_2 = 0.999):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.iterations = 0
        self.decay = decay
        self.current_decay = decay
        self.epsilon = epsilon #is only to prevent division by 0
        self.name = f'OptimizerAdam lr =  {self.learning_rate}, decay = {self.current_decay}'
        self.beta_1 = beta_1
        self.beta_2 = beta_2

        

    def pre_udpate_params(self):
        if self.decay:
            self.current_decay = 1./ (1.+ self.decay * self.iterations)
            self.current_learning_rate = self.learning_rate * (1./ (1.+ self.decay * self.iterations))

    def update_params(self, layer):
        #if we use momentum
        if not hasattr(layer, 'weight_cache'):
            layer.weight_momentums = np.zeros_like(layer.weights)
            layer.weight_cache = np.zeros_like(layer.weights)
            layer.bias_momentums = np.zeros_like(layer.biases)
            layer.bias_cache = np.zeros_like(layer.biases)

        layer.weight_momentums = self.beta_1 * layer.weight_momentums + (1-self.beta_1) * layer.dweights
        layer.bias_momentums = self.beta_1 * layer.bias_momentums + (1- self.beta_1) * layer.dbiases

        weight_momentums_corrected = layer.weight_momentums / (1- self.beta_1 ** (self.iterations + 1))
        bias_momentums_corrected = layer.bias_momentums / (1- self.beta_1 ** (self.iterations + 1))

        layer.weight_cache = self.beta_2 * layer.weight_cache + (1 - self.beta_2) * layer.dweights ** 2
        layer.bias_cache = self.beta_2 * layer.bias_cache + (1- self.beta_2) * layer.dbiases ** 2

        #get corrected cache
        weight_cache_corrected = layer.weight_cache / (1- self.beta_2 ** (self.iterations +1))
        bias_cache_corrected = layer.bias_cache / (1- self.beta_2 ** (self.iterations + 1))

        layer.weights += -self.current_learning_rate * weight_momentums_corrected / (np.sqrt(weight_cache_corrected) + self.epsilon)
        layer.biases += -self.current_learning_rate * bias_momentums_corrected / (np.sqrt(bias_cache_corrected) + self.epsilon)
            

    def post_update_params(self):
        self.iterations += 1

In [2]:
# Create dataset
X, y = spiral_data( samples = 100 , classes = 3 )
# Create Dense layer with 2 input features and 64 output values
dense1 = Layer_Dense( 2 , 64 )
# Create ReLU activation (to be used with Dense layer):
activation1 = Activation_ReLU()
# Create second Dense layer with 64 input features (as we take output
# of previous layer here) and 3 output values (output values)
dense2 = Layer_Dense( 64 , 3 )
# Create Softmax classifier's combined loss and activation
loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy()
# Create optimizer
optimizer_class = [
Optimizer_SGD( learning_rate= 1),
Optimizer_SGD( learning_rate= 0.85),
Optimizer_SGD( decay = 1e-2),
Optimizer_SGD( decay = 1e-3),
Optimizer_SGD(decay = 1e-3,momentum = 0.5),
Optimizer_SGD(decay = 1e-3,momentum = 0.9),
Optimizer_Adagrad(decay = 1e-4),
Optimizer_RMSprop(decay = 1e-4),
Optimizer_Adam(learning_rate = 0.05,decay = 5e-7)


]


scores = {"method":[],"epoch":[],"loss":[],"lr":[],"acc":[],"decay":[]}

for optimizer in optimizer_class:
    dense1 = Layer_Dense( 2 , 64 )
    activation1 = Activation_ReLU()
    dense2 = Layer_Dense( 64 , 3 )
    for epoch in range ( 10001 ):
        # Perform a forward pass of our training data through this layer
        dense1.forward(X)
        # Perform a forward pass through activation function
        # takes the output of first dense layer here
        activation1.forward(dense1.output)
        # Perform a forward pass through second Dense layer
        # takes outputs of activation function of first layer as inputs
        dense2.forward(activation1.output)
        # Perform a forward pass through the activation/loss function
        # takes the output of second dense layer here and returns loss
        loss = loss_activation.forward(dense2.output, y)
        # Calculate accuracy from output of activation2 and targets
        # calculate values along first axis
        predictions = np.argmax(loss_activation.output, axis = 1 )
        if len (y.shape) == 2 :
            y = np.argmax(y, axis = 1 )
        accuracy = np.mean(predictions == y)
        if not epoch % 10000 :
            print (f'epoch: {epoch} , ' +
                    f'acc: {accuracy :.3f} , ' +
                    f'loss: {loss :.3f},' +
                    f'lr:{optimizer.current_learning_rate:.3f}' )

            scores["method"].append(optimizer.name)
            scores["epoch"].append(epoch)
            scores["acc"].append(accuracy)
            scores["loss"].append(loss)
            scores["lr"].append(optimizer.current_learning_rate)
            scores["decay"].append(optimizer.current_decay)    
                # Backward pass
        loss_activation.backward(loss_activation.output, y)
        dense2.backward(loss_activation.dinputs)
        activation1.backward(dense2.dinputs)
        dense1.backward(activation1.dinputs)
        # Update weights and biases
        optimizer.pre_udpate_params()
        optimizer.update_params(dense1)
        optimizer.update_params(dense2)
        optimizer.post_update_params()

epoch: 0 , acc: 0.343 , loss: 1.099,lr:1.000
epoch: 10000 , acc: 0.677 , loss: 0.729,lr:1.000
epoch: 0 , acc: 0.313 , loss: 1.099,lr:0.850
epoch: 10000 , acc: 0.760 , loss: 0.561,lr:0.850
epoch: 0 , acc: 0.367 , loss: 1.099,lr:1.000
epoch: 10000 , acc: 0.703 , loss: 0.644,lr:0.010
epoch: 0 , acc: 0.410 , loss: 1.099,lr:1.000
epoch: 10000 , acc: 0.570 , loss: 0.922,lr:0.091
epoch: 0 , acc: 0.383 , loss: 1.099,lr:1.000
epoch: 10000 , acc: 0.810 , loss: 0.440,lr:0.091
epoch: 0 , acc: 0.357 , loss: 1.099,lr:1.000
epoch: 10000 , acc: 0.947 , loss: 0.136,lr:0.091
epoch: 0 , acc: 0.343 , loss: 1.099,lr:1.000
epoch: 10000 , acc: 0.840 , loss: 0.350,lr:0.500
epoch: 0 , acc: 0.380 , loss: 1.099,lr:0.001
epoch: 10000 , acc: 0.693 , loss: 0.692,lr:0.001
epoch: 0 , acc: 0.307 , loss: 1.099,lr:0.050
epoch: 10000 , acc: 0.923 , loss: 0.169,lr:0.050


In [3]:
df_scores = pd.DataFrame(data = scores)
df_scores[df_scores.epoch !=0]

Unnamed: 0,method,epoch,loss,lr,acc,decay
1,"OptimizerSGD lr:1, decay =0.0, momentum = 0",10000,0.729148,1.0,0.676667,0.0
3,"OptimizerSGD lr:0.85, decay =0.0, momentum = 0",10000,0.560901,0.85,0.76,0.0
5,"OptimizerSGD lr:1.0, decay =0.01, momentum = 0",10000,0.643755,0.009902,0.703333,0.009902
7,"OptimizerSGD lr:1.0, decay =0.001, momentum = 0",10000,0.921615,0.090917,0.57,0.090917
9,"OptimizerSGD lr:1.0, decay =0.001, momentum = 0.5",10000,0.439945,0.090917,0.81,0.090917
11,"OptimizerSGD lr:1.0, decay =0.001, momentum = 0.9",10000,0.13562,0.090917,0.946667,0.090917
13,Adagrad decay = 0.0001,10000,0.35009,0.500025,0.84,0.500025
15,RMSProp = 0.0001,10000,0.691779,0.0005,0.693333,0.500025
17,"OptimizerAdam lr = 0.05, decay = 5e-07",10000,0.168533,0.049751,0.923333,0.995025


In [4]:
test = Optimizer_SGD()
print(test.decay)
print(test.iterations)

0.0
0


In [5]:
'''test.momentum# Create dataset
X, y = spiral_data( samples = 100 , classes = 3 )
# Create Dense layer with 2 input features and 64 output values
dense1 = Layer_Dense( 2 , 64 )
# Create ReLU activation (to be used with Dense layer):
activation1 = Activation_ReLU()
# Create second Dense layer with 64 input features (as we take output
# of previous layer here) and 3 output values (output values)
dense2 = Layer_Dense( 64 , 3 )
# Create Softmax classifier's combined loss and activation
loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy()
# Create optimizer

optimizer =Optimizer_SGD(decay = 1e-3,momentum = 0.5)

for epoch in range ( 11):
    # Perform a forward pass of our training data through this layer
    dense1.forward(X)
    # Perform a forward pass through activation function
    # takes the output of first dense layer here
    activation1.forward(dense1.output)
    # Perform a forward pass through second Dense layer
    # takes outputs of activation function of first layer as inputs
    dense2.forward(activation1.output)
    # Perform a forward pass through the activation/loss function
    # takes the output of second dense layer here and returns loss
    loss = loss_activation.forward(dense2.output, y)
    # Calculate accuracy from output of activation2 and targets
    # calculate values along first axis
    predictions = np.argmax(loss_activation.output, axis = 1 )
    if len (y.shape) == 2 :
        y = np.argmax(y, axis = 1 )
    accuracy = np.mean(predictions == y)
    if not epoch % 1 :
        print (f'epoch: {epoch} , ' +
                f'acc: {accuracy :.3f} , ' +
                f'loss: {loss :.3f},' +
                f'lr:{optimizer.current_learning_rate:.3f}' )   
            # Backward pass
    loss_activation.backward(loss_activation.output, y)
    dense2.backward(loss_activation.dinputs)
    activation1.backward(dense2.dinputs)
    dense1.backward(activation1.dinputs)
    # Update weights and biases
    print(f'optimizer momentum ={optimizer.momentum}')


    optimizer.pre_udpate_params()
    print(f'decay = {optimizer.current_decay}')
    print(f'clr: {optimizer.current_learning_rate}')
    optimizer.update_params(dense1)
    print(dense1.weight_momentums[:2,:5])
    optimizer.update_params(dense2)
    optimizer.post_update_params()
    '''
    

"test.momentum# Create dataset\nX, y = spiral_data( samples = 100 , classes = 3 )\n# Create Dense layer with 2 input features and 64 output values\ndense1 = Layer_Dense( 2 , 64 )\n# Create ReLU activation (to be used with Dense layer):\nactivation1 = Activation_ReLU()\n# Create second Dense layer with 64 input features (as we take output\n# of previous layer here) and 3 output values (output values)\ndense2 = Layer_Dense( 64 , 3 )\n# Create Softmax classifier's combined loss and activation\nloss_activation = Activation_Softmax_Loss_CategoricalCrossentropy()\n# Create optimizer\n\noptimizer =Optimizer_SGD(decay = 1e-3,momentum = 0.5)\n\nfor epoch in range ( 11):\n    # Perform a forward pass of our training data through this layer\n    dense1.forward(X)\n    # Perform a forward pass through activation function\n    # takes the output of first dense layer here\n    activation1.forward(dense1.output)\n    # Perform a forward pass through second Dense layer\n    # takes outputs of activati

In [6]:
dense1.weight_momentums = np.zeros_like(dense1.weights)
print(dense1.weight_momentums[:2,:5])
print(f'weights:{dense1.weights.shape}')
print(f' weight momentum :{dense1.weight_momentums.shape}')
optimizer.update_params(dense1)
print(f'after:{dense1.weight_momentums[:2,:5]}')
weight_updates = optimizer.momentum * dense1.weight_momentums - optimizer.current_learning_rate * dense1.dweights
dense1.weight_momentums = weight_updates

[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
weights:(2, 64)
 weight momentum :(2, 64)
after:[[ 0.00000000e+00 -4.17712674e-05  1.97196539e-04 -1.15758173e-04
  -5.40050515e-05]
 [ 0.00000000e+00 -3.10440995e-05 -1.64139961e-04 -1.14780413e-04
  -7.17771644e-05]]


AttributeError: 'Optimizer_Adam' object has no attribute 'momentum'