In [21]:
import numpy as np 
import random

In [22]:
class LayerDense : 

    def __init__(self , n_input , n_layer):
        self.weights = 0.01 * np.random.rand(n_input , n_layer)
        self.biases = np.zeros((1,n_layer))


    def forward(self , X) : 

        self.input = X

        self.output = np.dot( X , self.weights ) + self.biases 


    def backward(self , dvalues): 

        self.dweights = np.dot( self.input.T, dvalues )
        self.dbiases = np.sum(dvalues , axis=0 , keepdims=True)

        self.dinput = np.dot(dvalues , self.weights.T)


class ReLU : 

    def forward (self , X) : 

        self.input = X 

        self.output = np.maximum(0,X)
    
    def backward(self , dvalues) : 

        self.dinput = dvalues.copy()

        self.dinput[self.input <= 0] = 0


    



class Softmax : 

    def forward(self , X) :

        exp_values = np.exp(X - np.max(X , axis=1 , keepdims=True)) 

        probability = exp_values / np.sum(exp_values , axis=1 , keepdims=1)
        self.output = probability


class loss :

    def calculate(self , X , y_true) : 

        loss = self.forward( X , y_true)

        return np.mean(loss)


class CategorycalEntropyLoss(loss) : 

    def forward(self , y_pred , y_true) : 

        y_pred_clipped = np.clip(y_pred , 1e-7 , 1 - 1e-7 )

        if len(y_true.shape) == 1 : 
            
            right_probability = y_pred_clipped[range(len(y_pred)) , y_true]
        
        elif len(y_true.shape) == 2 : 

            right_probability = np.sum(y_pred_clipped * y_true , axis=1)

    
        negative_log = -np.log(right_probability)
        
        return negative_log
    
    

class entropyloss_and_softmax:

    def __init__(self) : 
        self.activation = Softmax()
        self.loss = CategorycalEntropyLoss()

    def forward (self , input , y_true) :

        self.activation.forward(input) 

        self.output = self.activation.output

        return self.loss.calculate(self.output , y_true)
    

    def backward(self , dvalues , y_true) : 

        sample = len(dvalues)

        if len(y_true.shape) == 2 : 
            y_true = np.argmax(y_true , axis=1)
        
        self.dinput = dvalues.copy()

        self.dinput[range(sample) , y_true] -= 1

        self.dinput = self.dinput / sample








    

In [33]:
class Optimazer_SDG : 

    def __init__(self , learning_rate = 0.01):
        self.learning_rate = learning_rate

    def optimazer(self , layer) : 

        layer.weights +=  -self.learning_rate * layer.dweights
        layer.biases += -self.learning_rate * layer.dbiases

In [34]:
class Optimazer_decay : 

    def __init__(self , decay , learning_rate = 1):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iteration = 1

    def pre_optimize(self) :

        if self.decay : 

            self.current_learning_rate = self.learning_rate / (1 + self.decay * self.iteration)

    def optimazer(self , layer) : 

        layer.weights +=  -self.current_learning_rate * layer.dweights
        layer.biases += -self.current_learning_rate * layer.dbiases

    def post_optimizer(self) : 

        self.iteration += 1

In [35]:
class Optimazer_decay_momentum : 

    def __init__(self , decay , momentum , learning_rate = 1):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iteration = 1
        self.momentum = momentum

    def pre_optimize(self) :


        if self.decay : 

            self.current_learning_rate = self.learning_rate / (1 + self.decay * self.iteration)


    def optimazer(self, layer):

        if self.momentum:
            if not hasattr(layer, "weight_momentum"):

                layer.weight_momentum = np.zeros_like(layer.weights)
                layer.bias_momentum = np.zeros_like(layer.biases)

            weight_updates = self.momentum * layer.weight_momentum - self.current_learning_rate * layer.dweights
            layer.weight_momentum = weight_updates

            bias_updates = self.momentum * layer.bias_momentum - self.current_learning_rate * layer.dbiases
            layer.bias_momentum = bias_updates
 
        else:
            weight_updates = -self.current_learning_rate * layer.dweights 
            bias_updates = -self.current_learning_rate * layer.dbiases

        layer.weights += weight_updates
        layer.biases += bias_updates


    def post_optimizer(self) : 
        self.iteration += 1

In [37]:
class Adagrad : 

    def __init__(self , decay=0 , learning_rate=1 , epsilon=1e-7):
        
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.epsilon = epsilon
        self.iteration = 0

    def pre_optimize(self) : 

        if self.decay :

            self.current_learning_rate = self.learning_rate / (1 + self.decay * self.iteration) 

    def optimazer(self , layer) : 

        if not hasattr(layer , "weight_cache") : 

            layer.weight_cache = np.zeros_like(layer.weights)
            layer.bias_cache = np.zeros_like(layer.biases)
        
        layer.weight_cache += layer.dweights**2
        layer.bias_cache += layer.dbiases**2


        layer.weights += -self.current_learning_rate * layer.dweights / ( np.sqrt(layer.weight_cache) + self.epsilon )
        layer.biases += -self.current_learning_rate * layer.dbiases / (np.sqrt(layer.bias_cache) + self.epsilon)


    def post_optimizer(self) : 
        self.iteration += 1

In [None]:
class Optimizer_RMSprop:
    def __init__(self, learning_rate=0.001, decay=0., epsilon=1e-7, rho=0.9):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.epsilon = epsilon
        self.rho = rho

    def pre_optimize(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * \
                (1. / (1. + self.decay * self.iterations))

    def optimazer(self, layer):

        if not hasattr(layer, 'weight_cache'):
            layer.weight_cache = np.zeros_like(layer.weights)
            layer.bias_cache = np.zeros_like(layer.biases)

        layer.weight_cache = self.rho * layer.weight_cache + \
                             (1 - self.rho) * layer.dweights**2
        layer.bias_cache = self.rho * layer.bias_cache + \
                           (1 - self.rho) * layer.dbiases**2

        layer.weights += -self.current_learning_rate * \
                         layer.dweights / \
                         (np.sqrt(layer.weight_cache) + self.epsilon)
        layer.biases += -self.current_learning_rate * \
                        layer.dbiases / \
                        (np.sqrt(layer.bias_cache) + self.epsilon)

    def post_optimizer(self):
        self.iterations += 1


In [None]:
class adam : 

    def __init__(self, learning_rate =0.001, decay=0 , epsilon=1e-7 , beta_1 = 0.9 , beta_2=0.999):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.epsilon = epsilon
        self.beta_1 = beta_1
        self.beta_2 = beta_2

    

    def pre_optimize(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * \
                (1. / (1. + self.decay * self.iterations))
            
    
        

In [None]:
from nnfs.datasets import  spiral_data

x,y = spiral_data(samples=100 , classes=3)
x

In [54]:
layer1 = LayerDense(2,64)
relu = ReLU()
layer2 = LayerDense(64 , 3)
softmax_loss = entropyloss_and_softmax()
optim = Optimizer_RMSprop()
epocs = 30000


for i in range(epocs) : 

    layer1.forward(x)
    relu.forward(layer1.output)
    layer2.forward(relu.output)

    loss =  softmax_loss.forward(layer2.output , y)

    prediction = np.argmax(softmax_loss.output , axis=1)
    acc = np.mean(prediction == y)
    softmax_loss.backward(softmax_loss.output , y)

    layer2.backward(softmax_loss.dinput)

    relu.backward(layer2.dinput)
    layer1.backward(relu.dinput)

    optim.pre_optimize()
    optim.optimazer(layer1)
    optim.optimazer(layer2)
    optim.post_optimizer()
  

    if i % 1000 == 0 : 

      print(f"iteration : {i}  , loss : {loss} , accuracy : {acc}" )



iteration : 0  , loss : 1.0986117386100822 , accuracy : 0.27666666666666667
iteration : 1000  , loss : 1.0326355594633727 , accuracy : 0.45
iteration : 2000  , loss : 0.9535570844206355 , accuracy : 0.5166666666666667
iteration : 3000  , loss : 0.9009000682920081 , accuracy : 0.55
iteration : 4000  , loss : 0.8489119919675346 , accuracy : 0.6066666666666667
iteration : 5000  , loss : 0.8207684345871544 , accuracy : 0.6333333333333333
iteration : 6000  , loss : 0.7834207268769776 , accuracy : 0.6833333333333333
iteration : 7000  , loss : 0.7562210486740308 , accuracy : 0.6766666666666666
iteration : 8000  , loss : 0.7369792981214695 , accuracy : 0.6766666666666666
iteration : 9000  , loss : 0.7203880586142581 , accuracy : 0.7
iteration : 10000  , loss : 0.7071444368452298 , accuracy : 0.7033333333333334
iteration : 11000  , loss : 0.6965966382839466 , accuracy : 0.71
iteration : 12000  , loss : 0.6871502848990354 , accuracy : 0.6933333333333334
iteration : 13000  , loss : 0.678192875391