
## Neural Network 

### 1. Initialization
   - Input: Layer dimensions, learning rate, number of epochs
   - Initialize weights $W^{[l]}$ and biases $b^{[l]}$ for each layer $l$ with small random values (weights) and zeros (biases)

### 2. Forward Propagation for each layer $l$
   - Input: $X$, weights $W^{[l]}$, biases $b^{[l]}$
   - Linear step:
      - Compute $Z^{[l]} = W^{[l]} \cdot A^{[l-1]} + b^{[l]}$
   - Activation step (ReLU for hidden layers, Softmax for the output layer):
      - ReLU: $A^{[l]} = \max(0, Z^{[l]})$
      - Softmax: $A^{[L]} = \frac{{e^{Z^{[L]}}}}{{\sum e^{Z^{[L]}}}}$
   - Store cache for backpropagation: $(A^{[l-1]}, W^{[l]}, b^{[l]}, Z^{[l]})$

### 3. Compute Cost (Cross-Entropy Loss)
   - $\text{{Cost}} = -\frac{1}{m} \sum (y \cdot \log(A^{[L]}) + (1-y) \cdot \log(1-A^{[L]}))$

### 4. Backward Propagation for each layer $ l $
   - Initialize: $ dA^{[L]} = A^{[L]} - y$
   - For $l$ from $L$ to $1$:
      - Retrieve cache: $(A^{[l-1]}, W^{[l]}, b^{[l]}, Z^{[l]})$
      - Compute $dZ^{[l]}$:
         - ReLU: $ dZ^{[l]} = dA^{[l]} \cdot \mathbf{1}(Z^{[l]} > 0) $
         - Softmax: $ dZ^{[L]} = dA^{[L]} $ (derivative is handled in previous step)
      - Compute gradients:
         - $ dW^{[l]} = \frac{1}{m} \cdot dZ^{[l]} \cdot A^{[l-1]\top} $
         - $ db^{[l]} = \frac{1}{m} \cdot \sum dZ^{[l]} $
         - $ dA^{[l-1]} = W^{[l]\top} \cdot dZ^{[l]} $

### 5. Update Parameters
   - For $l$ from 1 to $L$:
      - $W^{[l]} = W^{[l]} - \alpha \cdot dW^{[l]}$
      - $b^{[l]} = b^{[l]} - \alpha \cdot db^{[l]}$

### 6. Repeat steps 2-5 for the given number of epochs

### 7. Predictions
   - Compute forward propagation with final weights and biases
   - Return the class with the highest probability from the Softmax output

Here:
- $m$: Number of examples
- $ \alpha $: Learning rate
- $A^{[l]}$: Activation at layer $l$
- $W^{[l]}$: Weights at layer $l$
- $b^{[l]}$: Biases at layer $l$
- $Z^{[l]}$: Linear output at layer $l$
- $L$: Total number of layers



In [2]:
import numpy as np

class NeuralNet():
    
    """
    A class used to represent a Simple Feed-forward DNN

    Attributes
    ----------
    X : ndarray
        Input features, must be 2-dimensional
    y : ndarray
        Target variable, must be 1-dimensional
    hidden_layers_and_units : list, optional
        A list of integers representing units in each hidden layer (default is [4])

    Methods
    -------
    _init_params():
        Initializes parameters (weights and biases) for the network.

    train(learning_rate=0.01, epochs=1000):
        Trains the network using backpropagation and gradient descent.
        
    _forward_step():
        Performs L forward propegation phase
        
    def _forward_1(self, A_previous, W, b, activation):
        performs a 1 forward step
        
    compute_cost():
        computing the cost function
        
    
    train(self, learning_rate=0.01, epochs=1000):
        Train our neural netword, it takes 2 parameters: (1) learning rate, (2) number of epochs
        
    _backward_steps():
        Perform L back propegation phase
    """
    def __init__(self, X, y, hidden_layers_and_units=[4]):
        self.X = X.T
        self.y = y.reshape(1, -1)
        self.hidden_layers_and_units = hidden_layers_and_units
        self.L = 2 + len(hidden_layers_and_units)
        self.layers_dim = [X.shape[1]] + hidden_layers_and_units + [len(set(y))]
        self.parameters = {}
        self.caches = []
        self._init_params()
        
    def _init_params(self):
        for l in range(1, self.L):
            self.parameters['W' + str(l)] = np.random.randn(self.layers_dim[l], self.layers_dim[l-1]) * 0.01
            self.parameters['b' + str(l)] = np.zeros((self.layers_dim[l], 1))  # Reshaping the bias
            
    def _forward_1(self, A_previous, W, b, activation):
        Z = np.dot(W, A_previous) + b
        linear_cache = (A_previous, W, b)
        if activation == "relu":
            A, activation_cache = self._ReLu(Z)
        elif activation == "softmax":
            A, activation_cache = self._softmax(Z)
        return A, (linear_cache, activation_cache)
    
    def _forward_step(self):
        A = self.X
        L = len(self.parameters) // 2
        for l in range(1, L):
            A_previous = A
            A, cache = self._forward_1(A_previous, self.parameters['W' + str(l)], self.parameters['b' + str(l)], "relu")
            self.caches.append(cache)
        self.A_last, cache = self._forward_1(A, self.parameters['W' + str(L)], self.parameters['b' + str(L)], "softmax")
        self.caches.append(cache)
            
    def _ReLu(self, Z):
        return np.maximum(0, Z), Z

    def _softmax(self, Z):
        expZ = np.exp(Z - np.max(Z))
        return expZ / expZ.sum(axis=0, keepdims=True), Z
    
    def _backward_steps(self):
        self.grads = {}
        L = self.L - 1
        m = self.A_last.shape[1]
        dA_last = self.A_last - self.y
        
        for l in reversed(range(1, L+1)):
            if l != L:
                dA_last, dW, db = self._backward_1(dA_last, self.caches[l-1], 'relu')
            else:
                dA_last, dW, db = self._backward_1(dA_last, self.caches[l-1], 'softmax')
            print(f"shapes in backward {l}: {dW.shape}, {db.shape}")
            self.grads['dW' + str(l)] = dW
            self.grads['db' + str(l)] = db

    def _backward_1(self, dA, cache_c, activation):
        linear_c, activation_c = cache_c
        Z = activation_c
        
        if activation == 'relu':
            dZ = self._ReLu_backward(dA, Z)
        else:
            dZ = dA
            
        A_prev, W, b = linear_c
        m = A_prev.shape[1]
        
        dW = (1.0/m) * np.dot(dZ, A_prev.T)
        db = (1.0/m) * np.sum(dZ, axis=1, keepdims=True)
        dA_prev = np.dot(W.T, dZ)
        return dA_prev, dW, db
    
    def _ReLu_backward(self, dA, cache):
        Z = cache
        dZ = np.array(dA, copy=True)
        dZ[Z <= 0] = 0
        return dZ

    def compute_cost(self):
        m = self.y.shape[1]
        logprobs = np.multiply(-np.log(self.A_last), self.y) + np.multiply(-np.log(1 - self.A_last), 1 - self.y)
        cost = 1./m * np.nansum(logprobs)
        return np.squeeze(cost)
        
    def __str__(self):
        return f"NeuralNet(X={self.X}, y={self.y}, hidden_layers={self.hidden_layers_and_units}, L={self.L})"
    
    def train(self, learning_rate=0.01, epochs=1000):
        
        for i in range(epochs):
            # Forward pass
            self._forward_step()

            # Compute cost
            cost = self.compute_cost()

            # Backward pass
            self._backward_steps()

            # Update parameters
            for l in range(1, self.L):
                print(f"shapes before update {l}: {self.parameters['W' + str(l)].shape}, {self.grads['dW' + str(l)].shape}")
                self.parameters["W" + str(l)] -= learning_rate * self.grads["dW" + str(l)]
                self.parameters["b" + str(l)] -= learning_rate * self.grads["db" + str(l)]

            if i % 100 == 0:
                print(f"Cost after iteration {i}: {cost}")

# Example dataset
X = np.array([[ 0.74204416, -3.42524143,  1.65980218,  1.49509867],
              [ 1.6924546 , -4.42287433, -0.74715829,  0.70896364],
              [ 0.2344157 , -1.52637437, -1.11731035, -2.85961623],
              [ 0.61720311,  1.23169963,  0.12015895,  1.72073855],
              [-0.63699565,  1.44634283,  0.05080775, -1.14651383],
              [ 2.10025514,  1.37496472,  0.19091548, -0.22942496],
              [-0.0126646 , -1.60290743, -0.67124613, -1.52320683],
              [-0.35224985,  0.78712117,  0.30017032,  0.55132541],
              [-0.84520564, -0.60483688, -0.6871727 , -2.00875146],
              [-0.88762896, -1.55118469, -0.19183555,  1.03584131]])

y = np.array([1, 1, 0, 1, 0, 0, 0, 1, 0, 1])

NN = NeuralNet(X, y, hidden_layers_and_units=[10, 5])
NN.train(epochs=5000, learning_rate=0.01)

shapes in backward 3: (2, 5), (2, 1)
shapes in backward 2: (5, 10), (5, 1)
shapes in backward 1: (10, 4), (10, 1)
shapes before update 1: (10, 4), (10, 4)
shapes before update 2: (5, 10), (5, 10)
shapes before update 3: (2, 5), (2, 5)
Cost after iteration 0: 1.3862943611226113
shapes in backward 3: (2, 5), (2, 1)
shapes in backward 2: (5, 10), (5, 1)
shapes in backward 1: (10, 4), (10, 1)
shapes before update 1: (10, 4), (10, 4)
shapes before update 2: (5, 10), (5, 10)
shapes before update 3: (2, 5), (2, 5)
shapes in backward 3: (2, 5), (2, 1)
shapes in backward 2: (5, 10), (5, 1)
shapes in backward 1: (10, 4), (10, 1)
shapes before update 1: (10, 4), (10, 4)
shapes before update 2: (5, 10), (5, 10)
shapes before update 3: (2, 5), (2, 5)
shapes in backward 3: (2, 5), (2, 1)
shapes in backward 2: (5, 10), (5, 1)
shapes in backward 1: (10, 4), (10, 1)
shapes before update 1: (10, 4), (10, 4)
shapes before update 2: (5, 10), (5, 10)
shapes before update 3: (2, 5), (2, 5)
shapes in backwar