# Neural Network From Scratch

In [2]:
import numpy as np
import math

## Layers

In [3]:
# Base Layer

class Layer:
    def __init__(self):
        self.input = None
        self.output = None

    def forward(self, input): # returns output
        pass

    def backward(self, output_gradient, learning_rate): # returns input
        pass

In [4]:
# Dense Layer inherits from Base Layer

class Dense(Layer):
    def __init__(self, input_size, output_size):
        self.weights = np.random.randn(output_size, input_size)
        self.bias = np.random.randn(output_size, 1)

    def forward(self, input):
        self.input = input
        return np.dot(self.weights, self.input) + self.bias         # Y = WX + B

    def backward(self, output_gradient, learning_rate):             # returns input
        weights_gradient = np.dot(output_gradient, self.input.T)    # Gradient of the loss w.r.t. weights, computed using the chain rule
        input_gradient = np.dot(self.weights.T, output_gradient)    # Computing and returning the gradient of the loss w.r.t. the input for further backpropagation
        self.weights -= learning_rate * weights_gradient            # Updating the weights using the computed gradient and learning rate
        self.bias -= learning_rate * output_gradient                # Updating the biases using the gradient of the loss w.r.t. biases and learning rate
        return input_gradient                                       # output_gradient is the gradient of the loss w.r.t. the output of this layer

In [5]:
# Activation Layer inherits from Base Layer

class Activation(Layer):
    def __init__(self, activation, activation_prime):
        self.activation = activation
        self.activation_prime = activation_prime
    
    def forward(self, input):
        self.input = input
        return self.activation(self.input)

    def backward(self, output_gradient, learning_rate):
        return np.multiply(output_gradient, self.activation_prime(self.input))

In [6]:
# tanh Activation Function

class Tanh(Activation):
    def __init__(self):
        tanh = lambda x: np.tanh(x)
        tanh_prime = lambda x: 1 - np.tanh(x) ** 2
        super().__init__(tanh, tanh_prime)

In [7]:
# Mean Squared Error

def mse(y_true, y_pred):
    return np.mean(np.power(y_true - y_pred, 2))

def mse_prime(y_true, y_pred):
    return 2 * (y_pred - y_true) / np.size(y_true)

In [8]:
# XOR

X = np.reshape([[0, 0], [0, 1], [1, 0], [1, 1]], (4, 2, 1))  # Input data reshaped to (4, 2, 1)
Y = np.reshape([[0], [1], [1], [0]], (4, 1, 1))            # Output data reshaped to (4, 1, 1)

# Neural Network Architecture
network = [
    Dense(2, 3),  # Dense layer with 2 inputs and 3 neurons
    Tanh(),       # Hyperbolic tangent activation function
    Dense(3, 1),  # Dense layer with 3 inputs (from previous layer) and 1 neuron
    Tanh()        # Hyperbolic tangent activation function
]

# Training parameters
epochs = 10000      # Number of training epochs
learning_rate = 0.1 # Learning rate for gradient descent

# Training loop
for epoch in range(epochs):
    error = 0  # Initialize error for this epoch to 0
    
    for x, y in zip(X, Y):  # Iterate through each training example
        
        # Forward Pass
        output = x  # Initialize output with input data
        for layer in network:  # Pass output through each layer in the network
            output = layer.forward(output)
        
        # Error Calculation
        error += mse(y, output)  # Accumulate Mean Squared Error
        
        # Backward Pass (Backpropagation)
        grad = mse_prime(y, output)  # Compute gradient of error w.r.t output
        for layer in reversed(network):  # Pass gradient through each layer in reverse order
            grad = layer.backward(grad, learning_rate)
    
    error /= len(X)  # Compute average error for this epoch
    
    error /= len(X)
    if (epoch + 1) % 500 == 0:
        print(f"{epoch+1}/{epochs}\t error={error}")

500/10000	 error=0.00012169175324081647
1000/10000	 error=5.131869007928736e-05
1500/10000	 error=3.179126701487351e-05
2000/10000	 error=2.2809762034287286e-05
2500/10000	 error=1.769257508688477e-05
3000/10000	 error=1.4474708648482511e-05
3500/10000	 error=1.2300988773262742e-05
4000/10000	 error=1.0572963990655387e-05
4500/10000	 error=9.259986135201076e-06
5000/10000	 error=8.405699656072939e-06
5500/10000	 error=7.475428466972509e-06
6000/10000	 error=6.778242055427937e-06
6500/10000	 error=5.674644982959735e-05
7000/10000	 error=5.754851455596804e-06
7500/10000	 error=5.3251049445709665e-06
8000/10000	 error=6.533152779058971e-06
8500/10000	 error=4.672538583964776e-06
9000/10000	 error=4.3743378320576285e-06
9500/10000	 error=4.270621176687159e-06
10000/10000	 error=4.0953461633043206e-06


In [14]:
def predict(network, input_data):
    output = input_data
    for layer in network:
        output = layer.forward(output)
    return output

input_data = [[0], [1]]
output_data = predict(network, input_data)

print(f"{input_data[0][0]} XOR {input_data[0][1]} \t => \t {round(output_data[0][0])}")

Prediction 	 => 	 1
