In [2]:
import numpy as np

class Activation:
    def __init__(self, activation_type):
        # Initialize the activation object with a specified type of activation function.
        self.type = activation_type

    def forward(self, x):
        # Compute the forward pass output using the specified activation function.
        if self.type == 'linear':
            # Linear activation does nothing and returns the input as is.
            return x
        elif self.type == 'relu':
            # ReLU (Rectified Linear Unit) activation returns the input if it's positive, otherwise 0.
            return np.maximum(0, x)
        elif self.type == 'sigmoid':
            # Sigmoid activation squashes the input to a range between 0 and 1.
            return 1 / (1 + np.exp(-x))
        elif self.type == 'tanh':
            # Tanh (Hyperbolic Tangent) activation squashes the input to a range between -1 and 1.
            return np.tanh(x)
        elif self.type == 'softmax':
            # Softmax activation computes the exponential of each input, then normalizes by dividing by the sum of all exponentials.
            e_x = np.exp(x - np.max(x, axis=0, keepdims=True))  # Subtract max for numerical stability.
            return e_x / np.sum(e_x, axis=0, keepdims=True)
        else:
            # If an unsupported activation type is specified, raise an error.
            raise ValueError(f"Unsupported activation type: {self.type}")

    def derivative(self, x):
        # Compute the derivative of the activation function for backpropagation.
        if self.type == 'linear':
            # The derivative of the linear function is 1.
            return np.ones_like(x)
        elif self.type == 'relu':
            # The derivative of ReLU is 1 for positive inputs and 0 otherwise.
            return np.where(x > 0, 1, 0)
        elif self.type == 'sigmoid':
            # The derivative of the sigmoid function can be expressed in terms of its output.
            s = self.forward(x)
            return s * (1 - s)
        elif self.type == 'tanh':
            # The derivative of the tanh function can be expressed in terms of its output as well.
            return 1 - np.tanh(x) ** 2
        else:
            # If the derivative is called for an unsupported or not required activation function, raise an error.
            raise NotImplementedError(f"Derivative not implemented or not required for: {self.type}")

class Parameters:
    def __init__(self, input_dim, output_dim):
        # Constructor for the Parameters class to initialize weights and biases.
        
        # input_dim: The size of the input layer or the number of features.
        # output_dim: The size of the output from this layer or the number of neurons in the layer.
        
        # Initialize the weights matrix with small random values.
        # The shape of the weights matrix is (output_dim, input_dim) to match the matrix multiplication
        # requirements during the forward pass: (output_dim x input_dim) dot (input_dim x m) = (output_dim x m),
        # where m is the number of examples. Multiplying by 0.01 keeps the initial weights small, aiding in
        # maintaining the stability of the learning process by preventing saturation of neurons.
        self.weights = np.random.randn(output_dim, input_dim) * 0.01
        
        # Initialize the bias vector with zeros.
        # The shape of the bias vector is (output_dim, 1) to ensure it can be added directly to the weighted inputs
        # for each neuron in the layer. Each neuron in the layer has a single bias term.
        self.bias = np.zeros((output_dim, 1))

class Layer:
    def __init__(self, input_dim, output_dim, activation_type):
        # Constructor for the Layer class to initialize a layer in the neural network.
        
        # input_dim: The size of the input for this layer, or the number of features/units in the previous layer.
        # output_dim: The number of neurons in this layer, determining the size of the output from this layer.
        # activation_type: A string specifying the type of activation function to be used in this layer.
        
        # Initialize the parameters of the layer (weights and biases) using the Parameters class.
        # This encapsulation simplifies the management of the layer's parameters.
        self.params = Parameters(input_dim, output_dim)
        
        # Initialize the activation function for the layer based on the specified type.
        # The Activation class encapsulates various activation functions, facilitating easy switching
        # and experimentation with different activation types.
        self.activation = Activation(activation_type)

    def forward(self, a_prev):
        # Compute the forward pass for this layer.
        
        # a_prev: The activations from the previous layer (or the input data for the first layer).
        # This is a matrix with the shape (input_dim, m), where m is the number of examples.
        
        # Compute the linear part of the layer's forward pass: Z = W.X + b, where
        # W is the weight matrix, X is the input, and b is the bias vector.
        # np.dot(self.params.weights, a_prev) performs the matrix multiplication W.X,
        # and adding self.params.bias broadcasts the bias to each column of the result.
        z = np.dot(self.params.weights, a_prev) + self.params.bias
        
        # Apply the activation function to the linear combination computed above.
        # This introduces non-linearity to the layer's output, allowing the network
        # to learn more complex functions. The activation function's output is the
        # final output (A) of this layer. The method returns both A and Z for use
        # in subsequent calculations, particularly during backpropagation.
        return self.activation.forward(z), z

class DeepNeuralNetwork:
    def __init__(self, layer_dims, activation_types):
        # Constructor for initializing a deep neural network.
        # layer_dims: List of integers, where each integer represents the number of neurons in each layer.
        # activation_types: List of strings, specifying the activation function for each layer.

        self.layers = []  # List to store each layer of the network.
        self.L = len(layer_dims) - 1  # Total number of layers in the network excluding the input layer.

        # Loop through the layer dimensions and create each layer with the appropriate dimensions
        # and activation function.
        for i in range(1, len(layer_dims)):
            self.layers.append(Layer(layer_dims[i-1], layer_dims[i], activation_types[i-1]))

    def forward_propagation(self, X):
        # Perform forward propagation through the network.
        # X: Input data, a numpy array of shape (number of features, number of examples).

        cache = {"A0": X}  # Initialize cache to store activations (A) and linear combinations (Z) for each layer.
        A = X  # Set the initial activation to the input data.

        # Loop through each layer and perform forward propagation, updating the cache with
        # activations and linear combinations for each layer.
        for i, layer in enumerate(self.layers, 1):
            A, Z = layer.forward(A)  # Get the activation and linear combination from the current layer.
            cache[f"A{i}"] = A  # Store the activation for the current layer in the cache.
            cache[f"Z{i}"] = Z  # Store the linear combination for the current layer in the cache.

        return A, cache  # Return the final activation output and the cache.

    def compute_cost(self, AL, Y):
        # Compute the cost (loss) using the output of the network and the true labels.
        # AL: Final layer activations, the predictions of the network.
        # Y: True labels.

        m = Y.shape[1]  # Number of examples.
        # Compute the cross-entropy cost.
        cost = -np.sum(Y * np.log(AL + 1e-8)) / m
        return cost

    def backward_propagation(self, Y, cache):
        # Perform backward propagation to compute gradients for learning.
        # Y: True labels.
        # cache: Cache containing the activations and linear combinations from forward propagation.

        grads = {}  # Dictionary to store gradients.
        m = Y.shape[1]  # Number of examples.
        AL = cache[f"A{self.L}"]  # Activation of the last layer.
        Y = Y.reshape(AL.shape)  # Ensure Y has the same shape as AL for operations.

        # Initialize backpropagation. The gradient of the cost with respect to AL differs
        # based on the activation type of the last layer.
        if self.layers[-1].activation.type == 'softmax':
            dAL = AL - Y  # Gradient for softmax with cross-entropy loss.
        else:
            dAL = - (np.divide(Y, AL) - np.divide(1 - Y, 1 - AL))

        # Loop through layers in reverse order to perform backpropagation.
        for l in reversed(range(self.L)):
            current_layer = self.layers[l]
            prev_A = cache[f"A{l}"]
            Z = cache[f"Z{l+1}"]

            # Calculate gradients for the current layer.
            if l == self.L - 1:
                dZ = dAL
            else:
                next_layer_W = self.layers[l + 1].params.weights
                next_layer_dZ = grads[f"dZ{l+2}"]
                dZ = np.dot(next_layer_W.T, next_layer_dZ) * current_layer.activation.derivative(Z)

            dW = np.dot(dZ, prev_A.T) / m
            db = np.sum(dZ, axis=1, keepdims=True) / m
            if l > 0:
                dA_prev = np.dot(current_layer.params.weights.T, dZ)

            # Store gradients for the current layer.
            grads[f"dW{l+1}"] = dW
            grads[f"db{l+1}"] = db
            if l > 0:  # dZ is not needed for the input layer
                grads[f"dZ{l+1}"] = dZ

        return grads

    def update_parameters(self, grads, learning_rate):
        # Update the network's weights and biases based on the calculated gradients and the learning rate.
        for l in range(self.L):
            # Loop through each layer of the network.

            # Update the weights for layer l by subtracting the product of the learning rate and the gradient of the weights.
            self.layers[l].params.weights -= learning_rate * grads[f"dW{l+1}"]

            # Update the bias for layer l by subtracting the product of the learning rate and the gradient of the bias.
            self.layers[l].params.bias -= learning_rate * grads[f"db{l+1}"]

    def train(self, X, Y, epochs, learning_rate):
        # Train the neural network using the provided training data.
        # X: Input data, a numpy array of shape (number of features, number of examples).
        # Y: True labels, a numpy array of shape (1, number of examples) for binary classification.
        # epochs: Number of iterations to run the training process.
        # learning_rate: Step size at each iteration while moving toward a minimum of the cost function.

        for i in range(epochs):
            # Loop through the specified number of epochs to perform training.

            # Perform forward propagation to compute the network's output AL and cache intermediate values.
            AL, cache = self.forward_propagation(X)

            # Compute the cost (loss) using the network's output AL and the true labels Y.
            cost = self.compute_cost(AL, Y)

            # Perform backward propagation to compute the gradients of the cost with respect to the parameters.
            grads = self.backward_propagation(Y, cache)

            # Update the network's parameters (weights and biases) using the computed gradients.
            self.update_parameters(grads, learning_rate)

            # Optionally, print the cost every 100 epochs to monitor the training progress.
            if i % 100 == 0:
                print(f"Loss (Cost) after iteration {i}: {cost}")
           






# Simulating a dataset for training
m = 1000  # Number of examples

input_dim = 784  # Example input size (e.g., a flattened 28x28 grayscale image)
output_dim = 1  # Output size for binary classification
X = np.random.randn(input_dim, m)  # Simulating input features with random values
Y = np.random.randint(0, 2, (output_dim, m))  # Simulating binary labels (0 or 1)

# Network configuration
layer_dims = [784, 128, 64, 1]  # Example architecture: input layer, two hidden layers, and output layer
activation_types = ['relu', 'relu', 'sigmoid']  # Using ReLU for hidden layers and sigmoid for the output layer

# Initialize the network
network = DeepNeuralNetwork(layer_dims, activation_types)

# Train the network
epochs = 2000
learning_rate = 0.01
network.train(X, Y, epochs, learning_rate)


Loss (Cost) after iteration 0: 0.33697825686595484
Loss (Cost) after iteration 100: 0.34563715756580515
Loss (Cost) after iteration 200: 0.3487991281537742
Loss (Cost) after iteration 300: 0.34994046986925637
Loss (Cost) after iteration 400: 0.35033837009809127
Loss (Cost) after iteration 500: 0.3504464553051579
Loss (Cost) after iteration 600: 0.35043806965701635
Loss (Cost) after iteration 700: 0.35034123094199654
Loss (Cost) after iteration 800: 0.3501569402715359
Loss (Cost) after iteration 900: 0.34979467137769993
Loss (Cost) after iteration 1000: 0.349033024652897
Loss (Cost) after iteration 1100: 0.3468630997512577
Loss (Cost) after iteration 1200: 0.33844624559249215
Loss (Cost) after iteration 1300: 0.28993378867921216
Loss (Cost) after iteration 1400: 0.06751102251317229


  dAL = - (np.divide(Y, AL) - np.divide(1 - Y, 1 - AL))


Loss (Cost) after iteration 1500: nan
Loss (Cost) after iteration 1600: nan
Loss (Cost) after iteration 1700: nan
Loss (Cost) after iteration 1800: nan
Loss (Cost) after iteration 1900: nan
