In [15]:
import numpy as np

class Activation:
    def __init__(self, activation_type):
        self.type = activation_type

    def forward(self, x):
        if self.type == 'linear':
            return x
        elif self.type == 'relu':
            return np.maximum(0, x)
        elif self.type == 'sigmoid':
            return 1 / (1 + np.exp(-x))
        elif self.type == 'tanh':
            return np.tanh(x)
        elif self.type == 'softmax':
            e_x = np.exp(x - np.max(x, axis=0, keepdims=True))
            return e_x / np.sum(e_x, axis=0, keepdims=True)
        else:
            raise ValueError(f"Unsupported activation type: {self.type}")

    def derivative(self, x):
        if self.type == 'linear':
            return np.ones_like(x)
        elif self.type == 'relu':
            return np.where(x > 0, 1, 0)
        elif self.type == 'sigmoid':
            s = self.forward(x)
            return s * (1 - s)
        elif self.type == 'tanh':
            return 1 - np.tanh(x) ** 2
        else:
            raise NotImplementedError(f"Derivative not implemented or not required for: {self.type}")

class Parameters:
    def __init__(self, input_dim, output_dim):
        self.weights = np.random.randn(output_dim, input_dim) * 0.01
        self.bias = np.zeros((output_dim, 1))

class Layer:
    def __init__(self, input_dim, output_dim, activation_type):
        self.params = Parameters(input_dim, output_dim)
        self.activation = Activation(activation_type)

    def forward(self, a_prev):
        z = np.dot(self.params.weights, a_prev) + self.params.bias
        return self.activation.forward(z), z

class DeepNeuralNetwork:
    def __init__(self, layer_dims, activation_types):
        self.layers = []
        self.L = len(layer_dims) - 1  # number of layers
        for i in range(1, len(layer_dims)):
            self.layers.append(Layer(layer_dims[i-1], layer_dims[i], activation_types[i-1]))

    def forward_propagation(self, X):
        cache = {"A0": X}
        A = X
        for i, layer in enumerate(self.layers, 1):
            A, Z = layer.forward(A)
            cache[f"A{i}"] = A
            cache[f"Z{i}"] = Z
        return A, cache

    def compute_cost(self, AL, Y):
        m = Y.shape[1]
        cost = -np.sum(Y * np.log(AL + 1e-8)) / m
        return cost

    def backward_propagation(self, Y, cache):
        grads = {}
        m = Y.shape[1]
        AL = cache[f"A{self.L}"]
        Y = Y.reshape(AL.shape)  # Ensure Y is the same shape as AL

    # Initialize backpropagation for the last layer differently if using softmax
        if self.layers[-1].activation.type == 'softmax':
            dAL = AL - Y  # This works directly for softmax with cross-entropy
        else:
            dAL = - (np.divide(Y, AL) - np.divide(1 - Y, 1 - AL))

        for l in reversed(range(self.L)):
            current_layer = self.layers[l]
            prev_A = cache[f"A{l}"]
            Z = cache[f"Z{l+1}"]

        # Calculating gradients
            if l == self.L - 1:
                dZ = dAL
            else:
                next_layer_W = self.layers[l + 1].params.weights
                next_layer_dZ = grads[f"dZ{l+2}"]
                dZ = np.dot(next_layer_W.T, next_layer_dZ) * current_layer.activation.derivative(Z)

            dW = np.dot(dZ, prev_A.T) / m
            db = np.sum(dZ, axis=1, keepdims=True) / m
            dA_prev = np.dot(current_layer.params.weights.T, dZ)

        # Saving gradients
            grads[f"dW{l+1}"] = dW
            grads[f"db{l+1}"] = db
            grads[f"dZ{l+1}"] = dZ  # Saving dZ for calculation in previous layer

        return grads

    def update_parameters(self, grads, learning_rate):
        # Update the network's weights and biases based on the calculated gradients and the learning rate.
        for l in range(self.L):
            # Loop through each layer of the network.

            # Update the weights for layer l by subtracting the product of the learning rate and the gradient of the weights.
            self.layers[l].params.weights -= learning_rate * grads[f"dW{l+1}"]

            # Update the bias for layer l by subtracting the product of the learning rate and the gradient of the bias.
            self.layers[l].params.bias -= learning_rate * grads[f"db{l+1}"]

    def train(self, X, Y, epochs, learning_rate):
        # Train the neural network using the provided training data.
        # X: Input data, a numpy array of shape (number of features, number of examples).
        # Y: True labels, a numpy array of shape (1, number of examples) for binary classification.
        # epochs: Number of iterations to run the training process.
        # learning_rate: Step size at each iteration while moving toward a minimum of the cost function.

        for i in range(epochs):
            # Loop through the specified number of epochs to perform training.

            # Perform forward propagation to compute the network's output AL and cache intermediate values.
            AL, cache = self.forward_propagation(X)

            # Compute the cost (loss) using the network's output AL and the true labels Y.
            cost = self.compute_cost(AL, Y)

            # Perform backward propagation to compute the gradients of the cost with respect to the parameters.
            grads = self.backward_propagation(Y, cache)

            # Update the network's parameters (weights and biases) using the computed gradients.
            self.update_parameters(grads, learning_rate)

            # Optionally, print the cost every 100 epochs to monitor the training progress.
            if i % 100 == 0:
                print(f"Loss (Cost) after iteration {i}: {cost}")

# Simulating a dataset for training
m = 1000  # Number of examples
input_dim = 784  # Example input size (e.g., a flattened 28x28 grayscale image)
output_dim = 1  # Output size for binary classification
X = np.random.randn(input_dim, m)  # Simulating input features with random values
Y = np.random.randint(0, 2, (output_dim, m))  # Simulating binary labels (0 or 1)

# Network configuration
layer_dims = [784, 128, 64, 1]  # Example architecture: input layer, two hidden layers, and output layer
activation_types = ['relu', 'relu', 'sigmoid']  # Using ReLU for hidden layers and sigmoid for the output layer

# Initialize the network
network = DeepNeuralNetwork(layer_dims, activation_types)

# Train the network
epochs = 2000
learning_rate = 0.01
network.train(X, Y, epochs, learning_rate)


Loss (Cost) after iteration 0: 0.3244606768759905
Loss (Cost) after iteration 100: 0.3438764737302939
Loss (Cost) after iteration 200: 0.3511308043820638
Loss (Cost) after iteration 300: 0.3537922151415003
Loss (Cost) after iteration 400: 0.3547572249820521
Loss (Cost) after iteration 500: 0.35509986163894747
Loss (Cost) after iteration 600: 0.35520347747646647
Loss (Cost) after iteration 700: 0.35520450252153285
Loss (Cost) after iteration 800: 0.35515274359102694
Loss (Cost) after iteration 900: 0.3550455316888044
Loss (Cost) after iteration 1000: 0.3548025387564004
Loss (Cost) after iteration 1100: 0.35423871212845504
Loss (Cost) after iteration 1200: 0.35231277300213726
Loss (Cost) after iteration 1300: 0.3425106879158974
Loss (Cost) after iteration 1400: 0.26831529795365305
Loss (Cost) after iteration 1500: 0.03292301060655237


  dAL = - (np.divide(Y, AL) - np.divide(1 - Y, 1 - AL))
  dAL = - (np.divide(Y, AL) - np.divide(1 - Y, 1 - AL))
  dZ = np.dot(next_layer_W.T, next_layer_dZ) * current_layer.activation.derivative(Z)


Loss (Cost) after iteration 1600: nan
Loss (Cost) after iteration 1700: nan
