In [1]:
import torch
import numpy as np

## Forward Propagation Process

In [2]:
def initialize_parameters(layer_dims):
    """
    Initialize the parameters for a multi-layer neural network.

    Args:
    layer_dims (list): List containing the dimensions of each layer in the network.

    Returns:
    dict: A dictionary containing the initialized weights and biases.
          Weights 'W' and biases 'b' for each layer l are keyed by 'Wl' and 'bl' respectively.
    """
    parameters = {}
    L = len(layer_dims)

    for l in range(1, L):
        parameters['W' + str(l)] = np.random.randn(layer_dims[l], layer_dims[l-1]) * np.sqrt(2 / layer_dims[l - 1])
        parameters['b' + str(l)] = np.zeros((layer_dims[l], 1))
        # print(f"Layer {l}, W shape: {parameters['W' + str(l)].shape}, max W: {np.max(parameters['W' + str(l)])}, min W: {np.min(parameters['W' + str(l)])}")  # Debugging statement

    return parameters

In [3]:
def linear_forward(A, W, b):
    """
    Implement the linear part of a layer's forward propagation.

    Args:
    A (numpy.ndarray): Activations from previous layer (or input data): shape (size of previous layer, number of examples)
    W (numpy.ndarray): Weights matrix: shape (size of current layer, size of previous layer)
    b (numpy.ndarray): Bias vector, shape (size of current layer, 1)

    Returns:
    numpy.ndarray: The linear component of the activation function for the current layer. Z = W*A + b
    dict: A cache containing the inputs A, W, and b, to be used in backpropagation.
    """

    Z = np.dot(W, A) + b
    linear_cache = (A, W, b)

    return Z, linear_cache


In [4]:
def softmax(Z):
    """
    Compute the softmax activation for a given input.

    Args:
    Z (numpy.ndarray): The linear component of the activation function from the current layer; shape (size of current layer, number of examples)

    Returns:
    numpy.ndarray: The activations after applying softmax, representing probability distributions over classes.
    numpy.ndarray: Returns Z, cached for use in backpropagation.
    """

    expZ = np.exp(Z - np.max(Z, axis=0, keepdims=True))
    A = expZ / np.sum(expZ, axis=0, keepdims=True)
    activation_cache = Z
    return A, activation_cache

In [5]:
def relu(Z):
    """
    compute ReLU activation for a givem input.

    Args:
    Z (numpy,ndarray): The linear component of the activation function

    Returns:
    numpy.ndarray: The activations after applying ReLU, where each element is the max of 0 and the element itself.
    numpy.ndarray: Returns Z, cached for use in backpropagation.
    """

    A = np.maximum(0.0, Z)

    return A, Z

In [6]:
def linear_activation_forward(A_prev, W, b, activation):
    """
    Implement the forward propagation for the LINEAR->ACTIVATION layer.

    Args:
    A_prev (numpy.ndarray): Activations from previous layer.
    W (numpy.ndarray): Weights matrix.
    B (numpy.ndarray): Bias vector.
    activation (str): The type of activation function to be used ("relu" or "softmax").

    Returns:
    numpy.ndarray: The activations of the current layer.
    dict: A cache containing both linear and activation caches for use in backpropagation.
    """

    Z, linear_cache = linear_forward(A_prev, W, b)
    if activation == "relu":
        A, activation_cache = relu(Z)
    elif activation == "softmax":
        A, activation_cache = softmax(Z)
    cache = (linear_cache, activation_cache)
    return A, cache


#### Batch-Normalization (BN)
Consists of normalizing activation vectors from hidden layers using the first and the second statistical moments (mean and variance) of the current batch.
This normalization step is applied right before (or right after) the nonlinear function.
It is usually used as a module which could be inserted as a standard layer in a DNN.

The BN layer first determines the mean 𝜇 and the variance σ² of the activation values across the batch, using (1) and (2).

It then normalizes the activation vector Z^(i) with (3). That way, each neuron’s output follows a standard normal distribution across the batch. (𝜀 is a constant used for numerical stability)


In [7]:
def apply_batchnorm(Z):
    """
    Apply batch normalization to the linear output Z.

    Args:
    A (numpy.ndarray): Input 2D array where each row corresponds to a feature
                       and each column corresponds to an example.

    Returns:
    numpy.ndarray: The normalized 2D array where each feature (row) has a mean
                   of 0 and a variance of 1.
    """ 
    mean = np.mean(Z, axis=1, keepdims=True)
    var = np.var(Z, axis=1, keepdims=True)
    NA = (Z - mean) / np.sqrt(var + 1e-8)
    return NA

In [8]:
def L_model_forward(X, parameters, use_batchnorm=False):
    """
    Implements forward propagation for the [LINEAR->RELU]*(L-1)->LINEAR->SOFTMAX computation.

    Args:
    X (numpy.ndarray): Input data of shape (input size, number of examples).
    parameters (dict): Python dictionary containing the parameters "W1", "b1", ..., "WL", "bL".
                       - Wl: weight matrix of shape (size of previous layer, size of current layer)
                       - bl: bias vector of shape (1, size of current layer)
    use_batchnorm (bool): If True, apply batch normalization after each linear transformation.

    Returns:
    AL (numpy.ndarray): Last post-activation value (output of the model).
    caches (list): List of caches containing:
                   - every cache of linear_activation_forward() (for relu) 
                   - the cache of linear_activation_forward() (for softmax)
    """
    caches = []
    A = X
    L = len(parameters) // 2  # Number of layers

    # Forward pass for [LINEAR->RELU]*(L-1)
    for l in range(1, L):
        A_prev = A
        W = parameters['W' + str(l)]
        B = parameters['b' + str(l)]
        A, cache = linear_activation_forward(A_prev, W, B, "relu")
        if use_batchnorm:
            Z = cache[1]  # activation_cache is second in tuple, Z is stored in it
            Z_norm = apply_batchnorm(Z)
            A, _ = relu(Z_norm)  # Reapply activation after batch normalization
        caches.append(cache)

    # Forward pass for the last layer [LINEAR->SOFTMAX]
    W = parameters['W' + str(L)]
    B = parameters['b' + str(L)]
    AL, cache = linear_activation_forward(A, W, B, "softmax")
    caches.append(cache)

    return AL, caches

In [9]:
def compute_cost(AL, Y, parameters ,epsilon_L2=0):
    """
    Compute the categorical cross-entropy cost.

    Args:
    AL (numpy.ndarray): Probability vector corresponding to label predictions, shape (num_of_classes, number of examples)
    Y (numpy.ndarray): Ground truth labels vector, shape (num_of_classes, number of examples)
    parameters:
    epsilon_L2 (float): L2 Regularization parameter

    Returns:
    float: The cross-entropy cost.
    """
    m = Y.shape[1]
    # cost = -np.sum(Y * np.log(AL) + (1 - Y) * np.log(1 - AL)) / m
    cost = -np.sum(Y * np.log(AL + 1e-8)) / m  # Categorical cross-entropy cost
    cost = np.squeeze(cost)  # To make sure cost is a scalar

    #add L2 Norm Regularization
    L2_cost = 0
    L = len(parameters) // 2
    for l in range(1, L + 1):
        L2_cost += np.sum(np.square(parameters["W" + str(l)]))
    L2_cost = (epsilon_L2 / (2 * m)) * L2_cost

    return cost + L2_cost

## Backward Propagation Process

In [10]:
def linear_backward(dZ, cache):
    """
    Implement the linear portion of backward propagation for a single layer.

    Args:
    dZ (numpy.ndarray): Gradient of the cost with respect to the linear output of the current layer l
    cache (tuple): Tuple of values (A_prev, W, b) from the forward pass

    Returns:
    dA_prev (numpy.ndarray): Gradient of the cost with respect to the activation of the previous layer
    dW (numpy.ndarray): Gradient of the cost with respect to W (current layer l)
    db (numpy.ndarray): Gradient of the cost with respect to b (current layer l)
    """
    A_prev, W, b = cache
    m = A_prev.shape[1]
    dW = (1 / m) * np.dot(dZ, A_prev.T)
    db = (1 / m) * np.sum(dZ, axis=1, keepdims=True)
    dA_prev = np.dot(W.T, dZ)

    return dA_prev, dW, db

In [11]:
def relu_backward(dA, activation_cache):
    """
    The backward propagation for a single RELU unit.
    """
    Z = activation_cache
    dZ = np.array(dA, copy=True)
    dZ[Z <= 0] = 0
    return dZ

In [12]:
def softmax_backward(dA, activation_cache):
    """
    Implements the backward propagation for a single SOFTMAX unit.

    The derivative of the softmax function is: p_i - y_i,
    where p_i is the softmax-adjusted probability of the class and y_i is the “ground truth”.

    Args:
    dA (numpy.ndarray): The post-activation gradient, typically AL - Y.
    activation_cache (numpy.ndarray): Contains Z (stored during the forward propagation).

    Returns:
    dZ (numpy.ndarray): Gradient of the cost with respect to Z.
    """

    # The gradient of the cost with respect to Z for softmax is simply the difference
    # between the predicted probabilities (AL) and the true labels (Y), which is dA.
    # In the context of backpropagation, dA is typically computed as AL - Y.
    # Therefore, we return dA directly as dZ.
    return dA


In [13]:
def linear_activation_backward(dA, cache, activation):
    """
    Implement the backward propagation for the LINEAR->ACTIVATION layer.

    Args:
    dA (numpy array): post-activation gradient for current layer
    cache (tuple): contains the linear_cache and activation_cache (linear_cache, activation_cache)
    activation (str): the activation to be used in this layer, stored as a text string: "relu", "softmax"

    Returns:
    dA_prev (numpy array): Gradient of the cost with respect to the activation (of the previous layer l-1), same shape as A_prev
    dW (numpy array): Gradient of the cost with respect to W (current layer l), same shape as W
    db (numpy array): Gradient of the cost with respect to b (current layer l), same shape as b
    """
    linear_cache, activation_cache = cache

    if activation == "relu":
        dZ = relu_backward(dA, activation_cache)
    elif activation == "softmax":
        dZ = softmax_backward(dA, activation_cache)
    else:
        raise ValueError("Unsupported activation function")

    dA_prev, dW, db = linear_backward(dZ, linear_cache)
    return dA_prev, dW, db

In [14]:
def L_model_backward(AL, Y, caches):
    """
    Implements the backward propagation for the entire network.

    Args:
    AL (numpy.ndarray): Probability vector, output of the forward propagation (L_model_forward).
    Y (numpy.ndarray): True "label" vector (the same shape as AL).
    caches (list): List of caches containing:
                   - every cache of linear_activation_forward() with "relu"
                   - the cache of linear_activation_forward() with "softmax"

    Returns:
    grads (dict): A dictionary with the gradients:
                  - "dA1", "dA2", ..., "dAL"
                  - "dW1", "dW2", ..., "dWL"
                  - "db1", "db2", ..., "dbL"
    """
    grads = {}
    L = len(caches)
    m = AL.shape[1]
    Y = Y.reshape(AL.shape)

    # dAL = - (np.divide(Y, AL) - np.divide(1 - Y, 1 - AL))
    # dAL = - (np.divide(Y, AL + 1e-8))
    dAL = AL - Y

    current_cache = caches[-1]
    grads["dA" + str(L)], grads["dW" + str(L)], grads["db" + str(L)] = linear_activation_backward(dAL, current_cache, "softmax")

    # loop from L-2 to 0
    for l in reversed(range(L-1)):
        current_cache = caches[l]
        dA_prev_temp, dW_temp, db_temp = linear_activation_backward(grads["dA" + str(l + 2)], current_cache, "relu")
        grads["dA" + str(l + 1)] = dA_prev_temp
        grads["dW" + str(l + 1)] = dW_temp
        grads["db" + str(l + 1)] = db_temp

    return grads


In [15]:
def update_parameters(parameters, grads, learning_rate, epsilon_L2):
    """
    Update parameters using gradient descent.

    Args:
    parameters (dict): A dictionary containing the DNN architecture's parameters
    grads (dict): A dictionary containing the gradients (generated by L_model_backward)
    learning_rate (float): The learning rate, alpha, used to update the parameters
    epsilon_L2 (float): L2 Regularization parameter

    Returns:
    dict: The updated values of the parameters
    """
    L = len(parameters) // 2  # number of layers in the neural network


    # Update rule for each parameter
    for l in range(1, L+1):
        m = parameters["W" + str(l)].shape[1]
        parameters["W" + str(l)] -= learning_rate * (grads["dW" + str(l)] +
                                               epsilon_L2 / m * parameters["W" + str(l)])
        parameters["b" + str(l)] -= learning_rate * grads["db" + str(l)]

    return parameters


## Train the Network and Produce Predictions

In [16]:
def random_mini_batches(X, Y, mini_batch_size = 64):
    """
    function to split X and Y to mini batches of size mini_batch_size

    Args:
    X (numpy.ndarray): Input data, shape (height*width, number_of_examples).
    Y (numpy.ndarray): True labels, shape (num_of_classes, number of examples).
    mini_batch_size(int): size of the mini-batches

    Returns:
    mini_batches: array of mini batches
    """
    # np.random.seed(3)
    m = X.shape[1]
    mini_batches = []

    permutation = list(np.random.permutation(m))
    shuffled_X = X[:, permutation]
    shuffled_Y = Y[:, permutation].reshape((Y.shape[0], m))

    num_complete_minibatches = m // mini_batch_size
    for k in range(0, num_complete_minibatches):
        mini_batch_X = shuffled_X[:, k * mini_batch_size : (k + 1) * mini_batch_size]
        mini_batch_Y = shuffled_Y[:, k * mini_batch_size : (k + 1) * mini_batch_size]
        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)

    if m % mini_batch_size != 0:
        mini_batch_X = shuffled_X[:, num_complete_minibatches * mini_batch_size : m]
        mini_batch_Y = shuffled_Y[:, num_complete_minibatches * mini_batch_size : m]
        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)

    return mini_batches

In [17]:
def L_layer_model(X, Y, layers_dims, learning_rate , num_iterations, batch_size, use_batchnorm=False, epsilon_L2=0):
    """
    Implements a L-layer neural network.
    All layers but the last should have the ReLU activation function,
    and the final layer will apply the softmax activation function.
    The size of the output layer should be equal to the number of labels in the data.
    Please select a batch size that enables your code to run well
    (i.e. no memory overflows while still running relatively fast).
    """
    costs = []
    parameters = initialize_parameters(layers_dims)

    for i in range(0, num_iterations):
        minibatches = random_mini_batches(X, Y, batch_size) #TODO: check if neccecry

        for minibatch in minibatches:
            (minibatch_X, minibatch_Y) = minibatch

            AL, caches = L_model_forward(minibatch_X, parameters, use_batchnorm=use_batchnorm)
            cost = compute_cost(AL, minibatch_Y, parameters, epsilon_L2)
            grads = L_model_backward(AL, minibatch_Y, caches)
            parameters = update_parameters(parameters, grads, learning_rate, epsilon_L2)

        if i % 100 == 0:
            costs.append(cost)
            print(f"Cost after iteration {i}: {cost}")

    return parameters, costs

In [18]:
def predict(X, Y, parameters):
    """
    Predicts the results using a deep learning model and calculates the accuracy.

    Args:
    X (numpy.ndarray): Input data, shape (height*width, number_of_examples).
    Y (numpy.ndarray): True labels, shape (num_of_classes, number of examples).
    parameters (dict): Trained parameters of the DNN.

    Returns:
    float: Accuracy of the predictions.
    """
    AL, _ = L_model_forward(X, parameters)
    predictions = np.argmax(AL, axis=0)
    true_labels = np.argmax(Y, axis=0)
    accuracy = np.mean(predictions == true_labels) * 100
    # return predictions, accuracy
    return accuracy

## classify the MNIST dataset and present a summary report

### Prepare mnist dataset

In [19]:
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split

# Download and preprocess the MNIST dataset
mnist = tf.keras.datasets.mnist
(X_train_full, Y_train_full), (X_test, Y_test) = mnist.load_data()

# Normalize the data
X_train_full = X_train_full / 255.0
X_test = X_test / 255.0

# Flatten the data
X_train_full = X_train_full.reshape(X_train_full.shape[0], -1).T
X_test = X_test.reshape(X_test.shape[0], -1).T

# One-hot encode the labels
Y_train_full = tf.keras.utils.to_categorical(Y_train_full, 10).T
Y_test = tf.keras.utils.to_categorical(Y_test, 10).T

# Split the training set into training and validation sets
X_train, X_val, Y_train, Y_val = train_test_split(X_train_full.T, Y_train_full.T, test_size=0.2, random_state=42)
X_train = X_train.T
X_val = X_val.T
Y_train = Y_train.T
Y_val = Y_val.T

print(f"Training set shape: {X_train.shape}")
print(f"Validation set shape: {X_val.shape}")
print(f"Test set shape: {X_test.shape}")

2024-05-19 12:35:14.606842: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Training set shape: (784, 48000)
Validation set shape: (784, 12000)
Test set shape: (784, 10000)


In [20]:
from time import perf_counter

def train_model_on_mnist(X_train, Y_train, X_val, Y_val, use_batchnorm=False, epsilon_L2=0, patience=100, improvement_threshold=1e-4):
    """
    Function to train a DNN model on the MNIST dataset.
    Train the network until there is no improvement on the validation set
    (or the improvement is very small) for a specified number of training steps.

    Args:
    X_train (numpy.ndarray): Training data.
    Y_train (numpy.ndarray): Training labels.
    X_val (numpy.ndarray): Validation data.
    Y_val (numpy.ndarray): Validation labels.
    use_batchnorm (boolean): Flag to use batch normalization.
    epsilon_L2 (float): L2 Regularization parameter.
    patience (int): Number of training steps to wait for an improvement before stopping.
    improvement_threshold (float): Minimum improvement to consider.

    Returns:
    parameters (dict): Trained model parameters.
    training_steps (int): Number of training steps performed.
    epochs (int): Number of epochs performed.
    """
    # Define the network architecture
    layers_dims = [X_train.shape[0], 20, 7, 5, 10]
    learning_rate = 0.009
    batch_size = 64

    best_val_cost = float('inf')
    training_steps = 0
    epochs_without_improvement = 0
    total_iterations = 0

    iterations_per_epoch = X_train.shape[1] // batch_size

    while epochs_without_improvement < patience:
        # Train the model for 100 iterations
        parameters, costs = L_layer_model(X_train, Y_train, layers_dims, learning_rate=learning_rate,
                                          num_iterations=100, batch_size=batch_size, use_batchnorm=use_batchnorm, epsilon_L2=epsilon_L2)
        training_steps += 100
        total_iterations += 100

        # Compute validation loss
        AL_val, _ = L_model_forward(X_val, parameters, use_batchnorm=use_batchnorm)
        val_cost = compute_cost(AL_val, Y_val, parameters, epsilon_L2=epsilon_L2)

        if best_val_cost - val_cost > improvement_threshold:
            best_val_cost = val_cost
            epochs_without_improvement = 0
        else:
            epochs_without_improvement += 100

        print(f"Training steps: {training_steps}, Validation cost: {val_cost:.4f}, Best validation cost: {best_val_cost:.4f}")

    # Calculate the number of epochs based on total iterations and iterations per epoch
    total_epochs = total_iterations / iterations_per_epoch

    return parameters, training_steps, total_epochs


### Without batchnorm

In [21]:
start_time = perf_counter()
parameters, training_steps, total_epochs = train_model_on_mnist(X_train, Y_train, X_val, Y_val, use_batchnorm=False, epsilon_L2=0)
end_time = perf_counter()

time_without_batchnorm = end_time - start_time

# Evaluate on training set
train_accuracy = predict(X_train, Y_train, parameters)
print(f"Train accuracy: {train_accuracy:.2f}%")

# Evaluate on test set
test_accuracy = predict(X_test, Y_test, parameters)
print(f"Test accuracy: {test_accuracy:.2f}%")

# Evaluate on validation set
val_accuracy = predict(X_val, Y_val, parameters)
print(f"Validation accuracy: {val_accuracy:.2f}%")

print(f"Total iterations: {training_steps}")
print(f"Total epochs: {total_epochs:.2f}")
print(f"Training time: {time_without_batchnorm:.2f} seconds")

Cost after iteration 0: 1.7001067863708594
Training steps: 100, Validation cost: 0.2983, Best validation cost: 0.2983
Cost after iteration 0: 1.775134069081191
Training steps: 200, Validation cost: 0.2530, Best validation cost: 0.2530
Cost after iteration 0: 1.5937900178811995
Training steps: 300, Validation cost: 0.2244, Best validation cost: 0.2244
Cost after iteration 0: 1.1332600397598123
Training steps: 400, Validation cost: 0.2440, Best validation cost: 0.2244
Train accuracy: 97.02%
Test accuracy: 94.31%
Validation accuracy: 93.92%
Total iterations: 400
Total epochs: 0.53
Training time: 164.41 seconds


### With batchnorm
classify the MNIST dataset and present a summary report (batchnorm function is “on”)

In [23]:
start_time = perf_counter()
parameters_batchnorm, training_steps_batchnorm, total_epochs_batchnorm = train_model_on_mnist(X_train, Y_train, X_val, Y_val, use_batchnorm=True, epsilon_L2=0)
end_time = perf_counter()

time_with_batchnorm = end_time - start_time

# Evaluate on training set
train_accuracy_batchnorm = predict(X_train, Y_train, parameters_batchnorm)
print(f"Train accuracy: {train_accuracy_batchnorm:.2f}%")

# Evaluate on test set
test_accuracy_batchnorm = predict(X_test, Y_test, parameters_batchnorm)
print(f"Test accuracy: {test_accuracy_batchnorm:.2f}%")

# Evaluate on validation set
val_accuracy_batchnorm = predict(X_val, Y_val, parameters_batchnorm)
print(f"Validation accuracy: {val_accuracy_batchnorm:.2f}%")

print(f"Total iterations: {training_steps_batchnorm}")
print(f"Total epochs: {total_epochs_batchnorm:.2f}")
print(f"Training time: {time_with_batchnorm:.2f} seconds")

Cost after iteration 0: 1.9186942963148828
Training steps: 100, Validation cost: 1.5564, Best validation cost: 1.5564
Cost after iteration 0: 2.065306905636067
Training steps: 200, Validation cost: 1.8803, Best validation cost: 1.5564
Train accuracy: 13.63%
Test accuracy: 13.47%
Validation accuracy: 13.41%
Total iterations: 200
Total epochs: 0.27
Training time: 108.77 seconds


compare this experiment to the previous one (performance, running time, number of training steps etc.)

In [24]:
print(f"Running time: ")
print(f"\twithout batchnorm: {time_without_batchnorm: .4f} sec")
print(f"\twith batchnorm: {time_with_batchnorm: .4f} sec")

print(f"Test accuracy: ")
print(f"\twithout batchnorm: {test_accuracy: .4f}%")
print(f"\twith batchnorm: {test_accuracy_batchnorm: .4f}%")

print(f"Train accuracy: ")
print(f"\twithout batchnorm: {train_accuracy: .4f}%")
print(f"\twith batchnorm: {train_accuracy_batchnorm: .4f}%")

print(f"Validate accuracy: ")
print(f"\twithout batchnorm: {val_accuracy: .4f}%")
print(f"\twith batchnorm: {val_accuracy_batchnorm: .4f}%")

print(f"Training steps: ")
print(f"\twithout batchnorm: {training_steps: .1f}")
print(f"\twith batchnorm: {training_steps_batchnorm: .1f}")

Running time: 
	without batchnorm:  164.4076 sec
	with batchnorm:  108.7736 sec
Test accuracy: 
	without batchnorm:  94.3100%
	with batchnorm:  13.4700%
Train accuracy: 
	without batchnorm:  97.0208%
	with batchnorm:  13.6250%
Validate accuracy: 
	without batchnorm:  93.9167%
	with batchnorm:  13.4083%
Training steps: 
	without batchnorm:  400.0
	with batchnorm:  200.0


### Modify the code to support the L2 standard functionality

Changes in the code:


*   compute_cost: add parameters: epsilon_L2 and the dict 'parameters'. Then calulate the L2 cost and add to the cros-entropy cost.
*   update_parameters: add parameter epsilon_L2 and update in the function the equation of updates W with the L2 Regularization
*   L_layer_model: add parameter epsilon_L2 to the signature and update the call to compute_cost with the new parameters.



In [28]:
start_time = perf_counter()
parameters_L2, training_steps_L2, total_epochs_L2 = train_model_on_mnist(X_train, Y_train, X_val, Y_val, use_batchnorm=True, epsilon_L2=0)
end_time = perf_counter()

time_with_L2 = end_time - start_time

# Evaluate on training set
train_accuracy_L2 = predict(X_train, Y_train, parameters_L2)
print(f"Train accuracy: {train_accuracy_L2:.2f}%")

# Evaluate on test set
test_accuracy_L2 = predict(X_test, Y_test, parameters_L2)
print(f"Test accuracy: {test_accuracy_L2:.2f}%")

# Evaluate on validation set
val_accuracy_L2 = predict(X_val, Y_val, parameters_L2)
print(f"Validation accuracy: {val_accuracy_L2:.2f}%")

print(f"Total iterations: {training_steps_L2}")
print(f"Total epochs: {total_epochs_L2:.2f}")
print(f"Training time: {time_with_L2:.2f} seconds")


Cost after iteration 0: 1.8162497788469927
Training steps: 100, Validation cost: 1.6965, Best validation cost: 1.6965
Cost after iteration 0: 2.043473990150488
Training steps: 200, Validation cost: 1.5872, Best validation cost: 1.5872
Cost after iteration 0: 2.002569312064333
Training steps: 300, Validation cost: 1.7081, Best validation cost: 1.5872
Train accuracy: 11.62%
Test accuracy: 11.67%
Validation accuracy: 11.57%
Total iterations: 300
Total epochs: 0.40
Training time: 163.10 seconds


In [29]:
print(f"Running time: ")
print(f"\twithout L2 norm : {time_without_batchnorm: .4f} sec")
print(f"\twith L2 norm: {time_with_L2: .4f} sec")

print(f"Test accuracy: ")
print(f"\twithout L2 norm: {test_accuracy: .4f}%")
print(f"\twith L2 norm: {test_accuracy_L2: .4f}%")

print(f"Train accuracy: ")
print(f"\twithout L2 norm: {train_accuracy: .4f}%")
print(f"\twith L2 norm: {train_accuracy_L2: .4f}%")

print(f"Validate accuracy: ")
print(f"\twithout L2 norm: {val_accuracy: .4f}%")
print(f"\twith L2 norm: {val_accuracy_L2: .4f}%")

print(f"Training steps: ")
print(f"\twithout L2 norm: {training_steps: .1f}")
print(f"\twith L2 norm: {training_steps_L2: .1f}")

Running time: 
	without L2 norm :  164.4076 sec
	with L2 norm:  163.0952 sec
Test accuracy: 
	without L2 norm:  94.3100%
	with L2 norm:  11.6700%
Train accuracy: 
	without L2 norm:  97.0208%
	with L2 norm:  11.6208%
Validate accuracy: 
	without L2 norm:  93.9167%
	with L2 norm:  11.5667%
Training steps: 
	without L2 norm:  400.0
	with L2 norm:  300.0


In [30]:
def compute_frobenius_norm(parameters):
    """
    Compute the Frobenius norm of the weights in the parameter dictionary.

    Arguments:
    parameters -- python dictionary containing the parameters "W1", "b1", ..., "WL", "bL"

    Returns:
    norms -- list of Frobenius norms of the weight matrices
    """
    L = len(parameters) // 2
    norms = []
    for l in range(1, L + 1):
        norm = np.linalg.norm(parameters['W' + str(l)], 'fro')
        norms.append(norm)
    return norms

# Compute the Frobenius norms of the weights for both models
norms_no_L2 = compute_frobenius_norm(parameters)
norms_L2 = compute_frobenius_norm(parameters_L2)

# Print the comparison of the norms
for l in range(len(norms_no_L2)):
    print(f"Layer {l+1}:")
    print(f"\tFrobenius norm without regularization: {norms_no_L2[l]}")
    print(f"\tFrobenius norm with regularization: {norms_L2[l]}")
    print("")

Layer 1:
	Frobenius norm without regularization: 9.416273058342117
	Frobenius norm with regularization: 35486324.266033225

Layer 2:
	Frobenius norm without regularization: 6.757597343145094
	Frobenius norm with regularization: 5818.79530094917

Layer 3:
	Frobenius norm without regularization: 6.340671153811876
	Frobenius norm with regularization: 69.33215855108277

Layer 4:
	Frobenius norm without regularization: 6.9658014538908635
	Frobenius norm with regularization: 9.691750845831352

