# Testing against other software libraries

You should test your results against a similar code using Scikit-Learn (see the examples in the above lecture notes from weeks 41 and 42) or tensorflow/keras or Pytorch (for Pytorch, see Raschka et al.’s text chapters 12 and 13).

Furthermore, you should also test that your derivatives are correctly calculated using automatic differentiation, using for example the Autograd library or the JAX library. It is optional to implement these libraries for the present project. In this project they serve as useful tests of our derivatives.

In [1]:
# import
import autograd.numpy as np
from autograd import elementwise_grad
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import r2_score, mean_squared_error


# custom imports
from settings import ETA_VALUES, LAMBDA_VALUES, VERBOSE, NP_RANDOM_SEED
from runge_preprocessing import y_noise, x, y, x_train_scaled, y_train, x_test_scaled, y_test, RUNGE_HIDDEN_LAYERS, RUNGE_MAX_ITERATIONS, runge_function

In [2]:

# define function to test different hyperparameters
def nn_model(x_train_scaled, x_test_scaled, y_test, etas, lambdas, hidden_layers, activation_type, solver_type, max_iterations, verbose=False):

    eta_list = []
    lambdas_list = []
    score_list = []
    r2_list = []
    mse_list = []

    for i, eta in enumerate(etas):
        for j, lmbd in enumerate(lambdas):

            dnn = MLPRegressor(hidden_layer_sizes=hidden_layers, activation=activation_type, solver=solver_type, max_iter=max_iterations)
            dnn.fit(x_train_scaled, y_train.ravel())
            
            #grid_search[i][j] = dnn ---> remove later if not needed

            y_predicted = dnn.predict(x_test_scaled)

            score = dnn.score(y_test, y_predicted)
            r2 = r2_score(y_test, y_predicted)
            mse = mean_squared_error(y_test, y_predicted)

            eta_list.append(eta)
            lambdas_list.append(lmbd)
            score_list.append(score)
            r2_list.append(r2)
            mse_list.append(mse)
            
            if verbose:
                print('Solver type: ' , solver_type)
                print("Learning rate  = ", eta)
                print("Lambda = ", lmbd)
                print("Accuracy score on test set, dnn.score: ", score) 
                print("Accuracy score on test set, R2: ", r2) 
                print("Accuracy score on test set, MSE: ", mse) 
                print()

    return [eta_list, lambdas_list, score_list, r2_list, mse_list]


In [3]:
# Test SGD with RMSprop, ADAM and plain GD



#grid_search = np.zeros((len(ETA_VALUES), len(LAMBDA_VALUES)), dtype=object) ---> remove later if not needed

# ------> move all reshape to runge_preprocessing? also used in part b
x_train_scaled = np.array(x_train_scaled).reshape(-1,1)     
x_test_scaled = np.array(x_test_scaled).reshape(-1,1)
y_train = np.array(y_train).reshape(-1, 1)
y_test = np.array(y_test).reshape(-1, 1)

relu_lbfgs = nn_model(x_train_scaled, x_test_scaled, y_test, ETA_VALUES, LAMBDA_VALUES, hidden_layers=RUNGE_HIDDEN_LAYERS, activation_type='relu', solver_type='lbfgs', max_iterations=RUNGE_MAX_ITERATIONS, verbose=VERBOSE)
relu_sgd = nn_model(x_train_scaled, x_test_scaled, y_test, ETA_VALUES, LAMBDA_VALUES, hidden_layers=RUNGE_HIDDEN_LAYERS, activation_type='relu', solver_type='sgd', max_iterations=RUNGE_MAX_ITERATIONS, verbose=VERBOSE)
relu_adam = nn_model(x_train_scaled, x_test_scaled, y_test, ETA_VALUES, LAMBDA_VALUES, hidden_layers=RUNGE_HIDDEN_LAYERS, activation_type='relu', solver_type='adam', max_iterations=RUNGE_MAX_ITERATIONS, verbose=VERBOSE)



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=10).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=10).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=10).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iteratio

# Option - check if derivatives are correctly calculated using automatic differentiation

In [None]:
# Seems to be correct results, must verify

from neural_network import create_layers, create_layers_batch, backpropagation, backpropagation_batch, feed_forward, feed_forward_batch
from cost_functions import mse, mse_derivative
from activation_functions import sigmoid, sigmoid_derivative, RELU, RELU_derivative, linear, linear_derivative
from runge_preprocessing import y_noise, x, y, x_train_scaled, x_test_scaled, y_train, RUNGE_HIDDEN_LAYERS, RUNGE_MAX_ITERATIONS

activation_functions = [sigmoid, sigmoid, linear]
activation_functions_derivatives = [sigmoid_derivative, sigmoid_derivative, linear_derivative]  
network_input_size = 1   # ----> should be the same as number of features, i.e. 1
layer_output_sizes = [RUNGE_HIDDEN_LAYERS[0], RUNGE_HIDDEN_LAYERS[1], 1]   # ---> should be hidden layers and output layer size

# reshape to column vector as input to batch version of backpropagation
x_train_scaled = np.array(x_train_scaled).reshape(-1,1) 
y_train = np.array(y_train).reshape(-1, 1)

layers = create_layers_batch(network_input_size, layer_output_sizes)
gradients = backpropagation_batch(x_train_scaled, layers, activation_functions, y_train, activation_functions_derivatives, cost_der=mse_derivative)


def cost(layers, input, activation_funcs, target):

    """
    Computes the cost (error) between the predicted output of a feedforward neural network
    and the target output.

    Parameters:
    ----------
    layers : list of tuples
        A list of (W, b) tuples representing the weights and biases for each layer.
    
    input : np.ndarray
        The input vector to the network, typically of shape (input_size,).
    
    activation_funcs : list of callable
        A list of activation functions to apply after each layer's linear transformation.
    
    target : np.ndarray
        The expected output vector (ground truth) for the given input.

    Returns:
    -------
    float
        The cost value, typically computed as the mean squared error (MSE) between
        the predicted output and the target.
    """


    _, _, predict = feed_forward_batch(input, layers, activation_funcs)
    return mse(predict, target)


from autograd import grad
cost_grad = grad(cost, 0) # 0 means gradients of first input to function -> layers
autograd_gradients = cost_grad(layers, x_train_scaled, activation_functions, y_train)


def compare_gradients(manual_grads, autograd_grads, atol=1e-6):
    for i, ((dW_manual, db_manual), (dW_auto, db_auto)) in enumerate(zip(manual_grads, autograd_grads)):
        w_close = np.allclose(dW_manual, dW_auto, atol=atol)
        b_close = np.allclose(db_manual, db_auto, atol=atol)
        print(f"Layer {i}: dW match: {w_close}, db match: {b_close}")

compare_gradients(gradients, autograd_gradients)



def show_gradient_differences(manual_grads, autograd_grads):
    for i, ((dW_manual, db_manual), (dW_auto, db_auto)) in enumerate(zip(manual_grads, autograd_grads)):
        #print(dW_manual.shape, dW_auto.shape)
        print(f"Layer {i} - dW diff:\n{dW_manual - dW_auto}")
        print(f"Layer {i} - db diff:\n{db_manual - db_auto}")
show_gradient_differences(gradients, autograd_gradients)




def finite_diff_grad_b(layers, inputs, activations, targets, layer_idx, eps=1e-6):
    # Copy layers to avoid in-place changes
    import copy
    L = copy.deepcopy(layers)
    W, b = L[layer_idx]
    num_grad = np.zeros_like(b)

    base_loss = cost(L, inputs, activations, targets)

    for j in range(b.shape[0]):
        Lp = copy.deepcopy(L)
        Lm = copy.deepcopy(L)
        bp = b.copy(); bp[j] += eps
        bm = b.copy(); bm[j] -= eps
        Lp[layer_idx] = (W, bp)
        Lm[layer_idx] = (W, bm)
        lp = cost(Lp, inputs, activations, targets)
        lm = cost(Lm, inputs, activations, targets)
        num_grad[j] = (lp - lm) / (2*eps)

    return num_grad
# Example check for the last layer bias:
i = len(layers) - 1
manual = backpropagation_batch(x_train_scaled, layers, activation_functions,
                               y_train, activation_functions_derivatives,
                               cost_der=mse_derivative)
dW_manual, db_manual = manual[i]
db_num = finite_diff_grad_b(layers, x_train_scaled, activation_functions, y_train, i)

print("||db_manual - db_num||:", np.linalg.norm(db_manual - db_num))



Layer 0: dW match: True, db match: True
Layer 1: dW match: True, db match: True
Layer 2: dW match: True, db match: True
Layer 0 - dW diff:
[[ 2.22044605e-16  0.00000000e+00  0.00000000e+00 -2.22044605e-16
   0.00000000e+00  0.00000000e+00 -2.22044605e-16  0.00000000e+00
   8.88178420e-16  3.33066907e-16 -5.55111512e-17  0.00000000e+00
  -5.55111512e-17 -2.77555756e-17 -2.22044605e-16  4.44089210e-16
  -1.11022302e-16  2.22044605e-16  0.00000000e+00  0.00000000e+00
   4.44089210e-16 -4.44089210e-16 -6.66133815e-16 -8.32667268e-17
  -1.11022302e-16  2.08166817e-17 -1.66533454e-16 -5.55111512e-17
  -4.44089210e-16 -2.22044605e-16  2.22044605e-16  1.11022302e-16
  -1.11022302e-16 -1.11022302e-16  5.55111512e-17  5.55111512e-17
   2.22044605e-16  3.33066907e-16  2.22044605e-16  1.11022302e-16
  -4.44089210e-16  1.11022302e-16 -1.11022302e-16 -5.55111512e-17
  -4.44089210e-16 -1.11022302e-16  0.00000000e+00  0.00000000e+00
   0.00000000e+00  6.66133815e-16]]
Layer 0 - db diff:
[ 4.44089210e-