In [None]:
import sys
sys.path.insert(0, '/Users/livestorborg/Desktop/FYS-STK4155/project2/code')

import numpy as np
from sklearn.model_selection import train_test_split
from src.neural_network import NeuralNetwork
from src.activations import Sigmoid, Linear
from src.losses import MSE
from src.optimizers import GD, RMSprop, Adam
from src.training import train
from src.metrics import mse
from src.utils import runge, polynomial_features, scale_data, OLS_parameters, inverse_scale_y

SEED = 42
np.random.seed(SEED)

# Setup for OLS Regression

In [11]:
SEED = 42
np.random.seed(SEED)

N = 300
x = np.linspace(-1, 1, N)
y_true = runge(x)
y_noise = y_true + np.random.normal(0, 0.01, N)

In [12]:
X_poly = polynomial_features(x, p=14, intercept=False)

X_train_poly, X_test_poly, y_train, y_test = train_test_split(
    X_poly, y_noise, test_size=0.2, random_state=SEED
)

theta_ols = OLS_parameters(X_train_poly, y_train)
y_pred_ols = X_test_poly @ theta_ols
ols_mse = mse(y_test.reshape(-1, 1), y_pred_ols.reshape(-1, 1))

print(f"OLS Test MSE: {ols_mse:.6f}")

OLS Test MSE: 0.083335


# Setup for FFNN

In [13]:
X_train_raw, X_test_raw, y_train_nn, y_test_nn = train_test_split(
    x.reshape(-1, 1), y_noise.reshape(-1, 1), 
    test_size=0.2, random_state=SEED
)

# Scale
X_train_s, y_train_s, X_mean, X_std, y_mean = scale_data(X_train_raw, y_train_nn)
X_test_s, y_test_s, _, _, _ = scale_data(X_test_raw, y_test_nn, X_mean, X_std, y_mean)

y_test_real = inverse_scale_y(y_test_s, y_mean)

# Learning rates to test

In [14]:
eta_gd = np.logspace(-3, 1, 20)     
eta_rms = np.logspace(-3, -1, 20)    
eta_adam = np.logspace(-3, -1, 20)

# Parameters for FFNN

In [15]:
network_input_size = 1
loss = MSE()

# Experiment 1: one hidden layer and 50 hidden nodes
layer_output_sizes_1 = [50, 1]
activations_1 = [Sigmoid(), Linear()]

# Experiment 2: two hidden layers and 100 hidden nodes each
layer_output_sizes_2 = [100, 100, 1]
activations_2 = [Sigmoid(), Sigmoid(), Linear()]


num_iter = 500     # GD full batch
epochs = 500       # RMSprop and Adam (mini-batch)
batch_size = 32    # RMSprop and Adam (mini-batch)

# Full batch Gradient Descent

In [16]:
best_gd_eta_1 = None
best_gd_mse_1 = float('inf')

best_gd_eta_2 = None
best_gd_mse_2 = float('inf')

for eta in eta_gd:

    # Experiment 1: one hidden layer and 50 hidden nodes
    nn_gd_1 = NeuralNetwork(network_input_size=network_input_size, layer_output_sizes=layer_output_sizes_1, activations=activations_1, loss=loss, seed=SEED)
    train(nn_gd_1, X_train_s, y_train_s, X_test_s, y_test_s, GD(eta), 
          epochs=500, batch_size=len(X_train_s), verbose=False, seed=SEED) 
    y_pred_gd_s = nn_gd_1.predict(X_test_s)
    y_pred_gd = inverse_scale_y(y_pred_gd_s, y_mean)
    gd_mse = mse(y_test_real, y_pred_gd)

    print(f"  eta={eta:.4f}  MSE={gd_mse:.6f}")

    if gd_mse < best_gd_mse_1:  
        best_gd_mse_1 = gd_mse
        best_gd_eta_1 = eta

    # Experiment 2: two hidden layers and 100 hidden nodes each
    nn_gd_2 = NeuralNetwork(network_input_size=network_input_size, layer_output_sizes=layer_output_sizes_2, activations=activations_2, loss=loss, seed=SEED)
    train(nn_gd_2, X_train_s, y_train_s, X_test_s, y_test_s, GD(eta), 
          epochs=500, batch_size=len(X_train_s), verbose=False, seed=SEED) 
    y_pred_gd_s = nn_gd_2.predict(X_test_s)
    y_pred_gd = inverse_scale_y(y_pred_gd_s, y_mean)
    gd_mse = mse(y_test_real, y_pred_gd)

    print(f"  eta={eta:.4f}  MSE={gd_mse:.6f}")

    if gd_mse < best_gd_mse_2:  
        best_gd_mse_2 = gd_mse
        best_gd_eta_2 = eta

  eta=0.0010  MSE=0.070507
  eta=0.0010  MSE=0.067735
  eta=0.0016  MSE=0.069999
  eta=0.0016  MSE=0.067737
  eta=0.0026  MSE=0.069725
  eta=0.0026  MSE=0.067746
  eta=0.0043  MSE=0.069308
  eta=0.0043  MSE=0.067746
  eta=0.0070  MSE=0.068628
  eta=0.0070  MSE=0.067799
  eta=0.0113  MSE=0.067506
  eta=0.0113  MSE=0.067860
  eta=0.0183  MSE=0.065574
  eta=0.0183  MSE=0.058716
  eta=0.0298  MSE=0.061927
  eta=0.0298  MSE=0.066984
  eta=0.0483  MSE=0.053909
  eta=0.0483  MSE=2.722146
  eta=0.0785  MSE=0.132213
  eta=0.0785  MSE=2.456288
  eta=0.1274  MSE=5.731522
  eta=0.1274  MSE=24.994946
  eta=0.2069  MSE=1.901176
  eta=0.2069  MSE=206.800797
  eta=0.3360  MSE=16.801429
  eta=0.3360  MSE=1326.513466
  eta=0.5456  MSE=137.711600
  eta=0.5456  MSE=5915.703690
  eta=0.8859  MSE=1044.394417
  eta=0.8859  MSE=17510.644148
  eta=1.4384  MSE=7034.400424
  eta=1.4384  MSE=34191.580075
  eta=2.3357  MSE=33398.155638
  eta=2.3357  MSE=55164.583186
  eta=3.7927  MSE=131023.824219
  eta=3.7927  MS

# Stochastic Gradient Descent with RMSprop

In [17]:
best_rms_eta_1 = None
best_rms_mse_1 = float('inf')

best_rms_eta_2 = None
best_rms_mse_2 = float('inf')

for eta in eta_rms:

    # Experiment 1: one hidden layer and 50 hidden nodes
    nn_rms_1 = NeuralNetwork(network_input_size=network_input_size, layer_output_sizes=layer_output_sizes_1, activations=activations_1, loss=loss, seed=SEED)
    train(nn_rms_1, X_train_s, y_train_s, X_test_s, y_test_s, RMSprop(eta), 
          epochs=500, batch_size=batch_size, verbose=False, seed=SEED) 
    y_pred_rms_s = nn_rms_1.predict(X_test_s)
    y_pred_rms = inverse_scale_y(y_pred_rms_s, y_mean)
    rms_mse = mse(y_test_real, y_pred_rms)

    print(f"  eta={eta:.4f}  MSE={rms_mse:.6f}")

    if rms_mse < best_rms_mse_1:  
        best_rms_mse_1 = rms_mse
        best_rms_eta_1 = eta

    # Experiment 2: two hidden layers and 100 hidden nodes each
    nn_rms_2 = NeuralNetwork(network_input_size=network_input_size, layer_output_sizes=layer_output_sizes_2, activations=activations_2, loss=loss, seed=SEED)
    train(nn_rms_2, X_train_s, y_train_s, X_test_s, y_test_s, RMSprop(eta), 
          epochs=500, batch_size=batch_size, verbose=False, seed=SEED) 
    y_pred_rms_s = nn_rms_2.predict(X_test_s)
    y_pred_rms = inverse_scale_y(y_pred_rms_s, y_mean)
    rms_mse = mse(y_test_real, y_pred_rms)

    print(f"  eta={eta:.4f}  MSE={rms_mse:.6f}")

    if rms_mse < best_rms_mse_2:  
        best_rms_mse_2 = rms_mse
        best_rms_eta_2 = eta

  eta=0.0010  MSE=0.001580
  eta=0.0010  MSE=0.011819
  eta=0.0013  MSE=0.001169
  eta=0.0013  MSE=0.004024
  eta=0.0016  MSE=0.000862
  eta=0.0016  MSE=0.009034
  eta=0.0021  MSE=0.001485
  eta=0.0021  MSE=0.015193
  eta=0.0026  MSE=0.001950
  eta=0.0026  MSE=0.025427
  eta=0.0034  MSE=0.002147
  eta=0.0034  MSE=0.024204
  eta=0.0043  MSE=0.001452
  eta=0.0043  MSE=0.068910
  eta=0.0055  MSE=0.008652
  eta=0.0055  MSE=0.002622
  eta=0.0070  MSE=0.006988
  eta=0.0070  MSE=0.004834
  eta=0.0089  MSE=0.015672
  eta=0.0089  MSE=0.007343
  eta=0.0113  MSE=0.015336
  eta=0.0113  MSE=0.008380
  eta=0.0144  MSE=0.015892
  eta=0.0144  MSE=0.004569
  eta=0.0183  MSE=0.012096
  eta=0.0183  MSE=0.004655
  eta=0.0234  MSE=0.015332
  eta=0.0234  MSE=0.023632
  eta=0.0298  MSE=0.015263
  eta=0.0298  MSE=0.007202
  eta=0.0379  MSE=0.067092
  eta=0.0379  MSE=0.001900
  eta=0.0483  MSE=0.070515
  eta=0.0483  MSE=0.001586
  eta=0.0616  MSE=0.016195
  eta=0.0616  MSE=0.060242
  eta=0.0785  MSE=0.005750
 

# Stochastic Gradient Descent with Adam

In [18]:
best_adam_eta_1 = None
best_adam_mse_1 = float('inf')

best_adam_eta_2 = None
best_adam_mse_2 = float('inf')

for eta in eta_adam:

    # Experiment 1: one hidden layer and 50 hidden nodes
    nn_adam_1 = NeuralNetwork(network_input_size=network_input_size, layer_output_sizes=layer_output_sizes_1, activations=activations_1, loss=loss, seed=SEED)
    train(nn_adam_1, X_train_s, y_train_s, X_test_s, y_test_s, Adam(eta),
          epochs=500, batch_size=batch_size, verbose=False, seed=SEED) 
    y_pred_adam_s = nn_adam_1.predict(X_test_s)
    y_pred_adam = inverse_scale_y(y_pred_adam_s, y_mean)
    adam_mse = mse(y_test_real, y_pred_adam)

    print(f"  eta={eta:.4f}  MSE={rms_mse:.6f}")

    if rms_mse < best_adam_mse_1:  
        best_adam_mse_1 = rms_mse
        best_adam_eta_1 = eta

    # Experiment 2: two hidden layers and 100 hidden nodes each
    nn_adam_2 = NeuralNetwork(network_input_size=network_input_size, layer_output_sizes=layer_output_sizes_2, activations=activations_2, loss=loss, seed=SEED)
    train(nn_adam_2, X_train_s, y_train_s, X_test_s, y_test_s, Adam(eta), 
          epochs=500, batch_size=batch_size, verbose=False, seed=SEED) 
    y_pred_adam_s = nn_adam_2.predict(X_test_s)
    y_pred_adam = inverse_scale_y(y_pred_adam_s, y_mean)
    adam_mse = mse(y_test_real, y_pred_adam)

    print(f"  eta={eta:.4f}  MSE={adam_mse:.6f}")

    if adam_mse < best_adam_mse_2:  
        best_adam_mse_2 = adam_mse
        best_adam_eta_2 = eta

  eta=0.0010  MSE=0.069129
  eta=0.0010  MSE=0.000074
  eta=0.0013  MSE=0.069129
  eta=0.0013  MSE=0.000074
  eta=0.0016  MSE=0.069129
  eta=0.0016  MSE=0.000075
  eta=0.0021  MSE=0.069129
  eta=0.0021  MSE=0.000073
  eta=0.0026  MSE=0.069129
  eta=0.0026  MSE=0.000081
  eta=0.0034  MSE=0.069129
  eta=0.0034  MSE=0.000079
  eta=0.0043  MSE=0.069129
  eta=0.0043  MSE=0.000077
  eta=0.0055  MSE=0.069129
  eta=0.0055  MSE=0.000077
  eta=0.0070  MSE=0.069129
  eta=0.0070  MSE=0.000100
  eta=0.0089  MSE=0.069129
  eta=0.0089  MSE=0.000076
  eta=0.0113  MSE=0.069129
  eta=0.0113  MSE=0.000066
  eta=0.0144  MSE=0.069129
  eta=0.0144  MSE=0.000074
  eta=0.0183  MSE=0.069129
  eta=0.0183  MSE=0.000095
  eta=0.0234  MSE=0.069129
  eta=0.0234  MSE=0.000088
  eta=0.0298  MSE=0.069129
  eta=0.0298  MSE=0.000070
  eta=0.0379  MSE=0.069129
  eta=0.0379  MSE=0.000087
  eta=0.0483  MSE=0.069129
  eta=0.0483  MSE=0.000069
  eta=0.0616  MSE=0.069129
  eta=0.0616  MSE=0.000071
  eta=0.0785  MSE=0.069129
 

In [19]:
print("\n" + "="*50)
print("ONE HIDDEN LAYER & 50 HIDDEN NODES")
print("="*50)
print(f"OLS (deg 14):      {ols_mse:.6f}")
print(f"NN + GD:           {best_gd_mse_1:.6f}  (eta={best_gd_eta_1:.4f})")
print(f"NN + RMSprop:      {best_rms_mse_1:.6f}  (eta={best_rms_eta_1:.4f})")
print(f"NN + Adam:         {best_adam_mse_1:.6f}  (eta={best_adam_eta_1:.4f})")
print("="*50)


ONE HIDDEN LAYER & 50 HIDDEN NODES
OLS (deg 14):      0.083335
NN + GD:           0.053909  (eta=0.0483)
NN + RMSprop:      0.000862  (eta=0.0016)
NN + Adam:         0.069129  (eta=0.0010)


In [20]:
print("\n" + "="*50)
print("TWO HIDDEN LAYERS & 100 HIDDEN NODES EACH")
print("="*50)
print(f"OLS (deg 14):      {ols_mse:.6f}")
print(f"NN + GD:           {best_gd_mse_2:.6f}  (eta={best_gd_eta_2:.4f})")
print(f"NN + RMSprop:      {best_rms_mse_2:.6f}  (eta={best_rms_eta_2:.4f})")
print(f"NN + Adam:         {best_adam_mse_2:.6f}  (eta={best_adam_eta_2:.4f})")
print("="*50)


TWO HIDDEN LAYERS & 100 HIDDEN NODES EACH
OLS (deg 14):      0.083335
NN + GD:           0.058716  (eta=0.0183)
NN + RMSprop:      0.001586  (eta=0.0483)
NN + Adam:         0.000066  (eta=0.0113)
