In [1]:
## Importing the required libraries
import numpy as np
import matplotlib.pyplot as plt
data = np.load('ct_data.npz')  ## Loading the dataset

## Slicing the dataset required parts
X_train = data['X_train']; X_val = data['X_val']; X_test = data['X_test']
y_train = data['y_train']; y_val = data['y_val']; y_test = data['y_test']

## Getting standard error 
print("Mean of y_train:",np.mean(y_train))
print("Standard error for y_train (for 5785 entries):",np.std(y_train[:5785], ddof=1)/np.sqrt(len(y_train[:5785])))
print("Mean of y_val (for 5785 entries):",np.mean(y_val))
print("Standard error for y_val (for 5785 entries):",np.std(y_val, ddof=1)/np.sqrt(len(y_val)))



Mean of y_train: -9.13868774539957e-15
Standard error for y_train (for 5785 entries): 0.011927303389170828
Mean of y_val (for 5785 entries): -0.2160085093241599
Standard error for y_val (for 5785 entries): 0.01290449880016868


In [2]:
## Data preprocessing

## 1st step is to remove Constant columns from the data 
rm_idx0=[]    

for i in range(len(X_train[1])):
    col=X_train[:,i]
    if all(col[0]==col):
        rm_idx0.append(i)

## Removing them from training as well from other too
X_train = np.delete(X_train,rm_idx0,axis=1)

X_val = np.delete(X_val, rm_idx0, axis=1)


X_test = np.delete(X_test, rm_idx0, axis=1)

print("Removed Constant columns Indices are:", rm_idx0)        

## Removing the Duplicate columns

indices= np.unique(X_train,return_index=True,axis=1)[1]

indices=np.sort(indices)
rm_idx=list(set(range(indices[0],indices[-1]+1))-set(indices))

## Removing them from training as well from other too
X_train = np.delete(X_train, rm_idx,axis=1)

X_val = np.delete(X_val, rm_idx, axis=1)


X_test = np.delete(X_test, rm_idx, axis=1)

## Printing the incides of removed columns
print("                                          ")
print("Removed Duplicate column indices are:",rm_idx)

Removed Constant columns Indices are: [59, 69, 179, 189, 351]
                                          
Removed Duplicate column indices are: [354, 195, 76, 77, 185, 283]


In [4]:
## Defining the linear regression using least-square est.

def fit_linreg(X, yy, alpha):
    k=len(X[1])                                 ## getting number of input features
    yy = np.concatenate((yy, np.zeros(k)))      ## adding 0_k to the y_train array
    z_k = np.sqrt(alpha) * np.eye(k)
    X = np.vstack((X,z_k))  
    
    b = np.concatenate((np.ones(len(X)-k), np.zeros(k)))[:,None]
                                                
    X = np.insert(X,[0],b,axis=1)

    w_fit = np.linalg.lstsq(X, yy, rcond=None)[0]
    
    
    return w_fit[1:], w_fit[0]                  ## returning the fitted params
    
    

In [5]:
alpha = 30  ## value of lambda

ww0, bb0 = fit_linreg(X_train, y_train, alpha)

print("Bias is:",bb0)
print("Weights are:","\n",ww0)

Bias is: 0.09105350649797651
Weights are: 
 [-6.08615525e-02 -1.06095667e-01  7.53031143e-02  2.78057665e-01
  2.59611794e-01  1.19816553e-01  1.05110635e-02  2.32613667e-01
 -3.17361266e-01 -7.77510933e-02 -4.17318470e-02  4.99788622e-02
 -2.21837493e-02 -2.47999863e-03 -7.29573201e-03  3.75843616e-02
  2.67414563e-02  3.32447940e-03 -8.15353393e-02  1.75498538e-01
  7.92614640e-02  2.86048166e-02 -4.09313322e-02 -6.22094524e-03
 -6.01902872e-02 -5.63195592e-02 -1.47997259e-02 -4.11106517e-01
  3.40469070e-01  2.63461215e-01 -1.35315466e-02  1.73477525e-03
 -9.11983421e-03 -6.98890681e-03  6.51972330e-02 -1.75618988e-01
 -1.79686338e-01  2.20532723e-01 -9.06442066e-02 -3.28680365e-02
 -1.80905054e-01  1.91033029e-02 -1.75010207e-01  5.31362120e-02
 -1.40996396e-01 -2.02005581e-02 -2.04166648e-02  2.13763380e-02
 -4.53472384e-01  8.70663389e-02 -9.25631074e-02 -3.52428163e-03
 -7.86791379e-02 -5.89370945e-02 -3.60229412e-02 -8.00712607e-02
  4.99929907e-02 -1.32414827e-01  1.16614759e-

In [6]:
## Support code
import numpy as np
from scipy.optimize import minimize
from scipy.linalg import cho_factor, cho_solve

def params_unwrap(param_vec, shapes, sizes):
    """Helper routine for minimize_list"""
    args = []
    pos = 0
    for i in range(len(shapes)):
        sz = sizes[i]
        args.append(param_vec[pos:pos+sz].reshape(shapes[i]))
        pos += sz
    return args


def params_wrap(param_list):
    """Helper routine for minimize_list"""
    param_list = [np.array(x) for x in param_list]
    shapes = [x.shape for x in param_list]
    sizes = [x.size for x in param_list]
    param_vec = np.zeros(sum(sizes))
    pos = 0
    for param in param_list:
        sz = param.size
        param_vec[pos:pos+sz] = param.ravel()
        pos += sz
    unwrap = lambda pvec: params_unwrap(pvec, shapes, sizes)
    return param_vec, unwrap


def linreg_cost(params, X, yy, alpha):
    """Regularized least squares cost function and gradients

    Can be optimized with minimize_list -- see fit_linreg_gradopt for a
    demonstration.

    Inputs:
    params: tuple (ww, bb): weights ww (D,), bias bb scalar
         X: N,D design matrix of input features
        yy: N,  real-valued targets
     alpha: regularization constant

    Outputs: (E, [ww_bar, bb_bar]), cost and gradients
    """
    # Unpack parameters from list
    ww, bb = params

    # forward computation of error
    ff = np.dot(X, ww) + bb
    res = ff - yy
    E = np.dot(res, res) + alpha*np.dot(ww, ww)

    # reverse computation of gradients
    ff_bar = 2*res
    bb_bar = np.sum(ff_bar)
    ww_bar = np.dot(X.T, ff_bar) + 2*alpha*ww

    return E, [ww_bar, bb_bar]

def minimize_list(cost, init_list, args):
    """Optimize a list of arrays (wrapper of scipy.optimize.minimize)

    The input function "cost" should take a list of parameters,
    followed by any extra arguments:
        cost(init_list, *args)
    should return the cost of the initial condition, and a list in the same
    format as init_list giving gradients of the cost wrt the parameters.

    The options to the optimizer have been hard-coded. You may wish
    to change disp to True to get more diagnostics. You may want to
    decrease maxiter while debugging. Although please report all results
    in Q2-5 using maxiter=500.
    """
    opt = {'maxiter': 500, 'disp': False}
    init, unwrap = params_wrap(init_list)
    def wrap_cost(vec, *args):
        E, params_bar = cost(unwrap(vec), *args)
        vec_bar, _ = params_wrap(params_bar)
        return E, vec_bar
    res = minimize(wrap_cost, init, args, 'L-BFGS-B', jac=True, options=opt)
    return unwrap(res.x)


def fit_linreg_gradopt(X, yy, alpha):
    """
    fit a regularized linear regression model with gradient opt

         ww, bb = fit_linreg_gradopt(X, yy, alpha)

     Find weights and bias by using a gradient-based optimizer
     (minimize_list) to improve the regularized least squares cost:

       np.sum(((np.dot(X,ww) + bb) - yy)**2) + alpha*np.dot(ww,ww)

     Inputs:
             X N,D design matrix of input features
            yy N,  real-valued targets
         alpha     scalar regularization constant

     Outputs:
            ww D,  fitted weights
            bb     scalar fitted bias
    """
    D = X.shape[1]
    args = (X, yy, alpha)
    init = (np.zeros(D), np.array(0))
    ww, bb = minimize_list(linreg_cost, init, args)
    return ww, bb

def logreg_cost(params, X, yy, alpha):
    """Regularized logistic regression cost function and gradients

    Can be optimized with minimize_list -- see fit_linreg_gradopt for a
    demonstration of fitting a similar function.

    Inputs:
    params: tuple (ww, bb): weights ww (D,), bias bb scalar
         X: N,D design matrix of input features
        yy: N,  real-valued targets
     alpha: regularization constant

    Outputs: (E, [ww_bar, bb_bar]), cost and gradients
    """
    # Unpack parameters from list
    ww, bb = params

    # Force targets to be +/- 1
    yy = 2*(yy==1) - 1

    # forward computation of error
    aa = yy*(np.dot(X, ww) + bb)
    sigma = 1/(1 + np.exp(-aa))
    E = -np.sum(np.log(sigma)) + alpha*np.dot(ww, ww)

    # reverse computation of gradients
    aa_bar = sigma - 1
    bb_bar = np.dot(aa_bar, yy)
    ww_bar = np.dot(X.T, yy*aa_bar) + 2*alpha*ww

    return E, (ww_bar, bb_bar)

def nn_cost(params, X, yy=None, alpha=None):
    """NN_COST simple neural network cost function and gradients, or predictions

           E, params_bar = nn_cost([ww, bb, V, bk], X, yy, alpha)
                    pred = nn_cost([ww, bb, V, bk], X)

     Cost function E can be minimized with minimize_list

     Inputs:
             params (ww, bb, V, bk), where:
                    --------------------------------
                        ww K,  hidden-output weights
                        bb     scalar output bias
                         V K,D hidden-input weights
                        bk K,  hidden biases
                    --------------------------------
                  X N,D input design matrix
                 yy N,  regression targets
              alpha     scalar regularization for weights

     Outputs:
                     E  sum of squares error
            params_bar  gradients wrt params, same format as params
     OR
               pred N,  predictions if only params and X are given as inputs
    """
    # Unpack parameters from list
    ww, bb, V, bk = params

    # Forwards computation of cost
    A = np.dot(X, V.T) + bk[None,:] # N,K
    P = 1 / (1 + np.exp(-A)) # N,K
    F = np.dot(P, ww) + bb # N,
    if yy is None:
        # user wants prediction rather than training signal:
        return F
    res = F - yy # N,
    E = np.dot(res, res) + alpha*(np.sum(V*V) + np.dot(ww,ww)) # 1x1

    # Reverse computation of gradients
    F_bar = 2*res # N,
    ww_bar = np.dot(P.T, F_bar) + 2*alpha*ww # K,
    bb_bar = np.sum(F_bar) # scalar
    P_bar = np.dot(F_bar[:,None], ww[None,:]) # N,K
    A_bar = P_bar * P * (1 - P) # N,K
    V_bar = np.dot(A_bar.T, X) + 2*alpha*V # K,D
    bk_bar = np.sum(A_bar, 0)

    return E, (ww_bar, bb_bar, V_bar, bk_bar)

def rbf_fn(X1, X2):
    """Helper routine for gp_post_par"""
    return np.exp((np.dot(X1,(2*X2.T))-np.sum(X1*X1,1)[:,None]) - np.sum(X2*X2,1)[None,:])


def gauss_kernel_fn(X1, X2, ell, sigma_f):
    """Helper routine for gp_post_par"""
    return sigma_f**2 * rbf_fn(X1/(np.sqrt(2)*ell), X2/(np.sqrt(2)*ell))


def gp_post_par(X_rest, X_obs, yy, sigma_y=0.05, ell=5.0, sigma_f=0.1):
    """GP_POST_PAR means and covariances of a posterior Gaussian process

         rest_cond_mu, rest_cond_cov = gp_post_par(X_rest, X_obs, yy)
         rest_cond_mu, rest_cond_cov = gp_post_par(X_rest, X_obs, yy, sigma_y, ell, sigma_f)

     Calculate the means and covariances at all test locations of the posterior Gaussian
     process conditioned on the observations yy at observed locations X_obs.

     Inputs:
                 X_rest GP test locations
                  X_obs locations of observations
                     yy observed values
                sigma_y observation noise standard deviation
                    ell kernel function length scale
                sigma_f kernel function standard deviation

     Outputs:
           rest_cond_mu mean at each location in X_rest
          rest_cond_cov covariance matrix between function values at all test locations
    """
    X_rest = X_rest[:, None]
    X_obs = X_obs[:, None]
    K_rest = gauss_kernel_fn(X_rest, X_rest, ell, sigma_f)
    K_rest_obs = gauss_kernel_fn(X_rest, X_obs, ell, sigma_f)
    K_obs = gauss_kernel_fn(X_obs, X_obs, ell, sigma_f)
    M = K_obs + sigma_y**2 * np.eye(yy.size)
    M_cho, M_low = cho_factor(M)
    rest_cond_mu = np.dot(K_rest_obs, cho_solve((M_cho, M_low), yy))
    rest_cond_cov = K_rest - np.dot(K_rest_obs, cho_solve((M_cho, M_low), K_rest_obs.T))

    return rest_cond_mu, rest_cond_cov

In [9]:
## calling the gradient descent method to compare
ww1,bb1 = fit_linreg_gradopt(X_train, y_train, 30)

print("Bias: ",bb1,"\n")
print("weights: ",ww1)

Bias:  0.08983942508643088 

weights:  [-6.08928794e-02 -1.06109231e-01  7.53199790e-02  2.78058745e-01
  2.59650357e-01  1.19767171e-01  1.05065679e-02  2.32640328e-01
 -3.17410095e-01 -7.79395725e-02 -4.16863910e-02  4.99515149e-02
 -2.22130095e-02 -2.47116900e-03 -7.34062422e-03  3.76237659e-02
  2.67685541e-02  3.28086014e-03 -8.14198810e-02  1.75633257e-01
  7.92432171e-02  2.86749071e-02 -4.09596766e-02 -6.22108255e-03
 -6.01534646e-02 -5.63056508e-02 -1.47844013e-02 -4.10930801e-01
  3.40022329e-01  2.63335827e-01 -1.35872609e-02  1.66221922e-03
 -9.11711030e-03 -7.00848604e-03  6.51838705e-02 -1.75613930e-01
 -1.79693155e-01  2.20564235e-01 -9.07051362e-02 -3.30882147e-02
 -1.80930149e-01  1.91309155e-02 -1.74997057e-01  5.31853600e-02
 -1.40982937e-01 -2.01732057e-02 -2.04407470e-02  2.17917797e-02
 -4.51780456e-01  8.75602083e-02 -9.25508965e-02 -3.47882886e-03
 -7.86813533e-02 -5.90120793e-02 -3.59756491e-02 -8.00154974e-02
  5.00354796e-02 -1.32535849e-01  1.16684096e-01 -7

In [10]:
## Defining the root mean square cost function 
def rmse(pred,yy):
    return np.sqrt(np.mean((pred-yy)**2))


In [11]:
## Calulating the errors for least square method
pred1_train = np.dot(X_train,ww0)+bb0
pred2_val = np.dot(X_val,ww0)+bb0
print("Root means square for training set(using least square method):",rmse(pred1_train, y_train))
print("Root means square for validation set(using least square method):",rmse(pred2_val, y_val))

Root means square for training set(using least square method): 0.3567565397204054
Root means square for validation set(using least square method): 0.4230521968394701


In [12]:
# Calulating the errors for gradient method
pred1_train = np.dot(X_train,ww1)+bb1
pred2_val = np.dot(X_val,ww1)+bb1
print("Root means square for training set(using least square method):",rmse(pred1_train, y_train))
print("Root means square for validation set(using least square method):",rmse(pred2_val, y_val))

Root means square for training set(using least square method): 0.3567569385517838
Root means square for validation set(using least square method): 0.4230540100048547


In [14]:
def fit_logreg_gradopt(X, yy, alpha):
    """
    fit a regularized linear regression model with gradient opt

         ww, bb = fit_linreg_gradopt(X, yy, alpha)

     Find weights and bias by using a gradient-based optimizer
     (minimize_list) to improve the regularized least squares cost:

       np.sum(((np.dot(X,ww) + bb) - yy)**2) + alpha*np.dot(ww,ww)

     Inputs:
             X N,D design matrix of input features
            yy N,  real-valued targets
         alpha     scalar regularization constant

     Outputs:
            ww D,  fitted weights
            bb     scalar fitted bias
    """
    D = X.shape[1]
    args = (X, yy, alpha)
    init = (np.zeros(D), np.array(0))
    ww, bb = minimize_list(logreg_cost, init, args)
    return ww, bb


K = 20 ## number of thresholded classification problems to fit

## Setting the threshold 
mx = np.max(y_train); mn = np.min(y_train); hh = (mx-mn)/(K+1)
thresholds = np.linspace(mn+hh, mx-hh, num=K, endpoint=True)  

## Creating an array to store weights and biases 
w_fit2= np.array([[0.0]* (len(X_train[1])+1)] * K)

## getting the weights and biases for K problems
for kk in range(K):
    labels = y_train > thresholds[kk]
    ww2, bb2 = fit_logreg_gradopt(X_train, labels, alpha=30)
    w_fit2[kk,0] = bb2
    w_fit2[kk,1:]=ww2


In [15]:
## getting the parameter values 
bb2_hat = w_fit2[:,0]
ww2_hat = w_fit2[:,1:]

## Defining a sigmoid function
def sigmoid(a):
    return 1 / (1+np.exp(-a))


## getting the new probabilities for train and validation sets 
X_train_new = sigmoid(np.dot(X_train, np.transpose(ww2_hat))+bb2_hat) 
X_val_new = sigmoid(np.dot(X_val, np.transpose(ww2_hat))+bb2_hat)


## Fitting values using the least square estimator again
nn_ww, nn_bb = fit_linreg(X_train_new, y_train, alpha=30)

In [16]:
## Getting the prediction for updated parameters
pred1_train = np.dot(X_train_new, nn_ww) + nn_bb
pred2_val = np.dot(X_val_new, nn_ww) + nn_bb

print("Root means square for training set:",rmse(pred1_train, y_train))
print("Root means square for validation set:",rmse(pred2_val, y_val))

Root means square for training set: 0.15441150430439968
Root means square for validation set: 0.2542477298370707


In [17]:
## Setting the seed for current session
np.random.seed(42)


## Defining a function to fit a Neural network
def fit_nn_gradopt(X, yy, K, alpha, w_random = True):
    """
    fit a regularized linear regression model with gradient opt

         ww, bb = fit_linreg_gradopt(X, yy, alpha)

     Find weights and bias by using a gradient-based optimizer
     (minimize_list) to improve the regularized least squares cost:

       np.sum(((np.dot(X,ww) + bb) - yy)**2) + alpha*np.dot(ww,ww)

     Inputs:
             X N,D design matrix of input features
            yy N,  real-valued targets
         alpha     scalar regularization constant

     Outputs:
            ww D,  fitted weights
            bb     scalar fitted bias
    """
    args = (X, yy, alpha)  ## Storing values in a tuple
    
    ## if we want to fit using random initialized weights
    if w_random:
        D = len(X_train[1])
        # calculate the range for the weights
        l = len(X_train[1]) 
    
        # generate random numbers for weights
        ww = 0.1 * np.random.randn(K) / np.sqrt(K)
        V = 0.1 * np.random.randn(K,D)/ np.sqrt(D)
        
        ## Biases should be zero
        bk = np.zeros(K)
        bb = 0
        
        init = (ww, bb, V, bk)
        ww, bb, V, bk = minimize_list(nn_cost, init, args)
        return (ww, bb, V, bk)
    
    else:
        init = (nn_ww,nn_bb,ww2_hat, bb2_hat)    ## Initialization from the results we obtained above cells
        ww, bb, V, bk = minimize_list(nn_cost, init, args)
        return (ww, bb, V, bk)
        
        

In [23]:
## Getting the fitted parameters using random initialization
params = fit_nn_gradopt(X_train, y_train,K=20, alpha=30)

## Getting predictions for train & validation sets
pred_train_nn = nn_cost(params, X_train, yy=None, alpha=30)
pred_val_nn = nn_cost(params, X_val, yy=None, alpha=30)
print("Training set RMSE for NN(with random initialization):", rmse(pred_train_nn, y_train))
print("Validation set RMSE for NN(with random initialization):", rmse(pred_val_nn, y_val))

Training set RMSE for NN(with random initialization): 0.13698780122060805
Validation set RMSE for NN(with random initialization): 0.26538584109603225


In [15]:
## Getting the fitted parameters without using random initialization
params2 = fit_nn_gradopt(X_train, y_train,K=20, alpha=30, w_random = False)

## Getting predictions for train & validation sets
pred1_train_nn = nn_cost(params2, X_train, yy=None, alpha=30)
pred1_val_nn = nn_cost(params2, X_val, yy=None, alpha=30)

print("Training set RMSE for NN:", rmse(pred1_train_nn, y_train))
print("Validation set RMSE for NN:", rmse(pred1_val_nn, y_val))

Training set RMSE for NN: 0.13972707083230795
Validation set RMSE for NN: 0.26789560258337225


In [21]:
## function defined to train NN for specific values of alpha
def train_nn_reg(X_train, X_val, yy, y_val, train_alpha):
    
    param = fit_nn_gradopt(X_train, yy, K=20, alpha= train_alpha)
    
    pred_val = nn_cost(param, X_val, yy=None, alpha= train_alpha)

    return (rmse(pred_val,y_val), param)  ## returning RMSE & fitted parameters
    
## Defining a range values of alpha's to choose from 
alpha= np.arange(0,50,0.02)   

## getting the indicies for randomly selected values 
indicies = np.random.choice(len(alpha),3) 

## Putting them in a already observed values
obs_alpha = np.array(alpha[indicies])

## Getting values of alpha to test
test_alpha = np.delete(alpha,indicies)

## getting the RMSE for observed values
obs_alpha_val = np.array([])
for alpha in obs_alpha: 
    val_rmse = train_nn_reg(X_train, X_val, y_train, y_val, alpha)[0]
    obs_alpha_val = np.append(obs_alpha_val, val_rmse )
    print("Validation RMSE for the value of alpha {0} => {1}".format(alpha,
       val_rmse))
          
    


   

Validation RMSE for the value of alpha 8.92 => 0.2601516227720478
Validation RMSE for the value of alpha 24.78 => 0.2696628649925378
Validation RMSE for the value of alpha 15.44 => 0.25991405180552374


In [26]:
## using Bayesian optimization to find best values of alpha(lambda)

## importing scipy
import scipy.stats


## Taking base value as Validation rmse if weights are randomized
log_base_rmse = np.log(rmse(pred_val_nn, y_val))    

## Subtracting the base from observed RMSE(alphas)
y = np.array(log_base_rmse - np.log(obs_alpha_val))

## Getting posterior mean and covarinaces 
post_mean, post_cov  = gp_post_par(test_alpha, obs_alpha, y)

## getting the Standard deviation
post_std = np.sqrt(np.diag(post_cov))         

## Defining Probability acquistion function
def phi(post_mean, post_std, y):
    return scipy.stats.norm.cdf((post_mean - max(y)) / post_std)

## Initilzing the parameters to extract
best_alpha = 0.0
best_alpha_rmse = 9999.0
best_params = set()

## Running for 10 alpha values
for _ in range(10):
    ## getting the maximum prob. for specific alpha 
    prob_max = phi(post_mean, post_std, y)
    
    ## getting it's index
    idx = np.argmax(prob_max)
    
    ## Calculating it's RMSE to compare and treat as prior(observed)
    alpha_val_rmse, params = train_nn_reg(X_train, X_val, y_train, y_val, test_alpha[idx])
    
    ## Selecting the best parameters from whole iteration
    if  alpha_val_rmse < best_alpha_rmse:
        best_alpha = test_alpha[idx]
        best_alpha_rmse = alpha_val_rmse
        best_params = params
    
    print("Maximum probability for Alpha(={0}) is {1}, Validation RMSE => {2}".format( 
                                test_alpha[idx], prob_max[idx], alpha_val_rmse))
    
    ## appending the observed RMSE values for specific alpha 
    obs_alpha_val = np.append(obs_alpha_val, alpha_val_rmse)
    
    ## appending the observed alpha values 
    obs_alpha = np.append(obs_alpha, test_alpha[idx])
    
    ## deleting them from values to be tested 
    test_alpha = np.delete(test_alpha,idx)
    
    ## Subtracting the base from observed RMSE(alphas)
    y = np.array(log_base_rmse - np.log(obs_alpha_val))
    
    ## Getting posterior mean and covarinaces
    post_mean, post_cov  = gp_post_par(test_alpha, obs_alpha, y)
    
    ## getting standard deviation 
    post_std = np.sqrt(np.diag(post_cov))
   
    
    

Maximum probability for Alpha(=12.38) is 0.4957708365024785, Validation RMSE => 0.2650949277648916
Maximum probability for Alpha(=4.96) is 0.45218243927731866, Validation RMSE => 0.2508676456097986
Maximum probability for Alpha(=2.7800000000000002) is 0.4069772196565155, Validation RMSE => 0.2496093182383833
Maximum probability for Alpha(=2.34) is 0.41801510111029233, Validation RMSE => 0.23065023115091857
Maximum probability for Alpha(=0.0) is 0.16521437452495163, Validation RMSE => 0.25922174997459124
Maximum probability for Alpha(=49.980000000000004) is 0.08033464746951624, Validation RMSE => 0.280199754230829
Maximum probability for Alpha(=36.6) is 0.07664688603151762, Validation RMSE => 0.2785540428735084
Maximum probability for Alpha(=43.22) is 0.022153485643845148, Validation RMSE => 0.2784977084805509
Maximum probability for Alpha(=30.34) is 0.017782503302289965, Validation RMSE => 0.27051755101601144
Maximum probability for Alpha(=19.56) is 0.015045885844013005, Validation RMS

In [27]:
## Traning on best alpha to get test error
pred_test = nn_cost(best_params, X_test, yy=None, alpha=best_alpha)   ## Prediction for test set
test_error = rmse(pred_test, y_test)

print("The Best value for alpha is {0}".format(best_alpha))
print("The Validation error is {0}".format(best_alpha_rmse))
print("The Test error is {0}".format(test_error))

The Best value for alpha is 2.34
The Validation error is 0.23065023115091857
The Test error is 0.2703016646642553
