In [62]:
%reset

In [63]:
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RationalQuadratic, Matern, WhiteKernel, RBF
from sklearn.gaussian_process.kernels import Sum
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, r2_score
from scipy import stats

import numpy as np
import pandas as pd
import optuna
import warnings

In [64]:
#Set this value to true if hyperparameter tuning is complete and the test set should be loaded and predicted on
OUTPUT_TEST = False

In [65]:
#Load the training and validation datasets
X_train = pd.read_csv("../data/cleaned/training.csv")
y_train = pd.read_csv("../data/cleaned/training_labels.csv")
X_val = pd.read_csv("../data/cleaned/validation.csv")
y_val = pd.read_csv("../data/cleaned/validation_labels.csv")

In [66]:
#Some columns headers contain '[' or ']' which are not compatable with sklearn. They are change to '(' and ')' respectively.
columns = X_train.columns
for col in columns:
    if '[' in col or ']' in col:
        old_name = col
        col = col.replace('[', '(')
        col = col.replace(']', ')')
        
        X_train = X_train.rename(columns={old_name:col})
        X_val = X_val.rename(columns={old_name:col})

In [67]:
#Splitting of the training set into a vedrification and training set with a 90/10 split. This verification set is used for optuna hyperparameter tuning.
X_train, X_verif, y_train, y_verif = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

In [68]:
#Reset the indicies after splitting the dataset
X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
X_verif = X_verif.reset_index(drop=True)
y_verif = y_verif.reset_index(drop=True)
X_val = X_val.reset_index(drop=True)
y_val = y_val.reset_index(drop=True)

In [69]:
#Defining the guassian process search space for Optuna.
def define_kernel(trial):
    kernels = []
    n_kernels = trial.suggest_int('n_kernels', 1, 3) #Number of simple kernels used to create the final kernel
    for i in range(n_kernels):
        kernel_type = trial.suggest_categorical(f'kernel_type_{i}', ["Matern", "RationalQuadratic"]) #Select a type of simple kernel

        #Depending on the kernel type selected, certain metrics need to specificed, each of those metrics is selected below by optuna as the tuning takes place
        if kernel_type == 'RationalQuadratic':
            quad_params = {
                'length_scale': trial.suggest_float(f'RationalQuadratic_{i}_length_scale', 1e-1, 1e5),
                'alpha': trial.suggest_float(f'RationalQuadratic_{i}_alpha', 1e-1, 1e5)
            }   
            kernel = RationalQuadratic(length_scale=quad_params['length_scale'], alpha=quad_params['alpha'], length_scale_bounds=(1e-8,1e8))
        elif kernel_type == 'Matern':
            matern_params = {
                'length_scale': trial.suggest_float(f'Matern_{i}_length_scale', 1e-1, 1e5),
                'nu': trial.suggest_float(f'Matern_{i}_nu', 0.5, 5)
            }
            kernel = Matern(length_scale=matern_params['length_scale'], nu=matern_params['nu'], length_scale_bounds=(1e-8,1e8))
        else:
            print("WRONG KERNEL NAME FOR:", kernel_type)
            TypeError
        kernels.append(kernel)

    white_noise_params = {
                'noise_level': trial.suggest_float(f'WhiteKernel_{n_kernels}_noise_level', 1e-5, 1e5),
            }
    #kernel = WhiteKernel(noise_level=white_noise_params['noise_level'])
    #kernels.append(kernel)
    if n_kernels == 1:
        combined_kernel = kernels[0]
    else:
        combined_kernel = Sum(kernels[0], kernels[1])
        for j in range(1, n_kernels-1):
            combined_kernel = Sum(combined_kernel, kernels[j+1])
    return combined_kernel

In [70]:
def objective(trial):
    """Define the objective function"""
    kernel = define_kernel(trial)
    params = {
        'alpha': trial.suggest_float('alpha', 1e-3, 1e3, log=True),
        'n_restarts_optimizer': trial.suggest_int('n_restarts_optimizer', 0, 10),
    }

    params["kernel"] = kernel
    print(params)
    # Fit the model
    optuna_model = GaussianProcessRegressor(**params)
    
    batch_size = 500

    # Take a random sample of the DataFrame
    X_train_sampled = X_train.sample(n=batch_size)

    # Access the indexes of the sampled rows
    sampled_indexes = X_train_sampled.index
    y_train_sampled = y_train.loc[sampled_indexes]

    optuna_model.fit(X_train_sampled, y_train_sampled)
    
    # Make predictions
    verif_pred = optuna_model.predict(X_verif)
    verif_loss = mean_absolute_percentage_error(y_verif,verif_pred)*100
    verif_error = mean_squared_error(y_verif,verif_pred,squared=False)

    # Evaluate predictions
    error = verif_loss + verif_error
    
    return error

In [71]:
warnings.filterwarnings("ignore", category=RuntimeWarning)
study = optuna.create_study(pruner=optuna.pruners.SuccessiveHalvingPruner())
study.optimize(objective, n_trials=50)

print("Best trial:")
trial = study.best_trial

print("  Value: ", trial.value)

print("  Params: ")
for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

[32m[I 2023-12-19 17:22:42,257][0m A new study created in memory with name: no-name-67ce1b1a-c429-4e04-ba6b-7562cf0fa2ac[0m


{'alpha': 0.4322961695854962, 'n_restarts_optimizer': 10, 'kernel': RationalQuadratic(alpha=2.77e+04, length_scale=3.85e+04) + Matern(length_scale=8.92e+04, nu=1.73) + Matern(length_scale=4.56e+03, nu=4.55)}


ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)
[32m[I 2023-12-19 17:26:33,190][0m Trial 0 finished with value: 48.51528888534959 and parameters: {'n_kernels': 3, 'kernel_type_0': 'RationalQuadratic', 'RationalQuadratic_0_length_scale': 38541.8416897705, 'RationalQuadratic_0_alpha': 27689.71642443232, 'kernel_type_1': 'Matern', 'Matern_1_length_scale': 89153.08672572863, 'Matern_1_nu': 1.726037581968306, 'kernel_type_2': 'Matern', 'Matern_2_length_scale': 4557.4908305312, 'Matern_2_nu': 4.551686402008937, 'WhiteKernel_3_noise_level': 46271.995898288056, 'alpha': 0.4322961695854962, 'n_restarts_optimizer': 10}. Best is trial 0 with value: 48.51528888534959.[0m


{'alpha': 1.2212138429940311, 'n_restarts_optimizer': 1, 'kernel': Matern(length_scale=7.86e+04, nu=0.848) + RationalQuadratic(alpha=7.77e+04, length_scale=8.73e+04)}


[32m[I 2023-12-19 17:27:00,680][0m Trial 1 finished with value: 49.66094240423003 and parameters: {'n_kernels': 2, 'kernel_type_0': 'Matern', 'Matern_0_length_scale': 78623.28077857281, 'Matern_0_nu': 0.8476520284471216, 'kernel_type_1': 'RationalQuadratic', 'RationalQuadratic_1_length_scale': 87259.83313914941, 'RationalQuadratic_1_alpha': 77720.5492153229, 'WhiteKernel_2_noise_level': 39932.49011801804, 'alpha': 1.2212138429940311, 'n_restarts_optimizer': 1}. Best is trial 0 with value: 48.51528888534959.[0m


{'alpha': 432.9535822424795, 'n_restarts_optimizer': 7, 'kernel': Matern(length_scale=3.42e+04, nu=1.85) + RationalQuadratic(alpha=8.57e+04, length_scale=6.71e+04)}


[32m[I 2023-12-19 17:27:51,430][0m Trial 2 finished with value: 58.7329045690527 and parameters: {'n_kernels': 2, 'kernel_type_0': 'Matern', 'Matern_0_length_scale': 34230.40350996027, 'Matern_0_nu': 1.8468250246883953, 'kernel_type_1': 'RationalQuadratic', 'RationalQuadratic_1_length_scale': 67058.65748434357, 'RationalQuadratic_1_alpha': 85652.03602721292, 'WhiteKernel_2_noise_level': 30357.123142221968, 'alpha': 432.9535822424795, 'n_restarts_optimizer': 7}. Best is trial 0 with value: 48.51528888534959.[0m


{'alpha': 200.34620398260066, 'n_restarts_optimizer': 3, 'kernel': RationalQuadratic(alpha=1.62e+04, length_scale=8.47e+04) + Matern(length_scale=9.45e+04, nu=1.86)}


[32m[I 2023-12-19 17:28:18,009][0m Trial 3 finished with value: 52.221927661118436 and parameters: {'n_kernels': 2, 'kernel_type_0': 'RationalQuadratic', 'RationalQuadratic_0_length_scale': 84674.85793800294, 'RationalQuadratic_0_alpha': 16233.422477122365, 'kernel_type_1': 'Matern', 'Matern_1_length_scale': 94544.62050641587, 'Matern_1_nu': 1.8642852059291561, 'WhiteKernel_2_noise_level': 92325.1131189338, 'alpha': 200.34620398260066, 'n_restarts_optimizer': 3}. Best is trial 0 with value: 48.51528888534959.[0m


{'alpha': 351.94670525145904, 'n_restarts_optimizer': 0, 'kernel': RationalQuadratic(alpha=6.02e+04, length_scale=9.36e+04)}


[32m[I 2023-12-19 17:28:19,080][0m Trial 4 finished with value: 65.3329260350107 and parameters: {'n_kernels': 1, 'kernel_type_0': 'RationalQuadratic', 'RationalQuadratic_0_length_scale': 93629.10224075186, 'RationalQuadratic_0_alpha': 60222.53501614783, 'WhiteKernel_1_noise_level': 21427.438994376975, 'alpha': 351.94670525145904, 'n_restarts_optimizer': 0}. Best is trial 0 with value: 48.51528888534959.[0m


{'alpha': 0.18991128718631514, 'n_restarts_optimizer': 2, 'kernel': Matern(length_scale=3.46e+04, nu=1.62) + Matern(length_scale=6.73e+04, nu=0.9)}


[32m[I 2023-12-19 17:29:06,635][0m Trial 5 finished with value: 50.20416917388933 and parameters: {'n_kernels': 2, 'kernel_type_0': 'Matern', 'Matern_0_length_scale': 34584.14764636267, 'Matern_0_nu': 1.6175443344533544, 'kernel_type_1': 'Matern', 'Matern_1_length_scale': 67288.39008079149, 'Matern_1_nu': 0.8998589617510898, 'WhiteKernel_2_noise_level': 3225.1954524307066, 'alpha': 0.18991128718631514, 'n_restarts_optimizer': 2}. Best is trial 0 with value: 48.51528888534959.[0m


{'alpha': 80.12896644067072, 'n_restarts_optimizer': 2, 'kernel': RationalQuadratic(alpha=7.35e+04, length_scale=7.05e+04) + Matern(length_scale=8.54e+04, nu=2.38) + RationalQuadratic(alpha=9.12e+04, length_scale=5.25e+03)}


[32m[I 2023-12-19 17:29:27,379][0m Trial 6 finished with value: 49.96794186850515 and parameters: {'n_kernels': 3, 'kernel_type_0': 'RationalQuadratic', 'RationalQuadratic_0_length_scale': 70532.00430542821, 'RationalQuadratic_0_alpha': 73514.79265722145, 'kernel_type_1': 'Matern', 'Matern_1_length_scale': 85401.23362900731, 'Matern_1_nu': 2.376469385421698, 'kernel_type_2': 'RationalQuadratic', 'RationalQuadratic_2_length_scale': 5248.631501571996, 'RationalQuadratic_2_alpha': 91234.62381813877, 'WhiteKernel_3_noise_level': 45798.61297106272, 'alpha': 80.12896644067072, 'n_restarts_optimizer': 2}. Best is trial 0 with value: 48.51528888534959.[0m


{'alpha': 33.86422299719959, 'n_restarts_optimizer': 8, 'kernel': Matern(length_scale=1.44e+04, nu=2.43)}


[32m[I 2023-12-19 17:30:08,900][0m Trial 7 finished with value: 49.95175673570424 and parameters: {'n_kernels': 1, 'kernel_type_0': 'Matern', 'Matern_0_length_scale': 14425.390626538121, 'Matern_0_nu': 2.4327242702023724, 'WhiteKernel_1_noise_level': 7772.49140687531, 'alpha': 33.86422299719959, 'n_restarts_optimizer': 8}. Best is trial 0 with value: 48.51528888534959.[0m


{'alpha': 0.004744966594095622, 'n_restarts_optimizer': 5, 'kernel': Matern(length_scale=4.65e+04, nu=4.75)}


[32m[I 2023-12-19 17:30:53,392][0m Trial 8 finished with value: 94.42001726488158 and parameters: {'n_kernels': 1, 'kernel_type_0': 'Matern', 'Matern_0_length_scale': 46480.785360740214, 'Matern_0_nu': 4.7511895181824215, 'WhiteKernel_1_noise_level': 24284.823852786227, 'alpha': 0.004744966594095622, 'n_restarts_optimizer': 5}. Best is trial 0 with value: 48.51528888534959.[0m


{'alpha': 0.003906363892735846, 'n_restarts_optimizer': 9, 'kernel': Matern(length_scale=1.6e+03, nu=0.724) + Matern(length_scale=6.5e+04, nu=0.675) + Matern(length_scale=6.86e+04, nu=4.51)}


[32m[I 2023-12-19 17:34:55,258][0m Trial 9 finished with value: 49.707179496745326 and parameters: {'n_kernels': 3, 'kernel_type_0': 'Matern', 'Matern_0_length_scale': 1601.5620475704468, 'Matern_0_nu': 0.7235582015812488, 'kernel_type_1': 'Matern', 'Matern_1_length_scale': 64980.65706703034, 'Matern_1_nu': 0.675053624731113, 'kernel_type_2': 'Matern', 'Matern_2_length_scale': 68644.39769146984, 'Matern_2_nu': 4.508445511382413, 'WhiteKernel_3_noise_level': 98282.87916777482, 'alpha': 0.003906363892735846, 'n_restarts_optimizer': 9}. Best is trial 0 with value: 48.51528888534959.[0m


{'alpha': 0.22116995682972637, 'n_restarts_optimizer': 10, 'kernel': RationalQuadratic(alpha=1.56e+04, length_scale=1.55e+04) + Matern(length_scale=9.35e+03, nu=4.6) + Matern(length_scale=1.35e+03, nu=0.845)}


[32m[I 2023-12-19 17:39:46,054][0m Trial 10 finished with value: 49.849223873809926 and parameters: {'n_kernels': 3, 'kernel_type_0': 'RationalQuadratic', 'RationalQuadratic_0_length_scale': 15487.572251159268, 'RationalQuadratic_0_alpha': 15570.834430916977, 'kernel_type_1': 'Matern', 'Matern_1_length_scale': 9352.770330805964, 'Matern_1_nu': 4.595014617766851, 'kernel_type_2': 'Matern', 'Matern_2_length_scale': 1349.555467320628, 'Matern_2_nu': 0.8451234384756727, 'WhiteKernel_3_noise_level': 7781.337379391334, 'alpha': 0.22116995682972637, 'n_restarts_optimizer': 10}. Best is trial 0 with value: 48.51528888534959.[0m


{'alpha': 3.7897877541987146, 'n_restarts_optimizer': 6, 'kernel': RationalQuadratic(alpha=3.7e+04, length_scale=3.03e+04) + RationalQuadratic(alpha=3.61e+04, length_scale=9.15e+04)}


[32m[I 2023-12-19 17:40:05,218][0m Trial 11 finished with value: 50.183610507958136 and parameters: {'n_kernels': 2, 'kernel_type_0': 'RationalQuadratic', 'RationalQuadratic_0_length_scale': 30254.812520691667, 'RationalQuadratic_0_alpha': 37030.85197132474, 'kernel_type_1': 'RationalQuadratic', 'RationalQuadratic_1_length_scale': 91480.94832868088, 'RationalQuadratic_1_alpha': 36116.768980955094, 'WhiteKernel_2_noise_level': 63351.51772213593, 'alpha': 3.7897877541987146, 'n_restarts_optimizer': 6}. Best is trial 0 with value: 48.51528888534959.[0m


{'alpha': 2.6184804419583445, 'n_restarts_optimizer': 4, 'kernel': Matern(length_scale=9.49e+04, nu=3.89) + RationalQuadratic(alpha=8.74e+04, length_scale=1.11e+04) + Matern(length_scale=49.3, nu=4.83)}


[32m[I 2023-12-19 17:42:00,288][0m Trial 12 finished with value: 49.283208047356375 and parameters: {'n_kernels': 3, 'kernel_type_0': 'Matern', 'Matern_0_length_scale': 94855.67130518469, 'Matern_0_nu': 3.886649642109063, 'kernel_type_1': 'RationalQuadratic', 'RationalQuadratic_1_length_scale': 11062.646706963955, 'RationalQuadratic_1_alpha': 87415.13693201271, 'kernel_type_2': 'Matern', 'Matern_2_length_scale': 49.28091891940767, 'Matern_2_nu': 4.828696030250407, 'WhiteKernel_3_noise_level': 52684.50907147756, 'alpha': 2.6184804419583445, 'n_restarts_optimizer': 4}. Best is trial 0 with value: 48.51528888534959.[0m


{'alpha': 0.043436828348113465, 'n_restarts_optimizer': 5, 'kernel': RationalQuadratic(alpha=9.91e+04, length_scale=5.16e+04) + RationalQuadratic(alpha=1.71e+03, length_scale=1.7e+03) + Matern(length_scale=1.62e+03, nu=4.98)}


[32m[I 2023-12-19 17:44:26,854][0m Trial 13 finished with value: 49.40309645446377 and parameters: {'n_kernels': 3, 'kernel_type_0': 'RationalQuadratic', 'RationalQuadratic_0_length_scale': 51614.27872486621, 'RationalQuadratic_0_alpha': 99116.89550652742, 'kernel_type_1': 'RationalQuadratic', 'RationalQuadratic_1_length_scale': 1695.8276535698096, 'RationalQuadratic_1_alpha': 1705.3985083769367, 'kernel_type_2': 'Matern', 'Matern_2_length_scale': 1615.1718510629357, 'Matern_2_nu': 4.975657442157939, 'WhiteKernel_3_noise_level': 52047.16076033533, 'alpha': 0.043436828348113465, 'n_restarts_optimizer': 5}. Best is trial 0 with value: 48.51528888534959.[0m


{'alpha': 12.759563687141059, 'n_restarts_optimizer': 4, 'kernel': RationalQuadratic(alpha=1.61e+03, length_scale=4.34e+04) + RationalQuadratic(alpha=9.96e+04, length_scale=1.39e+04) + Matern(length_scale=2.52e+04, nu=3.56)}


[32m[I 2023-12-19 17:45:08,893][0m Trial 14 finished with value: 50.021647214439675 and parameters: {'n_kernels': 3, 'kernel_type_0': 'RationalQuadratic', 'RationalQuadratic_0_length_scale': 43420.711676517014, 'RationalQuadratic_0_alpha': 1614.407257312596, 'kernel_type_1': 'RationalQuadratic', 'RationalQuadratic_1_length_scale': 13878.73493023758, 'RationalQuadratic_1_alpha': 99575.18821537952, 'kernel_type_2': 'Matern', 'Matern_2_length_scale': 25214.190182964412, 'Matern_2_nu': 3.5647429398259365, 'WhiteKernel_3_noise_level': 50219.3012119731, 'alpha': 12.759563687141059, 'n_restarts_optimizer': 4}. Best is trial 0 with value: 48.51528888534959.[0m


{'alpha': 0.031062472346451615, 'n_restarts_optimizer': 7, 'kernel': Matern(length_scale=9.85e+04, nu=4.04) + Matern(length_scale=2.77e+04, nu=3.81) + Matern(length_scale=9.9e+04, nu=3.41)}


ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)
[32m[I 2023-12-19 17:49:35,723][0m Trial 15 finished with value: 48.656037489654445 and parameters: {'n_kernels': 3, 'kernel_type_0': 'Matern', 'Matern_0_length_scale': 98514.69338410931, 'Matern_0_nu': 4.043799677232762, 'kernel_type_1': 'Matern', 'Matern_1_length_scale': 27657.84359535279, 'Matern_1_nu': 3.8137626485491465, 'kernel_type_2': 'Matern', 'Matern_2_length_scale': 99021.67315999194, 'Matern_2_nu': 3.407393200010882, 'WhiteKernel_3_noise_level': 66801.01831054635, 'alpha': 0.031062472346451615, 'n_restarts_optimizer': 7}. Best is trial 0 with value: 48.51528888534959.[0m


{'alpha': 0.03544444647030521, 'n_restarts_optimizer': 10, 'kernel': Matern(length_scale=7.25e+04, nu=3.64) + Matern(length_scale=2.5e+04, nu=4.11) + RationalQuadratic(alpha=804, length_scale=9.96e+04)}


[32m[I 2023-12-19 17:53:04,037][0m Trial 16 finished with value: 49.91763999747879 and parameters: {'n_kernels': 3, 'kernel_type_0': 'Matern', 'Matern_0_length_scale': 72455.101666054, 'Matern_0_nu': 3.6446069027724644, 'kernel_type_1': 'Matern', 'Matern_1_length_scale': 25040.77712417939, 'Matern_1_nu': 4.109579879220222, 'kernel_type_2': 'RationalQuadratic', 'RationalQuadratic_2_length_scale': 99570.08716297068, 'RationalQuadratic_2_alpha': 804.1040980528269, 'WhiteKernel_3_noise_level': 75925.14252745925, 'alpha': 0.03544444647030521, 'n_restarts_optimizer': 10}. Best is trial 0 with value: 48.51528888534959.[0m


{'alpha': 0.0012661989972729496, 'n_restarts_optimizer': 8, 'kernel': RationalQuadratic(alpha=4.13e+04, length_scale=2.71e+03) + Matern(length_scale=4.36e+04, nu=3.3) + Matern(length_scale=9.71e+04, nu=2.77)}


[32m[I 2023-12-19 17:55:04,379][0m Trial 17 finished with value: 49.99931819406822 and parameters: {'n_kernels': 3, 'kernel_type_0': 'RationalQuadratic', 'RationalQuadratic_0_length_scale': 2706.879296918072, 'RationalQuadratic_0_alpha': 41296.41221101981, 'kernel_type_1': 'Matern', 'Matern_1_length_scale': 43576.496980144766, 'Matern_1_nu': 3.3031630250424016, 'kernel_type_2': 'Matern', 'Matern_2_length_scale': 97104.08995075297, 'Matern_2_nu': 2.770954131503216, 'WhiteKernel_3_noise_level': 23106.93990919988, 'alpha': 0.0012661989972729496, 'n_restarts_optimizer': 8}. Best is trial 0 with value: 48.51528888534959.[0m


{'alpha': 0.24052398094282287, 'n_restarts_optimizer': 8, 'kernel': Matern(length_scale=9.42e+04, nu=4.65) + Matern(length_scale=3.96e+04, nu=3.31)}


KeyboardInterrupt: 

In [73]:
#Reconstruct the kernel based on the results from the optuna test.
def reconstruct_kernel(encoding):
    n_kernels = encoding['n_kernels']
    kernels = []
    for i in range(n_kernels):
        kernel_type = encoding[f'kernel_type_{i}']
        length_scale = encoding.get(f'{kernel_type}_{i}_length_scale', None)
        nu = encoding.get(f'{kernel_type}_{i}_nu', None)

        if kernel_type == 'Matern':
            kernel = Matern(length_scale=length_scale, nu=nu)
        elif kernel_type == 'RationalQuadratic':
            alpha = encoding.get(f'{kernel_type}_{i}_alpha', 1.0)
            kernel = RationalQuadratic(length_scale=length_scale, alpha=alpha)
        elif kernel_type == 'WhiteKernel':
            noise_level = encoding.get(f'{kernel_type}_{i}_noise_level', 1.0)
            print(noise_level)
            kernel = WhiteKernel(noise_level=noise_level)
        # Add more conditions for other kernel types if needed

        kernels.append(kernel)

    #noise_level = encoding.get(f'WhiteKernel_{n_kernels}_noise_level', 1)
    #kernel = WhiteKernel(noise_level=noise_level)
    #kernels.append(kernel)

    # Sum the individual kernels to get the final composite kernel
    if n_kernels == 1:
        final_kernel = kernels[0]
    else:
        final_kernel = Sum(kernels[0], kernels[1])
        for j in range(1, n_kernels-1):
            final_kernel = Sum(final_kernel, kernels[j+1])
    return final_kernel

In [74]:
#Check performance with no tuning to ensure performance is improving
sanity_check = GaussianProcessRegressor(kernel=RBF())
sanity_check.fit(X_train.iloc[0:499], y_train.iloc[0:499])
val_pred = sanity_check.predict(X_val)
verif_pred = sanity_check.predict(X_verif)
sanity_verif_error = mean_squared_error(y_verif,verif_pred,squared=False)
sanity_val_error = mean_squared_error(y_val,val_pred,squared=False)
print("SANITY CHECK VALUES:")
print("Verification RMSE:", sanity_verif_error)
print("Validation RMSE:", sanity_val_error)

SANITY CHECK VALUES:
Verification RMSE: 16.005811334029733
Validation RMSE: 16.326713639930887


In [75]:
params = trial.params
print(params)
kernel = reconstruct_kernel(params)
print(kernel)
gp = GaussianProcessRegressor(kernel=kernel,alpha=params['alpha'], n_restarts_optimizer=params['n_restarts_optimizer'])

NameError: name 'trial' is not defined

In [None]:
gp.fit(X_train.iloc[0:499], y_train.iloc[0:499])

In [None]:
val_pred, std_prediction = gp.predict(X_val, return_std=True)
error = mean_squared_error(y_val,val_pred,squared=False)
print("RMSE:", error)
print("Difference from sanity check:", sanity_val_error - error)

RMSE: 4.678907046098204
Difference from sanity check: 11.647806593832684


In [None]:
val_r = r2_score(y_val,val_pred)

print("val R:")
print(val_r)

val R:
-0.024127588444852188


In [None]:
if not OUTPUT_TEST:
    raise ValueError("OUTPUT_TEST set to False. If you would like to output final test values set to True and continue running from here")

ValueError: OUTPUT_TEST set to False. If you would like to output final test values set to True and continue running from here

In [None]:
X_test = pd.read_csv("../data/cleaned/test.csv")
y_test = pd.read_csv("../data/cleaned/test_labels.csv")

In [None]:
columns = X_test.columns
for col in columns:
    if '[' in col or ']' in col:
        old_name = col
        col = col.replace('[', '(')
        col = col.replace(']', ')')
        
        X_test = X_test.rename(columns={old_name:col})

In [None]:
test_preds = gp.predict(X_test)
train_preds = gp.predict(X_train)

In [None]:
#Save test true vals and predictions to csv

pred_data = pd.DataFrame(test_preds)
pred_filepath = '../data/predictions/GP/test_pred_gp.csv'
pred_data.to_csv(pred_filepath, index=False, header=False)
pred_data = pd.DataFrame(y_test)
pred_filepath = '../data/predictions/GP/test_true_gp.csv'
pred_data.to_csv(pred_filepath, index=False, header=False)

#Save train true vals and predictions to csv

pred_data = pd.DataFrame(train_preds)
pred_filepath = '../data/predictions/GP/train_pred_gp.csv'
pred_data.to_csv(pred_filepath, index=False, header=False)
pred_data = pd.DataFrame(y_train)
pred_filepath = '../data/predictions/GP/train_true_gp.csv'
pred_data.to_csv(pred_filepath, index=False, header=False)

In [None]:
#Save inputs to csv

pred_data = pd.DataFrame(X_train)
pred_filepath = '../data/predictions/GP/train_input_gp.csv'
pred_data.to_csv(pred_filepath, index=False, header=False)
true_data = pd.DataFrame(X_test)
true_filepath = '../data/predictions/GP/test_input_gp.csv'
true_data.to_csv(true_filepath, index=False, header=False)

In [None]:
#Read in values from csv and calculate RMSE and r values

test_pred_data = np.genfromtxt('../data/predictions/GP/test_pred_gp.csv', delimiter=',', filling_values=np.nan)
test_true_data = np.genfromtxt('../data/predictions/GP/test_true_gp.csv', delimiter=',', filling_values=np.nan)
train_pred_data = np.genfromtxt('../data/predictions/GP/train_pred_gp.csv', delimiter=',', filling_values=np.nan)
train_true_data = np.genfromtxt('../data/predictions/GP/train_true_gp.csv', delimiter=',', filling_values=np.nan)

test_rmse = mean_squared_error(test_true_data,test_pred_data,squared=False)
test_r = r2_score(test_true_data,test_pred_data)

train_rmse = mean_squared_error(train_true_data,train_pred_data,squared=False)
train_r = r2_score(train_true_data,train_pred_data)

print("Train:")
print(train_rmse)
print('Test:')
print(test_rmse)
print(test_r)

Train:
4.6359895675101335
Test:
4.6813758913658905
-0.011374792120455002


In [None]:
print("percent Error:", mean_absolute_percentage_error(test_true_data, test_pred_data)*100)

percent Error: 74.22485943098808
