In [1]:
import numpy as np
import pandas as pd

import warnings

import params as p
import functions as f

from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import mean_squared_error

import optuna
from optuna.integration import XGBoostPruningCallback

  from pandas import MultiIndex, Int64Index


### Settings

In [2]:
warnings.simplefilter(action='ignore', category=FutureWarning)

### Import Data

In [3]:
train, test, val = f.load_split_datasets(part='03')

In [4]:
X_train, y_train = f.split_data_X_y(train)
X_test, y_test = f.split_data_X_y(test)
X_val, y_val = f.split_data_X_y(val)

In [5]:
train_data = {'X' : X_train, 'y' : y_train}
test_data = {'X' : X_test, 'y' : y_test}
val_data = {'X' : X_val, 'y' : y_val}

### Create Models

#### XGB

In [6]:
xgb = XGBRegressor(random_state = p.RANDOM_STATE).fit(**train_data)

#### RandomForest

In [7]:
rf = RandomForestRegressor(random_state=p.RANDOM_STATE).fit(**train_data)

#### Naive Bayes

In [8]:
nb = GaussianNB().fit(**train_data)

### Evaluate Initial Models on Validation Data

In [9]:
rmse = {'RMSE' : [f.evaluate_model(model, metric = 'rmse', **val_data) for model in [xgb, rf, nb]]}
pd.DataFrame(rmse, index = ['XGBoost', 'RandomForest', 'Naive Bayes'])

Unnamed: 0,RMSE
XGBoost,210.904391
RandomForest,222.486178
Naive Bayes,531.287298


### Tuning with Optuna

### XGB

In [32]:
xgb_base_params = {
    'learning_rate' : 0.1,
    'verbosity' : 0,
    'n_jobs' : -1,
    'random_state' : p.RANDOM_STATE,
    }

def xgb_objective(trial, X_train = X_train, y_train = y_train, X_test = X_test, y_test = y_test, base_params = xgb_base_params):

    obj_params = {
        'max_depth' : trial.suggest_int('max_depth', 2, 7),
        'min_child_weight' : trial.suggest_int('min_child_weight', 1, 10),
        'n_estimators' : trial.suggest_int('n_estimators', 50, 1000),
        'subsample' : trial.suggest_float('subsample', 0.1, 1),

        **base_params
        }

    pruning_callback = XGBoostPruningCallback(trial, observation_key = 'validation_1-rmse')

    xgb_obj = XGBRegressor(**obj_params)
    xgb_obj.fit(
        X_train,
        y_train,
        eval_metric = 'rmse',
        eval_set = [(X_train, y_train), (X_test, y_test)],
        early_stopping_rounds = obj_params['n_estimators'] * 0.1,
        verbose = 0,
        callbacks = [pruning_callback]
        )

    y_pred = xgb_obj.predict(X_test)

    rmse = (mean_squared_error(y_true = y_test, y_pred = y_pred)) ** 0.5
    
    return rmse

xgb_study = optuna.create_study(direction = 'minimize')
xgb_study.optimize(xgb_objective, n_trials = 200, n_jobs = -1)

[32m[I 2022-03-25 15:48:40,065][0m A new study created in memory with name: no-name-4a3c1b33-8b3a-4748-90aa-af0d3b97b142[0m
[32m[I 2022-03-25 15:48:42,856][0m Trial 7 finished with value: 282.1720298279659 and parameters: {'max_depth': 2, 'min_child_weight': 1, 'n_estimators': 102, 'subsample': 0.22383988843343447}. Best is trial 7 with value: 282.1720298279659.[0m
[32m[I 2022-03-25 15:48:44,316][0m Trial 6 finished with value: 224.8017787685713 and parameters: {'max_depth': 4, 'min_child_weight': 1, 'n_estimators': 94, 'subsample': 0.3801307148255616}. Best is trial 6 with value: 224.8017787685713.[0m
[32m[I 2022-03-25 15:48:49,394][0m Trial 8 finished with value: 207.9168375729017 and parameters: {'max_depth': 6, 'min_child_weight': 6, 'n_estimators': 126, 'subsample': 0.280295481842614}. Best is trial 8 with value: 207.9168375729017.[0m
[32m[I 2022-03-25 15:48:53,782][0m Trial 3 finished with value: 247.7556842713255 and parameters: {'max_depth': 2, 'min_child_weight':

In [33]:
xgb_tuned_params = {**xgb_study.best_params, **xgb_base_params}
xgb_tuned_params

{'max_depth': 7,
 'min_child_weight': 3,
 'n_estimators': 228,
 'subsample': 0.5817689153524592,
 'learning_rate': 0.1,
 'verbosity': 0,
 'n_jobs': -1,
 'random_state': 73}

### RandomForest

In [13]:
rf_base_params = {
    'verbose' : 0,
    'random_state' : p.RANDOM_STATE,
    }

def rf_objective(trial, X_train = X_train, y_train = y_train, X_test = X_test, y_test = y_test, base_params = rf_base_params):

    obj_params = {
        'max_depth' : trial.suggest_int('max_depth', 2, 20),
        'max_features' : trial.suggest_categorical('max_features', ['auto', 'sqrt']),
        'n_estimators' : trial.suggest_int('n_estimators', 50, 1000),
        'bootstrap' : trial.suggest_categorical('bootstrap', [True, False]),
        'min_samples_leaf' : trial.suggest_float('min_samples_leaf', 0, 0.5),
        'min_samples_split' : trial.suggest_float('min_samples_split', 0, 0.5),

        **base_params
    }

    rf_obj = RandomForestRegressor(**obj_params)
    rf_obj.fit(X_train, y_train)

    y_pred = rf_obj.predict(X_test)

    rmse = (mean_squared_error(y_true = y_test, y_pred = y_pred)) ** 0.5
    
    return rmse

rf_study = optuna.create_study(direction='minimize')
rf_study.optimize(rf_objective, n_trials = 200, n_jobs = -1)

[32m[I 2022-03-25 15:33:53,987][0m A new study created in memory with name: no-name-b705b25e-e9f4-45a6-93f9-143934fc7948[0m
[32m[I 2022-03-25 15:33:54,961][0m Trial 1 finished with value: 639.3504878011229 and parameters: {'max_depth': 2, 'max_features': 'auto', 'n_estimators': 63, 'bootstrap': True, 'min_samples_leaf': 0.36230300992729975, 'min_samples_split': 0.41351973714248497}. Best is trial 1 with value: 639.3504878011229.[0m
[32m[I 2022-03-25 15:33:55,686][0m Trial 6 finished with value: 587.6081697743637 and parameters: {'max_depth': 9, 'max_features': 'sqrt', 'n_estimators': 296, 'bootstrap': False, 'min_samples_leaf': 0.3768703700734861, 'min_samples_split': 0.4009409313202093}. Best is trial 6 with value: 587.6081697743637.[0m
[32m[I 2022-03-25 15:33:56,491][0m Trial 8 finished with value: 513.2887565192293 and parameters: {'max_depth': 15, 'max_features': 'sqrt', 'n_estimators': 355, 'bootstrap': False, 'min_samples_leaf': 0.16395296943147403, 'min_samples_split'

In [14]:
rf_tuned_params = {**rf_study.best_params, **rf_base_params}
rf_tuned_params

{'max_depth': 8,
 'max_features': 'auto',
 'n_estimators': 939,
 'bootstrap': True,
 'min_samples_leaf': 0.00045236418526066295,
 'min_samples_split': 0.011118086880203824,
 'verbose': 0,
 'random_state': 73}

### Naive Bayes

In [25]:
def nb_objective(trial, X_train = X_train, y_train = y_train, X_test = X_test, y_test = y_test):

    obj_params = {
        'var_smoothing' : trial.suggest_loguniform('var_smoothing', 1e-9, 1),
    }

    nb_obj = GaussianNB(**obj_params)
    nb_obj.fit(X_train, y_train)

    y_pred = nb_obj.predict(X_test)

    rmse = (mean_squared_error(y_true = y_test, y_pred = y_pred)) ** 0.5
    
    return rmse

nb_study = optuna.create_study(direction='minimize')
nb_study.optimize(nb_objective, n_trials = 200, n_jobs = -1)

[32m[I 2022-03-25 15:43:36,656][0m A new study created in memory with name: no-name-0169a8b8-602a-41ac-a6ee-b4ad7b4fd329[0m
[32m[I 2022-03-25 15:43:39,346][0m Trial 1 finished with value: 756.8851303809026 and parameters: {'var_smoothing': 0.0057463396284133435}. Best is trial 1 with value: 756.8851303809026.[0m
[32m[I 2022-03-25 15:43:39,373][0m Trial 3 finished with value: 508.749198275105 and parameters: {'var_smoothing': 7.657589236506456e-09}. Best is trial 3 with value: 508.749198275105.[0m
[32m[I 2022-03-25 15:43:39,378][0m Trial 6 finished with value: 509.53472801493564 and parameters: {'var_smoothing': 1.6238950512607424e-05}. Best is trial 3 with value: 508.749198275105.[0m
[32m[I 2022-03-25 15:43:39,399][0m Trial 4 finished with value: 846.6320197837292 and parameters: {'var_smoothing': 0.0008155436912047956}. Best is trial 3 with value: 508.749198275105.[0m
[32m[I 2022-03-25 15:43:39,408][0m Trial 7 finished with value: 843.2359212788028 and parameters: {'v

In [26]:
nb_tuned_params = nb_study.best_params.copy()
nb_tuned_params

{'var_smoothing': 8.446788876917902e-07}

### Evaluate Final, Tuned Models on validation data

### XGB

In [34]:
xgb_tuned = XGBRegressor(**xgb_tuned_params)
xgb_tuned.fit(
    **train_data,
    eval_metric = 'rmse',
    eval_set = [(X_train, y_train), (X_test, y_test)],
    early_stopping_rounds = xgb_tuned_params['n_estimators'] * 0.1,
    verbose = 0
    )

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.1, max_delta_step=0,
             max_depth=7, min_child_weight=3, missing=nan,
             monotone_constraints='()', n_estimators=228, n_jobs=-1,
             num_parallel_tree=1, predictor='auto', random_state=73,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             subsample=0.5817689153524592, tree_method='exact',
             validate_parameters=1, verbosity=0)

### RandomForest

In [35]:
rf_tuned = RandomForestRegressor(**rf_tuned_params).fit(**train_data)

### Naive Bayes

In [36]:
nb_tuned = GaussianNB(**nb_tuned_params).fit(**train_data)

### Evaluate results on validation data

In [37]:
tuned_rmse = {'RMSE' : [f.evaluate_model(model, metric = 'rmse', **val_data) for model in [xgb_tuned, rf_tuned, nb_tuned]]}
pd.DataFrame(tuned_rmse, index = ['XGBoost', 'RandomForest', 'Naive Bayes'])

Unnamed: 0,RMSE
XGBoost,196.635275
RandomForest,261.215704
Naive Bayes,440.577962
