In [2]:
import numpy as np
import joblib

import warnings

import params as p
import functions as f

from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import mean_squared_error

import optuna
from optuna.integration import XGBoostPruningCallback

  from pandas import MultiIndex, Int64Index


### Settings

In [3]:
warnings.simplefilter(action='ignore', category=FutureWarning)

### Import Data

In [4]:
train, test, val = f.load_split_datasets(part='03')

In [5]:
X_train, y_train = f.split_data_X_y(train)
X_test, y_test = f.split_data_X_y(test)
X_val, y_val = f.split_data_X_y(val)

In [6]:
train_data = {'X' : X_train, 'y' : y_train}
test_data = {'X' : X_test, 'y' : y_test}
val_data = {'X' : X_val, 'y' : y_val}

### Create Models

#### XGB

In [7]:
xgb = XGBRegressor(random_state = p.RANDOM_STATE).fit(**train_data)

In [26]:
f.evaluate_model(xgb, **val_data)

rmse : 210.90439104107836
mae : 128.63407599728723
r2 : 0.8921120431393053


#### RandomForest

In [None]:
rf = RandomForestRegressor()

#### Naive Bayes

### Tuning with Optuna

In [20]:
base_params = {
    'learning_rate' : 0.1,
    'verbosity' : 0,
    'n_jobs' : -1,
    'random_state' : p.RANDOM_STATE,
    }

def objective(trial, X_train = X_train, y_train = y_train, X_test = X_test, y_test = y_test, base_params = base_params):

    obj_params = {
        'max_depth' : trial.suggest_int('max_depth', 2, 7),
        'min_child_weight' : trial.suggest_int('min_child_weight', 1, 10),
        'n_estimators' : trial.suggest_int('n_estimators', 50, 1000),
        'subsample' : trial.suggest_float('subsample', 0.1, 1),

        **base_params
    }

    pruning_callback = XGBoostPruningCallback(trial, observation_key= 'validation_1-rmse')

    xgb_obj = XGBRegressor(**obj_params)
    xgb_obj.fit(
        **train_data,
        eval_metric = 'rmse',
        eval_set = [(X_train, y_train), (X_test, y_test)],
        early_stopping_rounds = obj_params['n_estimators']*0.1,
        verbose = 0,
        callbacks = [pruning_callback]
        )

    y_pred = xgb_obj.predict(X_test)

    rmse = (mean_squared_error(y_true = y_test, y_pred=y_pred))**0.5
    
    return rmse

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials = 200, n_jobs = -1)

[32m[I 2022-03-25 14:43:14,341][0m A new study created in memory with name: no-name-f74b5b17-ed66-4f66-9305-50d3a9d6165b[0m
[32m[I 2022-03-25 14:43:18,653][0m Trial 5 finished with value: 218.29431247617296 and parameters: {'max_depth': 4, 'min_child_weight': 10, 'n_estimators': 142, 'subsample': 0.5884444564902216}. Best is trial 5 with value: 218.29431247617296.[0m
[32m[I 2022-03-25 14:43:21,017][0m Trial 1 finished with value: 194.29362272710657 and parameters: {'max_depth': 7, 'min_child_weight': 4, 'n_estimators': 111, 'subsample': 0.3972683130629646}. Best is trial 1 with value: 194.29362272710657.[0m
[32m[I 2022-03-25 14:43:21,229][0m Trial 0 finished with value: 215.6383744461178 and parameters: {'max_depth': 5, 'min_child_weight': 4, 'n_estimators': 182, 'subsample': 0.13487288342032044}. Best is trial 1 with value: 194.29362272710657.[0m
[32m[I 2022-03-25 14:43:23,133][0m Trial 2 finished with value: 215.44059756812203 and parameters: {'max_depth': 4, 'min_child

In [21]:
tuned_params = {**study.best_params, **base_params}
tuned_params

{'max_depth': 7,
 'min_child_weight': 4,
 'n_estimators': 400,
 'subsample': 0.8488153719247543,
 'learning_rate': 0.1,
 'verbosity': 0,
 'n_jobs': -1,
 'random_state': 73}

### Evaluate Final XGB Model

In [22]:
xgb_tuned = XGBRegressor(**tuned_params)
xgb_tuned.fit(
    **train_data,
    eval_metric = 'rmse',
    eval_set = [(X_train, y_train), (X_test, y_test)],
    early_stopping_rounds = tuned_params['n_estimators'] * 0.1,
    verbose = 0
    )

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.1, max_delta_step=0,
             max_depth=7, min_child_weight=4, missing=nan,
             monotone_constraints='()', n_estimators=400, n_jobs=-1,
             num_parallel_tree=1, predictor='auto', random_state=73,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             subsample=0.8488153719247543, tree_method='exact',
             validate_parameters=1, verbosity=0)

In [25]:
f.evaluate_model(xgb_tuned, **val_data)

rmse : 198.67011629703538
mae : 119.62577776174032
r2 : 0.9042658651421513
