In [5]:
# load packages
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
import pandas as pd
import numpy as np
from sklearn.datasets import load_boston
import matplotlib.pyplot as plt
from xgboost.sklearn import XGBRegressor

## Load Data

In [6]:
# load data
df_energy_weather_proc = pd.read_csv('input data/energy_weather_table_proc.csv',
                                     parse_dates=['dt_iso'])

df_energy_weather_proc['dt_iso'] = pd.to_datetime(df_energy_weather_proc['dt_iso'], utc=True, infer_datetime_format=True)
df_energy_weather = df_energy_weather_proc.set_index('dt_iso')

df_energy_weather_proc = df_energy_weather.select_dtypes(exclude=['object'])


## XGBoost Hyperparameter tuning

### Define the Grid Search function

In [7]:
# grid search function
def grid_search(actual_column, model_name):
    # prepare datasets
    X = df_energy_weather_proc.drop(columns=[actual_column])
    y = df_energy_weather_proc[actual_column]

    # prepare train and test data
    # this will be used for the rmse computation
    data_train = df_energy_weather_proc[df_energy_weather_proc.year != 2018.0]
    X_train = data_train.drop(columns=[actual_column])
    y_train = data_train[actual_column]

    data_test = df_energy_weather_proc[df_energy_weather_proc.year == 2018.0]
    X_test = data_test.drop(columns=[actual_column])
    y_test = data_test[actual_column]

    # perform the grid search!
    xgb1 = XGBRegressor()
    parameters = {'nthread':[4], #when use hyperthread, xgboost may become slower
                  'objective':['reg:linear'],
                  'learning_rate': [.03, 0.05, .07], #so called `eta` value
                  'max_depth': [5, 6, 7],
                  'min_child_weight': [4],
                  'silent': [1],
                  'subsample': [0.7],
                  'colsample_bytree': [0.7],
                  'n_estimators': [500]}

    # dummy parameters to simplify testing
    # parameters = {'nthread':[4], #when use hyperthread, xgboost may become slower
    #               'objective':['reg:linear'],
    #               'learning_rate': [.07], #so called `eta` value
    #               'max_depth': [3],
    #               'min_child_weight': [4],
    #               'silent': [1],
    #               'subsample': [0.7],
    #               'colsample_bytree': [0.7],
    #               'n_estimators': [10]}

    xgb_grid = GridSearchCV(xgb1,
                            parameters,
                            cv = 2,
                            n_jobs = 5,
                            verbose=True)

    xgb_grid.fit(X, y)

    xgb_best_params = xgb_grid.best_params_

    # measure the RMSE for the model using the best parameters
    xg_reg = xgb.XGBRegressor(objective='reg:linear', colsample_bytree=xgb_best_params['colsample_bytree'],
                              learning_rate=xgb_best_params['learning_rate'], max_depth=xgb_best_params['max_depth'],
                              min_child_weight=xgb_best_params['min_child_weight'], alpha=10,
                              n_estimators=xgb_best_params['n_estimators'], subsample=xgb_best_params['subsample'])

    xg_reg.fit(X_train,y_train)

    preds = xg_reg.predict(X_test)

    rmse = np.sqrt(mean_squared_error(y_test, preds))

    print("===================================")
    print(model_name + "_grid.best_score_: {}".format(xgb_grid.best_score_))
    print(model_name + "_grid: {}".format(xgb_grid.best_params_))
    print(model_name + " RMSE: %f" % (rmse))
    print("===================================")

    return None

### Run the grid search for hyperparameter tuning

In [8]:
# call the function to do the grid search per target value
grid_search('price_actual', 'xgb_price')
# xgb_price_grid: {'colsample_bytree': 0.7, 'learning_rate': 0.07, 'max_depth': 7,
# 'min_child_weight': 4, 'n_estimators': 500, 'nthread': 4, 'objective': 'reg:linear', 'silent': 1, 'subsample': 0.7}

grid_search('total_load_actual', 'xgb_load')
# xgb_load_grid: {'colsample_bytree': 0.7, 'learning_rate': 0.07, 'max_depth': 7,
# 'min_child_weight': 4, 'n_estimators': 500, 'nthread': 4, 'objective': 'reg:linear', 'silent': 1, 'subsample': 0.7}

grid_search('gen_solar_totals', 'xgb_solar')
# xgb_solar_grid: {'colsample_bytree': 0.7, 'learning_rate': 0.07, 'max_depth': 6,
# 'min_child_weight': 4, 'n_estimators': 500, 'nthread': 4, 'objective': 'reg:linear', 'silent': 1, 'subsample': 0.7}

grid_search('gen_wind_totals', 'xgb_wind')
# xgb_wind_grid: {'colsample_bytree': 0.7, 'learning_rate': 0.07, 'max_depth': 5,
# 'min_child_weight': 4, 'n_estimators': 500, 'nthread': 4, 'objective': 'reg:linear', 'silent': 1, 'subsample': 0.7}

# we will save these values and used them for our XGB model


Fitting 2 folds for each of 9 candidates, totalling 18 fits
Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


xgb_price_grid.best_score_: 0.9463222875921785
xgb_price_grid: {'colsample_bytree': 0.7, 'learning_rate': 0.07, 'max_depth': 7, 'min_child_weight': 4, 'n_estimators': 500, 'nthread': 4, 'objective': 'reg:linear', 'silent': 1, 'subsample': 0.7}
xgb_price RMSE: 11.744174
Fitting 2 folds for each of 9 candidates, totalling 18 fits
Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


xgb_load_grid.best_score_: 0.9779974656293

[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  18 out of  18 | elapsed:  3.0min finished
[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  18 out of  18 | elapsed:  3.6min finished
[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  18 out of  18 | elapsed:  4.1min finished
[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  18 out of  18 | elapsed:  4.0min finished
