In [1]:
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error

Here we're going to use randomized search to look for the best hyperparamters as it is faster than a gridsearch

In [2]:
df_cyclic = pd.read_pickle('../data/fe_temp_cyclic_data.pkl')

In [3]:
split_date = '2023-10-01'
train_df = df_cyclic[df_cyclic['DateTime'] < split_date]
test_df = df_cyclic[df_cyclic['DateTime'] >= split_date]

X_train = train_df.drop(columns=['demand','DateTime','date','time'])
y_train = train_df['demand']

X_test = test_df.drop(columns=['demand','DateTime','date','time'])
y_test = test_df['demand']

In [None]:
param_dist = {
    'n_estimators': [100, 200, 300, 500],
    'learning_rate': np.linspace(0.01, 0.2, 5),
    'max_depth': np.arange(3, 10),
    'subsample': [0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.7, 0.8, 0.9, 1.0],
    'gamma': [0, 0.1, 0.2, 0.3]
}

xgb_model = XGBRegressor()
random_search = RandomizedSearchCV(estimator=xgb_model, param_distributions=param_dist, 
                                   n_iter=100, scoring='neg_mean_squared_error', cv=5, verbose=1)

random_search.fit(X_train, y_train)
best_params = random_search.best_params_
print(f"best parameters found: {best_params}")

best_xgb_model = random_search.best_estimator_
y_pred = best_xgb_model.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)

print(f"rmse: {rmse}")
print(f"mae: {mae}")

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best parameters found: {'subsample': 0.8, 'n_estimators': 500, 'max_depth': 3, 'learning_rate': 0.0575, 'gamma': 0.3, 'colsample_bytree': 1.0}
RMSE: 0.11890621521990276
MAE: 0.0465065576229467