# GBM GridSearchCV Results Evaluation

### Loading Libraries

In [1]:
# Numerical Computing
import numpy as np

# Data Manipulation
import pandas as pd

# Warnings
import warnings

# pATH
from pathlib import Path

# OS & Time
import os
from datetime import datetime

# Data Visualization
import graphviz
import seaborn as sns
import matplotlib.pyplot as plt

# Joblib
import joblib

# Scikit-Learn
from sklearn.metrics import roc_auc_score
from statsmodels.api import OLS, add_constant
from sklearn.tree import DecisionTreeRegressor, export_graphviz

In [2]:
np.random.seed(42)

sns.set_style("white")

pd.options.display.float_format = '{:,.4f}'.format

In [4]:
with pd.HDFStore('data/tuning_sklearn_gbm.h5') as store:
    test_feature_data = store['holdout/features']
    test_features = test_feature_data.columns
    test_target = store['holdout/target']

### GBM GridsearchCV with sklearn

In [5]:
class OneStepTimeSeriesSplit:
    pass

#### Loading Result

In [6]:
gridsearch_result = joblib.load('results/sklearn_gbm_gridsearch.joblib')

### Best Parameters & AUC Score

In [None]:
pd.Series(gridsearch_result.best_params_)

In [None]:
f'{gridsearch_result.best_score_:.4f}'

### Best Model Evaluation

#### Test on Hold-Out Set

In [7]:
best_model = gridsearch_result.best_estimator_

In [8]:
idx = pd.IndexSlice

test_dates = sorted(test_feature_data.index.get_level_values('date').unique())

In [9]:
auc = {}

for i, test_date in enumerate(test_dates):
    test_data = test_feature_data.loc[idx[:, test_date], :]
    preds = best_model.predict(test_data)
    auc[i] = roc_auc_score(y_true=test_target.loc[test_data.index], y_score=preds)

In [10]:
auc = pd.Series(auc)

In [11]:
auc.head()

In [13]:
ax = auc.sort_index(ascending=False).plot.barh(xlim=(.45, .55),
                                               title=f'Test AUC: {auc.mean():.2%}',
                                               figsize=(8, 4))
ax.axvline(auc.mean(), ls='--', lw=1, c='k')
sns.despine()
plt.tight_layout()
plt.show()

#### Global Feature Importance Inspection

In [14]:
(pd.Series(best_model.feature_importances_,
           index=test_features)
 .sort_values()
 .tail(25)
 .plot.barh(figsize=(8, 5)))
sns.despine()
plt.tight_layout()
plt.show()

### CV Train-Test Scores

In [15]:
results = pd.Data

In [16]:
results.head()

### Getting Parameter Values & Mean Test Scores

In [17]:
test_scores = results.filter(like='param').join(results[['mean_test_score']])
test_scores = test_scores.rename(columns={c: '_'.join(c.split('_')[1:]) for c in test_scores.columns})
test_scores.info()

In [18]:
params = test_scores.columns[:-1].tolist()

In [19]:
test_scores = test_scores.set_index('test_score').stack().reset_index()
test_scores.columns= ['test_score', 'parameter', 'value']
test_scores.head()

In [20]:
test_scores.info()

In [21]:
def get_test_scores(df):
    """Select parameter values and test scores"""
    data = df.filter(like='param').join(results[['mean_test_score']])
    return data.rename(columns={c: '_'.join(c.split('_')[1:]) for c in data.columns})

### Plotting Test Scores vs Parameter Settings

In [23]:
plot_data = get_test_scores(results).drop('min_impurity_decrease', axis=1)
plot_params = plot_data.columns[:-1].tolist()
plot_data.info()

In [24]:
fig, axes = plt.subplots(ncols=3, nrows=2, figsize=(12, 6))

axes = axes.flatten()

for i, param in enumerate(plot_params):
    sns.swarmplot(x=param, y='test_score', data=plot_data, ax=axes[i])
    
fig.suptitle('Mean Test Score Distribution by Hyperparameter', fontsize=14)
fig.tight_layout()
fig.subplots_adjust(top=.94)
fig.savefig('sklearn_cv_scores_by_param', dpi=300);
plt.show()

### Dummy-Encode Parameters

In [25]:
data = get_test_scores(results)
params = data.columns[:-1].tolist()
data = pd.get_dummies(data,columns=params, drop_first=False)
data.info()

### Building Regression Tree

In [26]:
reg_tree = DecisionTreeRegressor(criterion='mse',
                                 splitter='best',
                                 max_depth=4,
                                 min_samples_split=5,
                                 min_samples_leaf=10,
                                 min_weight_fraction_leaf=0.0,
                                 max_features=None,
                                 random_state=42,
                                 max_leaf_nodes=None,
                                 min_impurity_decrease=0.0,
                                 min_impurity_split=None)

In [27]:
gbm_features = data.drop('test_score', axis=1).columns

reg_tree.fit(X=data[gbm_features], y=data.test_score)

In [28]:
reg_tree.feature_importances_

#### Tree Visualization

In [29]:
out_file = 'results/gbm_sklearn_tree.dot'

dot_data = export_graphviz(reg_tree,
                          out_file=out_file,
                          feature_names=gbm_features,
                          max_depth=4,
                          filled=True,
                          rounded=True,
                          special_characters=True)
if out_file is not None:
    dot_data = Path(out_file).read_text()

graphviz.Source(dot_data)

### Computing Feature Importance

In [30]:
reg_tree = DecisionTreeRegressor(criterion='mse',
                                 splitter='best',
                                 min_samples_split=2,
                                 min_samples_leaf=1,
                                 min_weight_fraction_leaf=0.0,
                                 max_features=None,
                                 random_state=42,
                                 max_leaf_nodes=None,
                                 min_impurity_decrease=0.0,
                                 min_impurity_split=None)

gbm_features = data.drop('test_score', axis=1).columns
reg_tree.fit(X=data[gbm_features], y=data.test_score)

In [31]:
gbm_fi = (pd.Series(reg_tree.feature_importances_, 
                    index=gbm_features)
          .sort_values(ascending=False))
gbm_fi = gbm_fi[gbm_fi > 0]
idx = [p.split('_') for p in gbm_fi.index]
gbm_fi.index = ['_'.join(p[:-1]) + '=' + p[-1] for p in idx]
gbm_fi.sort_values().plot.barh(figsize=(5,5))
plt.title('Hyperparameter Importance')
sns.despine()
plt.tight_layout();
plt.show()

### Running Linear Regression

In [32]:
data = get_test_scores(results)
params = data.columns[:-1].tolist()
data = pd.get_dummies(data,columns=params, drop_first=True)

model = OLS(endog=data.test_score, exog=add_constant(data.drop('test_score', axis=1))).fit(cov_type='HC3')
print(model.summary())