# Regression - HistGradientBoosting

## Notebook Setup

### Import Libraries

In [1]:
# Import Standard Libraries
import os
import datetime
import pickle
import itertools
import pandas as pd
import numpy as np

# Import Visualization Libraries
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import shap

# Import custom functions
import env_functions as ef
import s3_functions as sf
import common_functions as cf

Loading dotenv file


In [2]:
# Import Modeling Libraries
from sklearn import set_config
from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from hyperopt.early_stop import no_progress_loss
import neptune

In [3]:
# Determine the environment and get appropriate vars
deepnote, env_vars = ef.load_env_vars()

# Iterate through the vars and set them as global vars
for var_name, var in env_vars.items():
    globals()[var_name] = var

# If not in the DeepNote environment, create a dict for aws creds
#   that were located in the environment file.  This will be passed
#   to all aws s3 functions.
if not deepnote:
    aws_env_vars = {
        'access_key_id': aws_access_key_id,
        'secret_access_key': aws_secret_access_key,
        'bucket_name': s3_bucket_name
    }

Loading dotenv file


In [4]:
# Pandas Configs
pd.set_option('mode.chained_assignment', None)
pd.options.display.float_format = '{:.2f}'.format
pd.options.display.max_columns = None
pd.options.display.max_rows = None

# Ignore Warnings
import warnings
warnings.simplefilter('ignore', category=FutureWarning)
warnings.simplefilter('ignore', category=UserWarning)
warnings.simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

# MapBox Token for Plotly Maps
px.set_mapbox_access_token(os.environ.get("MAPBOX_TOKEN"))

# Scikit Learn Configs
set_config(transform_output="pandas")

### Common Functions

In [5]:
def objective(space):
    """
    This is the HistGradientBoostingRegressor hyperparam objective function
    Hyperparams are passed into this function, which are then used in the model object
    The model object is then used in the cross_val_scores function 
    This results in a list of MAE scores, which are then returned as 
      the loss and std vars to the trials object
    """
    #space['max_bins'] = int(space['max_bins'])
    #space['max_depth'] = int(space['max_depth'])
    space['max_iter'] = int(space['max_iter'])
    space['min_samples_leaf'] = int(space['min_samples_leaf'])
    space['max_leaf_nodes'] = int(space['max_leaf_nodes'])

    model=HistGradientBoostingRegressor(
                    learning_rate = space['learning_rate'],
                    max_iter = space['max_iter'],
                    max_leaf_nodes = None if space['max_leaf_nodes'] == 0 else space['max_leaf_nodes'],
                    min_samples_leaf = space['min_samples_leaf'],
                    l2_regularization = space['l2_regularization'],
                    #max_bins = space['max_bins'],
                    #max_depth = None if space['max_depth'] == 0 else space['max_depth'],
                    early_stopping = False,
                    loss='squared_error',
                    random_state = 42
                    )

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    mae_scores = -cross_val_score(model, X_train_trans, y_train_trans, cv=kf, scoring='neg_mean_absolute_error')
    mean_mae = mae_scores.mean()
    std_mae = mae_scores.std()

    # Emit model specific params and metrics to Neptune
    run["parameters/learning_rate"].log(space['learning_rate'])
    run["parameters/max_iter"].log(space['max_iter'])
    run["parameters/max_leaf_nodes"].log(space['max_leaf_nodes'])
    run["parameters/min_samples_leaf"].log(space['min_samples_leaf'])
    run["parameters/l2_regularization"].log(space['l2_regularization'])
    #run["parameters/max_depth"].log(space['max_depth'])
    #run["parameters/max_bins"].log(space['max_bins'])
    
    # Emit standard params and metrics to Neptune
    run['parameters/all_parameters'].log(str(space))
    run['metrics/mae_scores'].log(str(mae_scores.tolist()))
    run["metrics/mean_mae"].log(mean_mae)
    run["metrics/std_mae"].log(std_mae)

    return {'loss': mean_mae, 'status': STATUS_OK, 'std': std_mae}

In [6]:
# Define the Hyperparameter space
space = {
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(1)),
    'max_iter': hp.quniform('max_iter', 100, 500, 1), 
    'max_leaf_nodes': hp.quniform('max_leaf_nodes', 2, 1500, 1), 
    'min_samples_leaf': hp.quniform('min_samples_leaf', 16, 128, 1),  
    'l2_regularization': hp.loguniform('l2_regularization', np.log(0.001), np.log(1)),
    #'max_depth': hp.quniform('max_depth', 0, 24, 1),  
    #'max_bins': hp.quniform('max_bins', 10, 255, 1),
}

# Caribbean Region

## Import the Data

In [7]:
X_train_trans, X_val_trans, X_holdout_trans, y_train_trans, y_val_trans, y_holdout_trans = cf.import_data(location_name='CARB')

## Hyperparameter Tuning

### Execute Hyperparameter search

In [8]:
# Instantiate Neptune instance
run = neptune.init_run(
    name="HistGradientBoosting Reg Caribbean",
    tags=["HistGradientBoosting", "regression", "hyperopt", "MAE", "Caribbean", "CARB"],
    description="HistGradientBoosting Hyperopt with MAE on Caribbean"
)

# Create the Trials object
CARB_trials = Trials()

# Create the fmin object
CARB_best_hyperparams = fmin(fn = objective,
                            space = space,
                            algo = tpe.suggest,
                            max_evals = 1000,
                            trials = CARB_trials,
                            show_progressbar=True,
                            early_stop_fn=no_progress_loss(100))

# Stop Neptune instance
run.stop()

CARB_best_trial = CARB_trials.best_trial
CARB_best_hps = CARB_best_hyperparams.copy()

CARB_best_hps['max_iter'] = int(CARB_best_hps['max_iter'])
CARB_best_hps['min_samples_leaf'] = int(CARB_best_hps['min_samples_leaf'])
CARB_best_hps['max_leaf_nodes'] = int(CARB_best_hps['max_leaf_nodes'])



[neptune] [info   ] Neptune initialized. Open in the app: https://app.neptune.ai/mads-capstone/capstone/e/CAP-182
 25%|██▍       | 246/1000 [2:56:44<9:01:43, 43.11s/trial, best loss: 9.836539541369458] 
[neptune] [info   ] Shutting down background jobs, please wait a moment...
[neptune] [info   ] Done!
[neptune] [info   ] Waiting for the remaining 9 operations to synchronize with Neptune. Do not kill this process.
[neptune] [info   ] All 9 operations synced, thanks for waiting!
[neptune] [info   ] Explore the metadata in the Neptune app: https://app.neptune.ai/mads-capstone/capstone/e/CAP-182/metadata


In [9]:
# run.stop()

### Review the Hyperparameter Search Validation MAE Scores and Hyperparameters

In [10]:
# What's the best trial from the search? Report the MAE & StdDev
print(f"Best Mean Absolute Error: {CARB_trials.best_trial['result']['loss']:.4f}")
print(f"Best Standard Error: {CARB_trials.best_trial['result']['std']:.4f}")

Best Mean Absolute Error: 9.8365
Best Standard Error: 0.3220


In [11]:
# Display the best hyperparameters
CARB_best_hps

{'l2_regularization': 0.0067298442112987585,
 'learning_rate': 0.02189097914588318,
 'max_iter': 445,
 'max_leaf_nodes': 1028,
 'min_samples_leaf': 16}

## Recreate the model and get new MAE and Feature Importance

In [12]:
# Recreate the model with the best hyperparameters
CARB_model = cf.model_score(hps=CARB_best_hps, model_type='hgbm', holdout=True, val=True, Xtt=X_train_trans, ytt=y_train_trans, Xvt=X_val_trans, yvt=y_val_trans, Xht=X_holdout_trans, yht=y_holdout_trans)

# Plot the feature importance
# HistGradientBoosting does not have feature importance vars

 
Validation Mean Absolute Error: 9.2763
Validation Mean Squared Error: 250.5776
Validation Root Mean Squared Error: 15.8296
Validation R^2 Score: 0.5913
 
Holdout Mean Absolute Error: 9.2632
Holdout Mean Squared Error: 253.7178
Holdout Root Mean Squared Error: 15.9285
Holdout R^2 Score: 0.5843


## Feature Ablation

In [13]:
%%time
# First run through the feature ablation process
# This will determine which features can potentially be removed
CARB_feature_ablation_df, CARB_baseline_mae_val, CARB_baseline_mae_train = cf.feat_ablation(
    model=CARB_model, hps=CARB_best_hps, model_type='hgbm',
    Xtt=X_train_trans, ytt=y_train_trans, 
    Xvt=X_val_trans, yvt=y_val_trans, 
    Xht=X_holdout_trans, yht=y_holdout_trans)

Baseline Mean MAE: 3.4767, Validation MAE: 9.2763
CPU times: total: 6min 8s
Wall time: 9min


In [14]:
# Show the feature ablation dataframe, sorted by the change in MAE to Validation data
CARB_feature_ablation_df.sort_values('Val_MAE_Change', ascending=False)

Unnamed: 0,Removed_Feature,Train_MAE,Train_MAE_Change,Train_MAE_Pct_Change,Val_MAE,Val_MAE_Change,Val_MAE_Pct_Change
1,Cyclone_Frequency,3.49,-0.02,-0.47,9.22,0.06,0.64
17,TSA_cubed_SSTA_squared,3.5,-0.02,-0.56,9.23,0.05,0.55
15,TSA_Frequency,3.48,-0.01,-0.2,9.23,0.05,0.55
16,TSA_cubed_SSTA,3.49,-0.02,-0.46,9.24,0.04,0.39
11,SSTA_DHW,3.51,-0.03,-0.9,9.24,0.04,0.38
5,Fert_and_Turbidity_Interaction,3.5,-0.02,-0.54,9.24,0.03,0.35
10,SSTA,3.53,-0.05,-1.52,9.27,0.01,0.12
13,TSA,3.49,-0.01,-0.39,9.27,0.01,0.09
4,Distance_to_Shore,3.56,-0.09,-2.52,9.27,0.0,0.05
7,Fishing_Inter,3.5,-0.02,-0.7,9.28,0.0,0.01


In [15]:
%%time

CARB_feat_ablation_len = len(CARB_feature_ablation_df[CARB_feature_ablation_df['Val_MAE'] <= CARB_baseline_mae_val].sort_values('Val_MAE_Change', ascending=False)['Removed_Feature'])

if CARB_feat_ablation_len > 0:
    # Generate the top features to use in feature ablation combinations
    CARB_abl_list_to_combo = CARB_feature_ablation_df[CARB_feature_ablation_df['Val_MAE'] <= CARB_baseline_mae_val].sort_values('Val_MAE_Change', ascending=False)['Removed_Feature'].head(5).to_list()

    # Second run through the feature ablation process
    # This time including the top 5 features whose removal decreased the MAE
    # These top 5 features will be combined using itertools, and retested 
    CARB_feature_ablation_df, CARB_baseline_mae_val, CARB_baseline_mae_train = cf.feat_ablation(
        model=CARB_model, hps=CARB_best_hps, model_type='hgbm',
        Xtt=X_train_trans, ytt=y_train_trans, 
        Xvt=X_val_trans, yvt=y_val_trans, 
        Xht=X_holdout_trans, yht=y_holdout_trans, 
        abl_list_to_combo=CARB_abl_list_to_combo)

    # Show the top 1 feature that decreased the MAE in the second ablation run, which we'll remove.  
    # Might be multiple features due to itertools combinations, so we split the string
    CARB_drop_cols = CARB_feature_ablation_df[CARB_feature_ablation_df['Val_MAE'] <= CARB_baseline_mae_val].sort_values('Val_MAE_Change', ascending=False)['Removed_Feature'].head(1).values[0].split(', ')

    # Create the feature columns list for saving out
    CARB_feat_columns = X_train_trans.drop(columns=CARB_drop_cols).columns.to_list()

    # Recreate the model with the dropped columns
    # It should have a lower MAE score than the original
    CARB_experiment_model = cf.model_score(
        CARB_best_hps, val=True, model_type='hgbm',
        Xtt=X_train_trans.drop(columns=CARB_drop_cols), ytt=y_train_trans, 
        Xvt=X_val_trans.drop(columns=CARB_drop_cols), yvt=y_val_trans, 
        Xht=X_holdout_trans.drop(columns=CARB_drop_cols), yht=y_holdout_trans)

    # Plot the feature importance for this model
    # HistGradBoosting does not support FeatImportance

else:
    CARB_feat_columns = X_train_trans.columns.to_list()


Baseline Mean MAE: 3.4767, Validation MAE: 9.2763
 
Validation Mean Absolute Error: 9.1984
Validation Mean Squared Error: 249.0401
Validation Root Mean Squared Error: 15.7810
Validation R^2 Score: 0.5938


ValueError: HistGradientBoostingRegressor does not support Feature Importance

### Output the Model, Trials, Parameters and Feature List to disk

In [16]:
# Write out the model artifacts to disk
# RUN WITH YOUR OWN CREDENTIALS (DEEPNOTE ACCT/S3 BUCKET)
# cf.write_out(model=CARB_model, trials=CARB_trials, params=CARB_best_hps, feat_cols=CARB_feat_columns, model_family='histgradboost_reg', location_name='CARB')

## Review Holdout Truth vs Predictions

In [17]:
eval_df = y_val_trans.copy(deep=True)
CARB_val_pred = CARB_model.predict(X_val_trans)
eval_df['predictions'] = CARB_val_pred.tolist()
eval_df['diff'] = eval_df['y_val'] - eval_df['predictions']

In [18]:
eval_df.sort_values(by='diff', ascending=False).head(20)

Unnamed: 0,y_val,predictions,diff
5945,95.0,7.66,87.34
49424,75.0,-4.07,79.07
60718,86.96,8.14,78.82
54735,90.09,13.88,76.21
3147,100.0,25.3,74.7
60719,93.46,19.59,73.86
56053,75.0,2.71,72.29
6240,83.2,13.27,69.93
48455,97.5,28.03,69.47
54514,75.0,5.76,69.24


In [19]:
fig = px.histogram(eval_df[eval_df['y_val'] > 5], x='diff', nbins=100, title='Distribution of the diff between prediction and holdout where true value > 5')

# Show the plot
fig.show()

In [20]:
# Uncomment below to explore the error of our predictions

#eval_df[(eval_df['diff'].between(-5,5))].sample(10)
#eval_df[(eval_df['diff'].between(-5,5)) & (eval_df['y_val'] > 5)].sample(10)
eval_df[~(eval_df['diff'].between(-5,5))].sample(10)

Unnamed: 0,y_val,predictions,diff
47806,1.25,35.7,-34.45
5466,25.0,15.88,9.12
56269,25.0,1.21,23.79
5197,1.67,22.55,-20.88
52966,75.0,9.97,65.03
6060,50.0,36.63,13.37
5061,50.0,17.99,32.01
5841,0.0,15.96,-15.96
60046,1.84,7.54,-5.69
57240,60.0,47.8,12.2


## SHAP Analysis

### Create SHAP Objects

In [21]:
# Uncomment to re-run SHAP analysis
# Note: THIS CELL TAKES A LONG TIME TO RUN. RUN AT YOUR OWN RISK. RUN WITH YOUR OWN CREDENTIALS (DEEPNOTE ACCT/S3 BUCKET)

# %%time
# CARB_X_val_trans = X_val_trans.copy(deep=True)
# CARB_y_val_trans = y_val_trans.copy(deep=True)
# explainer = shap.TreeExplainer(CARB_model)
# shap_values = explainer(CARB_X_val_trans)
# shap_interaction = explainer.shap_interaction_values(CARB_X_val_trans)
# sv = explainer.shap_values(CARB_X_val_trans)
# shap.initjs()

In [22]:
# Beeswarm Importance
# plt.figure(figsize=(8,6))
# plt.title(f"SHAP Beeswarm Analysis")
# shap.plots.beeswarm(shap_values, max_display=32)
# plt.close()

### Investigate Individual Predictions

In [23]:
#investigate = CARB_X_val_trans.index.get_loc(5809)
# print(CARB_y_val_trans.iloc[investigate])

In [24]:
# Waterfall on a prediction
# plt.figure(figsize=(8,6))
# shap.plots.waterfall(shap_values[investigate], max_display=20) # , show=False
# plt.close()

In [25]:
# Decision plot on a prediction
# shap.decision_plot(explainer.expected_value, explainer.shap_values(CARB_X_val_trans)[investigate], CARB_X_val_trans, feature_display_range=slice(-1,-51,-1))

# South East Asia and Australia Region

## Import the Data

In [26]:
X_train_trans, X_val_trans, X_holdout_trans, y_train_trans, y_val_trans, y_holdout_trans = cf.import_data(location_name='SEAA')

## Hyperparameter Tuning

### Execute Hyperparameter search

In [27]:
# Instantiate Neptune instance
run = neptune.init_run(
    name="HistGradientBoosting Reg South East Asia and Australia",
    tags=["HistGradientBoosting", "regression", "hyperopt", "MAE", "South East Asia and Australia", "SEAA"],
    description="HistGradientBoosting Hyperopt with MAE on South East Asia and Australia"
)

# Create the Trials object
SEAA_trials = Trials()

# Create the fmin object
SEAA_best_hyperparams = fmin(fn = objective,
                            space = space,
                            algo = tpe.suggest,
                            max_evals = 1000,
                            trials = SEAA_trials,
                            show_progressbar=True,
                            early_stop_fn=no_progress_loss(100))

# Stop Neptune instance
run.stop()

SEAA_best_trial = SEAA_trials.best_trial
SEAA_best_hps = SEAA_best_hyperparams.copy()

SEAA_best_hps['max_iter'] = int(SEAA_best_hps['max_iter'])
SEAA_best_hps['min_samples_leaf'] = int(SEAA_best_hps['min_samples_leaf'])
SEAA_best_hps['max_leaf_nodes'] = int(SEAA_best_hps['max_leaf_nodes'])

[neptune] [info   ] Neptune initialized. Open in the app: https://app.neptune.ai/mads-capstone/capstone/e/CAP-187
 12%|█▏        | 123/1000 [1:00:08<7:08:46, 29.33s/trial, best loss: 8.134234962998857]
[neptune] [info   ] Shutting down background jobs, please wait a moment...
[neptune] [info   ] Done!
[neptune] [info   ] Waiting for the remaining 9 operations to synchronize with Neptune. Do not kill this process.
[neptune] [info   ] All 9 operations synced, thanks for waiting!
[neptune] [info   ] Explore the metadata in the Neptune app: https://app.neptune.ai/mads-capstone/capstone/e/CAP-187/metadata


### Review the Hyperparameter Search Validation MAE Scores and Hyperparameters

In [28]:
# What's the best trial from the search? Report the MAE & StdDev
print(f"Best Mean Absolute Error: {SEAA_trials.best_trial['result']['loss']:.4f}")
print(f"Best Standard Error: {SEAA_trials.best_trial['result']['std']:.4f}")

Best Mean Absolute Error: 8.1342
Best Standard Error: 0.1168


In [29]:
# Display the best hyperparameters
SEAA_best_hps

{'l2_regularization': 0.07568293170337922,
 'learning_rate': 0.022197274276110148,
 'max_iter': 230,
 'max_leaf_nodes': 361,
 'min_samples_leaf': 19}

## Recreate the model and get new MAE and Feature Importance

In [30]:
# Recreate the model with the best hyperparameters
SEAA_model = cf.model_score(hps=SEAA_best_hps, model_type='hgbm', holdout=True, val=True, Xtt=X_train_trans, ytt=y_train_trans, Xvt=X_val_trans, yvt=y_val_trans, Xht=X_holdout_trans, yht=y_holdout_trans)

# Plot the feature importance
# HistGradientBoosting does not have feature importance vars

 
Validation Mean Absolute Error: 8.0909
Validation Mean Squared Error: 235.5992
Validation Root Mean Squared Error: 15.3492
Validation R^2 Score: 0.2831
 
Holdout Mean Absolute Error: 7.4783
Holdout Mean Squared Error: 192.2259
Holdout Root Mean Squared Error: 13.8646
Holdout R^2 Score: 0.3574


## Feature Ablation

In [31]:
%%time
# First run through the feature ablation process
# This will determine which features can potentially be removed
SEAA_feature_ablation_df, SEAA_baseline_mae_val, SEAA_baseline_mae_train = cf.feat_ablation(
    model=SEAA_model, hps=SEAA_best_hps, model_type='hgbm',
    Xtt=X_train_trans, ytt=y_train_trans, 
    Xvt=X_val_trans, yvt=y_val_trans, 
    Xht=X_holdout_trans, yht=y_holdout_trans)

Baseline Mean MAE: 4.7307, Validation MAE: 8.0909
CPU times: total: 2min 54s
Wall time: 4min 4s


In [32]:
# Show the feature ablation dataframe, sorted by the change in MAE to Validation data
SEAA_feature_ablation_df.sort_values('Val_MAE_Change', ascending=False)

Unnamed: 0,Removed_Feature,Train_MAE,Train_MAE_Change,Train_MAE_Pct_Change,Val_MAE,Val_MAE_Change,Val_MAE_Pct_Change
10,SSTA,4.75,-0.02,-0.48,8.08,0.01,0.13
2,Depth_m,5.03,-0.3,-6.37,8.09,0.0,0.01
0,ClimSST,4.78,-0.05,-0.97,8.09,-0.0,-0.04
17,TSA_cubed_SSTA_squared,4.75,-0.02,-0.34,8.1,-0.01,-0.08
11,SSTA_DHW,4.76,-0.03,-0.63,8.1,-0.01,-0.1
5,Fert_and_Turbidity_Interaction,4.75,-0.02,-0.47,8.1,-0.01,-0.13
13,TSA,4.75,-0.02,-0.49,8.11,-0.02,-0.26
4,Distance_to_Shore,4.77,-0.04,-0.84,8.12,-0.02,-0.3
18,Turbidity,4.75,-0.02,-0.47,8.12,-0.02,-0.3
3,Dist_to_Shore_and_Turbidity_Interaction,4.77,-0.04,-0.83,8.12,-0.02,-0.3


In [33]:
%%time

SEAA_feat_ablation_len = len(SEAA_feature_ablation_df[SEAA_feature_ablation_df['Val_MAE'] <= SEAA_baseline_mae_val].sort_values('Val_MAE_Change', ascending=False)['Removed_Feature'])

if SEAA_feat_ablation_len > 0:
    # Generate the top features to use in feature ablation combinations
    SEAA_abl_list_to_combo = SEAA_feature_ablation_df[SEAA_feature_ablation_df['Val_MAE'] <= SEAA_baseline_mae_val].sort_values('Val_MAE_Change', ascending=False)['Removed_Feature'].head(5).to_list()

    # Second run through the feature ablation process
    # This time including the top 5 features whose removal decreased the MAE
    # These top 5 features will be combined using itertools, and retested 
    SEAA_feature_ablation_df, SEAA_baseline_mae_val, SEAA_baseline_mae_train = cf.feat_ablation(
        model=SEAA_model, hps=SEAA_best_hps, model_type='hgbm',
        Xtt=X_train_trans, ytt=y_train_trans, 
        Xvt=X_val_trans, yvt=y_val_trans, 
        Xht=X_holdout_trans, yht=y_holdout_trans, 
        abl_list_to_combo=SEAA_abl_list_to_combo)

    # Show the top 1 feature that decreased the MAE in the second ablation run, which we'll remove.  
    # Might be multiple features due to itertools combinations, so we split the string
    SEAA_drop_cols = SEAA_feature_ablation_df[SEAA_feature_ablation_df['Val_MAE'] <= SEAA_baseline_mae_val].sort_values('Val_MAE_Change', ascending=False)['Removed_Feature'].head(1).values[0].split(', ')

    # Create the feature columns list for saving out
    SEAA_feat_columns = X_train_trans.drop(columns=SEAA_drop_cols).columns.to_list()

    # Recreate the model with the dropped columns
    # It should have a lower MAE score than the original
    SEAA_experiment_model = cf.model_score(
        SEAA_best_hps, val=True, model_type='hgbm',
        Xtt=X_train_trans.drop(columns=SEAA_drop_cols), ytt=y_train_trans, 
        Xvt=X_val_trans.drop(columns=SEAA_drop_cols), yvt=y_val_trans, 
        Xht=X_holdout_trans.drop(columns=SEAA_drop_cols), yht=y_holdout_trans)

    # Plot the feature importance for this model
    # HistGradBoosting does not support FeatImportance

else:
    SEAA_feat_columns = X_train_trans.columns.to_list()



Baseline Mean MAE: 4.7307, Validation MAE: 8.0909
 
Validation Mean Absolute Error: 8.0800
Validation Mean Squared Error: 235.2955
Validation Root Mean Squared Error: 15.3393
Validation R^2 Score: 0.2840
CPU times: total: 3min 24s
Wall time: 4min 30s


### Output the Model, Trials, Parameters and Feature List to disk

In [34]:
# Write out the model artifacts to disk
# RUN WITH YOUR OWN CREDENTIALS (DEEPNOTE ACCT/S3 BUCKET)

# cf.write_out(model=SEAA_model, trials=SEAA_trials, params=SEAA_best_hps, feat_cols=SEAA_feat_columns, model_family='histgradboost_reg', location_name='SEAA')

## Review Holdout Truth vs Predictions

In [35]:
eval_df = y_val_trans.copy(deep=True)
SEAA_val_pred = SEAA_model.predict(X_val_trans)
eval_df['predictions'] = SEAA_val_pred.tolist()
eval_df['diff'] = eval_df['y_val'] - eval_df['predictions']

In [36]:
eval_df.sort_values(by='diff', ascending=False).head(20)

Unnamed: 0,y_val,predictions,diff
33637,100.0,2.46,97.54
28560,95.0,1.58,93.42
20027,100.0,9.4,90.6
27809,95.0,4.59,90.41
38375,100.0,10.96,89.04
27555,100.0,13.94,86.06
16252,85.0,0.72,84.28
25806,88.75,5.42,83.33
11564,85.0,7.24,77.76
27813,80.0,2.3,77.7


In [37]:
fig = px.histogram(eval_df[eval_df['y_val'] > 5], x='diff', nbins=100, title='Distribution of the diff between prediction and holdout where true value > 5')

# Show the plot
fig.show()

In [38]:
# Uncomment below to explore the error of our predictions

#eval_df[(eval_df['diff'].between(-5,5))].sample(10)
#eval_df[(eval_df['diff'].between(-5,5)) & (eval_df['y_val'] > 5)].sample(10)
eval_df[~(eval_df['diff'].between(-5,5))].sample(10)

Unnamed: 0,y_val,predictions,diff
25441,0.0,11.97,-11.97
13305,0.0,6.43,-6.43
27473,0.0,7.24,-7.24
40323,50.0,3.53,46.47
4663,2.0,8.5,-6.5
10592,17.5,3.33,14.17
39936,1.0,25.76,-24.76
18477,10.0,2.36,7.64
25197,0.0,21.59,-21.59
7439,100.0,68.35,31.65


## SHAP Analysis

### Create SHAP Objects

In [39]:
# Uncomment to re-run SHAP analysis
# Note: THIS CELL TAKES A LONG TIME TO RUN. RUN AT YOUR OWN RISK. RUN WITH YOUR OWN CREDENTIALS (DEEPNOTE ACCT/S3 BUCKET)

#%%time
# SEAA_X_val_trans = X_val_trans.copy(deep=True)
# SEAA_y_val_trans = y_val_trans.copy(deep=True)
# explainer = shap.TreeExplainer(SEAA_model)
# shap_values = explainer(SEAA_X_val_trans)
# shap_interaction = explainer.shap_interaction_values(SEAA_X_val_trans)
# sv = explainer.shap_values(SEAA_X_val_trans)
# shap.initjs()

CPU times: total: 0 ns
Wall time: 0 ns


In [40]:
# Beeswarm Importance
# plt.figure(figsize=(8,6))
# plt.title(f"SHAP Beeswarm Analysis")
# shap.plots.beeswarm(shap_values, max_display=32)
# plt.close()

### Investigate Individual Predictions

In [41]:
#investigate = SEAA_X_val_trans.index.get_loc(5809)
# print(SEAA_y_val_trans.iloc[investigate])

In [42]:
# Waterfall on a prediction
# plt.figure(figsize=(8,6))
# shap.plots.waterfall(shap_values[investigate], max_display=20) # , show=False
# plt.close()

In [43]:
# Decision plot on a prediction
# shap.decision_plot(explainer.expected_value, explainer.shap_values(SEAA_X_val_trans)[investigate], SEAA_X_val_trans, feature_display_range=slice(-1,-51,-1))

# Global Region

## Import the Data

In [44]:
X_train_trans, X_val_trans, X_holdout_trans, y_train_trans, y_val_trans, y_holdout_trans = cf.import_data(location_name='GLOB')

## Hyperparameter Tuning

### Execute Hyperparameter search

In [45]:
# Instantiate Neptune instance
run = neptune.init_run(
    name="HistGradientBoosting Reg Global",
    tags=["HistGradientBoosting", "regression", "hyperopt", "MAE", "Global", "GLOB"],
    description="HistGradientBoosting Hyperopt with MAE on Global"
)

# Create the Trials object
GLOB_trials = Trials()

# Create the fmin object
GLOB_best_hyperparams = fmin(fn = objective,
                            space = space,
                            algo = tpe.suggest,
                            max_evals = 1000,
                            trials = GLOB_trials,
                            show_progressbar=True,
                            early_stop_fn=no_progress_loss(100))

# Stop Neptune instance
run.stop()

GLOB_best_trial = GLOB_trials.best_trial
GLOB_best_hps = GLOB_best_hyperparams.copy()

GLOB_best_hps['max_iter'] = int(GLOB_best_hps['max_iter'])
GLOB_best_hps['min_samples_leaf'] = int(GLOB_best_hps['min_samples_leaf'])
GLOB_best_hps['max_leaf_nodes'] = int(GLOB_best_hps['max_leaf_nodes'])

[neptune] [info   ] Neptune initialized. Open in the app: https://app.neptune.ai/mads-capstone/capstone/e/CAP-190
 16%|█▋        | 163/1000 [3:24:29<17:30:01, 75.27s/trial, best loss: 8.986974619550029] 
[neptune] [info   ] Shutting down background jobs, please wait a moment...
[neptune] [info   ] Done!
[neptune] [info   ] Waiting for the remaining 9 operations to synchronize with Neptune. Do not kill this process.
[neptune] [info   ] All 9 operations synced, thanks for waiting!
[neptune] [info   ] Explore the metadata in the Neptune app: https://app.neptune.ai/mads-capstone/capstone/e/CAP-190/metadata


### Review the Hyperparameter Search Validation MAE Scores and Hyperparameters

In [46]:
# What's the best trial from the search? Report the MAE & StdDev
print(f"Best Mean Absolute Error: {GLOB_trials.best_trial['result']['loss']:.4f}")
print(f"Best Standard Error: {GLOB_trials.best_trial['result']['std']:.4f}")

Best Mean Absolute Error: 8.9870
Best Standard Error: 0.2859


In [47]:
# Display the best hyperparameters
GLOB_best_hps

{'l2_regularization': 0.9964032200290492,
 'learning_rate': 0.04111638407562157,
 'max_iter': 248,
 'max_leaf_nodes': 1438,
 'min_samples_leaf': 17}

## Recreate the model and get new MAE and Feature Importance

In [48]:
# Recreate the model with the best hyperparameters
GLOB_model = cf.model_score(hps=GLOB_best_hps, model_type='hgbm', holdout=True, val=True, Xtt=X_train_trans, ytt=y_train_trans, Xvt=X_val_trans, yvt=y_val_trans, Xht=X_holdout_trans, yht=y_holdout_trans)

# Plot the feature importance
# HistGradientBoosting does not have feature importance vars

 
Validation Mean Absolute Error: 8.9492
Validation Mean Squared Error: 236.6226
Validation Root Mean Squared Error: 15.3825
Validation R^2 Score: 0.4915
 
Holdout Mean Absolute Error: 9.0904
Holdout Mean Squared Error: 239.9994
Holdout Root Mean Squared Error: 15.4919
Holdout R^2 Score: 0.5065


## Feature Ablation

In [49]:
%%time
# First run through the feature ablation process
# This will determine which features can potentially be removed
GLOB_feature_ablation_df, GLOB_baseline_mae_val, GLOB_baseline_mae_train = cf.feat_ablation(
    model=GLOB_model, hps=GLOB_best_hps, model_type='hgbm',
    Xtt=X_train_trans, ytt=y_train_trans, 
    Xvt=X_val_trans, yvt=y_val_trans, 
    Xht=X_holdout_trans, yht=y_holdout_trans)

Baseline Mean MAE: 6.1032, Validation MAE: 8.9916
CPU times: total: 2min 28s
Wall time: 3min 53s


In [50]:
# Show the feature ablation dataframe, sorted by the change in MAE to Validation data
GLOB_feature_ablation_df.sort_values('Val_MAE_Change', ascending=False)

Unnamed: 0,Removed_Feature,Train_MAE,Train_MAE_Change,Train_MAE_Pct_Change,Val_MAE,Val_MAE_Change,Val_MAE_Pct_Change
10,SSTA,5.87,0.23,3.79,8.89,0.1,1.08
13,TSA,6.04,0.07,1.11,8.93,0.07,0.73
15,TSA_Frequency,6.1,0.01,0.08,8.95,0.04,0.47
16,TSA_cubed_SSTA,5.92,0.19,3.05,8.95,0.04,0.46
11,SSTA_DHW,5.86,0.24,3.93,8.95,0.04,0.41
17,TSA_cubed_SSTA_squared,5.88,0.23,3.74,8.96,0.04,0.4
1,Cyclone_Frequency,5.92,0.18,2.96,8.97,0.02,0.26
3,Dist_to_Shore_and_Turbidity_Interaction,6.04,0.06,1.02,8.97,0.02,0.25
18,Turbidity,6.07,0.04,0.62,8.99,0.0,0.0
4,Distance_to_Shore,5.88,0.23,3.73,9.0,-0.01,-0.09


In [51]:
%%time

GLOB_feat_ablation_len = len(GLOB_feature_ablation_df[GLOB_feature_ablation_df['Val_MAE'] <= GLOB_baseline_mae_val].sort_values('Val_MAE_Change', ascending=False)['Removed_Feature'])

if GLOB_feat_ablation_len > 0:
    # Generate the top features to use in feature ablation combinations
    GLOB_abl_list_to_combo = GLOB_feature_ablation_df[GLOB_feature_ablation_df['Val_MAE'] <= GLOB_baseline_mae_val].sort_values('Val_MAE_Change', ascending=False)['Removed_Feature'].head(5).to_list()

    # Second run through the feature ablation process
    # This time including the top 5 features whose removal decreased the MAE
    # These top 5 features will be combined using itertools, and retested 
    GLOB_feature_ablation_df, GLOB_baseline_mae_val, GLOB_baseline_mae_train = cf.feat_ablation(
        model=GLOB_model, hps=GLOB_best_hps, model_type='hgbm',
        Xtt=X_train_trans, ytt=y_train_trans, 
        Xvt=X_val_trans, yvt=y_val_trans, 
        Xht=X_holdout_trans, yht=y_holdout_trans, 
        abl_list_to_combo=GLOB_abl_list_to_combo)

    # Show the top 1 feature that decreased the MAE in the second ablation run, which we'll remove.  
    # Might be multiple features due to itertools combinations, so we split the string
    GLOB_drop_cols = GLOB_feature_ablation_df[GLOB_feature_ablation_df['Val_MAE'] <= GLOB_baseline_mae_val].sort_values('Val_MAE_Change', ascending=False)['Removed_Feature'].head(1).values[0].split(', ')

    # Create the feature columns list for saving out
    GLOB_feat_columns = X_train_trans.drop(columns=GLOB_drop_cols).columns.to_list()

    # Recreate the model with the dropped columns
    # It should have a lower MAE score than the original
    GLOB_experiment_model = cf.model_score(
        GLOB_best_hps, val=True, model_type='hgbm',
        Xtt=X_train_trans.drop(columns=GLOB_drop_cols), ytt=y_train_trans, 
        Xvt=X_val_trans.drop(columns=GLOB_drop_cols), yvt=y_val_trans, 
        Xht=X_holdout_trans.drop(columns=GLOB_drop_cols), yht=y_holdout_trans)

    # Plot the feature importance for this model
    # HistGradBoosting does not support FeatImportance

else:
    GLOB_feat_columns = X_train_trans.columns.to_list()



Baseline Mean MAE: 5.9975, Validation MAE: 9.0140
 
Validation Mean Absolute Error: 8.9343
Validation Mean Squared Error: 237.3922
Validation Root Mean Squared Error: 15.4075
Validation R^2 Score: 0.4899
CPU times: total: 6min 24s
Wall time: 9min 56s


### Output the Model, Trials, Parameters and Feature List to disk

In [52]:
# Write out the model artifacts to disk
# RUN WITH YOUR OWN CREDENTIALS (DEEPNOTE ACCT/S3 BUCKET)

# cf.write_out(model=GLOB_model, trials=GLOB_trials, params=GLOB_best_hps, feat_cols=GLOB_feat_columns, model_family='histgradboost_reg', location_name='GLOB')

## Review Holdout Truth vs Predictions

In [53]:
eval_df = y_val_trans.copy(deep=True)
GLOB_val_pred = GLOB_model.predict(X_val_trans)
eval_df['predictions'] = GLOB_val_pred.tolist()
eval_df['diff'] = eval_df['y_val'] - eval_df['predictions']

In [54]:
eval_df.sort_values(by='diff', ascending=False).head(20)

Unnamed: 0,y_val,predictions,diff
33637,100.0,4.18,95.82
5696,98.0,8.07,89.93
6844,93.0,5.21,87.79
27617,100.0,14.88,85.12
42188,90.0,7.21,82.79
58672,83.33,1.51,81.82
5891,87.0,5.5,81.5
50522,100.0,20.16,79.84
2640,90.0,10.67,79.33
11339,100.0,21.73,78.27


In [55]:
fig = px.histogram(eval_df[eval_df['y_val'] > 5], x='diff', nbins=100, title='Distribution of the diff between prediction and holdout where true value > 5')

# Show the plot
fig.show()

In [56]:
# Uncomment below to explore the error of our predictions

#eval_df[(eval_df['diff'].between(-5,5))].sample(10)
#eval_df[(eval_df['diff'].between(-5,5)) & (eval_df['y_val'] > 5)].sample(10)
eval_df[~(eval_df['diff'].between(-5,5))].sample(10)

Unnamed: 0,y_val,predictions,diff
56747,75.0,50.17,24.83
33817,15.5,9.07,6.43
25737,15.0,34.66,-19.66
38835,0.0,31.11,-31.11
40341,0.0,5.84,-5.84
5912,33.33,14.08,19.25
46622,0.0,12.58,-12.58
57145,10.0,20.04,-10.04
32920,44.25,19.68,24.57
57259,36.36,28.03,8.33


## SHAP Analysis

### Create SHAP Objects

In [57]:
# Uncomment to re-run SHAP analysis
# Note: THIS CELL TAKES A LONG TIME TO RUN. RUN AT YOUR OWN RISK. RUN WITH YOUR OWN CREDENTIALS (DEEPNOTE ACCT/S3 BUCKET)

# %%time
# GLOB_X_val_trans = X_val_trans.copy(deep=True)
# GLOB_y_val_trans = y_val_trans.copy(deep=True)
# explainer = shap.TreeExplainer(GLOB_model)
# shap_values = explainer(GLOB_X_val_trans)
# shap_interaction = explainer.shap_interaction_values(GLOB_X_val_trans)
# sv = explainer.shap_values(GLOB_X_val_trans)
# shap.initjs()

In [58]:
# Beeswarm Importance
# plt.figure(figsize=(8,6))
# plt.title(f"SHAP Beeswarm Analysis")
# shap.plots.beeswarm(shap_values, max_display=32)
# plt.close()

### Investigate Individual Predictions

In [59]:
#investigate = GLOB_X_val_trans.index.get_loc(5809)
# print(GLOB_y_val_trans.iloc[investigate])

In [60]:
# Waterfall on a prediction
# plt.figure(figsize=(8,6))
# shap.plots.waterfall(shap_values[investigate], max_display=20) # , show=False
# plt.close()

In [61]:
# Decision plot on a prediction
# shap.decision_plot(explainer.expected_value, explainer.shap_values(GLOB_X_val_trans)[investigate], GLOB_X_val_trans, feature_display_range=slice(-1,-51,-1))