# Regression - RandomForest

## Notebook Setup

### Import Libraries

In [1]:
# Import Standard Libraries
import os
import datetime
import pickle
import itertools
import pandas as pd
import numpy as np

# Import Visualization Libraries
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import shap

# Import custom functions
import env_functions as ef
import s3_functions as sf
import common_functions as cf

Loading dotenv file


In [2]:
# Import Modeling Libraries
from sklearn import set_config
from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from hyperopt.early_stop import no_progress_loss
import neptune

In [3]:
# Determine the environment and get appropriate vars
deepnote, env_vars = ef.load_env_vars()

# Iterate through the vars and set them as global vars
for var_name, var in env_vars.items():
    globals()[var_name] = var

# If not in the DeepNote environment, create a dict for aws creds
#   that were located in the environment file.  This will be passed
#   to all aws s3 functions.
if not deepnote:
    aws_env_vars = {
        'access_key_id': aws_access_key_id,
        'secret_access_key': aws_secret_access_key,
        'bucket_name': s3_bucket_name
    }

Loading dotenv file


In [4]:
# Pandas Configs
pd.set_option('mode.chained_assignment', None)
pd.options.display.float_format = '{:.2f}'.format
pd.options.display.max_columns = None
pd.options.display.max_rows = None

# Ignore Warnings
import warnings
warnings.simplefilter('ignore', category=FutureWarning)
warnings.simplefilter('ignore', category=UserWarning)
warnings.simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

# MapBox Token for Plotly Maps
px.set_mapbox_access_token(os.environ.get("MAPBOX_TOKEN"))

# Scikit Learn Configs
set_config(transform_output="pandas")

### Common Functions

In [5]:
def objective(space):
    """
    This is the RandomForest hyperparam objective function
    Hyperparams are passed into this function, which are then used in the model object
    The model object is then used in the cross_val_scores function 
    This results in a list of MAE scores, which are then returned as 
      the loss and std vars to the trials object
    """
    space['n_estimators'] = int(space['n_estimators'])
    space['min_samples_split'] = int(space['min_samples_split'])
    space['min_samples_leaf'] = int(space['min_samples_leaf'])
    space['max_leaf_nodes'] = int(space['max_leaf_nodes'])

    model=RandomForestRegressor(
                    n_estimators = space['n_estimators'],
                    max_features = space['max_features'],
                    min_samples_split = space['min_samples_split'],
                    min_samples_leaf = space['min_samples_leaf'],
                    min_weight_fraction_leaf = space['min_weight_fraction_leaf'],
                    max_leaf_nodes = None if space['max_leaf_nodes'] == 0 else space['max_leaf_nodes'],
                    max_depth = None,
                    bootstrap = True,
                    criterion = 'squared_error',
                    n_jobs = -1,
                    random_state = 42
                    )

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    mae_scores = -cross_val_score(model, X_train_trans, y_train_trans, cv=kf, scoring='neg_mean_absolute_error')
    mean_mae = mae_scores.mean()
    std_mae = mae_scores.std()

    # Emit model specific params and metrics to Neptune
    run["parameters/n_estimators"].log(space['n_estimators'])
    run["parameters/min_samples_split"].log(space['min_samples_split'])
    run["parameters/min_samples_leaf"].log(space['min_samples_leaf'])
    run["parameters/min_weight_fraction_leaf"].log(space['min_weight_fraction_leaf'])
    run["parameters/max_leaf_nodes"].log(space['max_leaf_nodes'])
    
    # Emit standard params and metrics to Neptune
    run['parameters/all_parameters'].log(str(space))
    run['metrics/mae_scores'].log(str(mae_scores.tolist()))
    run["metrics/mean_mae"].log(mean_mae)
    run["metrics/std_mae"].log(std_mae)

    return {'loss': mean_mae, 'status': STATUS_OK, 'std': std_mae}

In [6]:
# Define the Hyperparameter space
space = {
    'n_estimators': hp.quniform('n_estimators', 100, 2500, 1),
    'max_features': hp.choice('max_features', ['sqrt', 'log2', None]),
    'min_samples_leaf': hp.quniform('min_samples_leaf', 3, 24, 1),
    'min_samples_split': hp.quniform('min_samples_split', 2, 12, 1),
    'min_weight_fraction_leaf': hp.uniform('min_weight_fraction_leaf', 0.0, 0.5),
    'max_leaf_nodes': hp.quniform('max_leaf_nodes', 2, 1500, 1),
}

# Caribbean Region

## Import the Data

In [7]:
X_train_trans, X_val_trans, X_holdout_trans, y_train_trans, y_val_trans, y_holdout_trans = cf.import_data(location_name='CARB')

## Hyperparameter Tuning

### Execute Hyperparameter search

In [8]:
# Instantiate Neptune instance
run = neptune.init_run(
    name="RandomForest Reg Caribbean",
    tags=["RandomForest", "regression", "hyperopt", "RMSE", "Caribbean", "CARB"],
    description="RandomForest Hyperopt with RMSE on Caribbean"
)

# Create the Trials object
CARB_trials = Trials()

# Create the fmin object
CARB_best_hyperparams = fmin(fn = objective,
                            space = space,
                            algo = tpe.suggest,
                            max_evals = 1000,
                            trials = CARB_trials,
                            show_progressbar=True,
                            early_stop_fn=no_progress_loss(100))

# Stop Neptune instance
run.stop()

CARB_best_trial = CARB_trials.best_trial
CARB_best_hps = CARB_best_hyperparams.copy()

CARB_best_hps['max_features'] = ['sqrt', 'log2', None][CARB_best_hps['max_features']]
CARB_best_hps['n_estimators'] = int(CARB_best_hps['n_estimators'])
CARB_best_hps['min_samples_split'] = int(CARB_best_hps['min_samples_split'])
CARB_best_hps['min_samples_leaf'] = int(CARB_best_hps['min_samples_leaf'])
CARB_best_hps['max_leaf_nodes'] = int(CARB_best_hps['max_leaf_nodes'])



[neptune] [info   ] Neptune initialized. Open in the app: https://app.neptune.ai/mads-capstone/capstone/e/CAP-173
  5%|▍         | 47/1000 [03:08<1:03:42,  4.01s/trial, best loss: 12.432571646565716]


KeyboardInterrupt: 

In [9]:
run.stop()

[neptune] [info   ] Shutting down background jobs, please wait a moment...
[neptune] [info   ] Done!
[neptune] [info   ] All 0 operations synced, thanks for waiting!
[neptune] [info   ] Explore the metadata in the Neptune app: https://app.neptune.ai/mads-capstone/capstone/e/CAP-173/metadata


### Review the Hyperparameter Search Validation MAE Scores and Hyperparameters

In [None]:
# What's the best trial from the search? Report the MAE & StdDev
print(f"Best Mean Absolute Error: {CARB_trials.best_trial['result']['loss']:.4f}")
print(f"Best Standard Error: {CARB_trials.best_trial['result']['std']:.4f}")

In [None]:
# Display the best hyperparameters
CARB_best_hps

## Recreate the model and get new MAE and Feature Importance

In [None]:
# Recreate the model with the best hyperparameters
CARB_model = cf.model_score(hps=CARB_best_hps, model_type='rf', holdout=True, val=True, Xtt=X_train_trans, ytt=y_train_trans, Xvt=X_val_trans, yvt=y_val_trans, Xht=X_holdout_trans, yht=y_holdout_trans)

# Plot the feature importance
cf.plot_feat_importance(CARB_model)

## Feature Ablation

In [None]:
%%time
# First run through the feature ablation process
# This will determine which features can potentially be removed
CARB_feature_ablation_df, CARB_baseline_mae_val, CARB_baseline_mae_train = cf.feat_ablation(
    model=CARB_model, hps=CARB_best_hps, model_type='rf',
    Xtt=X_train_trans, ytt=y_train_trans, 
    Xvt=X_val_trans, yvt=y_val_trans, 
    Xht=X_holdout_trans, yht=y_holdout_trans)

In [None]:
# Show the feature ablation dataframe, sorted by the change in MAE to Validation data
CARB_feature_ablation_df.sort_values('Val_MAE_Change', ascending=False)

In [None]:
%%time

CARB_feat_ablation_len = len(CARB_feature_ablation_df[CARB_feature_ablation_df['Val_MAE'] <= CARB_baseline_mae_val].sort_values('Val_MAE_Change', ascending=False)['Removed_Feature'])

if CARB_feat_ablation_len > 0:
    # Generate the top features to use in feature ablation combinations
    CARB_abl_list_to_combo = CARB_feature_ablation_df[CARB_feature_ablation_df['Val_MAE'] <= CARB_baseline_mae_val].sort_values('Val_MAE_Change', ascending=False)['Removed_Feature'].head(5).to_list()

    # Second run through the feature ablation process
    # This time including the top 5 features whose removal decreased the MAE
    # These top 5 features will be combined using itertools, and retested 
    CARB_feature_ablation_df, CARB_baseline_mae_val, CARB_baseline_mae_train = cf.feat_ablation(
        model=CARB_model, hps=CARB_best_hps, model_type='rf',
        Xtt=X_train_trans, ytt=y_train_trans, 
        Xvt=X_val_trans, yvt=y_val_trans, 
        Xht=X_holdout_trans, yht=y_holdout_trans, 
        abl_list_to_combo=CARB_abl_list_to_combo)

    # Show the top 1 feature that decreased the MAE in the second ablation run, which we'll remove.  
    # Might be multiple features due to itertools combinations, so we split the string
    CARB_drop_cols = CARB_feature_ablation_df[CARB_feature_ablation_df['Val_MAE'] <= CARB_baseline_mae_val].sort_values('Val_MAE_Change', ascending=False)['Removed_Feature'].head(1).values[0].split(', ')

    # Create the feature columns list for saving out
    CARB_feat_columns = X_train_trans.drop(columns=CARB_drop_cols).columns.to_list()

    # Recreate the model with the dropped columns
    # It should have a lower MAE score than the original
    CARB_experiment_model = cf.model_score(
        CARB_best_hps, val=True, model_type='rf',
        Xtt=X_train_trans.drop(columns=CARB_drop_cols), ytt=y_train_trans, 
        Xvt=X_val_trans.drop(columns=CARB_drop_cols), yvt=y_val_trans, 
        Xht=X_holdout_trans.drop(columns=CARB_drop_cols), yht=y_holdout_trans)

    # Plot the feature importance for this model
    cf.plot_feat_importance(CARB_experiment_model)

else:
    CARB_feat_columns = X_train_trans.columns.to_list()



### Output the Model, Trials, Parameters and Feature List to disk

In [None]:
# Write out the model artifacts to disk
cf.write_out(model=CARB_model, trials=CARB_trials, params=CARB_best_hps, feat_cols=CARB_feat_columns, model_family='randomforest_reg', location_name='CARB')

## Review Holdout Truth vs Predictions

In [None]:
eval_df = y_val_trans.copy(deep=True)
CARB_val_pred = CARB_model.predict(X_val_trans)
eval_df['predictions'] = CARB_val_pred.tolist()
eval_df['diff'] = eval_df['y_val'] - eval_df['predictions']

In [None]:
eval_df.sort_values(by='diff', ascending=False).head(20)

In [None]:
fig = px.histogram(eval_df[eval_df['y_val'] > 5], x='diff', nbins=100, title='Distribution of the diff between prediction and holdout where true value > 5')

# Show the plot
fig.show()

In [None]:
#eval_df[(eval_df['diff'].between(-5,5))]
#eval_df[(eval_df['diff'].between(-5,5)) & (eval_df['y_val'] > 5)]
eval_df[~(eval_df['diff'].between(-5,5))]


## SHAP Analysis

### Create SHAP Objects

In [None]:
# %%time
# CARB_X_val_trans = X_val_trans.copy(deep=True)
# CARB_y_val_trans = y_val_trans.copy(deep=True)
# explainer = shap.TreeExplainer(CARB_model)
# shap_values = explainer(CARB_X_val_trans)
# shap_interaction = explainer.shap_interaction_values(CARB_X_val_trans)
# sv = explainer.shap_values(CARB_X_val_trans)
# shap.initjs()

In [None]:
# Beeswarm Importance
# plt.figure(figsize=(8,6))
# plt.title(f"SHAP Beeswarm Analysis")
# shap.plots.beeswarm(shap_values, max_display=32)
# plt.close()

### Investigate Individual Predictions

In [None]:
# investigate = CARB_X_val_trans.index.get_loc(5466)
# print(CARB_y_val_trans.iloc[investigate])

In [None]:
# Waterfall on a prediction
# plt.figure(figsize=(8,6))
# shap.plots.waterfall(shap_values[investigate], max_display=20) # , show=False
# plt.close()

In [None]:
# Decision plot on a prediction
# shap.decision_plot(explainer.expected_value, explainer.shap_values(CARB_X_val_trans)[investigate], CARB_X_val_trans, feature_display_range=slice(-1,-51,-1))

# South East Asia and Australia Region

## Import the Data

In [None]:
X_train_trans, X_val_trans, X_holdout_trans, y_train_trans, y_val_trans, y_holdout_trans = cf.import_data(location_name='SEAA')

## Hyperparameter Tuning

### Execute Hyperparameter search

In [None]:
# Instantiate Neptune instance
run = neptune.init_run(
    name="RandomForest Reg South East Asia and Australia",
    tags=["RandomForest", "regression", "hyperopt", "RMSE", "South East Asia and Australia", "SEAA"],
    description="RandomForest Hyperopt with RMSE on South East Asia and Australia"
)

# Create the Trials object
SEAA_trials = Trials()

# Create the fmin object
SEAA_best_hyperparams = fmin(fn = objective,
                            space = space,
                            algo = tpe.suggest,
                            max_evals = 1000,
                            trials = SEAA_trials,
                            show_progressbar=True,
                            early_stop_fn=no_progress_loss(100))

# Stop Neptune instance
run.stop()

SEAA_best_trial = SEAA_trials.best_trial
SEAA_best_hps = SEAA_best_hyperparams.copy()

SEAA_best_hps['max_features'] = ['sqrt', 'log2', None][SEAA_best_hps['max_features']]
SEAA_best_hps['n_estimators'] = int(SEAA_best_hps['n_estimators'])
#SEAA_best_hps['max_depth'] = int(SEAA_best_hps['max_depth'])
SEAA_best_hps['min_samples_split'] = int(SEAA_best_hps['min_samples_split'])
SEAA_best_hps['min_samples_leaf'] = int(SEAA_best_hps['min_samples_leaf'])
SEAA_best_hps['max_leaf_nodes'] = int(SEAA_best_hps['max_leaf_nodes'])

### Review the Hyperparameter Search Validation MAE Scores and Hyperparameters

In [None]:
# What's the best trial from the search? Report the MAE & StdDev
print(f"Best Mean Absolute Error: {SEAA_trials.best_trial['result']['loss']:.4f}")
print(f"Best Standard Error: {SEAA_trials.best_trial['result']['std']:.4f}")

In [None]:
# Display the best hyperparameters
SEAA_best_hps

## Recreate the model and get new MAE and Feature Importance

In [None]:
# Recreate the model with the best hyperparameters
SEAA_model = cf.model_score(hps=SEAA_best_hps, model_type='rf', holdout=True, val=True, Xtt=X_train_trans, ytt=y_train_trans, Xvt=X_val_trans, yvt=y_val_trans, Xht=X_holdout_trans, yht=y_holdout_trans)

# Plot the feature importance
cf.plot_feat_importance(SEAA_model)

## Feature Ablation

In [None]:
%%time
# First run through the feature ablation process
# This will determine which features can potentially be removed
SEAA_feature_ablation_df, SEAA_baseline_mae_val, SEAA_baseline_mae_train = cf.feat_ablation(
    model=SEAA_model, hps=SEAA_best_hps, model_type='rf',
    Xtt=X_train_trans, ytt=y_train_trans, 
    Xvt=X_val_trans, yvt=y_val_trans, 
    Xht=X_holdout_trans, yht=y_holdout_trans)

In [None]:
# Show the feature ablation dataframe, sorted by the change in MAE to Validation data
SEAA_feature_ablation_df.sort_values('Val_MAE_Change', ascending=False)

In [None]:
%%time

SEAA_feat_ablation_len = len(SEAA_feature_ablation_df[SEAA_feature_ablation_df['Val_MAE'] <= SEAA_baseline_mae_val].sort_values('Val_MAE_Change', ascending=False)['Removed_Feature'])

if SEAA_feat_ablation_len > 0:
    # Generate the top features to use in feature ablation combinations
    SEAA_abl_list_to_combo = SEAA_feature_ablation_df[SEAA_feature_ablation_df['Val_MAE'] <= SEAA_baseline_mae_val].sort_values('Val_MAE_Change', ascending=False)['Removed_Feature'].head(5).to_list()

    # Second run through the feature ablation process
    # This time including the top 5 features whose removal decreased the MAE
    # These top 5 features will be combined using itertools, and retested 
    SEAA_feature_ablation_df, SEAA_baseline_mae_val, SEAA_baseline_mae_train = cf.feat_ablation(
        model=SEAA_model, hps=SEAA_best_hps, model_type='rf',
        Xtt=X_train_trans, ytt=y_train_trans, 
        Xvt=X_val_trans, yvt=y_val_trans, 
        Xht=X_holdout_trans, yht=y_holdout_trans, 
        abl_list_to_combo=SEAA_abl_list_to_combo)

    # Show the top 1 feature that decreased the MAE in the second ablation run, which we'll remove.  
    # Might be multiple features due to itertools combinations, so we split the string
    SEAA_drop_cols = SEAA_feature_ablation_df[SEAA_feature_ablation_df['Val_MAE'] <= SEAA_baseline_mae_val].sort_values('Val_MAE_Change', ascending=False)['Removed_Feature'].head(1).values[0].split(', ')

    # Create the feature columns list for saving out
    SEAA_feat_columns = X_train_trans.drop(columns=SEAA_drop_cols).columns.to_list()

    # Recreate the model with the dropped columns
    # It should have a lower MAE score than the original
    SEAA_experiment_model = cf.model_score(
        SEAA_best_hps, val=True, model_type='rf',
        Xtt=X_train_trans.drop(columns=SEAA_drop_cols), ytt=y_train_trans, 
        Xvt=X_val_trans.drop(columns=SEAA_drop_cols), yvt=y_val_trans, 
        Xht=X_holdout_trans.drop(columns=SEAA_drop_cols), yht=y_holdout_trans)

    # Plot the feature importance for this model
    cf.plot_feat_importance(SEAA_experiment_model)

else:
    SEAA_feat_columns = X_train_trans.columns.to_list()


### Output the Model, Trials, Parameters and Feature List to disk

In [None]:
# Write out the model artifacts to disk
cf.write_out(model=SEAA_model, trials=SEAA_trials, params=SEAA_best_hps, feat_cols=SEAA_feat_columns, model_family='randomforest_reg', location_name='SEAA')

## Review Holdout Truth vs Predictions

In [None]:
eval_df = y_val_trans.copy(deep=True)
SEAA_val_pred = SEAA_model.predict(X_val_trans)
eval_df['predictions'] = SEAA_val_pred.tolist()
eval_df['diff'] = eval_df['y_val'] - eval_df['predictions']

In [None]:
eval_df.sort_values(by='diff', ascending=False).head(20)

In [None]:
fig = px.histogram(eval_df[eval_df['y_val'] > 5], x='diff', nbins=100, title='Distribution of the diff between prediction and holdout where true value > 5')

# Show the plot
fig.show()

In [None]:
#eval_df[(eval_df['diff'].between(-5,5))]
#eval_df[(eval_df['diff'].between(-5,5)) & (eval_df['y_val'] > 5)]
eval_df[~(eval_df['diff'].between(-5,5))]


## SHAP Analysis

### Create SHAP Objects

In [None]:
# %%time
# SEAA_X_val_trans = X_val_trans.copy(deep=True)
# SEAA_y_val_trans = y_val_trans.copy(deep=True)
# explainer = shap.TreeExplainer(SEAA_model)
# shap_values = explainer(SEAA_X_val_trans)
# shap_interaction = explainer.shap_interaction_values(SEAA_X_val_trans)
# sv = explainer.shap_values(SEAA_X_val_trans)
# shap.initjs()

In [None]:
# Beeswarm Importance
# plt.figure(figsize=(8,6))
# plt.title(f"SHAP Beeswarm Analysis")
# shap.plots.beeswarm(shap_values, max_display=32)
# plt.close()

### Investigate Individual Predictions

In [None]:
# investigate = SEAA_X_val_trans.index.get_loc(38207)
# print(SEAA_y_val_trans.iloc[investigate])

In [None]:
# Waterfall on a prediction
# plt.figure(figsize=(8,6))
# shap.plots.waterfall(shap_values[investigate], max_display=20) # , show=False
# plt.close()

In [None]:
# Decision plot on a prediction
# shap.decision_plot(explainer.expected_value, explainer.shap_values(SEAA_X_val_trans)[investigate], SEAA_X_val_trans, feature_display_range=slice(-1,-51,-1))

# Global Region

## Import the Data

In [None]:
X_train_trans, X_val_trans, X_holdout_trans, y_train_trans, y_val_trans, y_holdout_trans = cf.import_data(location_name='GLOB')

## Hyperparameter Tuning

### Execute Hyperparameter search

In [None]:
# Instantiate Neptune instance
run = neptune.init_run(
    name="RandomForest Reg Global",
    tags=["RandomForest", "regression", "hyperopt", "MAE", "Global", "GLOB"],
    description="RandomForest Hyperopt with MAE on Global"
)

# Create the Trials object
GLOB_trials = Trials()

# Create the fmin object
GLOB_best_hyperparams = fmin(fn = objective,
                            space = space,
                            algo = tpe.suggest,
                            max_evals = 1000,
                            trials = GLOB_trials,
                            show_progressbar=True,
                            early_stop_fn=no_progress_loss(100))

# Stop Neptune instance
run.stop()

GLOB_best_trial = GLOB_trials.best_trial
GLOB_best_hps = GLOB_best_hyperparams.copy()

GLOB_best_hps['max_features'] = ['sqrt', 'log2', None][GLOB_best_hps['max_features']]
GLOB_best_hps['n_estimators'] = int(GLOB_best_hps['n_estimators'])
GLOB_best_hps['min_samples_split'] = int(GLOB_best_hps['min_samples_split'])
GLOB_best_hps['min_samples_leaf'] = int(GLOB_best_hps['min_samples_leaf'])
GLOB_best_hps['max_leaf_nodes'] = int(GLOB_best_hps['max_leaf_nodes'])

### Review the Hyperparameter Search Validation MAE Scores and Hyperparameters

In [None]:
# What's the best trial from the search? Report the MAE & StdDev
print(f"Best Mean Absolute Error: {GLOB_trials.best_trial['result']['loss']:.4f}")
print(f"Best Standard Error: {GLOB_trials.best_trial['result']['std']:.4f}")

In [None]:
# Display the best hyperparameters
GLOB_best_hps

## Recreate the model and get new MAE and Feature Importance

In [None]:
# Recreate the model with the best hyperparameters
GLOB_model = cf.model_score(hps=GLOB_best_hps, model_type='rf', holdout=True, val=True, Xtt=X_train_trans, ytt=y_train_trans, Xvt=X_val_trans, yvt=y_val_trans, Xht=X_holdout_trans, yht=y_holdout_trans)

# Plot the feature importance
cf.plot_feat_importance(GLOB_model)

## Feature Ablation

In [None]:
%%time
# First run through the feature ablation process
# This will determine which features can potentially be removed
GLOB_feature_ablation_df, GLOB_baseline_mae_val, GLOB_baseline_mae_train = cf.feat_ablation(
    model=GLOB_model, hps=GLOB_best_hps, model_type='rf',
    Xtt=X_train_trans, ytt=y_train_trans, 
    Xvt=X_val_trans, yvt=y_val_trans, 
    Xht=X_holdout_trans, yht=y_holdout_trans)

In [None]:
# Show the feature ablation dataframe, sorted by the change in MAE to Validation data
GLOB_feature_ablation_df.sort_values('Val_MAE_Change', ascending=False)

In [None]:
%%time

GLOB_feat_ablation_len = len(GLOB_feature_ablation_df[GLOB_feature_ablation_df['Val_MAE'] <= GLOB_baseline_mae_val].sort_values('Val_MAE_Change', ascending=False)['Removed_Feature'])

if GLOB_feat_ablation_len > 0:
    # Generate the top features to use in feature ablation combinations
    GLOB_abl_list_to_combo = GLOB_feature_ablation_df[GLOB_feature_ablation_df['Val_MAE'] <= GLOB_baseline_mae_val].sort_values('Val_MAE_Change', ascending=False)['Removed_Feature'].head(5).to_list()

    # Second run through the feature ablation process
    # This time including the top 5 features whose removal decreased the MAE
    # These top 5 features will be combined using itertools, and retested 
    GLOB_feature_ablation_df, GLOB_baseline_mae_val, GLOB_baseline_mae_train = cf.feat_ablation(
        model=GLOB_model, hps=GLOB_best_hps, model_type='rf',
        Xtt=X_train_trans, ytt=y_train_trans, 
        Xvt=X_val_trans, yvt=y_val_trans, 
        Xht=X_holdout_trans, yht=y_holdout_trans, 
        abl_list_to_combo=GLOB_abl_list_to_combo)

    # Show the top 1 feature that decreased the MAE in the second ablation run, which we'll remove.  
    # Might be multiple features due to itertools combinations, so we split the string
    GLOB_drop_cols = GLOB_feature_ablation_df[GLOB_feature_ablation_df['Val_MAE'] <= GLOB_baseline_mae_val].sort_values('Val_MAE_Change', ascending=False)['Removed_Feature'].head(1).values[0].split(', ')

    # Create the feature columns list for saving out
    GLOB_feat_columns = X_train_trans.drop(columns=GLOB_drop_cols).columns.to_list()

    # Recreate the model with the dropped columns
    # It should have a lower MAE score than the original
    GLOB_experiment_model = cf.model_score(
        GLOB_best_hps, val=True, model_type='rf',
        Xtt=X_train_trans.drop(columns=GLOB_drop_cols), ytt=y_train_trans, 
        Xvt=X_val_trans.drop(columns=GLOB_drop_cols), yvt=y_val_trans, 
        Xht=X_holdout_trans.drop(columns=GLOB_drop_cols), yht=y_holdout_trans)

    # Plot the feature importance for this model
    cf.plot_feat_importance(GLOB_experiment_model)

else:
    GLOB_feat_columns = X_train_trans.columns.to_list()



### Output the Model, Trials, Parameters and Feature List to disk

In [None]:
# Write out the model artifacts to disk
cf.write_out(model=GLOB_model, trials=GLOB_trials, params=GLOB_best_hps, feat_cols=GLOB_feat_columns, model_family='randomforest_reg', location_name='GLOB')

## Review Holdout Truth vs Predictions

In [None]:
eval_df = y_val_trans.copy(deep=True)
GLOB_val_pred = GLOB_model.predict(X_val_trans)
eval_df['predictions'] = GLOB_val_pred.tolist()
eval_df['diff'] = eval_df['y_val'] - eval_df['predictions']

In [None]:
eval_df.sort_values(by='diff', ascending=False).head(20)

In [None]:
fig = px.histogram(eval_df[eval_df['y_val'] > 5], x='diff', nbins=100, title='Distribution of the diff between prediction and holdout where true value > 5')

# Show the plot
fig.show()

In [None]:
#eval_df[(eval_df['diff'].between(-5,5))]
#eval_df[(eval_df['diff'].between(-5,5)) & (eval_df['y_val'] > 5)]
eval_df[~(eval_df['diff'].between(-5,5))]


## SHAP Analysis

### Create SHAP Objects

In [None]:
# %%time
# GLOB_X_val_trans = X_val_trans.copy(deep=True)
# GLOB_y_val_trans = y_val_trans.copy(deep=True)
# explainer = shap.TreeExplainer(GLOB_model)
# shap_values = explainer(GLOB_X_val_trans)
# shap_interaction = explainer.shap_interaction_values(GLOB_X_val_trans)
# sv = explainer.shap_values(GLOB_X_val_trans)
# shap.initjs()

In [None]:
# Beeswarm Importance
# plt.figure(figsize=(8,6))
# plt.title(f"SHAP Beeswarm Analysis")
# shap.plots.beeswarm(shap_values, max_display=32)
# plt.close()

### Investigate Individual Predictions

In [None]:
# investigate = GLOB_X_val_trans.index.get_loc(39332)
# print(GLOB_y_val_trans.iloc[investigate])

In [None]:
# Waterfall on a prediction
# plt.figure(figsize=(8,6))
# shap.plots.waterfall(shap_values[investigate], max_display=20) # , show=False
# plt.close()

In [None]:
# Decision plot on a prediction
# shap.decision_plot(explainer.expected_value, explainer.shap_values(GLOB_X_val_trans)[investigate], GLOB_X_val_trans, feature_display_range=slice(-1,-51,-1))