# Regression - LightGBM

## Notebook Setup

### Import Libraries

In [None]:
# Import Standard Libraries
import os
import datetime
import pickle
import itertools
import pandas as pd
import numpy as np

# Import Visualization Libraries
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import shap

# Import custom functions
import env_functions as ef
import s3_functions as sf
import common_functions as cf

In [None]:
# Import Modeling Libraries
import lightgbm as lgb
from sklearn import set_config
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from hyperopt.early_stop import no_progress_loss
import neptune

In [None]:
# Determine the environment and get appropriate vars
deepnote, env_vars = ef.load_env_vars()

# Iterate through the vars and set them as global vars
for var_name, var in env_vars.items():
    globals()[var_name] = var

# If not in the DeepNote environment, create a dict for aws creds
#   that were located in the environment file.  This will be passed
#   to all aws s3 functions.
if not deepnote:
    aws_env_vars = {
        'access_key_id': aws_access_key_id,
        'secret_access_key': aws_secret_access_key,
        'bucket_name': s3_bucket_name
    }

In [None]:
# Pandas Configs
pd.set_option('mode.chained_assignment', None)
pd.options.display.float_format = '{:.2f}'.format
pd.options.display.max_columns = None
pd.options.display.max_rows = None

# Ignore Warnings
import warnings
warnings.simplefilter('ignore', category=FutureWarning)
warnings.simplefilter('ignore', category=UserWarning)
warnings.simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

# MapBox Token for Plotly Maps
px.set_mapbox_access_token(os.environ.get("MAPBOX_TOKEN"))

# Scikit Learn Configs
set_config(transform_output="pandas")

### Common Functions

In [None]:
def objective(space):
    """
    This is the LightGBM hyperparam objective function
    Hyperparams are passed into this function, which are then used in the model object
    The model object is then used in the cross_val_scores function 
    This results in a list of MAE scores, which are then returned as 
      the loss and std vars to the trials object
    """
    space['num_leaves'] = int(space['num_leaves'])
    space['n_estimators'] = int(space['n_estimators'])
    space['min_child_samples'] = int(space['min_child_samples'])
    space['subsample_freq'] = int(space['subsample_freq'])

    model=lgb.LGBMRegressor(
                    num_leaves=space['num_leaves'],
                    learning_rate=space['learning_rate'],
                    n_estimators=space['n_estimators'],
                    min_split_gain=space['min_split_gain'],
                    min_child_weight=space['min_child_weight'],
                    min_child_samples=space['min_child_samples'],
                    subsample=space['subsample'],
                    subsample_freq=space['subsample_freq'],
                    colsample_bytree=space['colsample_bytree'],
                    reg_alpha=space['reg_alpha'],
                    reg_lambda=space['reg_lambda'],
                    boosting_type='dart',
                    n_jobs = -1,
                    verbose = -1,
                    random_state = 42
                    )

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    mae_scores = -cross_val_score(model, X_train_trans, y_train_trans, cv=kf, scoring='neg_mean_absolute_error')
    mean_mae = mae_scores.mean()
    std_mae = mae_scores.std()

    # Emit model specific params and metrics to Neptune
    run['parameters/num_leaves'].log(space['num_leaves'])
    run['parameters/learning_rate'].log(space['learning_rate'])
    run['parameters/n_estimators'].log(space['n_estimators'])
    run['parameters/min_split_gain'].log(space['min_split_gain'])
    run['parameters/min_child_weight'].log(space['min_child_weight'])
    run['parameters/min_child_samples'].log(space['min_child_samples'])
    run['parameters/subsample'].log(space['subsample'])
    run['parameters/subsample_freq'].log(space['subsample_freq'])
    run['parameters/colsample_bytree'].log(space['colsample_bytree'])
    run['parameters/reg_alpha'].log(space['reg_alpha'])
    run['parameters/reg_lambda'].log(space['reg_lambda'])
    
    # Emit standard params and metrics to Neptune
    run['parameters/all_parameters'].log(str(space))
    run['metrics/mae_scores'].log(str(mae_scores.tolist()))
    run["metrics/mean_mae"].log(mean_mae)
    run["metrics/std_mae"].log(std_mae)

    return {'loss': mean_mae, 'status': STATUS_OK, 'std': std_mae}

In [None]:
# Define the Hyperparameter space
space = {
    'num_leaves': hp.quniform('num_leaves', 20, 1500, 1),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.2)),
    'n_estimators': hp.quniform('n_estimators', 100, 2500, 1),
    'min_split_gain': hp.uniform('min_split_gain', 0, 1),
    'min_child_weight': hp.uniform('min_child_weight', 0.001, 0.1),
    'min_child_samples': hp.quniform('min_child_samples', 1, 500, 1),
    'subsample': hp.uniform('subsample', 0.25, 1),
    'subsample_freq': hp.quniform('subsample_freq', 1, 20, 1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.25, 1),
    'reg_alpha': hp.uniform('reg_alpha', 0.05, 1),
    'reg_lambda': hp.uniform('reg_lambda', 0.05, 1),
}

# Caribbean Region

## Import the Data

In [None]:
X_train_trans, X_val_trans, X_holdout_trans, y_train_trans, y_val_trans, y_holdout_trans = cf.import_data(location_name='CARB')

## Hyperparameter Tuning

### Execute Hyperparameter search

In [None]:
# Instantiate Neptune instance
run = neptune.init_run(
    name="RandomForest Reg Caribbean",
    tags=["RandomForest", "regression", "hyperopt", "RMSE", "Caribbean", "CARB"],
    description="RandomForest Hyperopt with RMSE on Caribbean"
)

# Create the Trials object
CARB_trials = Trials()

# Create the fmin object
CARB_best_hyperparams = fmin(fn = objective,
                            space = space,
                            algo = tpe.suggest,
                            max_evals = 1000,
                            trials = CARB_trials,
                            show_progressbar=True,
                            early_stop_fn=no_progress_loss(100))

# Stop Neptune instance
run.stop()

CARB_best_trial = CARB_trials.best_trial
CARB_best_hps = CARB_best_hyperparams.copy()

CARB_best_hps['num_leaves'] = int(CARB_best_hps['num_leaves'])
CARB_best_hps['n_estimators'] = int(CARB_best_hps['n_estimators'])
CARB_best_hps['min_child_samples'] = int(CARB_best_hps['min_child_samples'])
CARB_best_hps['subsample_freq'] = int(CARB_best_hps['subsample_freq'])

### Review the Hyperparameter Search Validation MAE Scores and Hyperparameters

In [None]:
# What's the best trial from the search? Report the MAE & StdDev
print(f"Best Mean Absolute Error: {CARB_trials.best_trial['result']['loss']:.4f}")
print(f"Best Standard Error: {CARB_trials.best_trial['result']['std']:.4f}")

In [None]:
# Display the best hyperparameters
CARB_best_hps

## Recreate the model and get new MAE and Feature Importance

In [None]:
# Recreate the model with the best hyperparameters
CARB_model = cf.model_score(hps=CARB_best_hps, model_type='lgbm', holdout=True, val=True, Xtt=X_train_trans, ytt=y_train_trans, Xvt=X_val_trans, yvt=y_val_trans, Xht=X_holdout_trans, yht=y_holdout_trans)

# Plot the feature importance
cf.plot_feat_importance(CARB_model)

## Feature Ablation

In [None]:
%%time
# First run through the feature ablation process
# This will determine which features can potentially be removed
CARB_feature_ablation_df, CARB_baseline_mae_val, CARB_baseline_mae_train = cf.feat_ablation(
    model=CARB_model, hps=CARB_best_hps, model_type='lgbm',
    Xtt=X_train_trans, ytt=y_train_trans, 
    Xvt=X_val_trans, yvt=y_val_trans, 
    Xht=X_holdout_trans, yht=y_holdout_trans)

In [None]:
# Show the feature ablation dataframe, sorted by the change in MAE to Validation data
CARB_feature_ablation_df.sort_values('Val_MAE_Change', ascending=False)

In [None]:
# Generate the top 5 features to use in feature ablation combinations
CARB_abl_list_to_combo = CARB_feature_ablation_df[CARB_feature_ablation_df['Val_MAE'] <= CARB_baseline_mae_val].sort_values('Val_MAE_Change', ascending=False)['Removed_Feature'].head(1).values[0].split(', ')

# Show the top 5 features
CARB_abl_list_to_combo

In [None]:
%%time
# Second run through the feature ablation process
# This time including the top 5 features whose removal decreased the MAE
# These top 5 features will be combined using itertools
CARB_feature_ablation_df, CARB_baseline_mae_val, CARB_baseline_mae_train = cf.feat_ablation(
    model=CARB_model, hps=CARB_best_hps, model_type='lgbm',
    Xtt=X_train_trans, ytt=y_train_trans, 
    Xvt=X_val_trans, yvt=y_val_trans, 
    Xht=X_holdout_trans, yht=y_holdout_trans, 
    abl_list_to_combo=CARB_abl_list_to_combo)

In [None]:
# Show the feature ablation dataframe, sorted by the change in MAE to Validation data
CARB_feature_ablation_df[CARB_feature_ablation_df['Val_MAE'] <= CARB_baseline_mae_val]

In [None]:
# Show the top 1 feature, which we'll remove.  Might be multiple features, so we split the string
CARB_feature_ablation_df[CARB_feature_ablation_df['Val_MAE'] <= CARB_baseline_mae_val].sort_values('Val_MAE_Change', ascending=False)['Removed_Feature'].head(1).to_list()

In [None]:
# Define the top features to drop
CARB_drop_cols = CARB_feature_ablation_df[CARB_feature_ablation_df['Val_MAE'] <= CARB_baseline_mae_val].sort_values('Val_MAE_Change', ascending=False)['Removed_Feature'].head(1).values[0].split(', ')

# Create the feature columns list for saving out
CARB_feat_columns = X_train_trans.drop(columns=CARB_drop_cols).columns.to_list()

# Recreate the model with the dropped columns
# It should have a lower MAE score than the original
CARB_experiment_model = cf.model_score(
    CARB_best_hps, val=True, model_type='lgbm',
    Xtt=X_train_trans.drop(columns=CARB_drop_cols), ytt=y_train_trans, 
    Xvt=X_val_trans.drop(columns=CARB_drop_cols), yvt=y_val_trans, 
    Xht=X_holdout_trans.drop(columns=CARB_drop_cols), yht=y_holdout_trans)

# Plot the feature importance for this model
cf.plot_feat_importance(CARB_experiment_model)

### Output the Model, Trials, Parameters and Feature List to disk

In [None]:
# Write out the model artifacts to disk
cf.write_out(model=CARB_model, trials=CARB_trials, params=CARB_best_hps, feat_cols=CARB_feat_columns, model_family='lightgbm_reg', location_name='CARB')

## Review Holdout Truth vs Predictions

In [None]:
eval_df = y_val_trans.copy(deep=True)
CARB_val_pred = CARB_model.predict(X_val_trans)
eval_df['predictions'] = CARB_val_pred.tolist()
eval_df['diff'] = eval_df['y_val'] - eval_df['predictions']

In [None]:
eval_df.sort_values(by='diff', ascending=False).head(20)

In [None]:
fig = px.histogram(eval_df[eval_df['y_val'] > 5], x='diff', nbins=100, title='Distribution of the diff between prediction and holdout where true value > 5')

# Show the plot
fig.show()

In [None]:
#eval_df[(eval_df['diff'].between(-5,5))]
#eval_df[(eval_df['diff'].between(-5,5)) & (eval_df['y_val'] > 5)]
eval_df[~(eval_df['diff'].between(-5,5))]


## SHAP Analysis

### Create SHAP Objects

In [None]:
# %%time
# CARB_X_val_trans = X_val_trans.copy(deep=True)
# CARB_y_val_trans = y_val_trans.copy(deep=True)
# explainer = shap.TreeExplainer(CARB_model)
# shap_values = explainer(CARB_X_val_trans)
# shap_interaction = explainer.shap_interaction_values(CARB_X_val_trans)
# sv = explainer.shap_values(CARB_X_val_trans)
# shap.initjs()

In [None]:
# Beeswarm Importance
# plt.figure(figsize=(8,6))
# plt.title(f"SHAP Beeswarm Analysis")
# shap.plots.beeswarm(shap_values, max_display=32)
# plt.close()

### Investigate Individual Predictions

In [None]:
#investigate = CARB_X_val_trans.index.get_loc(5809)
# print(CARB_y_val_trans.iloc[investigate])

In [None]:
# Waterfall on a prediction
# plt.figure(figsize=(8,6))
# shap.plots.waterfall(shap_values[investigate], max_display=20) # , show=False
# plt.close()

In [None]:
# Decision plot on a prediction
# shap.decision_plot(explainer.expected_value, explainer.shap_values(CARB_X_val_trans)[investigate], CARB_X_val_trans, feature_display_range=slice(-1,-51,-1))

# South East Asia and Australia Region

## Import the Data

In [None]:
X_train_trans, X_val_trans, X_holdout_trans, y_train_trans, y_val_trans, y_holdout_trans = cf.import_data(location_name='SEAA')

## Hyperparameter Tuning

### Execute Hyperparameter search

In [None]:
# Instantiate Neptune instance
run = neptune.init_run(
    name="LightGBM Reg South East Asia and Australia",
    tags=["LightGBM", "regression", "hyperopt", "RMSE", "South East Asia and Australia", "SEAA"],
    description="LightGBM Hyperopt with RMSE on South East Asia and Australia"
)

# Create the Trials object
SEAA_trials = Trials()

# Create the fmin object
SEAA_best_hyperparams = fmin(fn = objective,
                            space = space,
                            algo = tpe.suggest,
                            max_evals = 1000,
                            trials = SEAA_trials,
                            show_progressbar=True,
                            early_stop_fn=no_progress_loss(100))

# Stop Neptune instance
run.stop()

SEAA_best_trial = SEAA_trials.best_trial
SEAA_best_hps = SEAA_best_hyperparams.copy()

SEAA_best_hps['num_leaves'] = int(SEAA_best_hps['num_leaves'])
SEAA_best_hps['n_estimators'] = int(SEAA_best_hps['n_estimators'])
SEAA_best_hps['min_child_samples'] = int(SEAA_best_hps['min_child_samples'])
SEAA_best_hps['subsample_freq'] = int(SEAA_best_hps['subsample_freq'])

### Review the Hyperparameter Search Validation MAE Scores and Hyperparameters

In [None]:
# What's the best trial from the search? Report the MAE & StdDev
print(f"Best Mean Absolute Error: {SEAA_trials.best_trial['result']['loss']:.4f}")
print(f"Best Standard Error: {SEAA_trials.best_trial['result']['std']:.4f}")

In [None]:
# Display the best hyperparameters
SEAA_best_hps

## Recreate the model and get new MAE and Feature Importance

In [None]:
# Recreate the model with the best hyperparameters
SEAA_model = cf.model_score(hps=SEAA_best_hps, model_type='lgbm', holdout=True, val=True, Xtt=X_train_trans, ytt=y_train_trans, Xvt=X_val_trans, yvt=y_val_trans, Xht=X_holdout_trans, yht=y_holdout_trans)

# Plot the feature importance
cf.plot_feat_importance(SEAA_model)

## Feature Ablation

In [None]:
%%time
# First run through the feature ablation process
# This will determine which features can potentially be removed
SEAA_feature_ablation_df, SEAA_baseline_mae_val, SEAA_baseline_mae_train = cf.feat_ablation(
    model=SEAA_model, hps=SEAA_best_hps, model_type='lgbm',
    Xtt=X_train_trans, ytt=y_train_trans, 
    Xvt=X_val_trans, yvt=y_val_trans, 
    Xht=X_holdout_trans, yht=y_holdout_trans)

In [None]:
# Show the feature ablation dataframe, sorted by the change in MAE to Validation data
SEAA_feature_ablation_df.sort_values('Val_MAE_Change', ascending=False)

In [None]:
# Generate the top 5 features to use in feature ablation combinations
SEAA_abl_list_to_combo = SEAA_feature_ablation_df[SEAA_feature_ablation_df['Val_MAE'] <= SEAA_baseline_mae_val].sort_values('Val_MAE_Change', ascending=False)['Removed_Feature'].head(5).to_list()

# Review the list
SEAA_abl_list_to_combo

In [None]:
%%time
# Second run through the feature ablation process
# This time including the top 5 features whose removal decreased the MAE
# These top 5 features will be combined using itertools
SEAA_feature_ablation_df, SEAA_baseline_mae_val, SEAA_baseline_mae_train = cf.feat_ablation(
    model=SEAA_model, hps=SEAA_best_hps, model_type='lgbm',
    Xtt=X_train_trans, ytt=y_train_trans, 
    Xvt=X_val_trans, yvt=y_val_trans, 
    Xht=X_holdout_trans, yht=y_holdout_trans, 
    abl_list_to_combo=SEAA_abl_list_to_combo)

In [None]:
# Show the feature ablation dataframe, sorted by the change in MAE to Validation data
SEAA_feature_ablation_df[SEAA_feature_ablation_df['Val_MAE'] <= SEAA_baseline_mae_val].sort_values('Val_MAE_Change', ascending=False)

In [None]:
# Show the top 1 feature, which we'll remove.  Might be multiple features, so we split the string
SEAA_feature_ablation_df[SEAA_feature_ablation_df['Val_MAE'] <= SEAA_baseline_mae_val].sort_values('Val_MAE_Change', ascending=False)['Removed_Feature'].head(1).values[0].split(', ')

In [None]:
# Define the top features to drop
SEAA_drop_cols = SEAA_feature_ablation_df[SEAA_feature_ablation_df['Val_MAE'] <= SEAA_baseline_mae_val].sort_values('Val_MAE_Change', ascending=False)['Removed_Feature'].head(1).values[0].split(', ')

# Create the feature columns list for saving out
SEAA_feat_columns = X_train_trans.drop(columns=SEAA_drop_cols).columns.to_list()

# Recreate the model with the dropped columns
# It should have a lower MAE score than the original
SEAA_experiment_model = cf.model_score(
    SEAA_best_hps, val=True, model_type='lgbm', 
    Xtt=X_train_trans.drop(columns=SEAA_drop_cols), ytt=y_train_trans, 
    Xvt=X_val_trans.drop(columns=SEAA_drop_cols), yvt=y_val_trans, 
    Xht=X_holdout_trans.drop(columns=SEAA_drop_cols), yht=y_holdout_trans)

# Plot the feature importance for this model
cf.plot_feat_importance(SEAA_experiment_model)

### Output the Model, Trials, Parameters and Feature List to disk

In [None]:
# Write out the model artifacts to disk
cf.write_out(model=SEAA_model, trials=SEAA_trials, params=SEAA_best_hps, feat_cols=SEAA_feat_columns, model_family='lightgbm_reg', location_name='SEAA')

## Review Holdout Truth vs Predictions

In [None]:
eval_df = y_val_trans.copy(deep=True)
SEAA_val_pred = SEAA_model.predict(X_val_trans)
eval_df['predictions'] = SEAA_val_pred.tolist()
eval_df['diff'] = eval_df['y_val'] - eval_df['predictions']

In [None]:
eval_df.sort_values(by='diff', ascending=False).head(20)

In [None]:
fig = px.histogram(eval_df[eval_df['y_val'] > 5], x='diff', nbins=100, title='Distribution of the diff between prediction and holdout where true value > 5')

# Show the plot
fig.show()

In [None]:
#eval_df[(eval_df['diff'].between(-5,5))]
#eval_df[(eval_df['diff'].between(-5,5)) & (eval_df['y_val'] > 5)]
eval_df[~(eval_df['diff'].between(-5,5))]


## SHAP Analysis

### Create SHAP Objects

In [None]:
# %%time
# SEAA_X_val_trans = X_val_trans.copy(deep=True)
# SEAA_y_val_trans = y_val_trans.copy(deep=True)
# explainer = shap.TreeExplainer(SEAA_model)
# shap_values = explainer(SEAA_X_val_trans)
# shap_interaction = explainer.shap_interaction_values(SEAA_X_val_trans)
# sv = explainer.shap_values(SEAA_X_val_trans)
# shap.initjs()

In [None]:
# Beeswarm Importance
# plt.figure(figsize=(8,6))
# plt.title(f"SHAP Beeswarm Analysis")
# shap.plots.beeswarm(shap_values, max_display=32)
# plt.close()

### Investigate Individual Predictions

In [None]:
#investigate = SEAA_X_val_trans.index.get_loc(5809)
# print(SEAA_y_val_trans.iloc[investigate])

In [None]:
# Waterfall on a prediction
# plt.figure(figsize=(8,6))
# shap.plots.waterfall(shap_values[investigate], max_display=20) # , show=False
# plt.close()

In [None]:
# Decision plot on a prediction
# shap.decision_plot(explainer.expected_value, explainer.shap_values(SEAA_X_val_trans)[investigate], SEAA_X_val_trans, feature_display_range=slice(-1,-51,-1))

# Global Region

## Import the Data

In [None]:
X_train_trans, X_val_trans, X_holdout_trans, y_train_trans, y_val_trans, y_holdout_trans = cf.import_data(location_name='GLOB')

## Hyperparameter Tuning

### Execute Hyperparameter search

In [None]:
# Instantiate Neptune instance
run = neptune.init_run(
    name="LightGBM Reg Global",
    tags=["LightGBM", "regression", "hyperopt", "MAE", "Global", "GLOB"],
    description="LightGBM Hyperopt with MAE on Global"
)

# Create the Trials object
GLOB_trials = Trials()

# Create the fmin object
GLOB_best_hyperparams = fmin(fn = objective,
                            space = space,
                            algo = tpe.suggest,
                            max_evals = 1000,
                            trials = GLOB_trials,
                            show_progressbar=True,
                            early_stop_fn=no_progress_loss(100))

# Stop Neptune instance
run.stop()

GLOB_best_trial = GLOB_trials.best_trial
GLOB_best_hps = GLOB_best_hyperparams.copy()

GLOB_best_hps['num_leaves'] = int(GLOB_best_hps['num_leaves'])
GLOB_best_hps['n_estimators'] = int(GLOB_best_hps['n_estimators'])
GLOB_best_hps['min_child_samples'] = int(GLOB_best_hps['min_child_samples'])
GLOB_best_hps['subsample_freq'] = int(GLOB_best_hps['subsample_freq'])

### Review the Hyperparameter Search Validation MAE Scores and Hyperparameters

In [None]:
# What's the best trial from the search? Report the MAE & StdDev
print(f"Best Mean Absolute Error: {GLOB_trials.best_trial['result']['loss']:.4f}")
print(f"Best Standard Error: {GLOB_trials.best_trial['result']['std']:.4f}")

In [None]:
# Display the best hyperparameters
GLOB_best_hps

## Recreate the model and get new MAE and Feature Importance

In [None]:
# Recreate the model with the best hyperparameters
GLOB_model = cf.model_score(hps=GLOB_best_hps, model_type='lgbm', holdout=True, val=True, Xtt=X_train_trans, ytt=y_train_trans, Xvt=X_val_trans, yvt=y_val_trans, Xht=X_holdout_trans, yht=y_holdout_trans)

# Plot the feature importance
cf.plot_feat_importance(GLOB_model)

## Feature Ablation

In [None]:
%%time
# First run through the feature ablation process
# This will determine which features can potentially be removed
GLOB_feature_ablation_df, GLOB_baseline_mae_val, GLOB_baseline_mae_train = cf.feat_ablation(
    model=GLOB_model, hps=GLOB_best_hps, model_type='lgbm',
    Xtt=X_train_trans, ytt=y_train_trans, 
    Xvt=X_val_trans, yvt=y_val_trans, 
    Xht=X_holdout_trans, yht=y_holdout_trans)

In [None]:
# Show the feature ablation dataframe, sorted by the change in MAE to Validation data
GLOB_feature_ablation_df.sort_values('Val_MAE_Change', ascending=False)

In [None]:
# Generate the top 5 features to use in feature ablation combinations
GLOB_abl_list_to_combo = GLOB_feature_ablation_df[GLOB_feature_ablation_df['Val_MAE'] <= GLOB_baseline_mae_val].sort_values('Val_MAE_Change', ascending=False)['Removed_Feature'].head(5).to_list()

# Review the list
GLOB_abl_list_to_combo

In [None]:
%%time
# Second run through the feature ablation process
# This time including the top 5 features whose removal decreased the MAE
# These top 5 features will be combined using itertools
GLOB_feature_ablation_df, GLOB_baseline_mae_val, GLOB_baseline_mae_train = cf.feat_ablation(
    model=GLOB_model, hps=GLOB_best_hps, model_type='lgbm',
    Xtt=X_train_trans, ytt=y_train_trans, 
    Xvt=X_val_trans, yvt=y_val_trans, 
    Xht=X_holdout_trans, yht=y_holdout_trans, 
    abl_list_to_combo=GLOB_abl_list_to_combo)

In [None]:
# Show the feature ablation dataframe, sorted by the change in MAE to Validation data
GLOB_feature_ablation_df[GLOB_feature_ablation_df['Val_MAE'] <= GLOB_baseline_mae_val].sort_values('Val_MAE_Change', ascending=False)

In [None]:
# Show the top 1 feature, which we'll remove.  Might be multiple features, so we split the string
GLOB_feature_ablation_df[GLOB_feature_ablation_df['Val_MAE'] <= GLOB_baseline_mae_val].sort_values('Val_MAE_Change', ascending=False)['Removed_Feature'].head(1).values[0].split(', ')

In [None]:
# Define the top features to drop
GLOB_drop_cols = GLOB_feature_ablation_df[GLOB_feature_ablation_df['Val_MAE'] <= GLOB_baseline_mae_val].sort_values('Val_MAE_Change', ascending=False)['Removed_Feature'].head(1).values[0].split(', ')

# Create the feature columns list for saving out
GLOB_feat_columns = X_train_trans.drop(columns=GLOB_drop_cols).columns.to_list()

# Recreate the model with the dropped columns
# It should have a lower MAE score than the original
GLOB_experiment_model = cf.model_score(
    GLOB_best_hps, val=True, model_type='lgbm', 
    Xtt=X_train_trans.drop(columns=GLOB_drop_cols), ytt=y_train_trans, 
    Xvt=X_val_trans.drop(columns=GLOB_drop_cols), yvt=y_val_trans, 
    Xht=X_holdout_trans.drop(columns=GLOB_drop_cols), yht=y_holdout_trans)

# Plot the feature importance for this model
cf.plot_feat_importance(GLOB_experiment_model)

### Output the Model, Trials, Parameters and Feature List to disk

In [None]:
# Write out the model artifacts to disk
cf.write_out(model=GLOB_model, trials=GLOB_trials, params=GLOB_best_hps, feat_cols=GLOB_feat_columns, model_family='lightgbm_reg', location_name='GLOB')

## Review Holdout Truth vs Predictions

In [None]:
eval_df = y_val_trans.copy(deep=True)
GLOB_val_pred = GLOB_model.predict(X_val_trans)
eval_df['predictions'] = GLOB_val_pred.tolist()
eval_df['diff'] = eval_df['y_val'] - eval_df['predictions']

In [None]:
eval_df.sort_values(by='diff', ascending=False).head(20)

In [None]:
fig = px.histogram(eval_df[eval_df['y_val'] > 5], x='diff', nbins=100, title='Distribution of the diff between prediction and holdout where true value > 5')

# Show the plot
fig.show()

In [None]:
#eval_df[(eval_df['diff'].between(-5,5))]
#eval_df[(eval_df['diff'].between(-5,5)) & (eval_df['y_val'] > 5)]
eval_df[~(eval_df['diff'].between(-5,5))]


## SHAP Analysis

### Create SHAP Objects

In [None]:
# %%time
# GLOB_X_val_trans = X_val_trans.copy(deep=True)
# GLOB_y_val_trans = y_val_trans.copy(deep=True)
# explainer = shap.TreeExplainer(GLOB_model)
# shap_values = explainer(GLOB_X_val_trans)
# shap_interaction = explainer.shap_interaction_values(GLOB_X_val_trans)
# sv = explainer.shap_values(GLOB_X_val_trans)
# shap.initjs()

In [None]:
# Beeswarm Importance
# plt.figure(figsize=(8,6))
# plt.title(f"SHAP Beeswarm Analysis")
# shap.plots.beeswarm(shap_values, max_display=32)
# plt.close()

### Investigate Individual Predictions

In [None]:
#investigate = GLOB_X_val_trans.index.get_loc(5809)
# print(GLOB_y_val_trans.iloc[investigate])

In [None]:
# Waterfall on a prediction
# plt.figure(figsize=(8,6))
# shap.plots.waterfall(shap_values[investigate], max_display=20) # , show=False
# plt.close()

In [None]:
# Decision plot on a prediction
# shap.decision_plot(explainer.expected_value, explainer.shap_values(GLOB_X_val_trans)[investigate], GLOB_X_val_trans, feature_display_range=slice(-1,-51,-1))