<a href="https://colab.research.google.com/github/joekelly211/masfi/blob/main/5_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports and directories

In [None]:
# Define base directory
base_dir = "/gdrive/Shareddrives/masfi"

# Mount Google Drive and set base directory
from google.colab import drive
import os
import sys
drive.mount('/gdrive', force_remount=True)
_path_to_add = os.path.realpath(base_dir)
if _path_to_add not in sys.path:
    sys.path.append(_path_to_add)

In [None]:
# Capture outputs
%%capture
# Installs
!pip install kaleido
!pip install shap
!pip install scikit-optimize
!pip install tabulate
!pip install xgboost

In [None]:
# Reload imports, replacing those in the cache
%load_ext autoreload
%autoreload 2
# Imports
import json
import pickle
from datetime import datetime, timedelta
from os import makedirs
from os.path import exists, join
from pathlib import Path
from google.colab import runtime
import ipywidgets as widgets
import joblib
import numpy as np
import pandas as pd
import plotly.express as px
import random
import shap
import xgboost as xgb
from IPython.display import clear_output
from matplotlib import pyplot as plt
from scipy.stats import randint, uniform
from sklearn.metrics import (
    mean_absolute_error,
    mean_absolute_percentage_error,
    mean_squared_error,
    r2_score,
    accuracy_score
)
from sklearn.model_selection import KFold
from tabulate import tabulate

# Define GPU
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  print('GPU device not found')
else:
  print(f"Found GPU at: {device_name}")

In [None]:
# 4_datasets directories
datasets_dir = join(base_dir, "4_datasets/final")

# 5_models directories
models_dir = join(base_dir, "5_models")

# Create directories
makedirs(models_dir, exist_ok=True)

# Compile new model dataset (optional)

## Select a dataset to import

In [None]:
print("Select the dataset variates .pkl to add predictors...\n")
for file in os.listdir(datasets_dir):
  if file.endswith(".pkl"):
    print(f'selected_dataset = "{file}"')

In [None]:
selected_dataset = "agbd.pkl"

# Load dataset
print(f"Importing '{selected_dataset}'")
dataset_read = pd.read_pickle(join(datasets_dir, selected_dataset))
dataset_imported = dataset_read.copy().reset_index(drop=True)

# Print dataset summary
print(f"'{selected_dataset}' with {len(dataset_imported)} rows and {len(dataset_imported.columns)} columns imported.")

## Select ID column

In [None]:
# Check remaining columns and select ID column
for col in dataset_imported.columns:
  if col[:4] == "var_":
    print(f'id_column = "{col}"')

In [None]:
id_column = "var_shot_number"

dataset_col_list = [id_column]

## Select a variate to predict

In [None]:
for col in dataset_imported.columns:
  if col[:4] == "var_" and col not in dataset_col_list:
    print(f'selected_variate = "{col}"')

In [None]:
selected_variate = "var_agbd"

dataset_col_list = dataset_col_list + [selected_variate]

## Select uncertainty metrics

In [None]:
# The uncertainty metrics should allow estimation of one standard deviation of the selected variate.

# Check remaining variate columns and select the uncertainty columns, if any.

for col in dataset_imported.columns:
  if col[:4] == "var_" and col not in dataset_col_list:
    print(f'uncertainty = "{col}"')
print(f'uncertainty = None')

In [None]:
uncertainty = "var_agbd_se"

if uncertainty:
  dataset_col_list = dataset_col_list + [uncertainty]

## Add covariates

In [None]:
# Covariates are used as a predictor in training, but have an 'NA' value for
# validation and testing. These should still be included in scenarios as an
# entirely 'NA' raster. They may / may not improve accuracy.

# Check remaining columns and select covariates column
print("covariates  = [")
for col in dataset_imported.columns:
  if col[:4] == "var_" and col not in dataset_col_list:
    print(f'"{col}",')
print("]")

In [None]:
covariates = [
"var_beam",
"var_sensitivity",
]


dataset_imported_covar = dataset_imported.copy()

# Rename var_ to pre_
covariates_renamed = []
if len(covariates)>0:
  covariates_renamed = [covariate.replace('var','pre') for covariate in covariates]
  dataset_imported_covar.rename(
      columns={i:j for i,j in zip(covariates, covariates_renamed)}, inplace=True
  )

# If covariates are object or string, convert to integer for model training
categorised_covariates = {}
for col in covariates_renamed:
    if dataset_imported_covar[col].dtype in ['object', 'string']:
        unique_values = sorted(dataset_imported_covar[col].unique())
        value_to_int = {val: i+1 for i, val in enumerate(unique_values)}
        dataset_imported_covar[col] = dataset_imported_covar[col].map(value_to_int).astype('category')
        categorised_covariates[col] = value_to_int
print(categorised_covariates)


dataset_col_list = dataset_col_list + covariates_renamed

## Select predictors

In [None]:
print("selected_predictors = [")
for col in sorted(dataset_imported_covar.columns):
  if col[:4] == "pre_" and col not in dataset_col_list:
    print(f'  "{col}",')
print("]")

In [None]:
selected_predictors = [
  "pre_coast_proximity_km",
  "pre_disturbance_with_edge_effects_1994",
  "pre_disturbance_with_edge_effects_1995",
  "pre_disturbance_with_edge_effects_1996",
  "pre_disturbance_with_edge_effects_1997",
  "pre_disturbance_with_edge_effects_1998",
  "pre_disturbance_with_edge_effects_1999",
  "pre_disturbance_with_edge_effects_2000",
  "pre_disturbance_with_edge_effects_2001",
  "pre_disturbance_with_edge_effects_2002",
  "pre_disturbance_with_edge_effects_2003",
  "pre_disturbance_with_edge_effects_2004",
  "pre_disturbance_with_edge_effects_2005",
  "pre_disturbance_with_edge_effects_2006",
  "pre_disturbance_with_edge_effects_2007",
  "pre_disturbance_with_edge_effects_2008",
  "pre_disturbance_with_edge_effects_2009",
  "pre_disturbance_with_edge_effects_2010",
  "pre_disturbance_with_edge_effects_2011",
  "pre_disturbance_with_edge_effects_2012",
  "pre_disturbance_with_edge_effects_2013",
  "pre_disturbance_with_edge_effects_2014",
  "pre_disturbance_with_edge_effects_2015",
  "pre_disturbance_with_edge_effects_2016",
  "pre_disturbance_with_edge_effects_2017",
  "pre_disturbance_with_edge_effects_2018",
  "pre_disturbance_with_edge_effects_2019",
  "pre_disturbance_with_edge_effects_2020",
  "pre_disturbance_with_edge_effects_2021",
  "pre_disturbance_with_edge_effects_2022",
  "pre_forest_with_edge_effects_1994",
  # "pre_forest_with_edge_effects_1995",
  # "pre_forest_with_edge_effects_1996",
  # "pre_forest_with_edge_effects_1997",
  # "pre_forest_with_edge_effects_1998",
  # "pre_forest_with_edge_effects_1999",
  # "pre_forest_with_edge_effects_2000",
  # "pre_forest_with_edge_effects_2001",
  # "pre_forest_with_edge_effects_2002",
  # "pre_forest_with_edge_effects_2003",
  # "pre_forest_with_edge_effects_2004",
  # "pre_forest_with_edge_effects_2005",
  "pre_forest_with_edge_effects_2006",
  # "pre_forest_with_edge_effects_2007",
  # "pre_forest_with_edge_effects_2008",
  # "pre_forest_with_edge_effects_2009",
  # "pre_forest_with_edge_effects_2010",
  # "pre_forest_with_edge_effects_2011",
  # "pre_forest_with_edge_effects_2012",
  # "pre_forest_with_edge_effects_2013",
  # "pre_forest_with_edge_effects_2014",
  # "pre_forest_with_edge_effects_2015",
  # "pre_forest_with_edge_effects_2016",
  # "pre_forest_with_edge_effects_2017",
  "pre_forest_with_edge_effects_2018",
  "pre_forest_with_edge_effects_2019",
  "pre_forest_with_edge_effects_2020",
  "pre_forest_with_edge_effects_2021",
  "pre_forest_with_edge_effects_2022",
  "pre_latitude",
  "pre_longitude",
  # "pre_pa_ais_with_edge_effects",
  # "pre_pa_taman_krau_ais_with_edge_effects",
  "pre_pa_taman_krau_with_edge_effects",
  "pre_topo_cor_smooth_aspect_cosine",
  "pre_topo_cor_smooth_aspect_sine",
  "pre_topo_cor_smooth_circular_variance_aspect_03",
  "pre_topo_cor_smooth_circular_variance_aspect_07",
  "pre_topo_cor_smooth_circular_variance_aspect_11",
  "pre_topo_cor_smooth_deviation_mean_elevation_03",
  "pre_topo_cor_smooth_deviation_mean_elevation_07",
  "pre_topo_cor_smooth_deviation_mean_elevation_11",
  "pre_topo_cor_smooth_eastness",
  "pre_topo_cor_smooth_elevation",
  "pre_topo_cor_smooth_northness",
  "pre_topo_cor_smooth_profile_curvature",
  "pre_topo_cor_smooth_roughness_03",
  "pre_topo_cor_smooth_roughness_07",
  "pre_topo_cor_smooth_roughness_11",
  "pre_topo_cor_smooth_slope",
  "pre_topo_cor_smooth_stream_power_index_log10",
  "pre_topo_cor_smooth_surface_area_ratio",
  "pre_topo_cor_smooth_tangential_curvature",
  "pre_topo_cor_smooth_topographic_position_index_03",
  "pre_topo_cor_smooth_topographic_position_index_07",
  "pre_topo_cor_smooth_topographic_position_index_11",
  "pre_topo_cor_smooth_topographic_ruggedness_index",
  "pre_topo_cor_smooth_topographic_wetness_index",
  "pre_topo_cor_unsmooth_aspect_cosine",
  "pre_topo_cor_unsmooth_aspect_sine",
  "pre_topo_cor_unsmooth_circular_variance_aspect_03",
  "pre_topo_cor_unsmooth_circular_variance_aspect_07",
  "pre_topo_cor_unsmooth_circular_variance_aspect_11",
  "pre_topo_cor_unsmooth_deviation_mean_elevation_03",
  "pre_topo_cor_unsmooth_deviation_mean_elevation_07",
  "pre_topo_cor_unsmooth_deviation_mean_elevation_11",
  "pre_topo_cor_unsmooth_eastness",
  "pre_topo_cor_unsmooth_elevation",
  "pre_topo_cor_unsmooth_northness",
  "pre_topo_cor_unsmooth_profile_curvature",
  "pre_topo_cor_unsmooth_roughness_03",
  "pre_topo_cor_unsmooth_roughness_07",
  "pre_topo_cor_unsmooth_roughness_11",
  "pre_topo_cor_unsmooth_slope",
  "pre_topo_cor_unsmooth_stream_power_index_log10",
  "pre_topo_cor_unsmooth_surface_area_ratio",
  "pre_topo_cor_unsmooth_tangential_curvature",
  "pre_topo_cor_unsmooth_topographic_position_index_03",
  "pre_topo_cor_unsmooth_topographic_position_index_07",
  "pre_topo_cor_unsmooth_topographic_position_index_11",
  "pre_topo_cor_unsmooth_topographic_ruggedness_index",
  "pre_topo_cor_unsmooth_topographic_wetness_index",
]

In [None]:
# Combine with covariates and sort for predictions
selected_predictors = sorted(selected_predictors)
dataset_col_list = dataset_col_list + sorted(selected_predictors)

## Define categorical predictors

In [None]:
# Define categorical data for correct model interpretation
print("categorical_columns = [")
for col in dataset_col_list:
  if col.startswith('pre_'):
    print(f'  "{col}",')
print("]")

In [None]:
categorical_columns = [
  "pre_beam",
]

dataset_imported_cat = dataset_imported_covar.copy()
if len(categorical_columns) > 0:
  dataset_imported_cat[categorical_columns].astype("category")

## Descriptive parameters

In [None]:
# Check remaining variate columns and select parameters that may be useful for descriptive statistics.
print("descriptive_parameters = [")
for col in dataset_imported_cat.columns:
  if col not in dataset_col_list:
    print(f'"{col}",')
print("]")

In [None]:
descriptive_parameters = [
]

dataset_col_list = dataset_col_list + descriptive_parameters

## Filter dataset

In [None]:
filter = False

dataset_imported_filtered = dataset_imported_cat.copy()

if filter:
  filter_parameter = "var_sensitivity" # Example with sensitivity
  threshold = 0.98 # Greater than or above, change below if needed
  # Filter
  filter_values_to_include = f">={threshold}" # Descriptive for metadata. Need to change the filter manually if a different conditional.
  for col in dataset_imported_filtered.columns:
    if col == filter_parameter:
      dataset_imported_filtered = dataset_imported_filtered.loc[dataset_imported_filtered[filter_parameter]>=threshold].reset_index().drop(columns=['index'])
else: filter_parameter, threshold, filter_values_to_include = None, None, None

## Sample size

In [None]:
# Print current number of rows (data points)
print(f'Number of rows (data points): {len(dataset_imported_filtered)}')

In [None]:
# Randomly sampling the rows of a dataset to generate a smaller subset can
# make testing different parameters faster

sample_imported_dataset = False
sample_imported_dataset_by_percent = False # If False then by number
sample_imported_dataset_value = 500000 # Set to a percentage or number, or 'None' if not sampling

# Global
dataset_imported_sampled = dataset_imported_filtered.copy()

# Sample dataset for testing or HPO
if sample_imported_dataset:
  if sample_imported_dataset_by_percent:
    dataset_imported_sampled = dataset_imported_sampled.sample(frac=sample_imported_dataset_value/100, random_state=1).reset_index().drop(columns=['index'])
  else:
    dataset_imported_sampled = dataset_imported_sampled.sample(n=sample_imported_dataset_value, random_state=1).reset_index().drop(columns=['index'])

print(f'Number of rows (data points) after sampling: {len(dataset_imported_sampled)}')

## Name and compile

In [None]:
# Set model dataset name (date and time suffix will be added) and sample percent
dataset_name = "agbd"

# Rename ID and descriptive columns
dataset_col_list = [item for item in dataset_col_list if item is not None]
dataset_final = dataset_imported_sampled.copy()[dataset_col_list]
dataset_final.columns = [col.replace('pre_', '').replace('var_', '') if col in descriptive_parameters else col for col in dataset_final.columns]
dataset_final.columns = [col.replace('pre_', '').replace('var_', '') if col == id_column else col for col in dataset_final.columns]

# Model dataset file and directory
model_dataset_name = f"{dataset_name}_{datetime.utcnow().strftime('%y%m%d_%H%M%S')}"
model_dataset_dir = join(models_dir, model_dataset_name)
model_dataset_path = join(model_dataset_dir, model_dataset_name)
model_dataset_description_dir = join(model_dataset_dir, "model_dataset_description.json")

# Check if identical filtered model dataset exists
dataset_exists = False
for subdir, dirs, files in os.walk(models_dir):
  for file in files:
    if file.endswith('.pkl'):
      dataset_existing = pd.read_pickle(join(subdir,file))
      dataset_equals = pd.DataFrame.equals(dataset_existing, dataset_final)
      if dataset_equals:
        dataset_exists = True
        model_dataset = dataset_existing
        model_dataset_dir = subdir
        print(f'An identical model dataset already exists in: {subdir}')
        break
  if dataset_exists:
    break

# Create a model dataset filtered to selected variate and predictors, if it does not exist
if dataset_exists == False:
  makedirs(model_dataset_dir, exist_ok=True)
  dataset_final.to_pickle(f"{model_dataset_path}.pkl")
  model_dataset = pd.read_pickle(f"{model_dataset_path}.pkl")
  model_dataset_description = {
      "model_dataset_name": model_dataset_name,
      "number_of_columns": len(model_dataset.columns),
      "number_of_rows": len(model_dataset),
      "id_column": id_column,
      "selected_variate": selected_variate,
      "uncertainty": uncertainty,
      "covariates_renamed": covariates_renamed,
      "covariates_categorised": categorised_covariates,
      "selected_predictors": selected_predictors,
      "categorical_columns": categorical_columns,
      "descriptive_parameters": descriptive_parameters,
      "filter_parameter": filter_parameter,
      "filter_values_to_include": filter_values_to_include,
      "sample_imported_dataset": sample_imported_dataset,
      "sample_imported_dataset_by_percent": sample_imported_dataset_by_percent,
      "sample_imported_dataset_value": sample_imported_dataset_value,
  }
  with open(model_dataset_description_dir, "w") as f:
    f.write(json.dumps(model_dataset_description))
  print(f"Model dataset compiled and exported to {model_dataset_dir}")

# Check dataset
model_dataset

# Select model dataset

In [None]:
# Select an existing model dataset
for subdir in os.listdir(models_dir):
    print(f'selected_model_dataset = "{subdir}"')

In [None]:
selected_model_dataset = "agbd_240923_083124"

# Select model dataset
model_dataset_dir = join(models_dir,selected_model_dataset)

# Read description for model dataset attributes
with open(join(model_dataset_dir,"model_dataset_description.json")) as model_dataset_description_json:
  model_dataset_description = json.load(model_dataset_description_json)
model_dataset_name = model_dataset_description["model_dataset_name"]
number_of_columns = model_dataset_description["number_of_columns"]
number_of_rows = model_dataset_description["number_of_rows"]
id_column = model_dataset_description["id_column"]
selected_variate = model_dataset_description["selected_variate"]
uncertainty = model_dataset_description["uncertainty"]
covariates_renamed = model_dataset_description["covariates_renamed"]
covariates_categorised = model_dataset_description["covariates_categorised"]
selected_predictors = model_dataset_description["selected_predictors"] + model_dataset_description["covariates_renamed"]
categorical_columns = model_dataset_description["categorical_columns"]
descriptive_parameters = model_dataset_description["descriptive_parameters"]
filter_parameter = model_dataset_description["filter_parameter"]
filter_values_to_include = model_dataset_description["filter_values_to_include"]
sample_imported_dataset = model_dataset_description["sample_imported_dataset"]
sample_imported_dataset_by_percent = model_dataset_description["sample_imported_dataset_by_percent"]
sample_imported_dataset_value = model_dataset_description["sample_imported_dataset_value"]

# Define model dataset description and .json directories
model_description_dir = join(model_dataset_dir, "model_description.json")
final_model_dir = join(model_dataset_dir, "model.json")

# Model parameters

In [None]:
# WIP categorise_variate
categorise_variate = False

# Load model dataset
model_dataset = pd.read_pickle(join(f"{model_dataset_dir}/{selected_model_dataset}.pkl"))

if categorise_variate:
  # Create a copy to protect original
  model_dataset_cat = model_dataset

  # Define categories (must be in order and start from 0)

  # # Categorise per 50 Mg/ha
  # multiclass = True # More than two classes
  # model_dataset_cat['var_agbd'] = np.where(model_dataset_cat.var_agbd < 50, 0, model_dataset_cat.var_agbd)
  # model_dataset_cat['var_agbd'] = np.where((model_dataset_cat.var_agbd >= 50) & (model_dataset_cat.var_agbd < 100), 1, model_dataset_cat.var_agbd)
  # model_dataset_cat['var_agbd'] = np.where((model_dataset_cat.var_agbd >= 100) & (model_dataset_cat.var_agbd < 150), 2, model_dataset_cat.var_agbd)
  # model_dataset_cat['var_agbd'] = np.where((model_dataset_cat.var_agbd >= 150) & (model_dataset_cat.var_agbd < 200), 3, model_dataset_cat.var_agbd)
  # model_dataset_cat['var_agbd'] = np.where((model_dataset_cat.var_agbd >= 200) & (model_dataset_cat.var_agbd < 250), 4, model_dataset_cat.var_agbd)
  # model_dataset_cat['var_agbd'] = np.where((model_dataset_cat.var_agbd >= 250) & (model_dataset_cat.var_agbd < 300), 5, model_dataset_cat.var_agbd)
  # model_dataset_cat['var_agbd'] = np.where((model_dataset_cat.var_agbd >= 300) & (model_dataset_cat.var_agbd < 350), 6, model_dataset_cat.var_agbd)
  # model_dataset_cat['var_agbd'] = np.where((model_dataset_cat.var_agbd >= 350) & (model_dataset_cat.var_agbd < 400), 7, model_dataset_cat.var_agbd)
  # model_dataset_cat['var_agbd'] = np.where(model_dataset_cat.var_agbd >= 400, 8, model_dataset_cat.var_agbd)

  # # Categorise per 100 Mg/ha
  # multiclass = True # More than two classes
  # model_dataset_cat['var_agbd'] = np.where(model_dataset_cat['var_agbd'] < 100, 0, model_dataset_cat['var_agbd'])
  # model_dataset_cat['var_agbd'] = np.where((model_dataset_cat['var_agbd'] >= 100) & (model_dataset_cat['var_agbd'] < 200), 1, model_dataset_cat['var_agbd'])
  # model_dataset_cat['var_agbd'] = np.where((model_dataset_cat['var_agbd'] >= 200) & (model_dataset_cat['var_agbd'] < 300), 2, model_dataset_cat['var_agbd'])
  # model_dataset_cat['var_agbd'] = np.where((model_dataset_cat['var_agbd'] >= 300) & (model_dataset_cat['var_agbd'] < 400), 3, model_dataset_cat['var_agbd'])
  # model_dataset_cat['var_agbd'] = np.where(model_dataset_cat['var_agbd'] >= 400, 4, model_dataset_cat['var_agbd'])

  # Categorise above and below 150 Mg/ha (High Carbon Stock)
  multiclass = False
  model_dataset_cat['var_agbd'] = np.where(model_dataset_cat.var_agbd < 150, 0, model_dataset_cat.var_agbd)
  model_dataset_cat['var_agbd'] = np.where(model_dataset_cat.var_agbd >= 150, 1, model_dataset_cat.var_agbd)

  # Set parameters
  XGBPredictor = xgb.XGBClassifier
  if multiclass:
      objective = 'multi:softprob'
      eval_metric = 'mlogloss'
  else:
      objective = 'binary:logistic'
      eval_metric = 'logloss'
  metric = 'accuracy'
  optimal_value = 'max'
  model_dataset_cat['var_agbd'] = model_dataset_cat['var_agbd'].astype("category")
  model_dataset = model_dataset_cat

else:
  XGBPredictor = xgb.XGBRegressor
  objective = 'reg:squarederror'
  eval_metric = 'rmse'
  metric = 'rmse'
  optimal_value = 'min'

# Define x and y axes for training
model_dataset_x = model_dataset[selected_predictors]
model_dataset_y = model_dataset[selected_variate]

In [None]:
# Select whether to use the weighted validation score for optimisation, mitigating overfitting.
use_score_weighted = True

# Create dictionary for optimal value and define the weighted score
dict_opt = {"min": min, "max": max}
def def_score_weighted(mean_validation, mean_training):
  if optimal_value == "max":
    score_weighted_calc = mean_validation - 0.5*max(0, mean_training - mean_validation)
  elif optimal_value == "min":
    score_weighted_calc = mean_validation + 0.5*max(0, mean_validation - mean_training)
  return score_weighted_calc

In [None]:
# Exclude specified columns
columns_to_exclude = [
                      'pre_sensitivity',
                      # 'pre_latitude',
                      # 'pre_longitude'
                      ]
columns_to_consider = model_dataset_x.columns.drop(columns_to_exclude)

# Calculate max_bin hyperparameter based on predictor with maximum unique values
max_unique_col = model_dataset_x[columns_to_consider].nunique().idxmax()
max_bin = model_dataset_x[columns_to_consider].nunique().max()
print(f"{max_unique_col} had the highest number of unique values, max_bin set to {max_bin}")

# Alternatively, set manually by uncommenting:
# max_bin = 1600

In [None]:
use_default_hp = False
use_random_seed = True

# Define baseline hyperparameters

baseline_hyperparameters = {
 'tree_method': 'hist',
 'device': 'cuda',
 'enable_categorical': True,
 'max_bin': max_bin,
 'n_estimators': 100000, # Will be limited by early stopping
 'learning_rate': 0.01,
 'early_stopping_rounds': 10,
 'min_child_weight': 39,
 'gamma': 0.12,
 'alpha': 9800,
 'lambda': 2000,
 'colsample_bytree': 0.82,
 'colsample_bylevel': 0.938,
 'colsample_bynode': 0.85
}


if use_default_hp:
  baseline_hyperparameters = {
    "tree_method": "hist",
    "device": 'cuda',
    'enable_categorical': True,
    "objective": objective,
    "eval_metric": eval_metric,
    "max_bin": max_bin,
    "n_estimators": 100_000, # Will be limited by early stopping
    "early_stopping_rounds": 10

  }

if use_random_seed: baseline_hyperparameters["random_state"] = 1

# Avoids issues using dataframe from CPU
xgb.set_config(verbosity=0, use_rmm=True)

# KFold cross-validation

In [None]:
# The number of k-folds affects the proportion of data used for training,
# with the complementary proportion used for validation / testing.
# 10 splits = 90 % training; 5 = 80 %; 4 = 75 %; 3 = 67 %; 2 = 50 %.

# Input the number of k-folds to generate train/valid splits.
n_splits = 10

# Number of folds to be used for HPO validation.
# Remaining folds will be used for final testing.
n_hpo_splits = 2

assert n_hpo_splits < n_splits, "n_hpo_splits must be less than n_splits"

In [None]:
kf_split = list(KFold(n_splits=n_splits, shuffle=True, random_state=1).split(model_dataset_x, model_dataset_y))

# Return a dictionary full of metrics for calculating accuracy
def calculate_metrics(validation, prediction):
    # Convert all data to a unified type for metric calculation
    if not categorise_variate:
        validation = validation.astype("float32")
        prediction = prediction.astype("float32")
    if categorise_variate:
        metrics = {
            "r2": r2_score(validation, prediction),
            "me": np.mean(np.array(validation).squeeze() - np.array(prediction).squeeze()),
            "rmse": mean_squared_error(validation, prediction, squared=False),
            "rrmse": np.sqrt(np.sum(np.square(validation - prediction), axis=0) / np.sum(np.square(prediction), axis=0)) * 100,
            "accuracy": accuracy_score(validation, prediction)
        }
    else:
        metrics = {
            "r2": r2_score(validation, prediction),
            "me": np.mean(np.array(validation).squeeze() - np.array(prediction).squeeze()),
            "rmse": mean_squared_error(validation, prediction, squared=False),
            "rrmse": np.sqrt(np.sum(np.square(validation - prediction), axis=0) / np.sum(np.square(prediction), axis=0)) * 100,
        }
    return metrics

def run_kfold(model_dataset, kf_split, predictor, verbose=False, export_test_model=False, test_model_dir=None):
    metrics = []
    cols = []
    junk_data = pd.Series([1, 1, 1, 1, 1, 1])
    for m in list(calculate_metrics(junk_data, junk_data).keys()):
        cols += [f"score_training ({m})", f"score_validation ({m})", f"score_difference ({m})", f"score_weighted ({m})"]
    cols.append("n_estimators")

    # Set fold index
    fold_index = 1

    for train_index, valid_index in kf_split:
        # Compile datasets
        x_train = model_dataset.loc[train_index][selected_predictors]
        x_valid = model_dataset.loc[valid_index][selected_predictors]
        y_train = model_dataset.loc[train_index][selected_variate]
        y_valid = model_dataset.loc[valid_index][selected_variate]

        # Create DMatrix objects
        dtrain = xgb.DMatrix(x_train, y_train, enable_categorical=True)
        dvalid = xgb.DMatrix(x_valid, y_valid, enable_categorical=True)

        # Set up parameters
        params = predictor.get_params()
        params['eval_metric'] = eval_metric

        # Set up early stopping
        early_stopping = xgb.callback.EarlyStopping(
            rounds=predictor.early_stopping_rounds,
            metric_name=eval_metric,
            data_name='validation_1'
        )

        # Train model
        evals_result = {}
        model = xgb.train(params,
                          dtrain,
                          num_boost_round=predictor.n_estimators,
                          evals=[(dtrain, 'training'), (dvalid, 'validation_1')],
                          callbacks=[early_stopping],
                          evals_result=evals_result,
                          verbose_eval=False)

        # Collect metrics
        metrics_row = []
        training_prediction = model.predict(dtrain)
        validation_prediction = model.predict(dvalid)
        training_metrics = calculate_metrics(y_train, training_prediction)
        validation_metrics = calculate_metrics(y_valid, validation_prediction)
        for m in list(training_metrics.keys()):
            score_training = training_metrics[m]
            score_validation = validation_metrics[m]
            score_difference = score_training - score_validation
            score_weighted = def_score_weighted(score_validation, score_training)

            metrics_row += [score_training, score_validation, score_difference, score_weighted]

        metrics_row.append(len(evals_result['validation_1'][eval_metric]))  # n_estimators
        metrics.append(metrics_row)

        # Export if enabled
        if export_test_model:
            makedirs(test_model_dir, exist_ok=True)
            model_name = f"model_test_fold_{fold_index}"
            metrics_dir = join(test_model_dir, f"{model_name}_metrics.csv")
            model.save_model(join(test_model_dir, f"{model_name}.json"))
            df_fold_metrics = pd.DataFrame(metrics, columns=cols)
            df_fold_metrics.to_csv(metrics_dir)
            print(f"Model export complete for test fold {fold_index}.")

        # Reset scores to selected metric
        score_training = training_metrics[metric]
        score_validation = validation_metrics[metric]
        score_difference = score_training - score_validation
        score_weighted = def_score_weighted(score_validation, score_training)

        if verbose:
            print(f"Validation {metric}: {score_validation}")
            print(f"Training {metric}: {score_training}")
            print(f"{metric} difference: {score_difference}")
            print(f"Weighted {metric}: {score_weighted}")
            print(f"--------------------")

        fold_index += 1

    # Place metrics into a DataFrame to make it more readable
    df_kfold_metrics = pd.DataFrame(metrics, columns=cols)

    return df_kfold_metrics

def generate_statistics(df_kfold_metrics):
    mean_training = df_kfold_metrics[f"score_training ({metric})"].mean()
    std_training = df_kfold_metrics[f"score_training ({metric})"].std()
    mean_validation = df_kfold_metrics[f"score_validation ({metric})"].mean()
    std_validation = df_kfold_metrics[f"score_validation ({metric})"].std()
    mean_difference = df_kfold_metrics[f"score_difference ({metric})"].mean()
    std_difference = df_kfold_metrics[f"score_difference ({metric})"].std()
    score_weighted = def_score_weighted(mean_validation, mean_training)
    mean_weighted = df_kfold_metrics[f"score_weighted ({metric})"].mean()
    std_weighted = df_kfold_metrics[f"score_weighted ({metric})"].std()

    # Print statistics
    print("----------Results Summary----------")
    print(f"Training {metric} mean: {mean_training}")
    print(f"Training {metric} std: {std_training}")
    print(f"Validation {metric} mean: {mean_validation}")
    print(f"Validation {metric} std: {std_validation}")
    print(f"{metric} difference mean: {mean_difference}")
    print(f"{metric} difference std: {std_difference}")
    print(f"Weighted {metric} score: {score_weighted}")

# Baseline model accuracy (optional)

In [None]:
# Baseline for HPO folds
baseline_hpo_predictor = XGBPredictor(**baseline_hyperparameters)
baseline_hpo = run_kfold(model_dataset, kf_split[:n_hpo_splits], baseline_hpo_predictor, verbose=True)
generate_statistics(baseline_hpo)

# Export baseline_hpo_descr.json
baseline_hpo_descr_dir = join(model_dataset_dir, "baseline_hpo_descr.json")
baseline_hpo_descr = {
  "selected_variate": selected_variate,
  "hyperparameters": str(baseline_hyperparameters),
  "metric_used_for_training": metric
}

for col in baseline_hpo.columns:
  baseline_hpo_descr[f"avg_{col}"] = float(baseline_hpo[col].mean())
  baseline_hpo_descr[f"std_{col}"] = float(baseline_hpo[col].std())
with open(baseline_hpo_descr_dir, "w") as file:
  file.write(json.dumps(baseline_hpo_descr))
print("baseline_hpo_descr.json generation and export complete\n")

In [None]:
# Baseline for final testing folds
baseline_test_predictor = XGBPredictor(**baseline_hyperparameters)
baseline_test = run_kfold(model_dataset, kf_split[n_hpo_splits:], baseline_test_predictor, verbose=True)
generate_statistics(baseline_test)

# Export baseline_test_descr.json
baseline_test_descr_dir = join(model_dataset_dir, "baseline_test_descr.json")
baseline_test_descr = {
  "selected_variate": selected_variate,
  "hyperparameters": str(baseline_hyperparameters),
  "metric_used_for_training": metric
}

for col in baseline_test.columns:
  baseline_test_descr[f"avg_{col}"] = float(baseline_test[col].mean())
  baseline_test_descr[f"std_{col}"] = float(baseline_test[col].std())
with open(baseline_test_descr_dir, "w") as file:
  file.write(json.dumps(baseline_test_descr))
print("baseline_test_descr.json generation and export complete\n")

# Hyperparameter optimisation

## Random search

In [None]:
# Hyperparameters and ranges to randomly sample.
# If uniform or loguniform, subtract the lower range from the upper range.

hp_distribution = {
    "early_stopping_rounds": randint(10, 100),
    "learning_rate": uniform(0.01, 0.3 - 0.01),
    "min_child_weight": randint(1, 10),
    "gamma": uniform(0, 0.5 - 0),
    "reg_alpha": uniform(0, 500 - 0), # If score does not change with HPs, try reducing reg_alpha
    "reg_lambda": uniform(1, 500 - 1),
    "colsample_bytree": uniform(0.95, 1.0 - 0.95),
    "colsample_bylevel": uniform(0.95, 1.0 - 0.95),
    "colsample_bynode": uniform(0.95, 1.0 - 0.95),
}

In [None]:
# Select how many iterations
iterations = 1000

hpo_random_search_dir = join(model_dataset_dir, 'hpo_random_search')
Path(hpo_random_search_dir).mkdir(parents=True, exist_ok=True)
hpo_random_search_results_filename = join(hpo_random_search_dir, 'iteration_results')
hpo_random_search_results = pd.DataFrame()
score_col = "score_weighted" if use_score_weighted else "mean_validation"
hpo_random_search_best_hyperparameters_dir = join(hpo_random_search_dir,"best_hyperparameters.csv")

# If an existing hyper parameter file exists, load it
if exists(f"{hpo_random_search_results_filename}.pkl"):
  print("Reading results of the existing trials records...")
  hpo_random_search_results = pd.read_pickle(f"{hpo_random_search_results_filename}.pkl")
  print(f"{len(hpo_random_search_results)} records read.")

# Iteratively train models with sampled hyperparameters, saving the values and accuracy scores
while len(hpo_random_search_results) < iterations:

  # Randomly sample hyperparameter distribution
  hp_sample_random_state = max(1,len(hpo_random_search_results)+1)
  hp_sample = {k: v.rvs(random_state=hp_sample_random_state*(i+1000000)) for i, (k, v) in enumerate(hp_distribution.items())}

  # Start regression
  t_start = datetime.now()
  print(f"Trying parameters: {hp_sample}")

  rs_hyperparameters = dict(baseline_hyperparameters.copy())
  rs_hyperparameters.update(hp_sample)
  rs_predictor = XGBPredictor(**rs_hyperparameters)

  df_kfold_metrics = run_kfold(model_dataset, kf_split[:n_hpo_splits], rs_predictor, verbose=False)

  # Generate statistics
  mean_training = df_kfold_metrics[f"score_training ({metric})"].mean()
  mean_validation = df_kfold_metrics[f"score_validation ({metric})"].mean()
  mean_difference = df_kfold_metrics[f"score_difference ({metric})"].mean()
  score_weighted = def_score_weighted(mean_validation, mean_training)

  interim_rs_results = pd.Series(rs_hyperparameters)
  interim_rs_results.loc["mean_validation"] = mean_validation
  interim_rs_results.loc["mean_training"] = mean_training
  interim_rs_results.loc["mean_difference"] = mean_difference
  interim_rs_results.loc["score_weighted"] = score_weighted
  interim_rs_results.loc["hyperparameters"] = rs_hyperparameters
  print(f"Trial completed in {datetime.now() - t_start}.")
  print(f"Mean validation {metric}: {mean_validation:.4f}, Mean training {metric}: {mean_training:.4f}, Mean {metric} difference: {mean_difference:.4f}, weighted score: {score_weighted:.4f}")

  #  Compile results
  if len(hpo_random_search_results) == 0:
    hpo_random_search_results = pd.DataFrame([interim_rs_results])
  else:
    hpo_random_search_results = pd.concat([hpo_random_search_results, pd.DataFrame([interim_rs_results])], ignore_index=True)
  hpo_random_search_results.to_pickle(f"{hpo_random_search_results_filename}.pkl")
  hpo_random_search_results.to_csv(f"{hpo_random_search_results_filename}.csv")

  #  Export best result
  if optimal_value == "min":
    best_result_id = hpo_random_search_results[score_col].idxmin()
  else:
    best_result_id = hpo_random_search_results[score_col].idxmax()
  best_rs_hyperparameters = hpo_random_search_results.loc[best_result_id]["hyperparameters"].copy()
  export_rs_hyperparameters = best_rs_hyperparameters
  export_rs_hyperparameters[score_col] = hpo_random_search_results.loc[best_result_id][score_col]
  pd.DataFrame(export_rs_hyperparameters, index=[0]).to_csv(hpo_random_search_best_hyperparameters_dir)

runtime.unassign()

In [None]:
# Hyperparameter SHAP plots
hpo_x = hpo_random_search_results[list(hp_distribution.keys())]
hpo_x = hpo_x.apply(pd.to_numeric)
hpo_y = hpo_random_search_results[score_col]
hpo_y = hpo_y.apply(pd.to_numeric)

hp_predictor = XGBPredictor()
hp_predictor.fit(hpo_x, hpo_y)

explainer = shap.Explainer(hp_predictor)
shap_values = explainer(hpo_x)
shap.plots.beeswarm(shap_values, plot_size=(20,8))

shap.plots.scatter(shap_values[:,'early_stopping_rounds'], color=shap_values[:,'early_stopping_rounds'].data)
shap.plots.scatter(shap_values[:,'learning_rate'], color=shap_values[:,'learning_rate'].data)
shap.plots.scatter(shap_values[:,'min_child_weight'], color=shap_values[:,'min_child_weight'].data)
shap.plots.scatter(shap_values[:,'gamma'], color=shap_values[:,'gamma'].data)
shap.plots.scatter(shap_values[:,'reg_lambda'], color=shap_values[:,'reg_lambda'].data)
shap.plots.scatter(shap_values[:,'reg_alpha'], color=shap_values[:,'reg_alpha'].data)
shap.plots.scatter(shap_values[:,'colsample_bytree'], color=shap_values[:,'colsample_bytree'].data)
shap.plots.scatter(shap_values[:,'colsample_bylevel'], color=shap_values[:,'colsample_bylevel'].data)
shap.plots.scatter(shap_values[:,'colsample_bynode'], color=shap_values[:,'colsample_bynode'].data)

## Automated Random Search Evaluation

In [None]:
# Define directories and paths
hpo_arse_dir = join(model_dataset_dir, "hpo_arse")
hpo_arse_optimisation_plots_dir = join(hpo_arse_dir, "optimisation_plots")
hpo_arse_optimisation_results_dir = join(hpo_arse_dir, "optimisation_results.pkl")
hpo_arse_iteration_results_dir = join(hpo_arse_dir, "iteration_results.pkl")
hpo_arse_best_hyperparameters_dir = join(hpo_arse_dir, "best_hyperparameters.csv")
hpo_arse_cache_shap_dir = join(hpo_arse_dir, "cache_shap.pkl")
hpo_arse_line_graph_dir = join(hpo_arse_dir, "optimisation_results.png")
Path(hpo_arse_dir).mkdir(parents=True, exist_ok=True)
Path(hpo_arse_optimisation_plots_dir).mkdir(parents=True, exist_ok=True)

In [None]:
# Each optimisation an XGBoost regression model and SHAP values are used to determine
# the hyperparameter with the largest effect on accuracy. This is then narrowed to a
# more optimal distribution based on SHAP effect direction. If the new distribution
# touches the upper or lower bounds of that tested then those bounds are shifted.
# If the mean score of an optimisation is less than the previous, the 'conservative'
# option will retry the former HP range rather than moving to the next round.

# hp_distribution = {
#     "early_stopping_rounds": randint(15, 21 + 1),
#     # "learning_rate": uniform(0.008, 0.012 - 0.008),
#     "min_child_weight": randint(26, 39 + 1),
#     # "gamma": uniform(0.125, 0.175 - 0.125),
#     "reg_alpha": uniform(5000, 15000 - 5000),
#     "reg_lambda": uniform(22000, 24000 - 22000),
#     "colsample_bytree": uniform(0.96, 1 - 0.96),
#     "colsample_bylevel": uniform(0.85, 0.92 - 0.85),
#     "colsample_bynode": uniform(0.85, 0.92 - 0.85),
# }

hp_distribution = {
    # "max_bin": randint(1000, 2000 + 1),
    # "early_stopping_rounds": randint(8, 12 + 1),
    # "learning_rate": uniform(0.01, 0.3 - 0.01),
    "min_child_weight": randint(49, 52 + 1),
    "gamma": uniform(0.1, 0.15 - 0.1),
    "reg_alpha": uniform(19000, 22000 - 19000),
    "reg_lambda": uniform(1, 500 - 1),
    "colsample_bytree": uniform(0.65, 0.72 - 0.7),
    "colsample_bylevel": uniform(0.74, 0.87 - 0.7),
    "colsample_bynode": uniform(0.8, 0.9 - 0.8),
}


df_hp_types = pd.DataFrame(columns=["type", "min", "max"])
df_hp_types.index.name = "hyperparameter"
df_hp_types.loc["max_bin"]                = ["randint", 1, np.inf]
df_hp_types.loc["early_stopping_rounds"]  = ["randint", 1, np.inf]
df_hp_types.loc["learning_rate"]          = ["uniform", 0.005, 1.0]
df_hp_types.loc["min_child_weight"]       = ["randint", 0, np.inf]
df_hp_types.loc["gamma"]                  = ["uniform", 0.0, np.inf]
df_hp_types.loc["reg_alpha"]              = ["uniform", 0.0, np.inf]
df_hp_types.loc["reg_lambda"]             = ["uniform", 1.0, np.inf]
df_hp_types.loc["colsample_bytree"]       = ["uniform", 0.5, 1.0]
df_hp_types.loc["colsample_bylevel"]      = ["uniform", 0.5, 1.0]
df_hp_types.loc["colsample_bynode"]       = ["uniform", 0.5, 1.0]

function_map = {
    "uniform": uniform,
    "randint": randint,
}

In [None]:
# (optional) redefine baseline hyperparameters for new ARSE round

baseline_hyperparameters = {
 'tree_method': 'hist',
 'device': 'cuda',
 'enable_categorical': True,
 'max_bin': max_bin,
 'n_estimators': 100000, # Will be limited by early stopping
 'learning_rate': 0.01,
 'early_stopping_rounds': 10,
 'min_child_weight': 39,
 'gamma': 0.12,
 'alpha': 9800,
 'lambda': 2000,
 'colsample_bytree': 0.82,
 'colsample_bylevel': 0.938,
 'colsample_bynode': 0.85
}

In [None]:
iterations_per_optimisation = 20
total_optimisations = 5
bounds_correction_percent = 50 # Shifts the HP sample range to this % of this sample range if it is at the upper or lower extremes
conservative = False

assert iterations_per_optimisation != len(hp_distribution), 'The number of iterations cannot be the same as the number of hyperparameters sampled (a known issue with shap)'

# Filter warning "ntree_limit is deprecated, use `iteration_range` or model slicing instead."
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning)

# Initialise variables and directories
shap_data = []
score_col = "score_weighted" if use_score_weighted else "mean_validation"

display_cols = [f"best score ({metric} {score_col})", f"mean score ({metric} {score_col})", "most_important_hp", "new_hp_min", "new_hp_max", f"time_taken (H:M:S)"]
cache_cols = ["hp_distribution_old", "hp_distribution_new"]
hpo_arse_results = pd.DataFrame(columns=display_cols + cache_cols)
hpo_arse_results.index.name = "optimisations"

hpo_arse_iteration_results = pd.DataFrame(columns=["optimisations", "iterations", "hyperparameters", "metric_used", "mean_validation", "mean_training", "mean_difference", "score_weighted", "time_taken", "hp_distribution"])

optimisations = 1
iterations = 1

# Function get the last row where the distribution was updated (if using the conservative flag, this is important for selecting the latest valid row in hpo_arse_results)
roll_back_message = "Rolled back to last valid optimisation."
def get_last_valid_row(df):
  for i in range(len(df) - 1, -1, -1):
    if df.iloc[i]["most_important_hp"] != roll_back_message:
      return df.iloc[i]

# Visualisation function
def display_visualisations(hpo_arse_results, shap_data):
  # Generate main scoring graph
  arse_results_plot = px.line(hpo_arse_results, x=hpo_arse_results.index, y=[f"best score ({metric} {score_col})", f"mean score ({metric} {score_col})"], title=f"{metric} score per optimisation", markers=True, width=800, height=400)
  arse_results_plot.update_xaxes(tick0=1, dtick=1) # It might be useful to specify the range to something like... range=[1,len(df) + 1]
  arse_results_plot.update_layout(
      title_x = 0.5,
      yaxis_title = "score",
      margin = {"l":0,"r":0,"t":50,"b":0,"pad":0}
  )
  arse_results_plot.write_image(hpo_arse_line_graph_dir)

  # Update display
  clear_output(wait=True)
  print(tabulate(hpo_arse_results[display_cols], headers="keys", tablefmt="psql"))
  arse_results_plot.show()

  # Update iterative SHAP plots
  i = len(shap_data)
  most_important_hp, shap_values = shap_data[-1]

  importances = []
  for j in range(shap_values.values.shape[1]):
    importances.append(np.mean(np.abs(shap_values.values[:, j])))
  feature_importances = {fea: imp for imp, fea in zip(importances, shap_values.feature_names)}
  feature_importances = {k: v for k, v in sorted(feature_importances.items(), key=lambda item: item[1], reverse=True)}

  # Setup matplotlib subplots for side-by-side display
  fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
  fig.suptitle(f"Optimisation #{i}", fontsize=16)

  # First plot
  shap.plots.scatter(shap_values[:,most_important_hp], color=shap_values[:,most_important_hp].data, ax=ax1, show=False)

  # Second plot (not using the default SHAP plot because they don't support passing an axis)
  y_pos = np.arange(len(feature_importances))
  width = [width for width in feature_importances.values()]

  ax2.barh(y_pos, width, color="#FF0051")
  ax2.set_yticks(y_pos)
  ax2.set_yticklabels(list(feature_importances.keys()))
  ax2.invert_yaxis()
  ax2.set_xlabel("mean(|SHAP value|)")
  ax2.spines["top"].set_visible(False)
  ax2.spines["right"].set_visible(False)

  for j in range(len(feature_importances.keys())):
    ax2.axhline(j+1, color="#888888", lw=0.5, dashes=(1, 5), zorder=-1)
  for j, v in enumerate(width):
    # All values are assumed to be positive as the optimisation only selects positive SHAP values
    ax2.text(v + 0.001, j, f"+{v:.2f}", color="#FF0051", horizontalalignment="left", verticalalignment="center", fontsize=12)

  # Save and display plots
  fig.tight_layout()
  fig.savefig(join(hpo_arse_optimisation_plots_dir, f"{i}_shap.png"))

  print() # Add a small gap
  plt.show()

# Load cache
if exists(hpo_arse_iteration_results_dir):
  # Load iteration results, set the hyperparameter distribution, and set current process indicies
  hpo_arse_iteration_results = pd.read_pickle(hpo_arse_iteration_results_dir)
  hp_distribution_serialized = hpo_arse_iteration_results.iloc[-1]["hp_distribution"]
  hp_distribution = {k:function_map[df_hp_types.loc[k]["type"]](v[0], v[1]) for k, v in hp_distribution_serialized.items()}

  optimisations = hpo_arse_iteration_results.iloc[-1]["optimisations"]
  iterations = hpo_arse_iteration_results.iloc[-1]["iterations"] + 1
  if iterations > iterations_per_optimisation:
    iterations = 1
    optimisations += 1

  # Load optimisation results, display visualisations, and update the hyperparameter distribution
  if exists(hpo_arse_optimisation_results_dir) and exists(hpo_arse_cache_shap_dir): # May not exit if process stops before the first optimisation finishes
    # Load data
    hpo_arse_results = pd.read_pickle(hpo_arse_optimisation_results_dir)
    with open(hpo_arse_cache_shap_dir, "rb") as data:
      shap_data = pickle.load(data)

    # Update hyperparameter distribution
    hp_distribution_serialized = hpo_arse_results.iloc[-1]["hp_distribution_new"]
    hp_distribution = {k:function_map[df_hp_types.loc[k]["type"]](v[0], v[1]) for k, v in hp_distribution_serialized.items()}

    # Display visualisations
    display_visualisations(hpo_arse_results, shap_data)

# Iteratively train models with sampled hyper parameters, saving the values and metric score
while optimisations <= total_optimisations:
  iterations_progress_label = widgets.Label(value=f"Iteration of current optimisation: {iterations}/{iterations_per_optimisation}")
  display(iterations_progress_label)
  hp_distribution_old = hp_distribution.copy()
  hp_distribution_new = hp_distribution.copy() # Copy old distribution for now, update this value later
  hp_distribution_old_serialized = {k:v.args for k, v in hp_distribution_old.items()}

  #interim_results = pd.DataFrame()
  while iterations <= iterations_per_optimisation:
    t_start = datetime.now()

    # Sample HP values
    hp_sample_random_state = len(hpo_arse_iteration_results)+1
    hp_sample = {k: v.rvs(random_state=hp_sample_random_state*(i+random.randrange(10000))) for i, (k, v) in enumerate(hp_distribution.items())}
    sampled_parameters = hp_sample
    arse_hyperparameters = dict(baseline_hyperparameters.copy())
    arse_hyperparameters.update(sampled_parameters)
    iterations_progress_label.value = f"Iteration of current optimisation: {iterations}/{iterations_per_optimisation}"

    arse_predictor = XGBPredictor(**arse_hyperparameters)
    df_kfold_metrics = run_kfold(model_dataset, kf_split[:n_hpo_splits], arse_predictor)

    # Generate statistics
    mean_training = df_kfold_metrics[f"score_training ({metric})"].mean()
    mean_validation = df_kfold_metrics[f"score_validation ({metric})"].mean()
    mean_difference = df_kfold_metrics[f"score_difference ({metric})"].mean()
    score_weighted = def_score_weighted(mean_validation, mean_training)

    # Save results
    time_taken = str(datetime.now() - t_start)
    i = len(hpo_arse_iteration_results)
    hpo_arse_iteration_results.loc[i + 1] = [optimisations, iterations, arse_hyperparameters, metric, mean_validation, mean_training, mean_difference, score_weighted, time_taken, hp_distribution_old_serialized]
    hpo_arse_iteration_results.to_pickle(hpo_arse_iteration_results_dir)
    hpo_arse_iteration_results.to_csv(f"{hpo_arse_iteration_results_dir[:-4]}.csv")

    # Export best hyperparameters
    if optimal_value == "min":
      best_result_id = hpo_arse_iteration_results[score_col].idxmin()
    else:
      best_result_id = hpo_arse_iteration_results[score_col].idxmax()

    best_arse_hyperparameters = hpo_arse_iteration_results.loc[best_result_id]["hyperparameters"].copy()

    export_arse_hyperparameters = best_arse_hyperparameters
    export_arse_hyperparameters[score_col] = hpo_arse_iteration_results.loc[best_result_id][score_col]
    pd.DataFrame(export_arse_hyperparameters, index=[0]).to_csv(hpo_arse_best_hyperparameters_dir)

    # Update iteration index
    iterations += 1

  iterations = 1

  # Collect results of only this optimisation round
  interim_results = hpo_arse_iteration_results.loc[hpo_arse_iteration_results["optimisations"] == optimisations]
  interim_results = pd.concat([interim_results.drop(["hyperparameters"], axis=1), interim_results["hyperparameters"].apply(pd.Series)], axis=1)
  interim_results = interim_results[["time_taken", score_col] + list(hp_distribution.keys())] # Make sure we only collect hyperparameters being optimised

  # Generate SHAP values
  hpo_x = interim_results[list(hp_distribution.keys())]
  hpo_x = hpo_x.apply(pd.to_numeric)
  hpo_y = interim_results[score_col]
  hpo_y = hpo_y.apply(pd.to_numeric)
  hp_predictor = XGBPredictor()
  hp_predictor.fit(hpo_x, hpo_y)
  explainer = shap.TreeExplainer(hp_predictor)
  shap_values = explainer(hpo_x)

  # Calculate the feature importance (mean absolute shap value) for each feature, then select the most important
  importances = []
  for i in range(shap_values.values.shape[1]):
    importances.append(np.mean(np.abs(shap_values.values[:, i])))
  feature_importances = {fea: imp for imp, fea in zip(importances, hpo_x.columns)}
  feature_importances = {k: v for k, v in sorted(feature_importances.items(), key=lambda item: item[1], reverse = True)}

  most_important_hp = max(feature_importances, key=feature_importances.get)
  try:
    most_important_hp_shap = shap_values[:,most_important_hp]
  except:
    most_important_hp_shap = shap_values[most_important_hp,:]

  # Break loop if most important feature has no effect (optimisation complete)
  if len((most_important_hp_shap.values!=0).nonzero()[0]) == 0:
    print("Optimisation complete")
    break

  #  Calculate 25th, 50th and 75th percentiles for the sampled range of the most important HP
  hp_p25, hp_p50, hp_p75 = np.percentile(hpo_x[most_important_hp], [25,50,75])

  # Create inclusive masks for quartiles
  hp_quartile1_mask = most_important_hp_shap.data <= hp_p25
  hp_quartile2_mask = (most_important_hp_shap.data <= hp_p50) & (most_important_hp_shap.data >= hp_p25)
  hp_quartile3_mask = (most_important_hp_shap.data <= hp_p75) & (most_important_hp_shap.data >= hp_p50)
  hp_quartile4_mask = most_important_hp_shap.data >= hp_p75

  #  Calculate mean SHAP values for each quartile
  shap_quartile1_mean = np.average(most_important_hp_shap.values[hp_quartile1_mask])
  shap_quartile2_mean = np.average(most_important_hp_shap.values[hp_quartile2_mask])
  shap_quartile3_mean = np.average(most_important_hp_shap.values[hp_quartile3_mask])
  shap_quartile4_mean = np.average(most_important_hp_shap.values[hp_quartile4_mask])

  shap_quartile_means = np.array([shap_quartile1_mean,shap_quartile2_mean,shap_quartile3_mean,shap_quartile4_mean])

  # Globals - check if any optimal HP value is the max or min of tested values
  upper_limit_optimal = False
  lower_limit_optimal = False

  # Calculate the 'min' and 'max' mean SHAP values for each quartile
  # Optimise based on 'min' or 'max' optimal value
  if optimal_value == "min":
    min_shap_quartile_mean = np.min(shap_quartile_means)
    if min_shap_quartile_mean == shap_quartile1_mean:
      optimal_hp_values = most_important_hp_shap.data[hp_quartile1_mask]
      lower_limit_optimal = True
    if min_shap_quartile_mean == shap_quartile2_mean:
      optimal_hp_values = most_important_hp_shap.data[hp_quartile2_mask]
    if min_shap_quartile_mean == shap_quartile3_mean:
      optimal_hp_values = most_important_hp_shap.data[hp_quartile3_mask]
    if min_shap_quartile_mean == shap_quartile4_mean:
      optimal_hp_values = most_important_hp_shap.data[hp_quartile4_mask]
      upper_limit_optimal = True
  else:
    max_shap_quartile_mean = np.max(shap_quartile_means)
    if max_shap_quartile_mean == shap_quartile1_mean:
      optimal_hp_values = most_important_hp_shap.data[hp_quartile1_mask]
      lower_limit_optimal = True
    if max_shap_quartile_mean == shap_quartile2_mean:
      optimal_hp_values = most_important_hp_shap.data[hp_quartile2_mask]
    if max_shap_quartile_mean == shap_quartile3_mean:
      optimal_hp_values = most_important_hp_shap.data[hp_quartile3_mask]
    if max_shap_quartile_mean == shap_quartile4_mean:
      optimal_hp_values = most_important_hp_shap.data[hp_quartile4_mask]
      upper_limit_optimal = True

  # Get new min and max optimal values
  if df_hp_types.loc[most_important_hp]['type'] == "randint":
    new_hp_max = np.ceil(np.max(optimal_hp_values))
    new_hp_min = np.trunc(np.min(optimal_hp_values))
  else:
    new_hp_max = np.max(optimal_hp_values)
    new_hp_min = np.min(optimal_hp_values)

  # Calculate range_correction percentage of tested range of optimal HP
  range_correction = bounds_correction_percent * np.ptp(most_important_hp_shap.data) / 100

  # Adjust to range_correction if necessary
  if df_hp_types.loc[most_important_hp]['type'] == "randint":
    if upper_limit_optimal:
      new_hp_max = np.ceil(min(new_hp_max + range_correction, df_hp_types.loc[most_important_hp]['max']))
    if lower_limit_optimal:
      new_hp_min = np.trunc(max(new_hp_min - range_correction, df_hp_types.loc[most_important_hp]['min']))
  else:
    if upper_limit_optimal:
      new_hp_max = min(new_hp_max + range_correction, df_hp_types.loc[most_important_hp]['max'])
    if lower_limit_optimal:
      new_hp_min = max(new_hp_min - range_correction, df_hp_types.loc[most_important_hp]['min'])

  # Update and export SHAP data
  shap_data.append([most_important_hp, shap_values])
  with open(hpo_arse_cache_shap_dir, "wb") as f:
    pickle.dump(shap_data, f)

  # Update and export optimisation results
  if optimal_value == "min":
    best_result_id = interim_results[score_col].idxmin()
  else:
    best_result_id = interim_results[score_col].idxmax()

  i = len(hpo_arse_results)
  time_taken = 0
  for index, value in interim_results["time_taken"].items():
    pt = datetime.strptime(value, "%H:%M:%S.%f")
    time_taken += pt.second + pt.minute*60 + pt.hour*3600
  time_taken = str(timedelta(seconds=time_taken))

  roll_back_hp = False
  if conservative and optimisations > 1:
    avg_score_col = [col for col in hpo_arse_results.columns if col.startswith("mean score")]
    current_mean = interim_results[score_col].mean()
    previous_mean = get_last_valid_row(hpo_arse_results)[avg_score_col][0] # Compare mean to the last valid optimisation, not necessarily the previous optimisation
    if (optimal_value == "min" and current_mean > previous_mean) or (optimal_value == "max" and current_mean < previous_mean):
      roll_back_hp = True

  if roll_back_hp:
    hp_distribution_new_serialized = get_last_valid_row(hpo_arse_results)["hp_distribution_old"]
    hp_distribution_new = {k:function_map[df_hp_types.loc[k]["type"]](v[0], v[1]) for k, v in hp_distribution_new_serialized.items()}
    most_important_hp = roll_back_message
    new_hp_min = np.nan
    new_hp_max = np.nan
  else:
    if df_hp_types.loc[most_important_hp]['type'] == "randint":
      hp_distribution_new[most_important_hp] = randint(new_hp_min, new_hp_max + 1)
    else:
      hp_distribution_new[most_important_hp] = uniform(new_hp_min, new_hp_max - new_hp_min)

  hp_distribution = hp_distribution_new.copy()
  hp_distribution_new_serialized = {k:v.args for k, v in hp_distribution_new.items()}

  hpo_arse_results.loc[i + 1] = [interim_results.loc[best_result_id][score_col],
                                interim_results[score_col].mean(), most_important_hp, new_hp_min, new_hp_max, time_taken, hp_distribution_old_serialized, hp_distribution_new_serialized]
  hpo_arse_results.to_pickle(hpo_arse_optimisation_results_dir)
  hpo_arse_results.to_csv(f"{hpo_arse_optimisation_results_dir[:-4]}.csv") # Human readable

  # Update visualisations
  display_visualisations(hpo_arse_results, shap_data)

  # Update optimisations index
  optimisations += 1

### View plots for all results

In [None]:
# View plots for all ARSE results

# Can change to a previous ARSE round
# hpo_arse_dir = join(model_dataset_dir, "hpo_arse_r1")

# Results
hpo_arse_dir = join(model_dataset_dir, "hpo_arse")
hpo_arse_iteration_results_dir = join(hpo_arse_dir, "iteration_results.pkl")
hpo_arse_iteration_results = pd.read_pickle(hpo_arse_iteration_results_dir)

# Hyperparameters optimised in ARSE
arse_hpo_x = pd.json_normalize(hpo_arse_iteration_results['hyperparameters']).filter([
    'max_bin',
    'early_stopping_rounds',
    'learning_rate',
    'max_depth',
    'min_child_weight',
    'gamma',
    'reg_alpha',
    'reg_lambda',
    'colsample_bytree',
    'colsample_bylevel',
    'colsample_bynode'
    ])

# Score optimised in ARSE
arse_hpo_y = hpo_arse_iteration_results.filter([
    'score_weighted'
])

# Filter warning "ntree_limit is deprecated, use `iteration_range` or model slicing instead."
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning)

# predictor for SHAP interpretation
arse_hpo_predictor = XGBPredictor()
arse_hpo_predictor.fit(arse_hpo_x, arse_hpo_y)

# SHAP explainer
arse_hpo_explainer = shap.Explainer(arse_hpo_predictor)
arse_hpo_shap_values = arse_hpo_explainer(arse_hpo_x)
shap.plots.beeswarm(arse_hpo_shap_values, plot_size=(20,8))

In [None]:
# Individual plots for hyperparameters
# shap.plots.scatter(arse_hpo_shap_values[:,'max_bin'], color=arse_hpo_shap_values[:,'max_bin'].data)
# shap.plots.scatter(arse_hpo_shap_values[:,'early_stopping_rounds'], color=arse_hpo_shap_values[:,'early_stopping_rounds'].data)
# shap.plots.scatter(arse_hpo_shap_values[:,'learning_rate'], color=arse_hpo_shap_values[:,'learning_rate'].data)
shap.plots.scatter(arse_hpo_shap_values[:,'min_child_weight'], color=arse_hpo_shap_values[:,'min_child_weight'].data)
shap.plots.scatter(arse_hpo_shap_values[:,'gamma'], color=arse_hpo_shap_values[:,'gamma'].data)
shap.plots.scatter(arse_hpo_shap_values[:,'reg_alpha'], color=arse_hpo_shap_values[:,'reg_alpha'].data)
shap.plots.scatter(arse_hpo_shap_values[:,'reg_lambda'], color=arse_hpo_shap_values[:,'reg_lambda'].data)
shap.plots.scatter(arse_hpo_shap_values[:,'colsample_bytree'], color=arse_hpo_shap_values[:,'colsample_bytree'].data)
shap.plots.scatter(arse_hpo_shap_values[:,'colsample_bylevel'], color=arse_hpo_shap_values[:,'colsample_bylevel'].data)
shap.plots.scatter(arse_hpo_shap_values[:,'colsample_bynode'], color=arse_hpo_shap_values[:,'colsample_bynode'].data)

# Final model

## Define and test

In [None]:
# (optional) redefine baseline hyperparameters for final model

baseline_hyperparameters = {
 'tree_method': 'hist',
 'device': 'cuda',
 'enable_categorical': True,
 'max_bin': max_bin,
 'n_estimators': 100000, # Will be limited by early stopping
 'learning_rate': 0.01,
 'early_stopping_rounds': 10,
 'min_child_weight': 52,
 'gamma': 0.125,
 'alpha': 22000,
 'lambda': 10,
 'colsample_bytree': 0.667,
 'colsample_bylevel': 0.86,
 'colsample_bynode': 0.9
}

In [None]:
use_best_hyperparameters = False # Set to false to use baseline

hpo_method = "hpo_arse" # Options: "hpo_arse", "hpo_random_search"

# Test model
test_hyperparameters = baseline_hyperparameters
predictor = XGBPredictor(**test_hyperparameters)
test_model_dir = join(model_dataset_dir,"model_test")

# If optimised hyperparameters exist, use them
if use_best_hyperparameters:
  test_hyperparameters = pd.read_csv(join(model_dataset_dir, hpo_method, "best_hyperparameters.csv"), index_col=0).to_dict(orient="records")[0]
  test_hyperparameters = {k: v for k, v in test_hyperparameters.items() if not "score_" in k} # Remove scoring metric
  predictor = XGBPredictor(**test_hyperparameters)

assert not exists(model_description_dir) and not exists(final_model_dir), "Remove both \"model_description.json\" and \"model.json\", before exporting a new model"

# Run model
df_kfold_metrics = run_kfold(model_dataset, kf_split[n_hpo_splits:], predictor, verbose=True,
                             export_test_model=True, test_model_dir=test_model_dir)

generate_statistics(df_kfold_metrics)

model_description = {
    "metric_used_for_training": metric,
    "optimal_value": optimal_value,
    "use_score_weighted": use_score_weighted,
    "n_splits": n_splits, # Number of k-fold splits
    "splits_hpo": n_hpo_splits, # Number of k-fold splits set aside for HPO, the remaining are used for final testing.
    "hyperparameters": str(test_hyperparameters)
}

for col in df_kfold_metrics.columns:
  model_description[f"{col} mean"] = float(df_kfold_metrics[col].mean())
  model_description[f"{col} std"] = float(df_kfold_metrics[col].std())

# Export model_description.json
with open(model_description_dir, "w") as f:
  f.write(json.dumps(model_description))
print("model_description.json generation and export complete.")

## Descriptive plots

In [None]:
# Descriptive plots

# Assert model description exists
assert exists(model_description_dir), "\"model_description.json\" must exist to continue, run previous cells"

# Reload hyperparameters
with open(model_description_dir) as model_description_json:
  model_description = json.load(model_description_json)
final_hyperparameters = eval(model_description["hyperparameters"])

# Remove early stopping and replace with mean n_estimators
if "early_stopping_rounds" in final_hyperparameters:
  final_hyperparameters = {k:v for k, v in final_hyperparameters.items() if k != "early_stopping_rounds"}
  final_hyperparameters["n_estimators"] = round(model_description["n_estimators mean"])

# Select the last k-split
train_index_plots, valid_index_plots = kf_split[n_splits-1]
x_train_plots = model_dataset.loc[train_index_plots][selected_predictors]
x_valid_plots = model_dataset.loc[valid_index_plots][selected_predictors]
y_train_plots = model_dataset.loc[train_index_plots][selected_variate]
y_valid_plots = model_dataset.loc[valid_index_plots][selected_variate]

# Create DMatrix objects
dtrain_plots = xgb.DMatrix(x_train_plots, y_train_plots, enable_categorical=True)
dvalid_plots = xgb.DMatrix(x_valid_plots, y_valid_plots, enable_categorical=True)

# Set up parameters
predictor = XGBPredictor(**final_hyperparameters)
params_plots = predictor.get_params()
params_plots['eval_metric'] = eval_metric
# Default fix for new XGBoost version
[model_params.pop(key, None) for key in ['n_estimators', 'enable_categorical', 'missing']]

# Train model
evals_result = {}
model_plots = xgb.train(params_plots,
                        dtrain_plots,
                        num_boost_round=final_hyperparameters['n_estimators'],
                        evals=[(dtrain_plots, 'training'), (dvalid_plots, 'validation_1')],
                        evals_result=evals_result,
                        verbose_eval=False)

# Feature importances
feature_importances = model_plots.get_score(importance_type='weight')
sorted_importances = sorted(feature_importances.items(), key=lambda x: x[1], reverse=True)
features, importances = zip(*sorted_importances)

plt.figure(figsize=(10, 30))
plt.barh(features, importances, align='center')
plt.xlabel('Importance')
plt.title('Feature Importances')
plt.gca().invert_yaxis()  # Invert y-axis to have the most important feature at the top
plt.show()

# Prediction versus test data plot
validation_prediction = model_plots.predict(dvalid_plots)
validation_prediction_series = pd.Series(validation_prediction, index=y_valid_plots.index, name=y_valid_plots.name)

plt.figure(figsize=(12, 7))
bins = np.linspace(y_valid_plots.min(), y_valid_plots.max(), 100)
y_valid_plots.hist(bins=bins, label=f'{y_valid_plots.name} (actual)', alpha=0.6)
validation_prediction_series.hist(bins=bins, label=f'{y_valid_plots.name} (predicted)', alpha=0.6)
plt.legend()
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.title('Prediction vs. Actual Data')
plt.show()

## Train and export with full dataset

In [None]:
# Generate final model

# Assert exists
assert exists(model_description_dir), "\"model_description.json\" must exist to continue, run previous cells"

# Reload hyperparameters
with open(model_description_dir) as model_description_json:
  model_description = json.load(model_description_json)
final_hyperparameters = eval(model_description["hyperparameters"])

# Remove early stopping and replace with mean n_estimators
if "early_stopping_rounds" in final_hyperparameters:
  final_hyperparameters = {k:v for k, v in final_hyperparameters.items() if k != "early_stopping_rounds"}
  final_hyperparameters["n_estimators"] = round(model_description["n_estimators mean"])


# Create DMatrix objects
dtrain_final = xgb.DMatrix(model_dataset_x, model_dataset_y, enable_categorical=True)

# Train model on full dataset
predictor = XGBPredictor(**final_hyperparameters)
params_final = predictor.get_params()
params_final['eval_metric'] = eval_metric
# Default fix for new XGBoost version
[params_final.pop(key, None) for key in ['n_estimators', 'enable_categorical', 'missing']]
model_final = xgb.train(params_final,
                        dtrain_final,
                        num_boost_round=final_hyperparameters['n_estimators'],
                        verbose_eval=True)

# Export model
model_final.save_model(final_model_dir)
print("Model training and 'model.json' export complete.")

# Model interpretation

In [None]:
# Set sampling criteria. Set to 100 % for all data.
sample_model_dataset_by_percent = True  # If False then by number
sample_model_dataset_value = 100  # Set to a percentage or number, or 'None' if not sampling

# Assert model exists
assert exists(model_description_dir) and exists(final_model_dir), "\"model_description.json\" and \"model.json\" must exist to continue, run the 'Final model' section."

# Sample dataset for SHAP evaluation
if sample_model_dataset_by_percent:
    model_dataset_shap = model_dataset.sample(frac=sample_model_dataset_value/100, random_state=1).reset_index(drop=True)
else: model_dataset_shap = model_dataset.sample(n=sample_model_dataset_value, random_state=1).reset_index(drop=True)

# Split sample into x and y
model_dataset_shap_x = model_dataset_shap[selected_predictors]
model_dataset_shap_y = model_dataset_shap[selected_variate]

# Split into training and validation for early_stopping
kf_split_shap = list(KFold(n_splits=n_splits, shuffle=True, random_state=1).split(model_dataset_shap_x, model_dataset_shap_y))

# Run on the first split
train_index_shap, valid_index_shap = kf_split_shap[n_splits-1]
x_train_shap = model_dataset_shap.loc[train_index_shap][selected_predictors]
x_valid_shap = model_dataset_shap.loc[valid_index_shap][selected_predictors]
y_train_shap = model_dataset_shap.loc[train_index_shap][selected_variate]
y_valid_shap = model_dataset_shap.loc[valid_index_shap][selected_variate]

# Create DMatrix objects
dtrain_shap = xgb.DMatrix(x_train_shap, y_train_shap, enable_categorical=True, feature_names=selected_predictors)
dvalid_shap = xgb.DMatrix(x_valid_shap, y_valid_shap, enable_categorical=True, feature_names=selected_predictors)

# Reload hyperparameters
with open(model_description_dir) as model_description_json:
    model_description = json.load(model_description_json)
final_hyperparameters = eval(model_description["hyperparameters"])

# Remove early stopping and replace with mean n_estimators
if "early_stopping_rounds" in final_hyperparameters:
  final_hyperparameters = {k:v for k, v in final_hyperparameters.items() if k != "early_stopping_rounds"}
  final_hyperparameters["n_estimators"] = round(model_description["n_estimators mean"])

# Define model
predictor = XGBPredictor(**final_hyperparameters)
params_shap = predictor.get_params()
params_shap['eval_metric'] = eval_metric
# Default fix for new XGBoost version
[model_params.pop(key, None) for key in ['n_estimators', 'enable_categorical', 'missing']]

# Train model
evals_result = {}
model_shap = xgb.train(params_shap,
                        dtrain_shap,
                        num_boost_round=final_hyperparameters['n_estimators'],
                        evals=[(dtrain_shap, 'training'), (dvalid_shap, 'validation_1')],
                        evals_result=evals_result,
                        verbose_eval=False)

# Create explainer and SHAP values
explainer = shap.TreeExplainer(model_shap)

# Prepare categorical data for SHAP analysis
model_dataset_shap_x_for_shap = model_dataset_shap_x.copy()
for col in model_dataset_shap_x_for_shap.select_dtypes(include=['category']).columns:
    model_dataset_shap_x_for_shap[col] = model_dataset_shap_x_for_shap[col].cat.codes

# Convert features to DMatrix
dmatrix_for_shap = xgb.DMatrix(model_dataset_shap_x_for_shap, enable_categorical=True, feature_names=selected_predictors)

# Generate SHAP values
shap_values = explainer(dmatrix_for_shap, check_additivity=False) # Disabling additivity check due to floating point precision issues

# Export SHAP values
shap_dir = join(model_dataset_dir, "model_shap.pkl")
pd.to_pickle(shap_values, shap_dir)
print("SHAP generation and export complete\n")

In [None]:
# Load SHAP values
shap_dir = join(model_dataset_dir, "model_shap.pkl")

with open(shap_dir, "rb") as shap_pkl:
    shap_values = pickle.load(shap_pkl)

# Summary SHAP plots with feature names
shap_values.feature_names = selected_predictors
shap.plots.beeswarm(shap_values, max_display=200)
shap.plots.bar(shap_values[0], max_display=200)

In [None]:
for feature in selected_predictors:
    print(f"shap.plots.scatter(shap_values[:, '{feature}'])")

In [None]:
shap.plots.scatter(shap_values[:, 'pre_topo_cor_unsmooth_elevation'])


# Disconnect runtime

In [None]:
# Useful for stopping background execution
runtime.unassign()