# Final Report Prep Notebook

Note: This notebook only works in Deepnote, as it contains Deepnote specific widgets and functions required for the final report, which is hosted as a DeepNote app.

## Import Libraries

In [2]:
# Import Standard Libraries
import os
import datetime
import pickle
import itertools
import pandas as pd
import geopandas as gpd
import numpy as np

# Import Visualization Libraries
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import shap

# Import custom functions
import env_functions as ef
import s3_functions as sf
import common_functions as cf

  from .autonotebook import tqdm as notebook_tqdm
Running on Deepnote with Env and S3 integrations, skipping dotenv


In [3]:
# Import Modeling Libraries
import lightgbm as lgb
import xgboost as xgb

## Import Environment

In [4]:
# Determine the environment and get appropriate vars
deepnote, env_vars = ef.load_env_vars()

# Iterate through the vars and set them as global vars
for var_name, var in env_vars.items():
    globals()[var_name] = var

# If not in the DeepNote environment, create a dict for aws creds
#   that were located in the environment file.  This will be passed
#   to all aws s3 functions.
if not deepnote:
    aws_env_vars = {
        'access_key_id': aws_access_key_id,
        'secret_access_key': aws_secret_access_key,
        'bucket_name': s3_bucket_name
    }

Running on Deepnote with Env and S3 integrations, skipping dotenv


## Setup Notebook Configurations

In [5]:
# Pandas Configs
pd.set_option('mode.chained_assignment', None)
pd.options.display.float_format = '{:.2f}'.format
pd.options.display.max_columns = None
pd.options.display.max_rows = None

# Ignore Warnings
import warnings
warnings.simplefilter('ignore', category=FutureWarning)
warnings.simplefilter('ignore', category=UserWarning)
warnings.simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

# MapBox Token for Plotly Maps
px.set_mapbox_access_token(os.environ.get("MAPBOX_TOKEN"))

## Import Data

### Import GCB and MEOW

In [6]:
# Import GCB and MEOW
if deepnote:
    gcb = pd.read_parquet("/work/data/Global_Coral_Bleaching_DB/gcb_v4.parquet")
    meow = gpd.read_file("/work/data/MEOW/meow_ecos.shp")
else:
    gcb = pd.read_parquet(sf.load_from_s3(file_path="data/Global_Coral_Bleaching_DB/gcb_v4.parquet", **aws_env_vars))
    meow = gpd.read_file(sf.load_from_s3(file_path="data/MEOW/meow_ecos.zip", **aws_env_vars))

In [7]:
# MEOW PROVINCES for our regions
SEAA_PROVINCE = ['Western Coral Triangle', 'Eastern Coral Triangle', 'Sunda Shelf', 'Tropical Southwestern Pacific',
                 'Northeast Australian Shelf', 'Sahul Shelf', 'South China Sea', 'Andaman', 'Java Transitional']

CARB_PROVINCE = ['Tropical Northwestern Atlantic']

# Filter MEOW for our regions
seaa_prov = meow[meow.PROVINCE.isin(SEAA_PROVINCE)]
carb_prov = meow[meow.PROVINCE.isin(CARB_PROVINCE)]

# Get the GeoJSON for our regions
seaa_prov_geo = seaa_prov.geometry.__geo_interface__
carb_prov_geo = carb_prov.geometry.__geo_interface__

### Import Regional Data Splits

In [8]:
# Initial data import
CARB_Xt, CARB_Xv, CARB_Xh, CARB_yt, CARB_yv, CARB_yh = cf.import_data(location_name='CARB')
SEAA_Xt, SEAA_Xv, SEAA_Xh, SEAA_yt, SEAA_yv, SEAA_yh = cf.import_data(location_name='SEAA')
GLOB_Xt, GLOB_Xv, GLOB_Xh, GLOB_yt, GLOB_yv, GLOB_yh = cf.import_data(location_name='GLOB')

# Model and Feature Lists
if deepnote:
    # Load Model Features
    with open('/work/data/Feature_Selection/CARB_LGBM_feat_list.pkl', 'rb') as f:
        CARB_feat_list =  pickle.load(f)
    with open('/work/data/Feature_Selection/SEAA_XGB_feat_list.pkl', 'rb') as f:
        SEAA_feat_list =  pickle.load(f)
    with open('/work/data/Feature_Selection/GLOB_LGBM_feat_list.pkl', 'rb') as f:
        GLOB_feat_list =  pickle.load(f)
    
    # Fetch model objects
    with open("/work/models/lightgbm_reg/CARB/20240413_195656_model.pkl", "rb") as f:
        CARB_model = pickle.load(f)
    with open("/work/models/xgboost_reg/SEAA/20240413_191526_model.pkl", "rb") as f:
        SEAA_model = pickle.load(f)
    with open("/work/models/lightgbm_reg/GLOB/20240414_113645_model.pkl", "rb") as f:
        GLOB_model = pickle.load(f)

else:
    # Load Model Features
    f = sf.load_from_s3(file_path='data/Feature_Selection/CARB_LGBM_feat_list.pkl', **aws_env_vars)
    CARB_feat_list = pickle.load(f)
    f = sf.load_from_s3(file_path='data/Feature_Selection/SEAA_XGB_feat_list.pkl', **aws_env_vars)
    SEAA_feat_list = pickle.load(f)
    f = sf.load_from_s3(file_path='data/Feature_Selection/GLOB_LGBM_feat_list.pkl', **aws_env_vars)
    GLOB_feat_list = pickle.load(f)

    # Fetch model objects
    f = sf.load_from_s3(file_path="models/lightgbm_reg/CARB/20240413_195656_model.pkl", **aws_env_vars)
    CARB_model = pickle.load(f)
    f = sf.load_from_s3(file_path="models/xgboost_reg/SEAA/20240413_191526_model.pkl", **aws_env_vars)
    SEAA_model = pickle.load(f)
    f = sf.load_from_s3(file_path="models/lightgbm_reg/GLOB/20240414_113645_model.pkl", **aws_env_vars)
    GLOB_model = pickle.load(f)

# Filter data using appropriate feature lists
CARB_Xtt, CARB_Xvt, CARB_Xht = CARB_Xt[CARB_feat_list], CARB_Xv[CARB_feat_list], CARB_Xh[CARB_feat_list]
SEAA_Xtt, SEAA_Xvt, SEAA_Xht = SEAA_Xt[SEAA_feat_list], SEAA_Xv[SEAA_feat_list], SEAA_Xh[SEAA_feat_list]
GLOB_Xtt, GLOB_Xvt, GLOB_Xht = GLOB_Xt[GLOB_feat_list], GLOB_Xv[GLOB_feat_list], GLOB_Xh[GLOB_feat_list]

## Create Predictions

### Predict on Validation and Holdout Data

In [9]:
# Get Validation predictions
CARB_val_preds = CARB_model.predict(CARB_Xvt)
CARB_val_preds = np.clip(CARB_val_preds, 0, 100)
SEAA_val_preds = SEAA_model.predict(SEAA_Xvt)
SEAA_val_preds = np.clip(SEAA_val_preds, 0, 100)
GLOB_val_preds = GLOB_model.predict(GLOB_Xvt)
GLOB_val_preds = np.clip(GLOB_val_preds, 0, 100)

# Get Holdout predictions
CARB_holdout_preds = CARB_model.predict(CARB_Xht)
CARB_holdout_preds = np.clip(CARB_holdout_preds, 0, 100)
SEAA_holdout_preds = SEAA_model.predict(SEAA_Xht)
SEAA_holdout_preds = np.clip(SEAA_holdout_preds, 0, 100)
GLOB_holdout_preds = GLOB_model.predict(GLOB_Xht)
GLOB_holdout_preds = np.clip(GLOB_holdout_preds, 0, 100)

### Create Validation Predictions DataFrame

In [10]:
# Create CARB validation dataframe
CARB_val_df = pd.concat([CARB_Xvt, CARB_yv], axis=1)
CARB_val_df['prediction'] = CARB_val_preds.tolist()
CARB_val_df['difference'] = CARB_val_df.prediction - CARB_val_df.y_val
CARB_val_df['abs_difference'] = np.abs(CARB_val_df.difference)
CARB_val_df['Region'] = 'CARB'
CARB_common_cols = gcb.columns.intersection(CARB_val_df.columns)
CARB_val_df = gcb.join(CARB_val_df.drop(CARB_common_cols, axis=1), how='inner')

# Create SEAA validation dataframe
SEAA_val_df = pd.concat([SEAA_Xvt, SEAA_yv], axis=1)
SEAA_val_df['prediction'] = SEAA_val_preds.tolist()
SEAA_val_df['difference'] = SEAA_val_df.prediction - SEAA_val_df.y_val
SEAA_val_df['abs_difference'] = np.abs(SEAA_val_df.difference)
SEAA_val_df['Region'] = 'SEAA'
SEAA_common_cols = gcb.columns.intersection(SEAA_val_df.columns)
SEAA_val_df = gcb.join(SEAA_val_df.drop(SEAA_common_cols, axis=1), how='inner')

# Create GLOB validation dataframe
GLOB_val_df = pd.concat([GLOB_Xvt, GLOB_yv], axis=1)
GLOB_val_df['prediction'] = GLOB_val_preds.tolist()
GLOB_val_df['difference'] = GLOB_val_df.prediction - GLOB_val_df.y_val
GLOB_val_df['abs_difference'] = np.abs(GLOB_val_df.difference)
GLOB_val_df['Region'] = 'GLOB'
GLOB_common_cols = gcb.columns.intersection(GLOB_val_df.columns)
GLOB_val_df = gcb.join(GLOB_val_df.drop(GLOB_common_cols, axis=1), how='inner')

# Create combined validation dataframe
val_df = pd.concat([CARB_val_df, SEAA_val_df, GLOB_val_df], axis=0)
val_df.rename(columns={'y_val': 'y'}, inplace=True)
val_df['Split'] = 'Validation'

### Create Holdout Predictions DataFrame

In [11]:
# Create CARB holdout dataframe
CARB_holdout_df = pd.concat([CARB_Xht, CARB_yh], axis=1)
CARB_holdout_df['prediction'] = CARB_holdout_preds.tolist()
CARB_holdout_df['difference'] = CARB_holdout_df.prediction - CARB_holdout_df.y_holdout
CARB_holdout_df['abs_difference'] = np.abs(CARB_holdout_df.difference)
CARB_holdout_df['Region'] = 'CARB'
CARB_common_cols = gcb.columns.intersection(CARB_holdout_df.columns)
CARB_holdout_df = gcb.join(CARB_holdout_df.drop(CARB_common_cols, axis=1), how='inner')

# Create SEAA holdout dataframe
SEAA_holdout_df = pd.concat([SEAA_Xht, SEAA_yh], axis=1)
SEAA_holdout_df['prediction'] = SEAA_holdout_preds.tolist()
SEAA_holdout_df['difference'] = SEAA_holdout_df.prediction - SEAA_holdout_df.y_holdout
SEAA_holdout_df['abs_difference'] = np.abs(SEAA_holdout_df.difference)
SEAA_holdout_df['Region'] = 'SEAA'
SEAA_common_cols = gcb.columns.intersection(SEAA_holdout_df.columns)
SEAA_holdout_df = gcb.join(SEAA_holdout_df.drop(SEAA_common_cols, axis=1), how='inner')

# Create GLOB holdout dataframe
GLOB_holdout_df = pd.concat([GLOB_Xht, GLOB_yh], axis=1)
GLOB_holdout_df['prediction'] = GLOB_holdout_preds.tolist()
GLOB_holdout_df['difference'] = GLOB_holdout_df.prediction - GLOB_holdout_df.y_holdout
GLOB_holdout_df['abs_difference'] = np.abs(GLOB_holdout_df.difference)
GLOB_holdout_df['Region'] = 'GLOB'
GLOB_common_cols = gcb.columns.intersection(GLOB_holdout_df.columns)
GLOB_holdout_df = gcb.join(GLOB_holdout_df.drop(GLOB_common_cols, axis=1), how='inner')

# Create combined holdout dataframe
holdout_df = pd.concat([CARB_holdout_df, SEAA_holdout_df, GLOB_holdout_df], axis=0)
holdout_df.rename(columns={'y_holdout': 'y'}, inplace=True)
holdout_df['Split'] = 'Holdout'

### Create Combined Predictions DataFrame

In [12]:
# Create combined val/hold dataframe
df = pd.concat([val_df, holdout_df], axis=0)
df.rename(columns={'y': 'actual'}, inplace=True)

In [None]:
# Make directory to write outputs
os.makedirs("/work/data/apps", exist_ok=True)

# Write consolidated dataframes to parquet
df.to_parquet('/work/data/apps/vis_df.parquet')
seaa_prov.to_parquet('/work/data/apps/seaa_prov.parquet')
carb_prov.to_parquet('/work/data/apps/carb_prov.parquet')

# Write GeoJSON dicts with pickle
with open('/work/data/apps/seaa_prov_geo.pickle', 'wb') as f:
    pickle.dump(seaa_prov_geo, f)

with open('/work/data/apps/carb_prov_geo.pickle', 'wb') as f:
    pickle.dump(carb_prov_geo, f)

## Create Visualizations

### Absolute Errors on Predictions 

In [93]:
Region_Var = 'CARB'

In [95]:
Color_Var = 'abs_difference'

In [96]:
Size_Var = 'Percent_Bleached_Value'

In [97]:
Data_Split_Var = 'Holdout'

In [98]:
Min_Error_Var = 0

In [99]:
Max_Error_Var = 100

In [100]:
# Examine the Region_Var
if Region_Var == 'CARB':
    region_prov = carb_prov
    region_prov_geo = carb_prov_geo
    center_lat = 20
    center_lon = -75
    z = 3
elif Region_Var == 'SEAA':
    region_prov = seaa_prov
    region_prov_geo = seaa_prov_geo
    center_lat = 0
    center_lon = 140
    z = 2
elif Region_Var == 'GLOB':
    center_lat = 0
    center_lon = 0
    z = 1

if Max_Error_Var < Min_Error_Var: 
    Max_Error_Var = 100
    
# Plot the sites
fig_scatter = px.scatter_mapbox(df[(df.Split == Data_Split_Var) & (df.Region == Region_Var) & (df['abs_difference'].between(Min_Error_Var, Max_Error_Var))],
                                lat="Latitude_Degrees",
                                lon="Longitude_Degrees",
                                hover_name="Country_Name",
                                color=Color_Var,
                                size=Size_Var,
                                size_max=15,
                                zoom=z,
                                hover_data=["actual", "prediction", "abs_difference"],
                                center={"lat": center_lat, "lon": center_lon},
                                title="")

if Region_Var == 'CARB' or Region_Var == 'SEAA':
    # Create the Region map
    fig_choropleth = px.choropleth_mapbox(region_prov, 
                                        geojson=region_prov_geo, 
                                        color='ECOREGION',
                                        opacity=0.15, 
                                        locations=region_prov.index,
                                        center={"lat": center_lat, "lon": center_lon},
                                        mapbox_style="carto-positron",
                                        zoom=z,
                                        title=f'{Region_Var} Region')
    for trace in fig_scatter.data:
        fig_choropleth.add_trace(trace)
    fig_choropleth.update_layout(title_text=f"{Region_Var} Predictions and Errors")
    fig_choropleth.update_layout(showlegend=False)
    fig_choropleth.update_layout(coloraxis_colorbar=dict(title=f"{Color_Var}"))
    fig_choropleth.show()
else: 
    fig_choropleth.update_layout(title_text=f"{Region_Var} Predictions and Errors")    
    fig_scatter.show()

## Create SHAP Visualizations

### Build all of the SHAP Explainer, Values and Interactions

In [None]:
%%time
# This process takes approximately 2.25 hours

CARB_explainer = shap.TreeExplainer(CARB_model)
CARB_val_shap_values = CARB_explainer(CARB_Xvt)
CARB_val_shap_interaction = CARB_explainer.shap_interaction_values(CARB_Xvt)
CARB_val_sv = CARB_explainer.shap_values(CARB_Xvt)
CARB_holdout_shap_values = CARB_explainer(CARB_Xht)
CARB_holdout_shap_interaction = CARB_explainer.shap_interaction_values(CARB_Xht)
CARB_holdout_sv = CARB_explainer.shap_values(CARB_Xht)

CPU times: user 2h 11min 54s, sys: 6.7 s, total: 2h 12min 1s
Wall time: 1h 47min 49s


In [None]:
%%time
# This process takes approximately 0.25 hours

SEAA_explainer = shap.TreeExplainer(SEAA_model)
SEAA_val_shap_values = SEAA_explainer(SEAA_Xvt)
SEAA_val_shap_interaction = SEAA_explainer.shap_interaction_values(SEAA_Xvt)
SEAA_val_sv = SEAA_explainer.shap_values(SEAA_Xvt)
SEAA_holdout_shap_values = SEAA_explainer(SEAA_Xht)
SEAA_holdout_shap_interaction = SEAA_explainer.shap_interaction_values(SEAA_Xht)
SEAA_holdout_sv = SEAA_explainer.shap_values(SEAA_Xht)

CPU times: user 48min 7s, sys: 1.35 s, total: 48min 9s
Wall time: 12min 12s


In [None]:
%%time
# This process takes approximately 12.25 hours

GLOB_explainer = shap.TreeExplainer(GLOB_model)
GLOB_val_shap_values = GLOB_explainer(GLOB_Xvt)
GLOB_val_shap_interaction = GLOB_explainer.shap_interaction_values(GLOB_Xvt)
GLOB_val_sv = GLOB_explainer.shap_values(GLOB_Xvt)
GLOB_holdout_shap_values = GLOB_explainer(GLOB_Xht)
GLOB_holdout_shap_interaction = GLOB_explainer.shap_interaction_values(GLOB_Xht)
GLOB_holdout_sv = GLOB_explainer.shap_values(GLOB_Xht)

CPU times: user 12h 51min 30s, sys: 30.4 s, total: 12h 52min
Wall time: 11h 9min 45s


### Write out the SHAP Objects to Disk for Final Report App

In [None]:
# Write out all SHAP objects for use in the visualization tool
# Make directory to write outputs
os.makedirs("/work/data/apps/shap", exist_ok=True)

# Explainer
with open('/work/data/apps/shap/CARB_explainer.pickle', 'wb') as f:
    pickle.dump(CARB_explainer, f)
with open('/work/data/apps/shap/SEAA_explainer.pickle', 'wb') as f:
    pickle.dump(SEAA_explainer, f)
with open('/work/data/apps/shap/GLOB_explainer.pickle', 'wb') as f:
    pickle.dump(GLOB_explainer, f)

# SHAP values
with open('/work/data/apps/shap/CARB_val_shap_values.pickle', 'wb') as f:
    pickle.dump(CARB_val_shap_values, f)
with open('/work/data/apps/shap/SEAA_val_shap_values.pickle', 'wb') as f:
    pickle.dump(SEAA_val_shap_values, f)
with open('/work/data/apps/shap/GLOB_val_shap_values.pickle', 'wb') as f:
    pickle.dump(GLOB_val_shap_values, f)
with open('/work/data/apps/shap/CARB_holdout_shap_values.pickle', 'wb') as f:
    pickle.dump(CARB_holdout_shap_values, f)
with open('/work/data/apps/shap/SEAA_holdout_shap_values.pickle', 'wb') as f:
    pickle.dump(SEAA_holdout_shap_values, f)
with open('/work/data/apps/shap/GLOB_holdout_shap_values.pickle', 'wb') as f:
    pickle.dump(GLOB_holdout_shap_values, f)

# SHAP interaction values
with open('/work/data/apps/shap/CARB_val_shap_interaction.pickle', 'wb') as f:
    pickle.dump(CARB_val_shap_interaction, f)
with open('/work/data/apps/shap/SEAA_val_shap_interaction.pickle', 'wb') as f:
    pickle.dump(SEAA_val_shap_interaction, f)
with open('/work/data/apps/shap/GLOB_val_shap_interaction.pickle', 'wb') as f:
    pickle.dump(GLOB_val_shap_interaction, f)
with open('/work/data/apps/shap/CARB_holdout_shap_interaction.pickle', 'wb') as f:
    pickle.dump(CARB_holdout_shap_interaction, f)
with open('/work/data/apps/shap/SEAA_holdout_shap_interaction.pickle', 'wb') as f:
    pickle.dump(SEAA_holdout_shap_interaction, f)
with open('/work/data/apps/shap/GLOB_holdout_shap_interaction.pickle', 'wb') as f:
    pickle.dump(GLOB_holdout_shap_interaction, f)

# SHAP explainer values
with open('/work/data/apps/shap/CARB_val_sv.pickle', 'wb') as f:
    pickle.dump(CARB_val_sv, f)
with open('/work/data/apps/shap/SEAA_val_sv.pickle', 'wb') as f:
    pickle.dump(SEAA_val_sv, f)
with open('/work/data/apps/shap/GLOB_val_sv.pickle', 'wb') as f:
    pickle.dump(GLOB_val_sv, f)
with open('/work/data/apps/shap/CARB_holdout_sv.pickle', 'wb') as f:
    pickle.dump(CARB_holdout_sv, f)
with open('/work/data/apps/shap/SEAA_holdout_sv.pickle', 'wb') as f:
    pickle.dump(SEAA_holdout_sv, f)
with open('/work/data/apps/shap/GLOB_holdout_sv.pickle', 'wb') as f:
    pickle.dump(GLOB_holdout_sv, f)

### Create Lists of Good/Bad Predictions for SHAP Waterfalls

In [None]:
CARB_Hld_good=df[
    (df.Region == "CARB")
    & (df.Split == "Holdout")
    & (df["abs_difference"].between(0, 5))
    & (df.prediction > 5)
].sort_values("abs_difference", ascending=True).head(5).index.tolist()

CARB_Val_good=df[
    (df.Region == "CARB")
    & (df.Split == "Validation")
    & (df["abs_difference"].between(0, 5))
    & (df.prediction > 5)
].sort_values("abs_difference", ascending=True).head(5).index.tolist()


SEAA_Val_good=df[
    (df.Region == "SEAA")
    & (df.Split == "Validation")
    & (df["abs_difference"].between(0, 5))
    & (df.prediction > 5)
].sort_values("abs_difference", ascending=True).head(5).index.tolist()


SEAA_Hld_good=df[
    (df.Region == "SEAA")
    & (df.Split == "Holdout")
    & (df["abs_difference"].between(0, 5))
    & (df.prediction > 5)
].sort_values("abs_difference", ascending=True).head(5).index.tolist()


GLOB_Hld_good=df[
    (df.Region == "GLOB")
    & (df.Split == "Holdout")
    & (df["abs_difference"].between(0, 5))
    & (df.prediction > 5)
].sort_values("abs_difference", ascending=True).head(5).index.tolist()


GLOB_Val_good=df[
    (df.Region == "GLOB")
    & (df.Split == "Validation")
    & (df["abs_difference"].between(0, 5))
    & (df.prediction > 5)
].sort_values("abs_difference", ascending=True).head(5).index.tolist()


In [None]:
CARB_Hld_bad=df[
    (df.Region == "CARB")
    & (df.Split == "Holdout")
    & (df["abs_difference"].between(50, 100))
    & (df.prediction < 5)
].sort_values("abs_difference", ascending=False).head(5).index.tolist()


CARB_Val_bad=df[
    (df.Region == "CARB")
    & (df.Split == "Validation")
    & (df["abs_difference"].between(50, 100))
    & (df.prediction < 5)
].sort_values("abs_difference", ascending=False).head(5).index.tolist()

SEAA_Val_bad=df[
    (df.Region == "SEAA")
    & (df.Split == "Validation")
    & (df["abs_difference"].between(50, 100))
    & (df.prediction < 5)
].sort_values("abs_difference", ascending=False).head(5).index.tolist()


SEAA_Hld_bad=df[
    (df.Region == "SEAA")
    & (df.Split == "Holdout")
    & (df["abs_difference"].between(50, 100))
    & (df.prediction < 5)
].sort_values("abs_difference", ascending=False).head(5).index.tolist()


GLOB_Hld_bad=df[
    (df.Region == "GLOB")
    & (df.Split == "Holdout")
    & (df["abs_difference"].between(50, 100))
    & (df.prediction < 5)
].sort_values("abs_difference", ascending=False).head(5).index.tolist()


GLOB_Val_bad=df[
    (df.Region == "GLOB")
    & (df.Split == "Validation")
    & (df["abs_difference"].between(50, 100))
    & (df.prediction < 5)
].sort_values("abs_difference", ascending=False).head(5).index.tolist()

### Read SHAP Objects and Good/Bad Predictions from Disk

In [None]:
with open('/work/data/apps/shap/CARB_val_shap_values.pickle', 'rb') as f:
    CARB_val_shap_values = pickle.load(f)
with open('/work/data/apps/shap/SEAA_val_shap_values.pickle', 'rb') as f:
    SEAA_val_shap_values = pickle.load(f)
with open('/work/data/apps/shap/GLOB_val_shap_values.pickle', 'rb') as f:
    GLOB_val_shap_values = pickle.load(f)

with open('/work/data/apps/shap/CARB_holdout_shap_values.pickle', 'rb') as f:
    CARB_holdout_shap_values = pickle.load(f)
with open('/work/data/apps/shap/SEAA_holdout_shap_values.pickle', 'rb') as f:
    SEAA_holdout_shap_values = pickle.load(f)
with open('/work/data/apps/shap/GLOB_holdout_shap_values.pickle', 'rb') as f:
    GLOB_holdout_shap_values = pickle.load(f)

with open('/work/data/apps/shap/CARB_val_shap_interaction.pickle', 'rb') as f:
    CARB_val_shap_interaction = pickle.load(f)
with open('/work/data/apps/shap/SEAA_val_shap_interaction.pickle', 'rb') as f:
    SEAA_val_shap_interaction = pickle.load(f)
with open('/work/data/apps/shap/GLOB_val_shap_interaction.pickle', 'rb') as f:
    GLOB_val_shap_interaction = pickle.load(f)

with open('/work/data/apps/shap/CARB_holdout_shap_interaction.pickle', 'rb') as f:
    CARB_holdout_shap_interaction = pickle.load(f)
with open('/work/data/apps/shap/SEAA_holdout_shap_interaction.pickle', 'rb') as f:
    SEAA_holdout_shap_interaction = pickle.load(f)
with open('/work/data/apps/shap/GLOB_holdout_shap_interaction.pickle', 'rb') as f:
    GLOB_holdout_shap_interaction = pickle.load(f)

In [None]:
with open('/work/data/apps/shap/lists/CARB_Hld_good.pickle', 'wb') as f:
    pickle.dump(CARB_Hld_good, f)
with open('/work/data/apps/shap/lists/CARB_Val_good.pickle', 'wb') as f:
    pickle.dump(CARB_Val_good, f)
with open('/work/data/apps/shap/lists/SEAA_Hld_good.pickle', 'wb') as f:
    pickle.dump(SEAA_Hld_good, f)
with open('/work/data/apps/shap/lists/SEAA_Val_good.pickle', 'wb') as f:
    pickle.dump(SEAA_Val_good, f)
with open('/work/data/apps/shap/lists/GLOB_Hld_good.pickle', 'wb') as f:
    pickle.dump(GLOB_Hld_good, f)
with open('/work/data/apps/shap/lists/GLOB_Val_good.pickle', 'wb') as f:
    pickle.dump(GLOB_Val_good, f)

with open('/work/data/apps/shap/lists/CARB_Hld_bad.pickle', 'wb') as f:
    pickle.dump(CARB_Hld_bad, f)
with open('/work/data/apps/shap/lists/CARB_Val_bad.pickle', 'wb') as f:
    pickle.dump(CARB_Val_bad, f)
with open('/work/data/apps/shap/lists/SEAA_Hld_bad.pickle', 'wb') as f:
    pickle.dump(SEAA_Hld_bad, f)
with open('/work/data/apps/shap/lists/SEAA_Val_bad.pickle', 'wb') as f:
    pickle.dump(SEAA_Val_bad, f)
with open('/work/data/apps/shap/lists/GLOB_Hld_bad.pickle', 'wb') as f:
    pickle.dump(GLOB_Hld_bad, f)
with open('/work/data/apps/shap/lists/GLOB_Val_bad.pickle', 'wb') as f:
    pickle.dump(GLOB_Val_bad, f)

### Write SHAP Waterfalls to Disk

In [None]:
for i in CARB_Val_good:
    investigate = CARB_Xvt.index.get_loc(i)
    plt.figure(figsize=(16,12))
    shap.plots.waterfall(CARB_val_shap_values[investigate], max_display=10, show=False)
    anno_text = f'Data from Global Coral Bleaching Database, 2024'
    plt.annotate(anno_text, (0.5, -0.2), xycoords='axes fraction', fontsize=12, color='gray', ha='center')
    plt.title(f"Waterfall Plot for {i}, in CARB on Validation data", fontsize=16, y=1.08)
    plt.savefig(f'/work/data/apps/images/shap_waterfall_{i}.png', bbox_inches='tight')
    plt.close()


for i in SEAA_Val_good:
    investigate = SEAA_Xvt.index.get_loc(i)
    plt.figure(figsize=(16,12))
    shap.plots.waterfall(SEAA_val_shap_values[investigate], max_display=10, show=False)
    anno_text = f'Data from Global Coral Bleaching Database, 2024'
    plt.annotate(anno_text, (0.5, -0.2), xycoords='axes fraction', fontsize=12, color='gray', ha='center')
    plt.title(f"Waterfall Plot for {i}, in CARB on Validation data", fontsize=16, y=1.08)
    plt.savefig(f'/work/data/apps/images/shap_waterfall_{i}.png', bbox_inches='tight')
    plt.close()

for i in GLOB_Val_good:
    investigate = GLOB_Xvt.index.get_loc(i)
    plt.figure(figsize=(16,12))
    shap.plots.waterfall(GLOB_val_shap_values[investigate], max_display=10, show=False)
    anno_text = f'Data from Global Coral Bleaching Database, 2024'
    plt.annotate(anno_text, (0.5, -0.2), xycoords='axes fraction', fontsize=12, color='gray', ha='center')
    plt.title(f"Waterfall Plot for {i}, in CARB on Validation data", fontsize=16, y=1.08)
    plt.savefig(f'/work/data/apps/images/shap_waterfall_{i}.png', bbox_inches='tight')
    plt.close()

In [None]:
for i in CARB_Hld_good:
    investigate = CARB_Xht.index.get_loc(i)
    plt.figure(figsize=(16,12))
    shap.plots.waterfall(CARB_holdout_shap_values[investigate], max_display=10, show=False)
    anno_text = f'Data from Global Coral Bleaching Database, 2024'
    plt.annotate(anno_text, (0.5, -0.2), xycoords='axes fraction', fontsize=12, color='gray', ha='center')
    plt.title(f"Waterfall Plot for {i}, in CARB on Validation data", fontsize=16, y=1.08)
    plt.savefig(f'/work/data/apps/images/shap_waterfall_{i}.png', bbox_inches='tight')
    plt.close()

for i in SEAA_Hld_good:
    investigate = SEAA_Xht.index.get_loc(i)
    plt.figure(figsize=(16,12))
    shap.plots.waterfall(SEAA_holdout_shap_values[investigate], max_display=10, show=False)
    anno_text = f'Data from Global Coral Bleaching Database, 2024'
    plt.annotate(anno_text, (0.5, -0.2), xycoords='axes fraction', fontsize=12, color='gray', ha='center')
    plt.title(f"Waterfall Plot for {i}, in CARB on Validation data", fontsize=16, y=1.08)
    plt.savefig(f'/work/data/apps/images/shap_waterfall_{i}.png', bbox_inches='tight')
    plt.close()

for i in GLOB_Hld_good:
    investigate = GLOB_Xht.index.get_loc(i)
    plt.figure(figsize=(16,12))
    shap.plots.waterfall(GLOB_holdout_shap_values[investigate], max_display=10, show=False)
    anno_text = f'Data from Global Coral Bleaching Database, 2024'
    plt.annotate(anno_text, (0.5, -0.2), xycoords='axes fraction', fontsize=12, color='gray', ha='center')
    plt.title(f"Waterfall Plot for {i}, in CARB on Validation data", fontsize=16, y=1.08)
    plt.savefig(f'/work/data/apps/images/shap_waterfall_{i}.png', bbox_inches='tight')
    plt.close()

## Abs Value of Errors Mean, StdDev and 95th%

In [86]:
hist_data = df[
            (df["Split"] == "Holdout")
            & (df["Region"] == "CARB")
            ]["difference"]

fig = px.histogram(
    hist_data,
    x="difference",
    nbins=20,
    title="Distribution of the difference in error between predictions and actuals",
    marginal="violin",
)

fig.add_vline(x=np.mean(hist_data), line_width=3, line_dash="solid", line_color="red")
fig.add_vline(x=np.std(hist_data), line_width=2, line_dash="dash", line_color="blue")
fig.add_vline(x=-np.std(hist_data), line_width=2, line_dash="dash", line_color="blue")
fig.add_vline(x=2*np.std(hist_data), line_width=2, line_dash="dash", line_color="darkorange")
fig.add_vline(x=2*-np.std(hist_data), line_width=2, line_dash="dash", line_color="darkorange")
fig.add_vline(x=3*np.std(hist_data), line_width=1, line_dash="dash", line_color="darkgrey")
fig.add_vline(x=3*-np.std(hist_data), line_width=1, line_dash="dash", line_color="darkgrey")
fig.update_layout(title="Histogram with Mean and Standard Deviation", xaxis_title="Value", yaxis_title="Count")


# Show the plot
fig.show()

In [89]:
print(np.mean(hist_data).round(2))
print(np.std(hist_data).round(2))

0.63
16.64


In [112]:
CARB_hist_data = df[
                (df["Split"] == "Holdout")
                & (df["Region"] == "CARB")
                ]["difference"]

SEAA_hist_data = df[
                (df["Split"] == "Holdout")
                & (df["Region"] == "SEAA")
                ]["difference"]

GLOB_hist_data = df[
                (df["Split"] == "Holdout")
                & (df["Region"] == "GLOB")
                ]["difference"]

print("CARB")
print(np.abs(np.mean(CARB_hist_data).round(2)))
print(np.std(CARB_hist_data).round(2))
print(1.96*np.std(CARB_hist_data).round(2) + np.abs(np.mean(CARB_hist_data).round(2)))
print("-"*10)
print("SEAA")
print(np.abs(np.mean(SEAA_hist_data).round(2)))
print(np.std(SEAA_hist_data).round(2))
print(1.96*np.std(SEAA_hist_data).round(2) + np.abs(np.mean(SEAA_hist_data).round(2)))
print("-"*10)
print("GLOB")
print(np.abs(np.mean(GLOB_hist_data).round(2)))
print(np.std(GLOB_hist_data).round(2))
print(1.96*np.std(GLOB_hist_data).round(2) + np.abs(np.mean(GLOB_hist_data).round(2)))

CARB
0.63
16.64
33.244400000000006
----------
SEAA
0.02
14.76
28.9496
----------
GLOB
0.27
16.69
32.982400000000005


## Create Top Features List per Model

### Import Feature Selection DataFrames from Disk

In [13]:
LightGBM_feature_selection_CARB = pd.read_parquet('/work/data/Feature_Selection/LightGBM_feature_selection_CARB.parquet')
LightGBM_feature_selection_SEAA = pd.read_parquet('/work/data/Feature_Selection/LightGBM_feature_selection_SEAA.parquet')
LightGBM_feature_selection_GLOB = pd.read_parquet('/work/data/Feature_Selection/LightGBM_feature_selection_GLOB.parquet')
XGBoost_feature_selection_CARB = pd.read_parquet('/work/data/Feature_Selection/XGBoost_feature_selection_CARB.parquet')
XGBoost_feature_selection_SEAA = pd.read_parquet('/work/data/Feature_Selection/XGBoost_feature_selection_SEAA.parquet')
XGBoost_feature_selection_GLOB = pd.read_parquet('/work/data/Feature_Selection/XGBoost_feature_selection_GLOB.parquet')

### Add Region and Model Columns And Consolidate DataFrames

In [28]:
LightGBM_feature_selection_CARB['Region'] = 'CARB'
LightGBM_feature_selection_SEAA['Region'] = 'SEAA'
LightGBM_feature_selection_GLOB['Region'] = 'GLOB'
XGBoost_feature_selection_CARB['Region'] = 'CARB'
XGBoost_feature_selection_SEAA['Region'] = 'SEAA'
XGBoost_feature_selection_GLOB['Region'] = 'GLOB'

LightGBM_feature_selection_CARB['Model'] = 'LightGBM'
LightGBM_feature_selection_SEAA['Model'] = 'LightGBM'
LightGBM_feature_selection_GLOB['Model'] = 'LightGBM'
XGBoost_feature_selection_CARB['Model'] = 'XGBoost'
XGBoost_feature_selection_SEAA['Model'] = 'XGBoost'
XGBoost_feature_selection_GLOB['Model'] = 'XGBoost'

LC = LightGBM_feature_selection_CARB[LightGBM_feature_selection_CARB['Val_MAE_Change'] >= 0].sort_values(by=['Val_MAE'], ascending=True).head(1)
LS = LightGBM_feature_selection_SEAA[LightGBM_feature_selection_SEAA['Val_MAE_Change'] >= 0].sort_values(by=['Val_MAE'], ascending=True).head(1)
LG = LightGBM_feature_selection_GLOB[LightGBM_feature_selection_GLOB['Val_MAE_Change'] >= 0].sort_values(by=['Val_MAE'], ascending=True).head(1)
XC = XGBoost_feature_selection_CARB[XGBoost_feature_selection_CARB['Val_MAE_Change'] >= 0].sort_values(by=['Val_MAE'], ascending=True).head(1)
XS = XGBoost_feature_selection_SEAA[XGBoost_feature_selection_SEAA['Val_MAE_Change'] >= 0].sort_values(by=['Val_MAE'], ascending=True).head(1)
XG = XGBoost_feature_selection_GLOB[XGBoost_feature_selection_GLOB['Val_MAE_Change'] >= 0].sort_values(by=['Val_MAE'], ascending=True).head(1)

TopFeatures = pd.concat([LC, LS, LG, XC, XS, XG]).round(4)
TopFeatures['Feature_Count'] = TopFeatures[['Model', 'Region', 'Features', 'Val_MAE', 'Val_MAE_Change']]['Features'].str.count(",") + 1

### Output TopFeatures DataFrame to Disk

In [33]:
TopFeatures.to_parquet('/work/data/apps/TopFeatures.parquet')

## Model Complexity by Parameters

In [101]:
with open("/work/models/xgboost_reg/CARB/20240413_185012_params.pkl", "rb") as f:
    CARB_XGB_params = pickle.load(f)
with open("/work/models/xgboost_reg/SEAA/20240413_191526_params.pkl", "rb") as f:
    SEAA_XGB_params = pickle.load(f)
with open("/work/models/xgboost_reg/GLOB/20240413_195305_params.pkl", "rb") as f:
    GLOB_XGB_params = pickle.load(f)
with open("/work/models/lightgbm_reg/CARB/20240413_195656_params.pkl", "rb") as f:
    CARB_LGBM_params = pickle.load(f)
with open("/work/models/lightgbm_reg/SEAA/20240413_210139_params.pkl", "rb") as f:
    SEAA_LGBM_params = pickle.load(f)
with open("/work/models/lightgbm_reg/GLOB/20240414_113645_params.pkl", "rb") as f:
    GLOB_LGBM_params = pickle.load(f)

### Model Parameters for LGBM and XGB

In [102]:
print(CARB_XGB_params)
print(SEAA_XGB_params)
print(GLOB_XGB_params)
print(CARB_LGBM_params)
print(SEAA_LGBM_params)
print(GLOB_LGBM_params)

{'colsample_bytree': 0.6209743509625751, 'eta': 0.03293991516546757, 'gamma': 1.2580389662931162, 'max_depth': 18, 'min_child_weight': 4, 'n_estimators': 748, 'reg_alpha': 1, 'reg_lambda': 14.92242588306025, 'subsample': 0.8239997546209348}
{'colsample_bytree': 0.6683159350272164, 'eta': 0.016575408581581384, 'gamma': 1.9695204510137367, 'max_depth': 17, 'min_child_weight': 2, 'n_estimators': 659, 'reg_alpha': 2, 'reg_lambda': 1.9311806405277454, 'subsample': 0.9137226173823628}
{'colsample_bytree': 0.9968885635999238, 'eta': 0.017691085390877714, 'gamma': 2.566008685714379, 'max_depth': 20, 'min_child_weight': 1, 'n_estimators': 995, 'reg_alpha': 1, 'reg_lambda': 13.644947099684984, 'subsample': 0.345956447706317}
{'colsample_bytree': 0.5683579417159563, 'learning_rate': 0.03364215449628482, 'min_child_samples': 1, 'min_child_weight': 0.07346058876490211, 'min_split_gain': 0.4493524623847611, 'n_estimators': 1010, 'num_leaves': 1141, 'reg_alpha': 0.09248327804728351, 'reg_lambda': 0.7