# Holdout Testing

## Notebook Setup

In [1]:
# Import Standard Libraries
import os
import datetime
import pickle
import itertools
import pandas as pd
import numpy as np

# Import custom functions
import env_functions as ef
import s3_functions as sf
import common_functions as cf

Loading dotenv file


In [2]:
# Import Modeling Libraries
import lightgbm as lgb
import xgboost as xgb
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.linear_model import LinearRegression

# Import Metrics
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [3]:
# Determine the environment and get appropriate vars
deepnote, env_vars = ef.load_env_vars()

# Iterate through the vars and set them as global vars
for var_name, var in env_vars.items():
    globals()[var_name] = var

# If not in the DeepNote environment, create a dict for aws creds
#   that were located in the environment file.  This will be passed
#   to all aws s3 functions.
if not deepnote:
    aws_env_vars = {
        'access_key_id': aws_access_key_id,
        'secret_access_key': aws_secret_access_key,
        'bucket_name': s3_bucket_name
    }

Loading dotenv file


In [4]:
# Pandas Configs
pd.set_option('mode.chained_assignment', None)
pd.options.display.float_format = '{:.4f}'.format
pd.options.display.max_columns = None
pd.options.display.max_rows = None

# Ignore Warnings
import warnings
warnings.simplefilter('ignore', category=FutureWarning)
warnings.simplefilter('ignore', category=UserWarning)
warnings.simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

# Load Features

In [5]:
CARB_Xtt, CARB_Xvt, CARB_Xht, CARB_yt, CARB_yv, CARB_yh = cf.import_data(location_name='CARB')
SEAA_Xtt, SEAA_Xvt, SEAA_Xht, SEAA_yt, SEAA_yv, SEAA_yh = cf.import_data(location_name='SEAA')
GLOB_Xtt, GLOB_Xvt, GLOB_Xht, GLOB_yt, GLOB_yv, GLOB_yh = cf.import_data(location_name='GLOB')

if deepnote:
    # Load XGB Features
    with open('/work/data/Feature_Selection/CARB_XGB_feat_list.pkl', 'rb') as f:
        CARB_XGB_feat_list =  pickle.load(f)
    with open('/work/data/Feature_Selection/SEAA_XGB_feat_list.pkl', 'rb') as f:
        SEAA_XGB_feat_list =  pickle.load(f)
    with open('/work/data/Feature_Selection/GLOB_XGB_feat_list.pkl', 'rb') as f:
        GLOB_XGB_feat_list =  pickle.load(f)

    # Load LGBM Features
    with open('/work/data/Feature_Selection/CARB_LGBM_feat_list.pkl', 'rb') as f:
        CARB_LGBM_feat_list =  pickle.load(f)
    with open('/work/data/Feature_Selection/SEAA_LGBM_feat_list.pkl', 'rb') as f:
        SEAA_LGBM_feat_list =  pickle.load(f)
    with open('/work/data/Feature_Selection/GLOB_LGBM_feat_list.pkl', 'rb') as f:
        GLOB_LGBM_feat_list =  pickle.load(f)
else:
    # Load XGB Features
    f = sf.load_from_s3(file_path='data/Feature_Selection/CARB_XGB_feat_list.pkl', **aws_env_vars)
    CARB_XGB_feat_list = pickle.load(f)
    f = sf.load_from_s3(file_path='data/Feature_Selection/SEAA_XGB_feat_list.pkl', **aws_env_vars)
    SEAA_XGB_feat_list = pickle.load(f)
    f = sf.load_from_s3(file_path='data/Feature_Selection/GLOB_XGB_feat_list.pkl', **aws_env_vars)
    GLOB_XGB_feat_list = pickle.load(f)
    
    # Load LGBM Features
    f = sf.load_from_s3(file_path='data/Feature_Selection/CARB_LGBM_feat_list.pkl', **aws_env_vars)
    CARB_LGBM_feat_list = pickle.load(f)
    f = sf.load_from_s3(file_path='data/Feature_Selection/SEAA_LGBM_feat_list.pkl', **aws_env_vars)
    SEAA_LGBM_feat_list = pickle.load(f)
    f = sf.load_from_s3(file_path='data/Feature_Selection/GLOB_LGBM_feat_list.pkl', **aws_env_vars)
    GLOB_LGBM_feat_list = pickle.load(f)

# Load Models and Parameters

## XGBoost

In [6]:
if deepnote:
    # Fetch model parameters
    with open("/work/models/xgboost_reg/CARB/20240413_185012_params.pkl", "rb") as f:
        CARB_XGB_params = pickle.load(f)
    with open("/work/models/xgboost_reg/SEAA/20240413_191526_params.pkl", "rb") as f:
        SEAA_XGB_params = pickle.load(f)
    with open("/work/models/xgboost_reg/GLOB/20240413_195305_params.pkl", "rb") as f:
        GLOB_XGB_params = pickle.load(f)

    # Fetch model objects
    with open("/work/models/xgboost_reg/CARB/20240413_185012_model.pkl", "rb") as f:
        CARB_XGB_model = pickle.load(f)
    with open("/work/models/xgboost_reg/SEAA/20240413_191526_model.pkl", "rb") as f:
        SEAA_XGB_model = pickle.load(f)
    with open("/work/models/xgboost_reg/GLOB/20240413_195305_model.pkl", "rb") as f:
        GLOB_XGB_model = pickle.load(f)

else:
    # Fetch model parameters
    f = sf.load_from_s3(file_path="models/xgboost_reg/CARB/20240413_185012_params.pkl", **aws_env_vars)
    CARB_XGB_params = pickle.load(f)
    f = sf.load_from_s3(file_path="models/xgboost_reg/SEAA/20240413_191526_params.pkl", **aws_env_vars)
    SEAA_XGB_params = pickle.load(f)
    f = sf.load_from_s3(file_path="models/xgboost_reg/GLOB/20240413_195305_params.pkl", **aws_env_vars)
    GLOB_XGB_params = pickle.load(f)

    # Fetch model objects
    f = sf.load_from_s3(file_path="models/xgboost_reg/CARB/20240413_185012_model.pkl", **aws_env_vars)
    CARB_XGB_model = pickle.load(f)
    f = sf.load_from_s3(file_path="models/xgboost_reg/SEAA/20240413_191526_model.pkl", **aws_env_vars)
    SEAA_XGB_model = pickle.load(f)
    f = sf.load_from_s3(file_path="models/xgboost_reg/GLOB/20240413_195305_model.pkl", **aws_env_vars)
    GLOB_XGB_model = pickle.load(f)

## LightGBM

In [7]:
if deepnote:
    # Fetch model parameters
    with open("/work/models/lightgbm_reg/CARB/20240413_195656_params.pkl", "rb") as f:
        CARB_LGBM_params = pickle.load(f)
    with open("/work/models/lightgbm_reg/SEAA/20240413_210139_params.pkl", "rb") as f:
        SEAA_LGBM_params = pickle.load(f)
    with open("/work/models/lightgbm_reg/GLOB/20240414_113645_params.pkl", "rb") as f:
        GLOB_LGBM_params = pickle.load(f)

    # Fetch model objects
    with open("/work/models/lightgbm_reg/CARB/20240413_195656_model.pkl", "rb") as f:
        CARB_LGBM_model = pickle.load(f)
    with open("/work/models/lightgbm_reg/SEAA/20240413_210139_model.pkl", "rb") as f:
        SEAA_LGBM_model = pickle.load(f)
    with open("/work/models/lightgbm_reg/GLOB/20240414_113645_model.pkl", "rb") as f:
        GLOB_LGBM_model = pickle.load(f)

else:
    # Fetch model parameters
    f = sf.load_from_s3(file_path="models/lightgbm_reg/CARB/20240413_195656_params.pkl", **aws_env_vars)
    CARB_LGBM_params = pickle.load(f)
    f = sf.load_from_s3(file_path="models/lightgbm_reg/SEAA/20240413_210139_params.pkl", **aws_env_vars)
    SEAA_LGBM_params = pickle.load(f)
    f = sf.load_from_s3(file_path="models/lightgbm_reg/GLOB/20240414_113645_params.pkl", **aws_env_vars)
    GLOB_LGBM_params = pickle.load(f)


    # Fetch model objects
    f = sf.load_from_s3(file_path="models/lightgbm_reg/CARB/20240413_195656_model.pkl", **aws_env_vars)
    CARB_LGBM_model = pickle.load(f)
    f = sf.load_from_s3(file_path="models/lightgbm_reg/SEAA/20240413_210139_model.pkl", **aws_env_vars)
    SEAA_LGBM_model = pickle.load(f)
    f = sf.load_from_s3(file_path="models/lightgbm_reg/GLOB/20240414_113645_model.pkl", **aws_env_vars)
    GLOB_LGBM_model = pickle.load(f)

## RandomForest

In [8]:
if deepnote:
    # Fetch model parameters
    with open("/work/models/randomforest_reg/CARB/20240413_111331_params.pkl", "rb") as f:
        CARB_RF_params = pickle.load(f)
    with open("/work/models/randomforest_reg/SEAA/20240413_115032_params.pkl", "rb") as f:
        SEAA_RF_params = pickle.load(f)
    with open("/work/models/randomforest_reg/GLOB/20240413_135029_params.pkl", "rb") as f:
        GLOB_RF_params = pickle.load(f)

    # Fetch model objects
    with open("/work/models/randomforest_reg/CARB/20240413_111331_model.pkl", "rb") as f:
        CARB_RF_model = pickle.load(f)
    with open("/work/models/randomforest_reg/SEAA/20240413_115032_model.pkl", "rb") as f:
        SEAA_RF_model = pickle.load(f)
    with open("/work/models/randomforest_reg/GLOB/20240413_135029_model.pkl", "rb") as f:
        GLOB_RF_model = pickle.load(f)
        
else:
    # Fetch model parameters
    f = sf.load_from_s3(file_path="models/randomforest_reg/CARB/20240413_111331_params.pkl", **aws_env_vars)
    CARB_RF_params = pickle.load(f)
    f = sf.load_from_s3(file_path="models/randomforest_reg/SEAA/20240413_115032_params.pkl", **aws_env_vars)
    SEAA_RF_params = pickle.load(f)
    f = sf.load_from_s3(file_path="models/randomforest_reg/GLOB/20240413_135029_params.pkl", **aws_env_vars)
    GLOB_RF_params = pickle.load(f)


    # Fetch model objects
    f = sf.load_from_s3(file_path="models/randomforest_reg/CARB/20240413_111331_model.pkl", **aws_env_vars)
    CARB_RF_model = pickle.load(f)
    f = sf.load_from_s3(file_path="models/randomforest_reg/SEAA/20240413_115032_model.pkl", **aws_env_vars)
    SEAA_RF_model = pickle.load(f)
    f = sf.load_from_s3(file_path="models/randomforest_reg/GLOB/20240413_135029_model.pkl", **aws_env_vars)
    GLOB_RF_model = pickle.load(f)

## ElasticNet

In [9]:
if deepnote:
    # Fetch model parameters
    with open("/work/models/elasticnet_reg/CARB/20240413_135638_params.pkl", "rb") as f:
        CARB_ENET_params = pickle.load(f)
    with open("/work/models/elasticnet_reg/SEAA/20240413_135843_params.pkl", "rb") as f:
        SEAA_ENET_params = pickle.load(f)
    with open("/work/models/elasticnet_reg/GLOB/20240413_140053_params.pkl", "rb") as f:
        GLOB_ENET_params = pickle.load(f)

    # Fetch model objects
    with open("/work/models/elasticnet_reg/CARB/20240413_135638_model.pkl", "rb") as f:
        CARB_ENET_model = pickle.load(f)
    with open("/work/models/elasticnet_reg/SEAA/20240413_135843_model.pkl", "rb") as f:
        SEAA_ENET_model = pickle.load(f)
    with open("/work/models/elasticnet_reg/GLOB/20240413_140053_model.pkl", "rb") as f:
        GLOB_ENET_model = pickle.load(f)
        
else:
    # Fetch model parameters
    f = sf.load_from_s3(file_path="models/elasticnet_reg/CARB/20240413_135638_params.pkl", **aws_env_vars)
    CARB_ENET_params = pickle.load(f)
    f = sf.load_from_s3(file_path="models/elasticnet_reg/SEAA/20240413_135843_params.pkl", **aws_env_vars)
    SEAA_ENET_params = pickle.load(f)
    f = sf.load_from_s3(file_path="models/elasticnet_reg/GLOB/20240413_140053_params.pkl", **aws_env_vars)
    GLOB_ENET_params = pickle.load(f)


    # Fetch model objects
    f = sf.load_from_s3(file_path="models/elasticnet_reg/CARB/20240413_135638_model.pkl", **aws_env_vars)
    CARB_ENET_model = pickle.load(f)
    f = sf.load_from_s3(file_path="models/elasticnet_reg/SEAA/20240413_135843_model.pkl", **aws_env_vars)
    SEAA_ENET_model = pickle.load(f)
    f = sf.load_from_s3(file_path="models/elasticnet_reg/GLOB/20240413_140053_model.pkl", **aws_env_vars)
    GLOB_ENET_model = pickle.load(f)

## HistGradientBoosting

In [10]:
if deepnote:
    # Fetch model parameters
    with open("/work/models/histgradboost_reg/CARB/20240413_154115_params.pkl", "rb") as f:
        CARB_HGBM_params = pickle.load(f)
    with open("/work/models/histgradboost_reg/SEAA/20240413_165035_params.pkl", "rb") as f:
        SEAA_HGBM_params = pickle.load(f)
    with open("/work/models/histgradboost_reg/GLOB/20240413_202923_params.pkl", "rb") as f:
        GLOB_HGBM_params = pickle.load(f)

    # Fetch model objects
    with open("/work/models/histgradboost_reg/CARB/20240413_154115_model.pkl", "rb") as f:
        CARB_HGBM_model = pickle.load(f)
    with open("/work/models/histgradboost_reg/SEAA/20240413_165035_model.pkl", "rb") as f:
        SEAA_HGBM_model = pickle.load(f)
    with open("/work/models/histgradboost_reg/GLOB/20240413_202923_model.pkl", "rb") as f:
        GLOB_HGBM_model = pickle.load(f)
        
else:
    # Fetch model parameters
    f = sf.load_from_s3(file_path="models/histgradboost_reg/CARB/20240413_154115_params.pkl", **aws_env_vars)
    CARB_HGBM_params = pickle.load(f)
    f = sf.load_from_s3(file_path="models/histgradboost_reg/SEAA/20240413_165035_params.pkl", **aws_env_vars)
    SEAA_HGBM_params = pickle.load(f)
    f = sf.load_from_s3(file_path="models/histgradboost_reg/GLOB/20240413_202923_params.pkl", **aws_env_vars)
    GLOB_HGBM_params = pickle.load(f)


    # Fetch model objects
    f = sf.load_from_s3(file_path="models/histgradboost_reg/CARB/20240413_154115_model.pkl", **aws_env_vars)
    CARB_HGBM_model = pickle.load(f)
    f = sf.load_from_s3(file_path="models/histgradboost_reg/SEAA/20240413_165035_model.pkl", **aws_env_vars)
    SEAA_HGBM_model = pickle.load(f)
    f = sf.load_from_s3(file_path="models/histgradboost_reg/GLOB/20240413_202923_model.pkl", **aws_env_vars)
    GLOB_HGBM_model = pickle.load(f)

# Holdout Evaluation

In [11]:
def model_prediction(model, X, y, region=None, features=None):
    """
    This function takes a trained model and a test set and returns the predicted values, 
    the mean absolute error, the root mean squared error, and the R-squared score.
    """

    if features is not None:
        X = X[features]

    y_pred = model.predict(X)
    y_pred = np.clip(y_pred, 0, 100)
    mae = mean_absolute_error(y, y_pred)
    rmse = mean_squared_error(y, y_pred, squared=False)
    rsq = r2_score(y, y_pred)
    feats = X.columns.tolist()

    return y_pred, mae, rmse, rsq, feats


def dummy_model_prediction(X, y):
    """
    This function takes an X, y set and returns the predicted values
    using a dummy model, the mean absolute error, the root mean squared error,
    and the R-squared score.
    """ 
    model = DummyRegressor(strategy='mean')
    model.fit(X, y)

    y_pred = model.predict(X)
    y_pred = np.clip(y_pred, 0, 100)
    mae = mean_absolute_error(y, y_pred)
    rmse = mean_squared_error(y, y_pred, squared=False)
    rsq = r2_score(y, y_pred)
    feats = X.columns.tolist()

    return y_pred, mae, rmse, rsq, feats

In [12]:
# Create the lists to use with itertools.product
model_types = ["XGB", "RF", "ENET", "HGBM", "LGBM"]
region_names = ["CARB", "SEAA", "GLOB"]

# Create a list to store the results of each run
results_list = []

# Loop over the combinations of model and region 
# for all standard model/region combinations
for m,r in list(itertools.product(model_types, region_names)):
    # XGB and LGBM have different feature lists
    if m == "XGB" or m == "LGBM":
        feat_list = globals()[f"{r}_{m}_feat_list"]
    else:
        feat_list = None

    # Load the model from the global variable
    model = globals()[f"{r}_{m}_model"]

    # Loop over the validation and holdout sets
    for d in ['v', 'h']:
        X, y = globals()[f"{r}_X{d}t"], globals()[f"{r}_y{d}"]
        y_pred, mae, rmse, rsq, feats = model_prediction(model, X, y, features=feat_list)
        results_list.append([d, m, r, mae, rmse, rsq, feats])

# Another for loop to add the dummy model
for r in region_names:
    # Loop over the validation and holdout sets
    for d in ['v', 'h']:
        y_pred, mae, rmse, rsq, feats = dummy_model_prediction(globals()[f"{r}_X{d}t"], globals()[f"{r}_y{d}"])
        results_list.append([d, 'Dummy', r, mae, rmse, rsq, feats])


# Create a DataFrame with the results
results = pd.DataFrame(results_list, columns=["split","model", "region", "mae", "rmse", "rsq", "features"])
results['split'].replace({'t':'Training', 'v':'Validation', 'h':'Holdout'}, inplace=True)

In [13]:
results[results['split']=='Holdout'][['model','region','mae','rmse','rsq']].round(4)

Unnamed: 0,model,region,mae,rmse,rsq
1,XGB,CARB,9.3033,16.3587,0.5615
3,XGB,SEAA,7.2761,14.7631,0.2714
5,XGB,GLOB,8.7163,15.7133,0.4923
7,RF,CARB,11.4058,17.2596,0.5119
9,RF,SEAA,7.983,13.7649,0.3666
11,RF,GLOB,10.4469,16.3797,0.4483
13,ENET,CARB,17.0999,22.9403,0.1377
15,ENET,SEAA,10.6908,16.7972,0.0568
17,ENET,GLOB,14.7235,20.9536,0.0971
19,HGBM,CARB,9.2632,15.9285,0.5843


In [14]:
results[results['split'] == 'Holdout'].groupby(
    'region', as_index=False
    ).apply(
        lambda x: x.loc[x['mae'].idxmin()]
        )[['region','model','mae']].round(4)

Unnamed: 0,region,model,mae
0,CARB,LGBM,9.1064
1,GLOB,LGBM,8.5803
2,SEAA,XGB,7.2761


In [15]:
results[results['split']=='Holdout'][['model','region','mae','rmse','rsq']].round(4)

Unnamed: 0,model,region,mae,rmse,rsq
1,XGB,CARB,9.3033,16.3587,0.5615
3,XGB,SEAA,7.2761,14.7631,0.2714
5,XGB,GLOB,8.7163,15.7133,0.4923
7,RF,CARB,11.4058,17.2596,0.5119
9,RF,SEAA,7.983,13.7649,0.3666
11,RF,GLOB,10.4469,16.3797,0.4483
13,ENET,CARB,17.0999,22.9403,0.1377
15,ENET,SEAA,10.6908,16.7972,0.0568
17,ENET,GLOB,14.7235,20.9536,0.0971
19,HGBM,CARB,9.2632,15.9285,0.5843
