# Feature Selection

## Notebook Setup

### Import Libraries

In [1]:
# Import Standard Libraries
import os
import datetime
import pickle
import itertools
import pandas as pd
import numpy as np

# Import Visualization Libraries
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

# Import custom functions
import env_functions as ef
import s3_functions as sf
import common_functions as cf

Loading dotenv file


In [None]:
# Import Modeling Libraries
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import HistGradientBoostingRegressor

# Import Evaluation Libraries
from sklearn import set_config
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [None]:
# Determine the environment and get appropriate vars
deepnote, env_vars = ef.load_env_vars()

# Iterate through the vars and set them as global vars
for var_name, var in env_vars.items():
    globals()[var_name] = var

# If not in the DeepNote environment, create a dict for aws creds
#   that were located in the environment file.  This will be passed
#   to all aws s3 functions.
if not deepnote:
    aws_env_vars = {
        'access_key_id': aws_access_key_id,
        'secret_access_key': aws_secret_access_key,
        'bucket_name': s3_bucket_name
    }

In [None]:
# Pandas Configs
pd.set_option('mode.chained_assignment', None)
pd.options.display.float_format = '{:.2f}'.format
pd.options.display.max_columns = None
pd.options.display.max_rows = None

# Ignore Warnings
import warnings
warnings.simplefilter('ignore', category=FutureWarning)
warnings.simplefilter('ignore', category=UserWarning)
warnings.simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

# MapBox Token for Plotly Maps
px.set_mapbox_access_token(os.environ.get("MAPBOX_TOKEN"))

# Scikit Learn Configs
set_config(transform_output="pandas")

### Common Functions

In [None]:
def feature_model(cols=[], Xtt=None, ytt=None, Xht=None, yht=None, Xvt=None, yvt=None, train=True, val=True, holdout=True, feature=False, model_type='xgb'):
    """
    Train a model with the default hyperparameters.  If the "feature" parameter is true, a combination of 
    all features for Xtt are tested and the results returned in a dataframe.  If the "cols" parameter has
    a list of features, that will be used for the feature combinations, instead of the full list of features
    from the Xtt (X_train_trans) dataframe.

    params:
    cols: List of columns to use
    Xtt: Training dataframe
    ytt: Training target
    Xht: Holdout dataframe
    yht: Holdout target
    Xvt: Validation dataframe
    yvt: Validation target
    feature: If True, test model on all feature combinations
    model_type: 'xgb', 'lgbm', 'rf' or 'hgb'

    returns: Pandas Dataframe with model results for feature combinations
    """
    
    # If columns are passed in, use them.  Otherwise, use all columns
    if len(cols) > 0:
        Xtt = Xtt[cols]
        Xht = Xht[cols]
        Xvt = Xvt[cols]        
    
    # Create the model
    if model_type == 'xgb':
        model = xgb.XGBRegressor(objtive='reg:squarederror', eval_metric='mae')
    elif model_type == 'lgbm':
        model = lgb.LGBMRegressor(verbose=-1, random_state=42, n_jobs=-1)
    elif model_type == 'rf':
        model = RandomForestRegressor()
    elif model_type == 'hgb':
        model = HistGradientBoostingRegressor()
    else:
        raise ValueError("model_type: must be a string that is either 'xgb', 'lgbm', 'rf', or 'hgb")
    
    # Fit the model and get the cross_val_score 
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    mae_scores = -cross_val_score(model, Xtt, ytt, cv=kf, scoring='neg_mean_absolute_error')
    mean_mae = mae_scores.mean()
    std_mae = mae_scores.std()
    
    # Print the results
    print(f"Model 5 Fold CV Mean MAE: {mean_mae:.4f}")
    print(f"Model 5 Fold CV Std MAE: {std_mae:.4f}")
    print(f"Model 5 Fold CV MAE Scores: {mae_scores}")
    
    # Fit the model on the training data
    model.fit(Xtt, ytt)
    
    # Evaluate the model on the training, validation and holdout data
    if train:
        # Evaluation on train data
        train_pred = model.predict(Xtt)
        train_pred = np.clip(train_pred, 0, 100)
        train_mae = mean_absolute_error(ytt, train_pred)
        train_mse = mean_squared_error(ytt, train_pred)
        train_rmse = mean_squared_error(ytt, train_pred, squared=False)
        train_rsq = r2_score(ytt, train_pred)
        print(" ")
        print(f"Train Mean Absolute Error: {train_mae:.4f}")
        print(f"Train Mean Squared Error: {train_mse:.4f}")
        print(f"Train Root Mean Squared Error: {train_rmse:.4f}")
        print(f"Train R^2 Score: {train_rsq:.4f}")

    if val:
        # Evaluation on validation data
        val_pred = model.predict(Xvt)
        val_pred = np.clip(val_pred, 0, 100)
        val_mae = mean_absolute_error(yvt, val_pred)
        val_mse = mean_squared_error(yvt, val_pred)
        val_rmse = mean_squared_error(yvt, val_pred, squared=False)
        val_rsq = r2_score(yvt, val_pred)
        print(" ")
        print(f"Validation Mean Absolute Error: {val_mae:.4f}")
        print(f"Validation Mean Squared Error: {val_mse:.4f}")
        print(f"Validation Root Mean Squared Error: {val_rmse:.4f}")
        print(f"Validation R^2 Score: {val_rsq:.4f}")

    if holdout:
        # Evaluation on holdout data
        holdout_pred = model.predict(Xht)
        holdout_pred = np.clip(holdout_pred, 0, 100)
        holdout_mae = mean_absolute_error(yht, holdout_pred)
        holdout_mse = mean_squared_error(yht, holdout_pred)
        holdout_rmse = mean_squared_error(yht, holdout_pred, squared=False)
        holdout_rsq = r2_score(yht, holdout_pred)
        print(" ")
        print(f"Holdout Mean Absolute Error: {holdout_mae:.4f}")
        print(f"Holdout Mean Squared Error: {holdout_mse:.4f}")
        print(f"Holdout Root Mean Squared Error: {holdout_rmse:.4f}")
        print(f"Holdout R^2 Score: {holdout_rsq:.4f}")

    if feature:
        # Evaluation on train data
        train_pred = model.predict(Xtt)
        train_pred = np.clip(train_pred, 0, 100)
        baseline_mae_train = mean_absolute_error(ytt, train_pred)
    
        # Evaluation on validation data
        val_pred = model.predict(Xvt)
        val_pred = np.clip(val_pred, 0, 100)
        baseline_mae_val = mean_absolute_error(yvt, val_pred)
        
        # Create a list of all features
        feat_list = [x for x in Xtt.columns]
    
        # Now create combos of features
        # Range starts at 10 to skip features
        # Based on testing of CARB data, lowest number of features is 10 in the top100 models
        # Based on testing of SEAA data, it prefers lower numberes of features ...
        #feat_combo_list = [combo for r in range(10, len(feat_list) + 1)
        feat_combo_list = [combo for r in range(2, 10)
                          for combo in itertools.combinations(feat_list, r)]
    
        # Itertools combinations() creates tuples.
        #   Convert each combination from a tuple to a list for feature building
        feat_combo_list = [list(combo) for combo in feat_combo_list]

        print("Number of Feature Combinations: ", len(feat_combo_list))
    
        
        # Create the feature loop
        feature_results_list = []
    
        # Feat loop
        for feature in feat_combo_list:
            # use feature cols
            modified_X_train_trans = Xtt[feature]
            modified_X_val_trans = Xvt[feature]
    
            # Fit the model with features
            model.fit(modified_X_train_trans, ytt)
    
            # Evaluation on train data
            modified_train_predictions = model.predict(modified_X_train_trans)
            modified_train_predictions = np.clip(modified_train_predictions, 0, 100)
            modified_mae_train = mean_absolute_error(ytt, modified_train_predictions)
    
            # Evaluation on validation data
            modified_val_predictions = model.predict(modified_X_val_trans)
            modified_val_predictions = np.clip(modified_val_predictions, 0, 100)
            modified_mae_val = mean_absolute_error(yvt, modified_val_predictions)
    
            # Calculate MAE changes
            mae_change_train = baseline_mae_train - modified_mae_train
            mae_change_val = baseline_mae_val - modified_mae_val
    
            feature_result_dict = {
                'Features': ", ".join(feature),
                'Train_MAE': modified_mae_train,
                'Train_MAE_Change': mae_change_train,
                'Train_MAE_Pct_Change': 100 * (1 - (modified_mae_train / baseline_mae_train)),
                'Val_MAE': modified_mae_val,
                'Val_MAE_Change': mae_change_val,
                'Val_MAE_Pct_Change': 100 * (1 - (modified_mae_val / baseline_mae_val))
            }
    
            # Append the results to the list
            feature_results_list.append(feature_result_dict)
    
        # Create a dataframe from the results list
        feature_df = pd.DataFrame(feature_results_list)
    
        return feature_df

In [None]:
features_s3_path = 'data/Feature_Selection'
features_dns3_path = "/datasets/s3/data/Feature_Selection"
features_dn_path = "/work/data/Feature_Selection"

# Base Model Comparisons

## Import the Data

In [None]:
CARB_Xtt, CARB_Xvt, CARB_Xht, CARB_ytt, CARB_yvt, CARB_yht = cf.import_data(location_name='CARB')
SEAA_Xtt, SEAA_Xvt, SEAA_Xht, SEAA_ytt, SEAA_yvt, SEAA_yht = cf.import_data(location_name='SEAA')
GLOB_Xtt, GLOB_Xvt, GLOB_Xht, GLOB_ytt, GLOB_yvt, GLOB_yht = cf.import_data(location_name='GLOB')

## XGBoost

In [None]:
start_time = datetime.datetime.now()
print("Base Model Comparisons - XGBoost")
print("GLOB")
feature_model(Xtt=GLOB_Xtt, Xvt=GLOB_Xvt, Xht=GLOB_Xht, ytt=GLOB_ytt, yvt=GLOB_yvt, yht=GLOB_yht, feature=False, model_type='xgb')
print("-"*20)
print("SEAA")
feature_model(Xtt=SEAA_Xtt, Xvt=SEAA_Xvt, Xht=SEAA_Xht, ytt=SEAA_ytt, yvt=SEAA_yvt, yht=SEAA_yht, feature=False, model_type='xgb')
print("-"*20)
print("CARB")
feature_model(Xtt=CARB_Xtt, Xvt=CARB_Xvt, Xht=CARB_Xht, ytt=CARB_ytt, yvt=CARB_yvt, yht=CARB_yht, feature=False, model_type='xgb')
print("-"*20)
print("Time taken:", datetime.datetime.now() - start_time)

## LightGBM

In [None]:
start_time = datetime.datetime.now()
print("Base Model Comparisons - LightGBM")
print("GLOB")
feature_model(Xtt=GLOB_Xtt, Xvt=GLOB_Xvt, Xht=GLOB_Xht, ytt=GLOB_ytt, yvt=GLOB_yvt, yht=GLOB_yht, feature=False, model_type='lgbm')
print("-"*20)
print("SEAA")
feature_model(Xtt=SEAA_Xtt, Xvt=SEAA_Xvt, Xht=SEAA_Xht, ytt=SEAA_ytt, yvt=SEAA_yvt, yht=SEAA_yht, feature=False, model_type='lgbm')
print("-"*20)
print("CARB")
feature_model(Xtt=CARB_Xtt, Xvt=CARB_Xvt, Xht=CARB_Xht, ytt=CARB_ytt, yvt=CARB_yvt, yht=CARB_yht, feature=False, model_type='lgbm')
print("-"*20)
print("Time taken:", datetime.datetime.now() - start_time)

## RandomForest

In [None]:
start_time = datetime.datetime.now()
print("Base Model Comparisons - RandomForest")
print("GLOB")
feature_model(Xtt=GLOB_Xtt, Xvt=GLOB_Xvt, Xht=GLOB_Xht, ytt=GLOB_ytt, yvt=GLOB_yvt, yht=GLOB_yht, feature=False, model_type='rf')
print("-"*20)
print("SEAA")
feature_model(Xtt=SEAA_Xtt, Xvt=SEAA_Xvt, Xht=SEAA_Xht, ytt=SEAA_ytt, yvt=SEAA_yvt, yht=SEAA_yht, feature=False, model_type='rf')
print("-"*20)
print("CARB")
feature_model(Xtt=CARB_Xtt, Xvt=CARB_Xvt, Xht=CARB_Xht, ytt=CARB_ytt, yvt=CARB_yvt, yht=CARB_yht, feature=False, model_type='rf')
print("-"*20)
print("Time taken:", datetime.datetime.now() - start_time)

## HistGradientBoosting

In [None]:
start_time = datetime.datetime.now()
print("Base Model Comparisons - HistGradientBoosting")
print("GLOB")
feature_model(Xtt=GLOB_Xtt, Xvt=GLOB_Xvt, Xht=GLOB_Xht, ytt=GLOB_ytt, yvt=GLOB_yvt, yht=GLOB_yht, feature=False, model_type='hgb')
print("-"*20)
print("SEAA")
feature_model(Xtt=SEAA_Xtt, Xvt=SEAA_Xvt, Xht=SEAA_Xht, ytt=SEAA_ytt, yvt=SEAA_yvt, yht=SEAA_yht, feature=False, model_type='hgb')
print("-"*20)
print("CARB")
feature_model(Xtt=CARB_Xtt, Xvt=CARB_Xvt, Xht=CARB_Xht, ytt=CARB_ytt, yvt=CARB_yvt, yht=CARB_yht, feature=False, model_type='hgb')
print("-"*20)
print("Time taken:", datetime.datetime.now() - start_time)

# Results of Base Model Training on Regions

The results of the base model testing for regions indicates that out of the 4 decision tree based regressors, we should not concentrate on building custom feature sets for RandomForestRegressor due to a lack of reasonable times to execute.  At a time of 3 minutes and 45 seconds to build and score 3 models (compared to 1.6 seconds for LightGBM), the amount of time required to train against nearly 600k models would be prohibitive. Additionally, we should also not focus on HistGradientBoostingRegressor for different reasons.  While tuning would certainly improve its MAE scores, it did not perform better than XGBoost or LightGBM, and it took 324% more time than LightGBM and 235% longer than XGBoost.  Additionally, HistGradientBoostingRegressor does not have a feature importance function, making it more difficult to interpret.

For the purpose of continuing our analysis, we'll focus on either XGBoost or LightGBM, given these results.  Custom features for those models will be built out below for the Caribbean, South East Asia & Australia and Global regions below.

# Caribbean Region

## Build the Feature Selection DataFrames

In [None]:
# This cell takes approximately 30 hours to execute on a 32 core x 5GHz machine
# CARB_XGB_feature_df = feature_model(Xtt=CARB_Xtt, Xvt=CARB_Xvt, Xht=CARB_Xht, ytt=CARB_ytt, yvt=CARB_yvt, yht=CARB_yht, feature=False, model_type='xgb')
# CARB_LGBM_feature_df = feature_model(Xtt=CARB_Xtt, Xvt=CARB_Xvt, Xht=CARB_Xht, ytt=CARB_ytt, yvt=CARB_yvt, yht=CARB_yht, feature=False, model_type='lgbm')

## Write out the DataFrames

In [None]:
# Uncomment if writing out data
# if deepnote:
#     # Deep Note Local
#     CARB_XGB_feature_df.to_parquet(f"{features_dn_path}/XGBoost_feature_selection_CARB.parquet")
#     CARB_LGBM_feature_df.to_parquet(f"{features_dn_path}/LightGBM_feature_selection_CARB.parquet")

#     # S3 Integration
#     CARB_XGB_feature_df.to_parquet(f"{features_dns3_path}/XGBoost_feature_selection_CARB.parquet")
#     CARB_LGBM_feature_df.to_parquet(f"{features_dns3_path}/LightGBM_feature_selection_CARB.parquet")

# else:
#     sf.write_to_s3(file_path=f"{features_s3_path}/XGBoost_feature_selection_CARB.parquet", data=CARB_XGB_feature_df, **aws_env_vars)
#     sf.write_to_s3(file_path=f"{features_s3_path}/LightGBM_feature_selection_CARB.parquet", data=CARB_LGBM_feature_df, **aws_env_vars)

## Read DataFrames from disk if available instead of building

In [None]:
if deepnote:
    CARB_XGB_feature_df = pd.read_parquet(f"{features_dn_path}/XGBoost_feature_selection_CARB.parquet")
    CARB_LGBM_feature_df = pd.read_parquet(f"{features_dn_path}/LightGBM_feature_selection_CARB.parquet")
else:
    CARB_XGB_feature_df = pd.read_parquet(sf.load_from_s3(file_path=f"{features_s3_path}/XGBoost_feature_selection_CARB.parquet", **aws_env_vars))
    CARB_LGBM_feature_df = pd.read_parquet(sf.load_from_s3(file_path=f"{features_s3_path}/LightGBM_feature_selection_CARB.parquet", **aws_env_vars))

In [None]:
# Show the top 10 features that were tested where the "Val_MAE_Change" is greater than or equal to 0
# This will show the best combination of features that produce the lowest MAE score
CARB_XGB_feature_df[CARB_XGB_feature_df['Val_MAE_Change'] >= 0].sort_values(by=['Val_MAE'], ascending=True).head(10)

In [None]:

# Show the top 10 features that were tested where the "Val_MAE_Change" is greater than or equal to 0
# Count the number of features that were used to generate that MAE score
CARB_XGB_feature_df.sort_values(by=['Val_MAE'], ascending=True).head(10)['Features'].str.count(",") + 1

## Write out top features

In [None]:
CARB_XGB_feature_list = CARB_XGB_feature_df[CARB_XGB_feature_df['Val_MAE_Change'] >= 0].sort_values(by=['Val_MAE'], ascending=True).head(1)['Features'].str.split(',').to_list()
CARB_XGB_feature_list = CARB_XGB_feature_list[0]

CARB_LGBM_feature_list = CARB_LGBM_feature_df[CARB_LGBM_feature_df['Val_MAE_Change'] >= 0].sort_values(by=['Val_MAE'], ascending=True).head(1)['Features'].str.split(',').to_list()
CARB_LGBM_feature_list = CARB_LGBM_feature_df[0]

In [None]:
if deepnote:
    with open(f'{features_dn_path}CARB_XGB_feat_list.pkl', 'wb') as f:
                    pickle.dump(CARB_XGB_feature_list, f)
    with open(f'{features_dn_path}CARB_LGBM_feat_list.pkl', 'wb') as f:
                    pickle.dump(CARB_LGBM_feature_list, f)
    with open(f'{features_dns3_path}CARB_XGB_feat_list.pkl', 'wb') as f:
                    pickle.dump(CARB_XGB_feature_list, f)
    with open(f'{features_dns3_path}CARB_LGBM_feat_list.pkl', 'wb') as f:
                    pickle.dump(CARB_LGBM_feature_list, f)
else:
    sf.write_to_s3(file_path=f"{features_s3_path}/CARB_XGB_feat_list.pkl", data=CARB_XGB_feature_list, pickle_file=True, **aws_env_vars)
    sf.write_to_s3(file_path=f"{features_s3_path}/CARB_LGBM_feat_list.pkl", data=CARB_LGBM_feature_list, pickle_file=True, **aws_env_vars)

# South East Asia and Australia Region

## Build the Feature Selection DataFrames

In [None]:
# This cell takes approximately 30 hours to execute on a 32 core x 5GHz machine
# SEAA_XGB_feature_df = feature_model(Xtt=CARB_Xtt, Xvt=CARB_Xvt, Xht=CARB_Xht, ytt=CARB_ytt, yvt=CARB_yvt, yht=CARB_yht, feature=False, model_type='xgb')
# SEAA_LGBM_feature_df = feature_model(Xtt=CARB_Xtt, Xvt=CARB_Xvt, Xht=CARB_Xht, ytt=CARB_ytt, yvt=CARB_yvt, yht=CARB_yht, feature=False, model_type='lgbm')

## Write out the DataFrames

In [None]:
# Uncomment if writing out data
# if deepnote:
#     # Deep Note Local
#     SEAA_XGB_feature_df.to_parquet(f"{features_dn_path}/XGBoost_feature_selection_SEAA.parquet")
#     SEAA_LGBM_feature_df.to_parquet(f"{features_dn_path}/LightGBM_feature_selection_SEAA.parquet")

#     # S3 Integration
#     SEAA_XGB_feature_df.to_parquet(f"{features_dns3_path}/XGBoost_feature_selection_SEAA.parquet")
#     SEAA_LGBM_feature_df.to_parquet(f"{features_dns3_path}/LightGBM_feature_selection_SEAA.parquet")

# else:
#     sf.write_to_s3(file_path=f"{features_s3_path}/XGBoost_feature_selection_SEAA.parquet", data=SEAA_XGB_feature_df, **aws_env_vars)
#     sf.write_to_s3(file_path=f"{features_s3_path}/LightGBM_feature_selection_SEAA.parquet", data=SEAA_LGBM_feature_df, **aws_env_vars)

## Read DataFrames from disk if available instead of building

In [None]:
if deepnote:
    SEAA_XGB_feature_df = pd.read_parquet(f"{features_dn_path}/XGBoost_feature_selection_SEAA.parquet")
    SEAA_LGBM_feature_df = pd.read_parquet(f"{features_dn_path}/LightGBM_feature_selection_SEAA.parquet")
else:
    SEAA_XGB_feature_df = pd.read_parquet(sf.load_from_s3(file_path=f"{features_s3_path}/XGBoost_feature_selection_SEAA.parquet", **aws_env_vars))
    SEAA_LGBM_feature_df = pd.read_parquet(sf.load_from_s3(file_path=f"{features_s3_path}/LightGBM_feature_selection_SEAA.parquet", **aws_env_vars))

In [None]:
# Show the top 10 features that were tested where the "Val_MAE_Change" is greater than or equal to 0
# This will show the best combination of features that produce the lowest MAE score
SEAA_XGB_feature_df[SEAA_XGB_feature_df['Val_MAE_Change'] >= 0].sort_values(by=['Val_MAE'], ascending=True).head(10)

In [None]:
# Show the top 10 features that were tested where the "Val_MAE_Change" is greater than or equal to 0
# Count the number of features that were used to generate that MAE score
SEAA_XGB_feature_df.sort_values(by=['Val_MAE'], ascending=True).head(10)['Features'].str.count(",") + 1

## Write out top features

In [None]:
SEAA_XGB_feature_list = SEAA_XGB_feature_df[SEAA_XGB_feature_df['Val_MAE_Change'] >= 0].sort_values(by=['Val_MAE'], ascending=True).head(1)['Features'].str.split(',').to_list()
SEAA_XGB_feature_list = SEAA_XGB_feature_list[0]

SEAA_LGBM_feature_list = SEAA_LGBM_feature_df[SEAA_LGBM_feature_df['Val_MAE_Change'] >= 0].sort_values(by=['Val_MAE'], ascending=True).head(1)['Features'].str.split(',').to_list()
SEAA_LGBM_feature_list = SEAA_LGBM_feature_list[0]

In [None]:
if deepnote:
    with open(f'{features_dn_path}SEAA_XGB_feat_list.pkl', 'wb') as f:
                    pickle.dump(SEAA_XGB_feature_list, f)
    with open(f'{features_dn_path}SEAA_LGBM_feat_list.pkl', 'wb') as f:
                    pickle.dump(SEAA_LGBM_feature_list, f)
    with open(f'{features_dns3_path}SEAA_XGB_feat_list.pkl', 'wb') as f:
                    pickle.dump(SEAA_XGB_feature_list, f)
    with open(f'{features_dns3_path}SEAA_LGBM_feat_list.pkl', 'wb') as f:
                    pickle.dump(SEAA_LGBM_feature_list, f)
else:
    sf.write_to_s3(file_path=f"{features_s3_path}/SEAA_XGB_feat_list.pkl", data=SEAA_XGB_feature_list, pickle_file=True, **aws_env_vars)
    sf.write_to_s3(file_path=f"{features_s3_path}/SEAA_LGBM_feat_list.pkl", data=SEAA_LGBM_feature_list, pickle_file=True, **aws_env_vars)

# Global Region

## Build the Feature Selection DataFrames

In [None]:
# This cell takes approximately 30 hours to execute on a 32 core x 5GHz machine
# GLOB_XGB_feature_df = feature_model(Xtt=CARB_Xtt, Xvt=CARB_Xvt, Xht=CARB_Xht, ytt=CARB_ytt, yvt=CARB_yvt, yht=CARB_yht, feature=False, model_type='xgb')
# GLOB_LGBM_feature_df = feature_model(Xtt=CARB_Xtt, Xvt=CARB_Xvt, Xht=CARB_Xht, ytt=CARB_ytt, yvt=CARB_yvt, yht=CARB_yht, feature=False, model_type='lgbm')

## Write out the DataFrames

In [None]:
# Uncomment if writing out data
# if deepnote:
#     # Deep Note Local
#     GLOB_XGB_feature_df.to_parquet(f"{features_dn_path}/XGBoost_feature_selection_GLOB.parquet")
#     GLOB_LGBM_feature_df.to_parquet(f"{features_dn_path}/LightGBM_feature_selection_GLOB.parquet")

#     # S3 Integration
#     GLOB_XGB_feature_df.to_parquet(f"{features_dns3_path}/XGBoost_feature_selection_GLOB.parquet")
#     GLOB_LGBM_feature_df.to_parquet(f"{features_dns3_path}/LightGBM_feature_selection_GLOB.parquet")

# else:
#     sf.write_to_s3(file_path=f"{features_s3_path}/XGBoost_feature_selection_GLOB.parquet", data=GLOB_XGB_feature_df, **aws_env_vars)
#     sf.write_to_s3(file_path=f"{features_s3_path}/LightGBM_feature_selection_GLOB.parquet", data=GLOB_LGBM_feature_df, **aws_env_vars)

## Read DataFrames from disk if available instead of building

In [None]:
if deepnote:
    GLOB_XGB_feature_df = pd.read_parquet(f"{features_dn_path}/XGBoost_feature_selection_GLOB.parquet")
    GLOB_LGBM_feature_df = pd.read_parquet(f"{features_dn_path}/LightGBM_feature_selection_GLOB.parquet")
else:
    GLOB_XGB_feature_df = pd.read_parquet(sf.load_from_s3(file_path=f"{features_s3_path}/XGBoost_feature_selection_GLOB.parquet", **aws_env_vars))
    GLOB_LGBM_feature_df = pd.read_parquet(sf.load_from_s3(file_path=f"{features_s3_path}/LightGBM_feature_selection_GLOB.parquet", **aws_env_vars))

In [None]:
# Show the top 10 features that were tested where the "Val_MAE_Change" is greater than or equal to 0
# This will show the best combination of features that produce the lowest MAE score
GLOB_XGB_feature_df[GLOB_XGB_feature_df['Val_MAE_Change'] >= 0].sort_values(by=['Val_MAE'], ascending=True).head(10)

In [None]:
# Show the top 10 features that were tested where the "Val_MAE_Change" is greater than or equal to 0
# Count the number of features that were used to generate that MAE score
GLOB_XGB_feature_df.sort_values(by=['Val_MAE'], ascending=True).head(10)['Features'].str.count(",") + 1

## Write out top features

In [None]:
GLOB_XGB_feature_list = GLOB_XGB_feature_df[GLOB_XGB_feature_df['Val_MAE_Change'] >= 0].sort_values(by=['Val_MAE'], ascending=True).head(1)['Features'].str.split(',').to_list()
GLOB_XGB_feature_list = GLOB_XGB_feature_list[0]

GLOB_LGBM_feature_list = GLOB_LGBM_feature_df[GLOB_LGBM_feature_df['Val_MAE_Change'] >= 0].sort_values(by=['Val_MAE'], ascending=True).head(1)['Features'].str.split(',').to_list()
GLOB_LGBM_feature_list = GLOB_LGBM_feature_list[0]

In [None]:
if deepnote:
    with open(f'{features_dn_path}GLOB_XGB_feat_list.pkl', 'wb') as f:
                    pickle.dump(GLOB_XGB_feature_list, f)
    with open(f'{features_dn_path}GLOB_LGBM_feat_list.pkl', 'wb') as f:
                    pickle.dump(GLOB_LGBM_feature_list, f)
    with open(f'{features_dns3_path}GLOB_XGB_feat_list.pkl', 'wb') as f:
                    pickle.dump(GLOB_XGB_feature_list, f)
    with open(f'{features_dns3_path}GLOB_LGBM_feat_list.pkl', 'wb') as f:
                    pickle.dump(GLOB_LGBM_feature_list, f)
else:
    sf.write_to_s3(file_path=f"{features_s3_path}/GLOB_XGB_feat_list.pkl", data=GLOB_XGB_feature_list, pickle_file=True, **aws_env_vars)
    sf.write_to_s3(file_path=f"{features_s3_path}/GLOB_LGBM_feat_list.pkl", data=GLOB_LGBM_feature_list, pickle_file=True, **aws_env_vars)