# Regression - ElasticNet

## Notebook Setup

### Import Libraries

In [None]:
# Import Standard Libraries
import os
import datetime
import pickle
import itertools
import pandas as pd
import numpy as np

# Import Visualization Libraries
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import shap

In [None]:
# Import Modeling Libraries
from sklearn import set_config
from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from hyperopt.early_stop import no_progress_loss
import neptune

# ElasticNet
from sklearn.linear_model import ElasticNet

In [None]:
# Pandas Configs
pd.set_option('mode.chained_assignment', None)
pd.options.display.float_format = '{:.2f}'.format
pd.options.display.max_columns = None
pd.options.display.max_rows = None

# Ignore Warnings
import warnings
warnings.simplefilter('ignore', category=FutureWarning)
warnings.simplefilter('ignore', category=UserWarning)
warnings.simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

# MapBox Token for Plotly Maps
px.set_mapbox_access_token(os.environ.get("MAPBOX_TOKEN"))

# Scikit Learn Configs
set_config(transform_output="pandas")

### Common Functions

In [None]:
def import_data(common_path="/work/data", xy_path="Xy_Data", location_name='CARB'):
    """
    This function imports the data from the specified data path and location geography
    """

    data_path = os.path.join(common_path, xy_path)

    # Import the training, validation and holdout data
    X_train_trans = pd.read_parquet(f"{data_path}/X_train_trans_{location_name}.parquet")
    X_val_trans = pd.read_parquet(f"{data_path}/X_val_trans_{location_name}.parquet")
    X_holdout_trans = pd.read_parquet(f"{data_path}/X_holdout_trans_{location_name}.parquet")

    y_train = pd.read_parquet(f"{data_path}/y_train_{location_name}.parquet")
    y_val = pd.read_parquet(f"{data_path}/y_val_{location_name}.parquet")
    y_holdout = pd.read_parquet(f"{data_path}/y_holdout_{location_name}.parquet")

    # y is not transformed
    y_train_trans = y_train
    y_val_trans = y_val
    y_holdout_trans = y_holdout

    return X_train_trans, X_val_trans, X_holdout_trans, y_train_trans, y_val_trans, y_holdout_trans

# Import some data temporarily, as some of the other functions use the Xy data in argument defaults
X_train_trans, X_val_trans, X_holdout_trans, y_train_trans, y_val_trans, y_holdout_trans = import_data(location_name='CARB')

In [None]:
def objective(space):
    """
    This is the ElasticNet hyperparam objective function
    Hyperparams are passed into this function, which are then used in the model object
    The model object is then used in the cross_val_scores function 
    This results in a list of MAE scores, which are then returned as 
      the loss and std vars to the trials object

    NOTE: Lasso is L1 and Ridge is L2
    DO NOT MAKE ALPHA = 0 WHEN USING LASSO
    """
    space['alpha'] = float(space['alpha'])
    space['l1_ratio'] = float(space['l1_ratio'])

    model = ElasticNet(
                alpha=space['alpha'],
                l1_ratio=space['l1_ratio'],
                fit_intercept=True, # If False, data is assumed to already be centered
                precompute=False, # For sparse matrices, set False
                max_iter=1000, # Default is 1000
                copy_X=True, # Copies the input data, else could be overwritten!
                tol=1e-4, # A threshold for optimization
                warm_start=False, # When True, Use previous solution as the fit
                positive=False, # When True, forces coefficients to be positive
                random_state = 42,
                selection='cyclic' # 'cyclic' or 'random' -- random could be faster but cyclic is methodical
    )

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    mae_scores = -cross_val_score(model, X_train_trans, y_train_trans, cv=kf, scoring='neg_mean_absolute_error')
    mean_mae = mae_scores.mean()
    std_mae = mae_scores.std()

    # Emit model specific params and metrics to Neptune
    run['parameters/alpha'].log(space['alpha'])
    run['parameters/l1_ratio'].log(space['l1_ratio'])
    
    # Emit standard params and metrics to Neptune
    run['parameters/all_parameters'].log(str(space))
    run['metrics/mae_scores'].log(str(mae_scores.tolist()))
    run["metrics/mean_mae"].log(mean_mae)
    run["metrics/std_mae"].log(std_mae)

    return {'loss': mean_mae, 'status': STATUS_OK, 'std': std_mae}

In [None]:
def model_score(hps, val=True, train=False, holdout=False, Xtt=X_train_trans, ytt=y_train_trans, Xvt=X_val_trans, yvt=y_val_trans, Xht=X_holdout_trans, yht=y_holdout_trans):
    """
    This function rebuilds the model with the desired hyperparameters
    val: True will evaluate the model on the validation data
    holdout: True will evaluate the model on the holdout data
    train: True will evaluate the model on the train data
    Xtt: X_train_trans
    ytt: y_train_trans
    Xvt: X_val_trans
    yvt: y_val_trans
    Xht: X_holdout_trans
    yht: y_holdout_trans
    """

    # Rebuild model
    model = ElasticNet(**hps, random_state=42)
    model.fit(Xtt, ytt)

    # Evaluate model and print results
    if train:
        # Evaluation on train data
        train_pred = model.predict(Xtt)
        train_pred = np.clip(train_pred, 0, 100)
        train_mae = mean_absolute_error(ytt, train_pred)
        train_mse = mean_squared_error(ytt, train_pred)
        train_rmse = mean_squared_error(ytt, train_pred, squared=False)
        train_rsq = r2_score(ytt, train_pred)
        print(" ")
        print(f"Train Mean Absolute Error: {train_mae:.4f}")
        print(f"Train Mean Squared Error: {train_mse:.4f}")
        print(f"Train Root Mean Squared Error: {train_rmse:.4f}")
        print(f"Train R^2 Score: {train_rsq:.4f}")

    if val:
        # Evaluation on validation data
        val_pred = model.predict(Xvt)
        val_pred = np.clip(val_pred, 0, 100)
        val_mae = mean_absolute_error(yvt, val_pred)
        val_mse = mean_squared_error(yvt, val_pred)
        val_rmse = mean_squared_error(yvt, val_pred, squared=False)
        val_rsq = r2_score(yvt, val_pred)
        print(" ")
        print(f"Validation Mean Absolute Error: {val_mae:.4f}")
        print(f"Validation Mean Squared Error: {val_mse:.4f}")
        print(f"Validation Root Mean Squared Error: {val_rmse:.4f}")
        print(f"Validation R^2 Score: {val_rsq:.4f}")

    if holdout:
        # Evaluation on holdout data
        holdout_pred = model.predict(Xht)
        holdout_pred = np.clip(holdout_pred, 0, 100)
        holdout_mae = mean_absolute_error(yht, holdout_pred)
        holdout_mse = mean_squared_error(yht, holdout_pred)
        holdout_rmse = mean_squared_error(yht, holdout_pred, squared=False)
        holdout_rsq = r2_score(yht, holdout_pred)
        print(" ")
        print(f"Holdout Mean Absolute Error: {holdout_mae:.4f}")
        print(f"Holdout Mean Squared Error: {holdout_mse:.4f}")
        print(f"Holdout Root Mean Squared Error: {holdout_rmse:.4f}")
        print(f"Holdout R^2 Score: {holdout_rsq:.4f}")

    return model

In [None]:
def plot_feat_importance(model):
    """
    Plot the feature importance
    model: The model object itself
    """
    #fi_df = pd.DataFrame({'Feature':model.feature_names_in_,'Importance':model.feature_importances_}).sort_values(by='Importance', ascending=True)
    # fi_df = pd.DataFrame({'Feature':model.booster_.feature_name(), 'Importance':model.booster_.feature_importance()}).sort_values(by='Importance', ascending=True)
    fi_df = pd.DataFrame({'Feature':model.feature_names_in_, 'Importance':abs(model.coef_)}).sort_values(by='Importance', ascending=True)

    # Plot the feature importance
    fig = px.bar(fi_df, x="Importance", y="Feature", orientation='h', color_discrete_sequence=['darkorange'])
    fig.update_xaxes(categoryorder='total ascending')
    fig.update_layout(
        title={
            'text': "Feature Importance",
            'x': 0.5, 'xanchor': 'center',
            'y':0.90, 'yanchor':'top'},
        xaxis=dict(title="Feature Importance", title_standoff=2),
        yaxis=dict(title="Feature", title_standoff=0),
        height=400, width=650)
    fig.add_annotation(
        dict(text=f"Data Sources: Global Coral Beaching Database, World Bank WDI\nMarine Ecoregions of the World", x=0.5, y=-0.25, showarrow=False,
            font=dict(
                size=10,
                color="grey"),
            xref="paper", yref="paper", align="center"
        )
    )

    fig.show()

In [None]:
def write_out(model, trials, params, feat_cols=[], common_path="/work/models", model_family="elasticnet_reg", location_name="CARB"):
    """
    Write out the model artifacts to disk
    model: The model object itself
    trial: The Hyperopt trials object
    params: The best model hyperparameters from the trials object
    common_path: The common path for model artifacts e.g. '/work/models'
    model_family: The model family, e.g. 'xgboost_reg', 'lightgbm_reg', 'elasticnet_reg'
    location_name: The geography that the model is trained for, e.g. 'SEAA', 'CARB', 'GLOB'
    """
    date_time_str = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    model_path = os.path.join(common_path, model_family, location_name)
 
    # make sure the path exists
    os.makedirs(model_path, exist_ok=True)

    # Write out the HyperOpt Trials object
    with open(model_path+'/'+date_time_str+'_trials.pkl', 'wb') as f:
        pickle.dump(trials, f)

    # Write out the XGBoost Model Object
    with open(model_path+'/'+date_time_str+'_model.pkl', 'wb') as f:
        pickle.dump(model, f)

    # Write out the XGBoost Best Params
    with open(model_path+'/'+date_time_str+'_params.pkl', 'wb') as f:
        pickle.dump(params, f)
    
    # Write out the feature columns if they exist
    if len(feat_cols) > 0:
        with open(model_path+'/'+date_time_str+'_feat_cols.pkl', 'wb') as f:
            pickle.dump(feat_cols, f)

In [None]:
def feat_ablation(model, hps, Xtt, ytt, Xvt, yvt, Xht, yht, abl_list_to_combo=[]):
    """
    Perform feature ablation analysis
    model: The model object itself
    Xtt: X_train_trans
    ytt: y_train_trans
    Xvt: X_val_trans
    yvt: y_val_trans
    Xht: X_holdout_trans
    yht: y_holdout_trans
    """
    
    # Create the base model object with hyperparameters
    # Note: this model will be refit during the ablation loop
    abl_model = model_score(hps, Xtt=Xtt, ytt=ytt, Xvt=Xvt, yvt=yvt, Xht=Xht, yht=yht, train=False, val=False, holdout=False)
    
    # Evaluation on train data
    train_pred = abl_model.predict(Xtt)
    train_pred = np.clip(train_pred, 0, 100)
    baseline_mae_train = mean_absolute_error(ytt, train_pred)
    
    # Evaluation on validation data
    val_pred = abl_model.predict(Xvt)
    val_pred = np.clip(val_pred, 0, 100)
    baseline_mae_val = mean_absolute_error(yvt, val_pred)

    print(f"Baseline Mean MAE: {baseline_mae_train:.4f}, Validation MAE: {baseline_mae_val:.4f}")


    # Features for ablation 
    # Start with all features
    abl_list = [[x] for x in Xtt.columns]

    # Now create combos of features passed in as abl_list_to_combo
    # Create combos of all items in abl_list_to_combo
    # Range stars at 2 to skip single columns
    abl_combo_list = [combo for r in range(2, len(abl_list_to_combo) + 1) 
        for combo in itertools.combinations(abl_list_to_combo, r)]

    # Itertools combinations() creates tuples. 
    #   Convert each combination from a tuple to a list for ablation
    abl_combo_list = [list(combo) for combo in abl_combo_list]

    # Add the ablation combos to the ablation list
    abl_list = abl_list + abl_combo_list


    # Create the ablation loop
    ablation_results_list = []

    # Feat ablation loop
    for feature in abl_list:
        # drop ablated cols
        modified_X_train_trans = Xtt.drop(columns=feature)
        modified_X_val_trans = Xvt.drop(columns=feature)

        # Fit the model with ablated features        
        abl_model.fit(modified_X_train_trans, ytt)

        # Evaluation on train data
        modified_train_predictions = abl_model.predict(modified_X_train_trans)
        modified_train_predictions = np.clip(modified_train_predictions, 0, 100)
        modified_mae_train = mean_absolute_error(ytt, modified_train_predictions)

        # Evaluation on validation data
        modified_val_predictions = abl_model.predict(modified_X_val_trans)
        modified_val_predictions = np.clip(modified_val_predictions, 0, 100)
        modified_mae_val = mean_absolute_error(yvt, modified_val_predictions)
        
        # Calculate MAE changes
        mae_change_train = baseline_mae_train - modified_mae_train
        mae_change_val = baseline_mae_val - modified_mae_val

        ablation_result_dict = {
            'Removed_Feature': ", ".join(feature),
            'Train_MAE': modified_mae_train,
            'Train_MAE_Change': mae_change_train,
            'Train_MAE_Pct_Change': 100*(1-(modified_mae_train/baseline_mae_train)),
            'Val_MAE': modified_mae_val,
            'Val_MAE_Change': mae_change_val,
            'Val_MAE_Pct_Change': 100*(1-(modified_mae_val/baseline_mae_val))
        }
        
        ablation_results_list.append(ablation_result_dict)
    
    feature_ablation_df = pd.DataFrame(ablation_results_list)
    
    return feature_ablation_df, baseline_mae_val, baseline_mae_train

In [None]:
# Define the Hyperparameter space
space = {
    'alpha': hp.loguniform('alpha', np.log(1e-1), np.log(1e3)),
    'l1_ratio': hp.uniform('l1_ratio', 0, 1.0)
}

# Caribbean Region

## Import the Data

In [None]:
X_train_trans, X_val_trans, X_holdout_trans, y_train_trans, y_val_trans, y_holdout_trans = import_data(location_name='CARB')

## Hyperparameter Tuning

### Execute Hyperparameter search

In [None]:
# Instantiate Neptune instance
run = neptune.init_run(
    name="ElasticNet Reg Caribbean",
    tags=["ElasticNet", "regression", "hyperopt", "RMSE", "Caribbean", "CARB"],
    description="Elastic Hyperopt with RMSE on Caribbean"
)

# Create the Trials object
CARB_trials = Trials()

# Create the fmin object
CARB_best_hyperparams = fmin(fn = objective,
                            space = space,
                            algo = tpe.suggest,
                            max_evals = 1000,
                            trials = CARB_trials,
                            show_progressbar=True,
                            early_stop_fn=no_progress_loss(100))

# Stop Neptune instance
run.stop()

CARB_best_trial = CARB_trials.best_trial
CARB_best_hps = CARB_best_hyperparams.copy()

CARB_best_hps['alpha'] = float(CARB_best_hps['alpha'])
CARB_best_hps['l1_ratio'] = float(CARB_best_hps['l1_ratio'])

### Review the Hyperparameter Search Validation MAE Scores and Hyperparameters

In [None]:
# What's the best trial from the search? Report the MAE & StdDev
print(f"Best Mean Absolute Error: {CARB_trials.best_trial['result']['loss']:.4f}")
print(f"Best Standard Error: {CARB_trials.best_trial['result']['std']:.4f}")

In [None]:
# Display the best hyperparameters
CARB_best_hps

## Recreate the model and get new MAE and Feature Importance

In [None]:
# Recreate the model with the best hyperparameters
CARB_model = model_score(CARB_best_hps, holdout=True, val=True)

# Plot the feature importance
plot_feat_importance(CARB_model)

## Feature Ablation

In [None]:
%%time
# First run through the feature ablation process
# This will determine which features can potentially be removed
CARB_feature_ablation_df, CARB_baseline_mae_val, CARB_baseline_mae_train = feat_ablation(
    model=CARB_model, hps=CARB_best_hps, 
    Xtt=X_train_trans, ytt=y_train_trans, 
    Xvt=X_val_trans, yvt=y_val_trans, 
    Xht=X_holdout_trans, yht=y_holdout_trans)

In [None]:
# Show the feature ablation dataframe, sorted by the change in MAE to Validation data
CARB_feature_ablation_df.sort_values('Val_MAE_Change', ascending=False)

In [None]:
# Generate the top 5 features to use in feature ablation combinations
CARB_abl_list_to_combo = CARB_feature_ablation_df[CARB_feature_ablation_df['Val_MAE'] <= CARB_baseline_mae_val].sort_values('Val_MAE_Change', ascending=False)['Removed_Feature'].head(5).values[0].split(', ')

# Show the top 5 features
CARB_abl_list_to_combo

In [None]:
%%time
# Second run through the feature ablation process
# This time including the top 5 features whose removal decreased the MAE
# These top 5 features will be combined using itertools
CARB_feature_ablation_df, CARB_baseline_mae_val, CARB_baseline_mae_train = feat_ablation(
    model=CARB_model, hps=CARB_best_hps, 
    Xtt=X_train_trans, ytt=y_train_trans, 
    Xvt=X_val_trans, yvt=y_val_trans, 
    Xht=X_holdout_trans, yht=y_holdout_trans, 
    abl_list_to_combo=CARB_abl_list_to_combo)

In [None]:
# Show the feature ablation dataframe, sorted by the change in MAE to Validation data
CARB_feature_ablation_df[CARB_feature_ablation_df['Val_MAE'] <= CARB_baseline_mae_val]

In [None]:
# Show the top 1 feature, which we'll remove.  Might be multiple features, so we split the string
CARB_feature_ablation_df[CARB_feature_ablation_df['Val_MAE'] <= CARB_baseline_mae_val].sort_values('Val_MAE_Change', ascending=False)['Removed_Feature'].head(5).to_list()

In [None]:
# Define the top features to drop
CARB_drop_cols = CARB_feature_ablation_df[CARB_feature_ablation_df['Val_MAE'] <= CARB_baseline_mae_val].sort_values('Val_MAE_Change', ascending=False)['Removed_Feature'].head(1).values[0].split(', ')

# Create the feature columns list for saving out
CARB_feat_columns = X_train_trans.drop(columns=CARB_drop_cols).columns.to_list()

# Recreate the model with the dropped columns
# It should have a lower MAE score than the original
CARB_experiment_model = model_score(
    CARB_best_hps, val=True, 
    Xtt=X_train_trans.drop(columns=CARB_drop_cols), ytt=y_train_trans, 
    Xvt=X_val_trans.drop(columns=CARB_drop_cols), yvt=y_val_trans, 
    Xht=X_holdout_trans.drop(columns=CARB_drop_cols), yht=y_holdout_trans)

# Plot the feature importance for this model
plot_feat_importance(CARB_experiment_model)

### Output the Model, Trials, Parameters and Feature List to disk

In [None]:
# Write out the model artifacts to disk
write_out(model=CARB_model, trials=CARB_trials, params=CARB_best_hps, feat_cols=CARB_feat_columns, location_name='CARB')

## Review Holdout Truth vs Predictions

In [None]:
eval_df = y_val_trans.copy(deep=True)
CARB_val_pred = CARB_model.predict(X_val_trans)
CARB_val_pred = np.clip(CARB_val_pred, 0, 100)
eval_df['predictions'] = CARB_val_pred.tolist()
eval_df['diff'] = eval_df['y_val'] - eval_df['predictions']

In [None]:
eval_df.sort_values(by='diff', ascending=False).head(20)

In [None]:
fig = px.histogram(eval_df[eval_df['y_val'] > 5], x='diff', nbins=100, title='Distribution of the diff between prediction and holdout where true value > 5')

# Show the plot
fig.show()

In [None]:
#eval_df[(eval_df['diff'].between(-5,5))]
#eval_df[(eval_df['diff'].between(-5,5)) & (eval_df['y_val'] > 5)]
eval_df[~(eval_df['diff'].between(-5,5))]


## SHAP Analysis

### Create SHAP Objects

In [None]:
# %%time
# CARB_X_val_trans = X_val_trans.copy(deep=True)
# CARB_y_val_trans = y_val_trans.copy(deep=True)
# explainer = shap.TreeExplainer(CARB_model)
# shap_values = explainer(CARB_X_val_trans)
# shap_interaction = explainer.shap_interaction_values(CARB_X_val_trans)
# sv = explainer.shap_values(CARB_X_val_trans)
# shap.initjs()

In [None]:
# Beeswarm Importance
# plt.figure(figsize=(8,6))
# plt.title(f"SHAP Beeswarm Analysis")
# shap.plots.beeswarm(shap_values, max_display=32)
# plt.close()

### Investigate Individual Predictions

In [None]:
#investigate = CARB_X_val_trans.index.get_loc(5809)
# print(CARB_y_val_trans.iloc[investigate])

In [None]:
# Waterfall on a prediction
# plt.figure(figsize=(8,6))
# shap.plots.waterfall(shap_values[investigate], max_display=20) # , show=False
# plt.close()

In [None]:
# Decision plot on a prediction
# shap.decision_plot(explainer.expected_value, explainer.shap_values(CARB_X_val_trans)[investigate], CARB_X_val_trans, feature_display_range=slice(-1,-51,-1))

# South East Asia and Australia Region

## Import the Data

In [None]:
X_train_trans, X_val_trans, X_holdout_trans, y_train_trans, y_val_trans, y_holdout_trans = import_data(location_name='SEAA')

## Hyperparameter Tuning

### Execute Hyperparameter search

In [None]:
# Instantiate Neptune instance
run = neptune.init_run(
    name="ElasticNet Reg South East Asia and Australia",
    tags=["ElasticNet", "regression", "hyperopt", "RMSE", "South East Asia and Australia", "SEAA"],
    description="ElasticNet Hyperopt with RMSE on South East Asia and Australia"
)

# Create the Trials object
SEAA_trials = Trials()

# Create the fmin object
SEAA_best_hyperparams = fmin(fn = objective,
                            space = space,
                            algo = tpe.suggest,
                            max_evals = 1000,
                            trials = SEAA_trials,
                            show_progressbar=True,
                            early_stop_fn=no_progress_loss(100))

# Stop Neptune instance
run.stop()

SEAA_best_trial = SEAA_trials.best_trial
SEAA_best_hps = SEAA_best_hyperparams.copy()

SEAA_best_hps['alpha'] = float(SEAA_best_hps['alpha'])
SEAA_best_hps['l1_ratio'] = float(SEAA_best_hps['l1_ratio'])

### Review the Hyperparameter Search Validation MAE Scores and Hyperparameters

In [None]:
# What's the best trial from the search? Report the MAE & StdDev
print(f"Best Mean Absolute Error: {SEAA_trials.best_trial['result']['loss']:.4f}")
print(f"Best Standard Error: {SEAA_trials.best_trial['result']['std']:.4f}")

In [None]:
# Display the best hyperparameters
SEAA_best_hps

## Recreate the model and get new MAE and Feature Importance

In [None]:
# Recreate the model with the best hyperparameters
SEAA_model = model_score(SEAA_best_hps, holdout=True, val=True)

# Plot the feature importance
plot_feat_importance(SEAA_model)

## Feature Ablation

In [None]:
%%time
# First run through the feature ablation process
# This will determine which features can potentially be removed
SEAA_feature_ablation_df, SEAA_baseline_mae_val, SEAA_baseline_mae_train = feat_ablation(
    model=SEAA_model, hps=SEAA_best_hps, 
    Xtt=X_train_trans, ytt=y_train_trans, 
    Xvt=X_val_trans, yvt=y_val_trans, 
    Xht=X_holdout_trans, yht=y_holdout_trans)

In [None]:
# Show the feature ablation dataframe, sorted by the change in MAE to Validation data
SEAA_feature_ablation_df.sort_values('Val_MAE_Change', ascending=False)

In [None]:
# Generate the top 5 features to use in feature ablation combinations
SEAA_abl_list_to_combo = SEAA_feature_ablation_df[SEAA_feature_ablation_df['Val_MAE'] <= SEAA_baseline_mae_val].sort_values('Val_MAE_Change', ascending=False)['Removed_Feature'].head(5).to_list()

# Review the list
SEAA_abl_list_to_combo

In [None]:
%%time
# Second run through the feature ablation process
# This time including the top 5 features whose removal decreased the MAE
# These top 5 features will be combined using itertools
SEAA_feature_ablation_df, SEAA_baseline_mae_val, SEAA_baseline_mae_train = feat_ablation(
    model=SEAA_model, hps=SEAA_best_hps, 
    Xtt=X_train_trans, ytt=y_train_trans, 
    Xvt=X_val_trans, yvt=y_val_trans, 
    Xht=X_holdout_trans, yht=y_holdout_trans, 
    abl_list_to_combo=SEAA_abl_list_to_combo)

In [None]:
# Show the feature ablation dataframe, sorted by the change in MAE to Validation data
SEAA_feature_ablation_df[SEAA_feature_ablation_df['Val_MAE'] <= SEAA_baseline_mae_val].sort_values('Val_MAE_Change', ascending=False)

In [None]:
# Show the top 1 feature, which we'll remove.  Might be multiple features, so we split the string
SEAA_feature_ablation_df[SEAA_feature_ablation_df['Val_MAE'] <= SEAA_baseline_mae_val].sort_values('Val_MAE_Change', ascending=False)['Removed_Feature'].head(5).values[0].split(', ')

In [None]:
# Define the top features to drop
SEAA_drop_cols = SEAA_feature_ablation_df[SEAA_feature_ablation_df['Val_MAE'] <= SEAA_baseline_mae_val].sort_values('Val_MAE_Change', ascending=False)['Removed_Feature'].head(1).values[0].split(', ')

# Create the feature columns list for saving out
SEAA_feat_columns = X_train_trans.drop(columns=SEAA_drop_cols).columns.to_list()

# Recreate the model with the dropped columns
# It should have a lower MAE score than the original
SEAA_experiment_model = model_score(
    SEAA_best_hps, val=True, 
    Xtt=X_train_trans.drop(columns=SEAA_drop_cols), ytt=y_train_trans, 
    Xvt=X_val_trans.drop(columns=SEAA_drop_cols), yvt=y_val_trans, 
    Xht=X_holdout_trans.drop(columns=SEAA_drop_cols), yht=y_holdout_trans)

# Plot the feature importance for this model
plot_feat_importance(SEAA_experiment_model)

### Output the Model, Trials, Parameters and Feature List to disk

In [None]:
# Write out the model artifacts to disk
write_out(model=SEAA_model, trials=SEAA_trials, params=SEAA_best_hps, feat_cols=SEAA_feat_columns, location_name='SEAA')

## Review Holdout Truth vs Predictions

In [None]:
eval_df = y_val_trans.copy(deep=True)
SEAA_val_pred = SEAA_model.predict(X_val_trans)
SEAA_val_pred = np.clip(SEAA_val_pred, 0, 100)
eval_df['predictions'] = SEAA_val_pred.tolist()
eval_df['diff'] = eval_df['y_val'] - eval_df['predictions']

In [None]:
eval_df.sort_values(by='diff', ascending=False).head(20)

In [None]:
fig = px.histogram(eval_df[eval_df['y_val'] > 5], x='diff', nbins=100, title='Distribution of the diff between prediction and holdout where true value > 5')

# Show the plot
fig.show()

In [None]:
#eval_df[(eval_df['diff'].between(-5,5))]
#eval_df[(eval_df['diff'].between(-5,5)) & (eval_df['y_val'] > 5)]
eval_df[~(eval_df['diff'].between(-5,5))]


In [None]:
eval_df[(eval_df['diff'] > 100) | (eval_df['diff'] < 0)].sort_values(by='diff', ascending=False).head(20)


## SHAP Analysis

### Create SHAP Objects

In [None]:
# %%time
# SEAA_X_val_trans = X_val_trans.copy(deep=True)
# SEAA_y_val_trans = y_val_trans.copy(deep=True)
# explainer = shap.TreeExplainer(SEAA_model)
# shap_values = explainer(SEAA_X_val_trans)
# shap_interaction = explainer.shap_interaction_values(SEAA_X_val_trans)
# sv = explainer.shap_values(SEAA_X_val_trans)
# shap.initjs()

In [None]:
# Beeswarm Importance
# plt.figure(figsize=(8,6))
# plt.title(f"SHAP Beeswarm Analysis")
# shap.plots.beeswarm(shap_values, max_display=32)
# plt.close()

### Investigate Individual Predictions

In [None]:
#investigate = SEAA_X_val_trans.index.get_loc(5809)
# print(SEAA_y_val_trans.iloc[investigate])

In [None]:
# Waterfall on a prediction
# plt.figure(figsize=(8,6))
# shap.plots.waterfall(shap_values[investigate], max_display=20) # , show=False
# plt.close()

In [None]:
# Decision plot on a prediction
# shap.decision_plot(explainer.expected_value, explainer.shap_values(SEAA_X_val_trans)[investigate], SEAA_X_val_trans, feature_display_range=slice(-1,-51,-1))

# Global Region

## Import the Data

In [None]:
X_train_trans, X_val_trans, X_holdout_trans, y_train_trans, y_val_trans, y_holdout_trans = import_data(location_name='GLOB')

## Hyperparameter Tuning

### Execute Hyperparameter search

In [None]:
# Instantiate Neptune instance
run = neptune.init_run(
    name="ElasticNet Reg Global",
    tags=["ElasticNet", "regression", "hyperopt", "MAE", "Global", "GLOB"],
    description="ElasticNet Hyperopt with MAE on Global"
)

# Create the Trials object
GLOB_trials = Trials()

# Create the fmin object
GLOB_best_hyperparams = fmin(fn = objective,
                            space = space,
                            algo = tpe.suggest,
                            max_evals = 1000,
                            trials = GLOB_trials,
                            show_progressbar=True,
                            early_stop_fn=no_progress_loss(100))

# Stop Neptune instance
run.stop()

GLOB_best_trial = GLOB_trials.best_trial
GLOB_best_hps = GLOB_best_hyperparams.copy()

GLOB_best_hps['alpha'] = float(GLOB_best_hps['alpha'])
GLOB_best_hps['l1_ratio'] = float(GLOB_best_hps['l1_ratio'])

### Review the Hyperparameter Search Validation MAE Scores and Hyperparameters

In [None]:
# What's the best trial from the search? Report the MAE & StdDev
print(f"Best Mean Absolute Error: {GLOB_trials.best_trial['result']['loss']:.4f}")
print(f"Best Standard Error: {GLOB_trials.best_trial['result']['std']:.4f}")

In [None]:
# Display the best hyperparameters
GLOB_best_hps

## Recreate the model and get new MAE and Feature Importance

In [None]:
# Recreate the model with the best hyperparameters
GLOB_model = model_score(GLOB_best_hps, holdout=True, val=True)

# Plot the feature importance
plot_feat_importance(GLOB_model)

## Feature Ablation

In [None]:
%%time
# First run through the feature ablation process
# This will determine which features can potentially be removed
GLOB_feature_ablation_df, GLOB_baseline_mae_val, GLOB_baseline_mae_train = feat_ablation(
    model=GLOB_model, hps=GLOB_best_hps, 
    Xtt=X_train_trans, ytt=y_train_trans, 
    Xvt=X_val_trans, yvt=y_val_trans, 
    Xht=X_holdout_trans, yht=y_holdout_trans)

In [None]:
# Show the feature ablation dataframe, sorted by the change in MAE to Validation data
GLOB_feature_ablation_df.sort_values('Val_MAE_Change', ascending=False)

In [None]:
# Generate the top 5 features to use in feature ablation combinations
GLOB_abl_list_to_combo = GLOB_feature_ablation_df[GLOB_feature_ablation_df['Val_MAE'] <= GLOB_baseline_mae_val].sort_values('Val_MAE_Change', ascending=False)['Removed_Feature'].head(5).to_list()

# Review the list
GLOB_abl_list_to_combo

In [None]:
%%time
# Second run through the feature ablation process
# This time including the top 5 features whose removal decreased the MAE
# These top 5 features will be combined using itertools
GLOB_feature_ablation_df, GLOB_baseline_mae_val, GLOB_baseline_mae_train = feat_ablation(
    model=GLOB_model, hps=GLOB_best_hps, 
    Xtt=X_train_trans, ytt=y_train_trans, 
    Xvt=X_val_trans, yvt=y_val_trans, 
    Xht=X_holdout_trans, yht=y_holdout_trans, 
    abl_list_to_combo=GLOB_abl_list_to_combo)

In [None]:
# Show the feature ablation dataframe, sorted by the change in MAE to Validation data
GLOB_feature_ablation_df[GLOB_feature_ablation_df['Val_MAE'] <= GLOB_baseline_mae_val].sort_values('Val_MAE_Change', ascending=False)

In [None]:
# Show the top 1 feature, which we'll remove.  Might be multiple features, so we split the string
GLOB_feature_ablation_df[GLOB_feature_ablation_df['Val_MAE'] <= GLOB_baseline_mae_val].sort_values('Val_MAE_Change', ascending=False)['Removed_Feature'].head(1).values[0].split(', ')

In [None]:
# Define the top features to drop
GLOB_drop_cols = GLOB_feature_ablation_df[GLOB_feature_ablation_df['Val_MAE'] <= GLOB_baseline_mae_val].sort_values('Val_MAE_Change', ascending=False)['Removed_Feature'].head(1).values[0].split(', ')

# Create the feature columns list for saving out
GLOB_feat_columns = X_train_trans.drop(columns=GLOB_drop_cols).columns.to_list()

# Recreate the model with the dropped columns
# It should have a lower MAE score than the original
GLOB_experiment_model = model_score(
    GLOB_best_hps, val=True, 
    Xtt=X_train_trans.drop(columns=GLOB_drop_cols), ytt=y_train_trans, 
    Xvt=X_val_trans.drop(columns=GLOB_drop_cols), yvt=y_val_trans, 
    Xht=X_holdout_trans.drop(columns=GLOB_drop_cols), yht=y_holdout_trans)

# Plot the feature importance for this model
plot_feat_importance(GLOB_experiment_model)

### Output the Model, Trials, Parameters and Feature List to disk

In [None]:
# Write out the model artifacts to disk
#write_out(model=GLOB_model, trials=GLOB_trials, params=GLOB_best_hps, feat_cols=GLOB_feat_columns, location_name='GLOB')
write_out(model=GLOB_model, trials=GLOB_trials, params=GLOB_best_hps, feat_cols=X_train_trans.columns.to_list(), location_name='GLOB')

## Review Holdout Truth vs Predictions

In [None]:
eval_df = y_val_trans.copy(deep=True)
GLOB_val_pred = GLOB_model.predict(X_val_trans)
GLOB_val_pred = np.clip(GLOB_val_pred, 0, 100)
eval_df['predictions'] = GLOB_val_pred.tolist()
eval_df['diff'] = eval_df['y_val'] - eval_df['predictions']

In [None]:
eval_df.sort_values(by='diff', ascending=False).head(20)

In [None]:
fig = px.histogram(eval_df[eval_df['y_val'] > 5], x='diff', nbins=100, title='Distribution of the diff between prediction and holdout where true value > 5')

# Show the plot
fig.show()

In [None]:
#eval_df[(eval_df['diff'].between(-5,5))]
#eval_df[(eval_df['diff'].between(-5,5)) & (eval_df['y_val'] > 5)]
eval_df[~(eval_df['diff'].between(-5,5))]


## SHAP Analysis

### Create SHAP Objects

In [None]:
# %%time
# GLOB_X_val_trans = X_val_trans.copy(deep=True)
# GLOB_y_val_trans = y_val_trans.copy(deep=True)
# explainer = shap.TreeExplainer(GLOB_model)
# shap_values = explainer(GLOB_X_val_trans)
# shap_interaction = explainer.shap_interaction_values(GLOB_X_val_trans)
# sv = explainer.shap_values(GLOB_X_val_trans)
# shap.initjs()

In [None]:
# Beeswarm Importance
# plt.figure(figsize=(8,6))
# plt.title(f"SHAP Beeswarm Analysis")
# shap.plots.beeswarm(shap_values, max_display=32)
# plt.close()

### Investigate Individual Predictions

In [None]:
#investigate = GLOB_X_val_trans.index.get_loc(5809)
# print(GLOB_y_val_trans.iloc[investigate])

In [None]:
# Waterfall on a prediction
# plt.figure(figsize=(8,6))
# shap.plots.waterfall(shap_values[investigate], max_display=20) # , show=False
# plt.close()

In [None]:
# Decision plot on a prediction
# shap.decision_plot(explainer.expected_value, explainer.shap_values(GLOB_X_val_trans)[investigate], GLOB_X_val_trans, feature_display_range=slice(-1,-51,-1))