# Explainability Metrics for Regression

In [1]:
import sys, os
base_path = os.path.dirname(os.path.dirname(os.path.dirname(os.getcwd())))
sys.path.insert(0, base_path)

In [2]:
from holisticai.datasets import load_us_crime
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

from holisticai.explainability import Explainer
from holisticai.efficacy.metrics import regression_efficacy_metrics

ModuleNotFoundError: No module named 'holisticai'

## Data Preprocessing

In [None]:
dataset = load_us_crime(return_X_y=False, as_frame=True)
df = pd.concat([dataset["data"], dataset["target"]], axis=1)
df.head()

In [None]:
def preprocess_us_crime_dataset(df, protected_feature):
  """Performs the pre-processing step of the data."""
  # Remove NaN elements from dataframe
  df_ = df.copy()
  df_clean = df_.iloc[:,[i for i,n in enumerate(df_.isna().sum(axis=0).T.values) if n<1000]]
  df_clean = df_clean.dropna()
  # Get the protected attribute vectors
  group_a = df_clean[protected_feature].apply(lambda x: x>0.5)
  group_b =  1-group_a
  group_b = group_b.astype('bool')
  # Remove unnecessary columns
  cols = [c for c in df_clean.columns if (not c.startswith('race')) and (not c.startswith('age'))]
  df_clean = df_clean[cols].iloc[:,3:]
  return df_clean, group_a, group_b

In [None]:
df_clean, group_a, group_b = preprocess_us_crime_dataset(df, 'racePctWhite')
X = df_clean.iloc[:,:-1]
y = df_clean.iloc[:,-1]

In [None]:
from holisticai.bias.plots import correlation_matrix_plot

correlation_matrix_plot(X, target_feature='population', size = (12,7))

## Model Training

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
import numpy as np
seed = np.random.seed(42) # set seed for reproducibility
# simple preprocessing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed) # train test split


model = GradientBoostingRegressor() # instantiate model
#model = LinearRegression()
model.fit(X_train, y_train) # fit model

y_pred = model.predict(X_test) # compute predictions

# compute efficacy metrics
regression_efficacy_metrics(y_test, y_pred)

# Global Explainability Metrics (based on Permutation Feature Importance)

In [None]:
# permutation feature importance
explainer = Explainer(based_on='feature_importance',
                      strategy_type='permutation',
                      model_type='regression',
                      model = model, 
                      x = X, 
                      y = y)

In [None]:
explainer.metrics()

In [None]:
explainer.bar_plot(max_display=10)

In [None]:
_,ax = plt.subplots(figsize=(15,3))
explainer.partial_dependence_plot(last=3, ax=ax)

Conditional Feature Importance Metrics

In [None]:
explainer.metrics()

In [None]:
# How change the feature importance for each quantil with respect to the feature importance of the whole model?
explainer.contrast_visualization(show_connections=False)

In [None]:
explainer.feature_importance_table(sorted_by='Global', top_n=5)

# Global Explainability metrics (based on Surrogate Model)

In [None]:
# surrogate feature importance
explainer = Explainer(based_on='feature_importance',
                      strategy_type='surrogate',
                      model_type='regression',
                      model = model, 
                      x = X, 
                      y = y)

In [None]:
explainer.metrics()

In [None]:
explainer.bar_plot(max_display=6)

In [None]:
_,ax = plt.subplots(figsize=(15,5))
explainer.partial_dependence_plot(last=6, ax=ax, kind='both')

In [None]:
explainer.feature_importance_table(sorted_by='Global', top_n=10)

In [None]:
_,ax = plt.subplots(figsize=(15,3))
_ = explainer.tree_visualization('sklearn', fontsize=7, ax=ax)

In [None]:
#!pip install --upgrade pillow

In [None]:
explainer.tree_visualization('pydotplus')

In [None]:
vis = explainer.tree_visualization('dtreeviz', scale=2)
vis

# Local Explainability Metrics (based on Lime)

In [None]:
# lime feature importance
lime_explainer = Explainer(based_on='feature_importance',
                      strategy_type='lime',
                      model_type='regression',
                      model = model, 
                      x = X, 
                      y = y)

In [None]:
lime_explainer.metrics()

In [None]:
lime_explainer.metrics(detailed=True)

In [None]:
#explainer.feature_importance_table(sorted_by='Global', top_n=10)

In [None]:
lime_explainer.show_importance_stability()

In [None]:
lime_explainer.show_data_stability_boundaries(top_n=10, figsize=(15,5))

In [None]:
lime_explainer.show_features_stability_boundaries(figsize=(15,5))