In [None]:
pip install interpret

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from interpret.glassbox import ExplainableBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from interpret import perf
from interpret import show
from interpret.provider import InlineProvider
from interpret import set_visualize_provider
from interpret.data import ClassHistogram

set_visualize_provider(InlineProvider())
from interpret import set_show_addr

#if show() does not show anything use this command
set_show_addr(("127.0.0.1", 7080))   # takes in a tuple of (ip_addr : str, and port : int)


The dash_html_components package is deprecated. Please replace
`import dash_html_components as html` with `from dash import html`
  import dash_html_components as html
The dash_core_components package is deprecated. Please replace
`import dash_core_components as dcc` with `from dash import dcc`
  import dash_core_components as dcc
The dash_table package is deprecated. Please replace
`import dash_table` with `from dash import dash_table`

Also, if you're using any of the table format helpers (e.g. Group), replace 
`from dash_table.Format import Group` with 
`from dash.dash_table.Format import Group`
  import dash_table as dt


In [2]:
#read in data, split, and do some light eda
data = pd.read_csv(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",
    header=None)

data.columns = [
    "Age", "WorkClass", "fnlwgt", "Education", "EducationNum",
    "MaritalStatus", "Occupation", "Relationship", "Race", "Gender",
    "CapitalGain", "CapitalLoss", "HoursPerWeek", "NativeCountry", "Income"
]
X = data.drop(["Income"], axis=1)
y = data["Income"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42, stratify=y
)

hist = ClassHistogram().explain_data(X_train, y_train, name="Train Data")
show(hist)

In [3]:
param_test = {
    "learning_rate": [0.001, 0.005, 0.01, 0.03],
    "interactions": [5, 10, 15],
    "max_interaction_bins": [10, 15, 20],
    "max_rounds": [5000, 10000, 15000, 20000],
    "min_samples_leaf": [2, 3, 5],
    "max_leaves": [3, 5, 10],
}

n_HP_points_to_test = 10
EBM_clf = ExplainableBoostingClassifier(feature_names=X_train.columns)
EBM_gs = RandomizedSearchCV(
    estimator=EBM_clf,
    param_distributions=param_test,
    n_iter=n_HP_points_to_test,
    scoring="roc_auc",
    cv=3,
    refit=True,
    random_state=314,
    verbose=False,
)

EBM_gs.fit(X_train, y_train)

RandomizedSearchCV(cv=3,
                   estimator=ExplainableBoostingClassifier(feature_names=Index(['Age', 'WorkClass', 'fnlwgt', 'Education', 'EducationNum',
       'MaritalStatus', 'Occupation', 'Relationship', 'Race', 'Gender',
       'CapitalGain', 'CapitalLoss', 'HoursPerWeek', 'NativeCountry'],
      dtype='object')),
                   param_distributions={'interactions': [5, 10, 15],
                                        'learning_rate': [0.001, 0.005, 0.01,
                                                          0.03],
                                        'max_interaction_bins': [10, 15, 20],
                                        'max_leaves': [3, 5, 10],
                                        'max_rounds': [5000, 10000, 15000,
                                                       20000],
                                        'min_samples_leaf': [2, 3, 5]},
                   random_state=314, scoring='roc_auc', verbose=False)

In [4]:
print(EBM_gs.best_estimator_)

ExplainableBoostingClassifier(feature_names=['Age', 'WorkClass', 'fnlwgt',
                                             'Education', 'EducationNum',
                                             'MaritalStatus', 'Occupation',
                                             'Relationship', 'Race', 'Gender',
                                             'CapitalGain', 'CapitalLoss',
                                             'HoursPerWeek', 'NativeCountry',
                                             'Relationship x HoursPerWeek',
                                             'Occupation x Relationship',
                                             'Age x Relationship',
                                             'MaritalStatus x HoursPerWeek',
                                             'Occupation x HoursPerWeek',
                                             'fnl...
                                             'continuous', 'categorical',
                                             'ca

In [5]:
print(data.Income.unique())

[' <=50K' ' >50K']


In [6]:
roc = perf.ROC(EBM_gs.best_estimator_.predict_proba, feature_names=X_train.columns)

y_test_roc = y_test.map({' >50K':1, ' <=50K':0})

roc_explanation = roc.explain_perf(X_test, y_test_roc)

show(roc_explanation)

In [7]:
ebm_global = EBM_gs.best_estimator_.explain_global()
show(ebm_global)

In [8]:
ebm_local = EBM_gs.best_estimator_.explain_local(X_test[10:15], y_test[10:15])
show(ebm_local)

# Dashboard

In [9]:
show([hist, ebm_global, ebm_local])

In [13]:
from interpret.glassbox import LogisticRegression, ClassificationTree

seed = 123

# We have to transform categorical variables to use Logistic Regression and Decision Tree
X_enc = pd.get_dummies(X, prefix_sep='.')
feature_names = list(X_enc.columns)
X_train_enc, X_test_enc, y_train, y_test = train_test_split(X_enc, y, test_size=0.20, random_state=seed)

lr = LogisticRegression(random_state=seed, feature_names=feature_names, penalty='l1', solver='liblinear')
lr.fit(X_train_enc, y_train)

tree = ClassificationTree()
tree.fit(X_train_enc, y_train)

<interpret.glassbox.decisiontree.ClassificationTree at 0x7fa544f7c370>

In [14]:
lr_global = lr.explain_global(name='Logistic Regression')
tree_global = tree.explain_global(name='Classification Tree')

show([hist, ebm_global, ebm_local, lr_global, tree_global], shared_tables = True)

# Other bells and whistles

In [15]:
#if you need to keep your data private, use Differentially Private EBMs (see DP-EBMs)
#link to paper: http://proceedings.mlr.press/v139/nori21a/nori21a.pdf

from interpret.privacy import DPExplainableBoostingClassifier
from interpret.glassbox import ExplainableBoostingClassifier
import time
from sklearn.metrics import roc_auc_score, accuracy_score


df = data.sample(frac=0.50)
train_cols = df.columns[0:-1]
label = df.columns[-1]
X = df[train_cols]
y = df[label]
y = y.map({' >50K':1, ' <=50K':0})

seed = 1
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=seed)


start = time.time()
dpebm = DPExplainableBoostingClassifier(epsilon=1, delta=1e-6)
_ = dpebm.fit(X_train, y_train)

dp_auroc = roc_auc_score(y_test, dpebm.predict_proba(X_test)[:, 1])
end = time.time()

print(f"DP EBM with eps: {dpebm.epsilon} and delta: {dpebm.delta} trained in {end - start:.2f} seconds with a test AUC of {dp_auroc:.3f}")


start = time.time()
ebm = ExplainableBoostingClassifier()
_ = ebm.fit(X_train, y_train)

ebm_auroc = roc_auc_score(y_test, ebm.predict_proba(X_test)[:, 1])
end = time.time()
print(f"EBM trained in {end - start:.2f} seconds with a test AUC of {ebm_auroc:.3f}")





DP EBM with eps: 1 and delta: 1e-06 trained in 1.61 seconds with a test AUC of 0.880
EBM trained in 6.03 seconds with a test AUC of 0.925


In [16]:
show(ebm.explain_global())
show(dpebm.explain_global())

In [None]:
#There's also a merge command

'''from interpret.glassbox.ebm.utils import *
models = [ebm1, ebm2 , ebm3]
merged_ebm = EBMUtils.merge_models(models=models)

ebm_global = merged_ebm.explain_global(name='EBM')
show(ebm_global)'''