## Setup a classification experiment

In [1]:
import pandas as pd
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split

df = pd.read_csv(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",
    header=None)
df.columns = [
    "Age", "WorkClass", "fnlwgt", "Education", "EducationNum",
    "MaritalStatus", "Occupation", "Relationship", "Race", "Gender",
    "CapitalGain", "CapitalLoss", "HoursPerWeek", "NativeCountry", "Income"
]
# df = df.sample(frac=0.01, random_state=1)
train_cols = df.columns[0:-1]
label = df.columns[-1]
X = df[train_cols]
y = df[label].apply(lambda x: 0 if x == " <=50K" else 1) #Turning response into 0 and 1

# We have to transform categorical variables to use sklearn models
X_enc = pd.get_dummies(X, prefix_sep='.')
feature_names = list(X_enc.columns)

seed = 1  
X_train, X_test, y_train, y_test = train_test_split(X_enc, y, test_size=0.20, random_state=seed)

## Train a blackbox classification system

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

#Blackbox system can include preprocessing, not just a classifier!
pca = PCA()
rf = RandomForestClassifier(n_estimators=100, n_jobs=-1)

blackbox_model = Pipeline([('pca', pca), ('rf', rf)])
blackbox_model.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('pca',
                 PCA(copy=True, iterated_power='auto', n_components=None,
                     random_state=None, svd_solver='auto', tol=0.0,
                     whiten=False)),
                ('rf',
                 RandomForestClassifier(bootstrap=True, class_weight=None,
                                        criterion='gini', max_depth=None,
                                        max_features='auto',
                                        max_leaf_nodes=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1, min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,
                                        n_estimators=100, n_jobs=-1,
                                        oob_score=False, random_state=None,
                                        verbose=0, wa

## Show blackbox model performance

In [3]:
from interpret import show
from interpret.perf import ROC

blackbox_perf = ROC(blackbox_model.predict_proba).explain_perf(X_test, y_test, name='Blackbox')
show(blackbox_perf)

## Local Explanations: How an individual prediction was made

In [4]:
from interpret.blackbox import LimeTabular
from interpret import show

#Blackbox explainers need a predict function, and optionally a dataset
lime = LimeTabular(predict_fn=blackbox_model.predict_proba, data=X_train, random_state=1)

#Pick the instances to explain, optionally pass in labels if you have them
lime_local = lime.explain_local(X_test[:5], y_test[:5], name='LIME')

show(lime_local)

In [5]:
from interpret.blackbox import ShapKernel
import numpy as np

background_val = np.median(X_train, axis=0).reshape(1, -1)
shap = ShapKernel(predict_fn=blackbox_model.predict_proba, data=background_val, feature_names=feature_names)
shap_local = shap.explain_local(X_test[:5], y_test[:5], name='SHAP')
show(shap_local)

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))


l1_reg="auto" is deprecated and in the next version (v0.29) the behavior will change from a conditional use of AIC to simply "num_features(10)"!


l1_reg="auto" is deprecated and in the next version (v0.29) the behavior will change from a conditional use of AIC to simply "num_features(10)"!


l1_reg="auto" is deprecated and in the next version (v0.29) the behavior will change from a conditional use of AIC to simply "num_features(10)"!


l1_reg="auto" is deprecated and in the next version (v0.29) the behavior will change from a conditional use of AIC to simply "num_features(10)"!


l1_reg="auto" is deprecated and in the next version (v0.29) the behavior will change from a conditional use of AIC to simply "num_features(10)"!






## Global Explanations: How the model behaves overall

In [6]:
from interpret.blackbox import MorrisSensitivity

sensitivity = MorrisSensitivity(predict_fn=blackbox_model.predict_proba, data=X_train)
sensitivity_global = sensitivity.explain_global(name="Global Sensitivity")

show(sensitivity_global)

In [7]:
from interpret.blackbox import PartialDependence

pdp = PartialDependence(predict_fn=blackbox_model.predict_proba, data=X_train)
pdp_global = pdp.explain_global(name='Partial Dependence')

show(pdp_global)

sort_indexes: 
[ 3  0  2  5  1  4 33 54 58 35 31 95 71 79 56]
key: 
names
data_dict[key]: 
['Age', 'fnlwgt', 'EducationNum', 'CapitalGain', 'CapitalLoss', 'HoursPerWeek', 'WorkClass. ?', 'WorkClass. Federal-gov', 'WorkClass. Local-gov', 'WorkClass. Never-worked', 'WorkClass. Private', 'WorkClass. Self-emp-inc', 'WorkClass. Self-emp-not-inc', 'WorkClass. State-gov', 'WorkClass. Without-pay', 'Education. 10th', 'Education. 11th', 'Education. 12th', 'Education. 1st-4th', 'Education. 5th-6th', 'Education. 7th-8th', 'Education. 9th', 'Education. Assoc-acdm', 'Education. Assoc-voc', 'Education. Bachelors', 'Education. Doctorate', 'Education. HS-grad', 'Education. Masters', 'Education. Preschool', 'Education. Prof-school', 'Education. Some-college', 'MaritalStatus. Divorced', 'MaritalStatus. Married-AF-spouse', 'MaritalStatus. Married-civ-spouse', 'MaritalStatus. Married-spouse-absent', 'MaritalStatus. Never-married', 'MaritalStatus. Separated', 'MaritalStatus. Widowed', 'Occupation. ?', 'Occ

## Compare them all in the Dashboard

In [8]:
show([blackbox_perf, lime_local, shap_local, sensitivity_global, pdp_global])

In [9]:
#from importlib import reload
#import azureml.explain.model

In [10]:
# reload(azureml.explain.model)

In [11]:
from interpret.ext.blackbox import TabularExplainer

In [12]:
tabular_explainer = TabularExplainer(blackbox_model, X_train)


The default value for feature_dependence has been changed to "independent"!



In [13]:
global_explanation = tabular_explainer.explain_global(X_test[0:10])

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


l1_reg="auto" is deprecated and in the next version (v0.29) the behavior will change from a conditional use of AIC to simply "num_features(10)"!


l1_reg="auto" is deprecated and in the next version (v0.29) the behavior will change from a conditional use of AIC to simply "num_features(10)"!


l1_reg="auto" is deprecated and in the next version (v0.29) the behavior will change from a conditional use of AIC to simply "num_features(10)"!


l1_reg="auto" is deprecated and in the next version (v0.29) the behavior will change from a conditional use of AIC to simply "num_features(10)"!


l1_reg="auto" is deprecated and in the next version (v0.29) the behavior will change from a conditional use of AIC to simply "num_features(10)"!


l1_reg="auto" is deprecated and in the next version (v0.29) the behavior will change from a conditional use of AIC to simply "num_features(10)"!


l1_reg="auto" is deprecated and in the next version (v0.29) the behavior will change from a conditional use of AIC to




In [14]:
show(global_explanation)

In [15]:
names = ['Age', 'fnlwgt', 'EducationNum', 'CapitalGain', 'CapitalLoss', 'HoursPerWeek', 'WorkClass. ?', 'WorkClass. Federal-gov', 'WorkClass. Local-gov', 'WorkClass. Never-worked', 'WorkClass. Private', 'WorkClass. Self-emp-inc', 'WorkClass. Self-emp-not-inc', 'WorkClass. State-gov', 'WorkClass. Without-pay', 'Education. 10th', 'Education. 11th', 'Education. 12th', 'Education. 1st-4th', 'Education. 5th-6th', 'Education. 7th-8th', 'Education. 9th', 'Education. Assoc-acdm', 'Education. Assoc-voc', 'Education. Bachelors', 'Education. Doctorate', 'Education. HS-grad', 'Education. Masters', 'Education. Preschool', 'Education. Prof-school', 'Education. Some-college', 'MaritalStatus. Divorced', 'MaritalStatus. Married-AF-spouse', 'MaritalStatus. Married-civ-spouse', 'MaritalStatus. Married-spouse-absent', 'MaritalStatus. Never-married', 'MaritalStatus. Separated', 'MaritalStatus. Widowed', 'Occupation. ?', 'Occupation. Adm-clerical', 'Occupation. Armed-Forces', 'Occupation. Craft-repair', 'Occupation. Exec-managerial', 'Occupation. Farming-fishing', 'Occupation. Handlers-cleaners', 'Occupation. Machine-op-inspct', 'Occupation. Other-service', 'Occupation. Priv-house-serv', 'Occupation. Prof-specialty', 'Occupation. Protective-serv', 'Occupation. Sales', 'Occupation. Tech-support', 'Occupation. Transport-moving', 'Relationship. Husband', 'Relationship. Not-in-family', 'Relationship. Other-relative', 'Relationship. Own-child', 'Relationship. Unmarried', 'Relationship. Wife', 'Race. Amer-Indian-Eskimo', 'Race. Asian-Pac-Islander', 'Race. Black', 'Race. Other', 'Race. White', 'Gender. Female', 'Gender. Male', 'NativeCountry. ?', 'NativeCountry. Cambodia', 'NativeCountry. Canada', 'NativeCountry. China', 'NativeCountry. Columbia', 'NativeCountry. Cuba', 'NativeCountry. Dominican-Republic', 'NativeCountry. Ecuador', 'NativeCountry. El-Salvador', 'NativeCountry. England', 'NativeCountry. France', 'NativeCountry. Germany', 'NativeCountry. Greece', 'NativeCountry. Guatemala', 'NativeCountry. Haiti', 'NativeCountry. Holand-Netherlands', 'NativeCountry. Honduras', 'NativeCountry. Hong', 'NativeCountry. Hungary', 'NativeCountry. India', 'NativeCountry. Iran', 'NativeCountry. Ireland', 'NativeCountry. Italy', 'NativeCountry. Jamaica', 'NativeCountry. Japan', 'NativeCountry. Laos', 'NativeCountry. Mexico', 'NativeCountry. Nicaragua', 'NativeCountry. Outlying-US(Guam-USVI-etc)', 'NativeCountry. Peru', 'NativeCountry. Philippines', 'NativeCountry. Poland', 'NativeCountry. Portugal', 'NativeCountry. Puerto-Rico', 'NativeCountry. Scotland', 'NativeCountry. South', 'NativeCountry. Taiwan', 'NativeCountry. Thailand', 'NativeCountry. Trinadad&Tobago', 'NativeCountry. United-States', 'NativeCountry. Vietnam', 'NativeCountry. Yugoslavia']

In [16]:
names[np.array([1,2,3])]

TypeError: only integer scalar arrays can be converted to a scalar index

sort_indexes: 
[ 3 53  0 35  5 33 26  1 46  2 48 16 20 45 30]
key: 
names
data_dict[key]: 
['Age', 'fnlwgt', 'EducationNum', 'CapitalGain', 'CapitalLoss', 'HoursPerWeek', 'WorkClass. ?', 'WorkClass. Federal-gov', 'WorkClass. Local-gov', 'WorkClass. Never-worked', 'WorkClass. Private', 'WorkClass. Self-emp-inc', 'WorkClass. Self-emp-not-inc', 'WorkClass. State-gov', 'WorkClass. Without-pay', 'Education. 10th', 'Education. 11th', 'Education. 12th', 'Education. 1st-4th', 'Education. 5th-6th', 'Education. 7th-8th', 'Education. 9th', 'Education. Assoc-acdm', 'Education. Assoc-voc', 'Education. Bachelors', 'Education. Doctorate', 'Education. HS-grad', 'Education. Masters', 'Education. Preschool', 'Education. Prof-school', 'Education. Some-college', 'MaritalStatus. Divorced', 'MaritalStatus. Married-AF-spouse', 'MaritalStatus. Married-civ-spouse', 'MaritalStatus. Married-spouse-absent', 'MaritalStatus. Never-married', 'MaritalStatus. Separated', 'MaritalStatus. Widowed', 'Occupation. ?', 'Occ

E0912 18:13:25.435784 23868 udash.py:186] index 107 is out of bounds for axis 1 with size 10
Traceback (most recent call last):
  File "C:\Users\ilmat\AppData\Local\Continuum\Miniconda3\envs\sh\lib\site-packages\flask\app.py", line 1832, in full_dispatch_request
    rv = self.dispatch_request()
  File "C:\Users\ilmat\AppData\Local\Continuum\Miniconda3\envs\sh\lib\site-packages\flask\app.py", line 1818, in dispatch_request
    return self.view_functions[rule.endpoint](**req.view_args)
  File "C:\Users\ilmat\AppData\Local\Continuum\Miniconda3\envs\sh\lib\site-packages\dash\dash.py", line 1287, in dispatch
    response.set_data(self.callback_map[output]['callback'](*args))
  File "C:\Users\ilmat\AppData\Local\Continuum\Miniconda3\envs\sh\lib\site-packages\dash\dash.py", line 1171, in add_context
    output_value = func(*args, **kwargs)
  File "c:\interpret\python\interpret\visual\udash.py", line 180, in update_viz_container
    output_div = gen_plot(explanation, int(value), 0, 0)
  File "