# SHAP Demo

Note:
Explainations on the final subsection are not shown when viewing this from github. Please run this code in order to see SHAP demo.

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

## Load Data

In [2]:
df = pd.read_csv(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",
    header=None)
df.columns = [
    "Age", "WorkClass", "fnlwgt", "Education", "EducationNum",
    "MaritalStatus", "Occupation", "Relationship", "Race", "Gender",
    "CapitalGain", "CapitalLoss", "HoursPerWeek", "NativeCountry", "Income"
]

df.head()

Unnamed: 0,Age,WorkClass,fnlwgt,Education,EducationNum,MaritalStatus,Occupation,Relationship,Race,Gender,CapitalGain,CapitalLoss,HoursPerWeek,NativeCountry,Income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


## Prepare Data for Modeling

In [3]:
train_cols = df.columns[0:-1]
label = df.columns[-1]
X = df[train_cols]
y = df[label].apply(lambda x: 0 if x == " <=50K" else 1) #Turning response into 0 and 1

# We have to transform categorical variables to use sklearn models
X_enc = pd.get_dummies(X, prefix_sep='.')
feature_names = list(X_enc.columns)

seed = 1  
X_train, X_test, y_train, y_test = train_test_split(X_enc, y, test_size=0.20, random_state=seed)

X_train.head()

Unnamed: 0,Age,fnlwgt,EducationNum,CapitalGain,CapitalLoss,HoursPerWeek,WorkClass. ?,WorkClass. Federal-gov,WorkClass. Local-gov,WorkClass. Never-worked,...,NativeCountry. Portugal,NativeCountry. Puerto-Rico,NativeCountry. Scotland,NativeCountry. South,NativeCountry. Taiwan,NativeCountry. Thailand,NativeCountry. Trinadad&Tobago,NativeCountry. United-States,NativeCountry. Vietnam,NativeCountry. Yugoslavia
16465,39,188571,7,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
5625,54,105010,13,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
30273,32,156464,9,0,1902,50,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3136,45,32172,10,0,0,50,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4521,60,146674,6,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


After preparing the data, we see that there are 108 unique features.

## Train Blackbox Model

The user trying to use LIME doesn't need to know how the model was created and even the training data. We only need to know what kind of variables were used in the model.

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

#Blackbox system can include preprocessing, not just a classifier!
pca = PCA()
rf = RandomForestClassifier(n_estimators=100, n_jobs=-1)

blackbox_model = Pipeline([('pca', pca), ('rf', rf)])
blackbox_model.fit(X_train, y_train)

Pipeline(steps=[('pca', PCA()), ('rf', RandomForestClassifier(n_jobs=-1))])

## SHAP: How an Individual Prediction was made?

In [9]:
X_test[:5]

Unnamed: 0,Age,fnlwgt,EducationNum,CapitalGain,CapitalLoss,HoursPerWeek,WorkClass. ?,WorkClass. Federal-gov,WorkClass. Local-gov,WorkClass. Never-worked,...,NativeCountry. Portugal,NativeCountry. Puerto-Rico,NativeCountry. Scotland,NativeCountry. South,NativeCountry. Taiwan,NativeCountry. Thailand,NativeCountry. Trinadad&Tobago,NativeCountry. United-States,NativeCountry. Vietnam,NativeCountry. Yugoslavia
9646,62,26911,4,0,0,66,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
709,18,208103,7,0,0,25,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
7385,25,102476,13,27828,0,50,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
16671,33,511517,9,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
21932,36,292570,7,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [10]:
y_test[:5]

9646     0
709      0
7385     1
16671    0
21932    0
Name: Income, dtype: int64

In [7]:
from interpret.blackbox import ShapKernel
from interpret import show

import numpy as np

In [8]:
background_val = np.median(X_train, axis=0).reshape(1, -1)
shap = ShapKernel(predict_fn=blackbox_model.predict_proba, data=background_val, feature_names=feature_names)
shap_local = shap.explain_local(X_test[:5], y_test[:5], name='SHAP')
show(shap_local)

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

l1_reg="auto" is deprecated and in the next version (v0.29) the behavior will change from a conditional use of AIC to simply "num_features(10)"!
l1_reg="auto" is deprecated and in the next version (v0.29) the behavior will change from a conditional use of AIC to simply "num_features(10)"!
l1_reg="auto" is deprecated and in the next version (v0.29) the behavior will change from a conditional use of AIC to simply "num_features(10)"!
l1_reg="auto" is deprecated and in the next version (v0.29) the behavior will change from a conditional use of AIC to simply "num_features(10)"!
l1_reg="auto" is deprecated and in the next version (v0.29) the behavior will change from a conditional use of AIC to simply "num_features(10)"!



