## Application of Dalex
#### Dalex is available at https://github.com/ModelOriented/DALEX

In [1]:
import dalex as dx
import pandas as pd
import numpy as np
import sklearn
import pickle

#### Retrieve train and test set, train the model 

In [2]:
nome = 'adult'
title = "../datasets/train_set_"+nome+"_strat.p"
train = open(title,"rb")
train_set = pickle.load(train)
title = "../datasets/train_label_"+nome+"_strat.p"
train_l = open(title,"rb")
train_label = pickle.load(train_l)
title = "../datasets/test_set_" + nome + "_strat.p"
test = open(title, "rb")
test_set = pickle.load(test)
title = "../datasets/test_label_" + nome + "_strat.p"
test_l = open(title, "rb")
test_label = pickle.load(test_l)
train_set = train_set.apply(pd.to_numeric)
test_set = test_set.apply(pd.to_numeric)

In [3]:
train_set.describe()

Unnamed: 0,age,workclass,fnlwgt,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
count,21113.0,21113.0,21113.0,21113.0,21113.0,21113.0,21113.0,21113.0,21113.0,21113.0,21113.0,21113.0,21113.0
mean,38.406243,2.110122,190180.5,10.113863,3.326481,6.357126,2.773504,1.209444,0.321982,1099.897504,88.369962,40.918392,38.611851
std,13.135741,0.93678,105069.2,2.543448,2.706099,4.129759,1.675771,0.607902,0.467247,7426.180557,404.943602,11.97349,8.36254
min,17.0,0.0,14878.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0
25%,28.0,2.0,117789.0,9.0,1.0,3.0,1.0,1.0,0.0,0.0,0.0,40.0,41.0
50%,37.0,2.0,179171.0,10.0,2.0,5.0,3.0,1.0,0.0,0.0,0.0,40.0,41.0
75%,47.0,2.0,238397.0,12.0,7.0,10.0,5.0,1.0,1.0,0.0,0.0,45.0,41.0
max,90.0,6.0,1455435.0,16.0,7.0,14.0,5.0,5.0,1.0,99999.0,4356.0,99.0,41.0


In [4]:
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
bb = XGBClassifier(C= 1, penalty='l2', objective='binary:logistic', seed = 42, bootstrap=True, max_depth=90,learning_rate=0.1, n_estimators=500, tree_method='auto')
bb.fit(train_set.values, train_label.values)



Parameters: { C, bootstrap, penalty } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




XGBClassifier(C=1, base_score=0.5, booster='gbtree', bootstrap=True,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.1, max_delta_step=0,
              max_depth=90, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=500, n_jobs=12,
              num_parallel_tree=1, penalty='l2', random_state=42, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, seed=42, subsample=1,
              tree_method='auto', validate_parameters=1, verbosity=None)

In [5]:
bb.predict(test_set.iloc[58].values.reshape(1, -1))

array([1])

### Create the explainer 
#### We have to pass the black box model, the train set and the train label

In [6]:
import time
start = time.time()
exp = dx.Explainer(bb, train_set, train_label)
end = time.time()
print('Time for creation ', end - start)

Preparation of a new explainer is initiated

  -> data              : 21113 rows 13 cols
  -> target variable   : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray.
  -> target variable   : 21113 values
  -> model_class       : xgboost.sklearn.XGBClassifier (default)
  -> label             : Not specified, model's class short name will be used. (default)
  -> predict function  : <function yhat_proba_default at 0x1198d1dd0> will be used (default)
  -> predict function  : Accepts pandas.DataFrame and numpy.ndarray.
  -> predicted values  : min = 1.02e-06, mean = 0.25, max = 1.0
  -> model type        : classification will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = -0.504, mean = -2.94e-06, max = 0.496
  -> model_info        : package xgboost

A new explainer has been created!
Time for creation  0.23270106315612793


### Dalex allows for different kinds of feature importance based explanations. Here we report the shap explanation of a record 

In [11]:
#the record we are going to explain 
test_set.iloc[58, :]

age                   40
workclass              2
fnlwgt            144995
education-num         11
marital-status         1
occupation             9
relationship           1
race                   1
sex                    0
capital-gain        4386
capital-loss           0
hours-per-week        40
native-country        41
Name: 713, dtype: int64

In [13]:
bb.predict(test_set.iloc[58, :].values.reshape(1, -1))

array([1])

In [14]:
exp.predict_parts(test_set.iloc[58, :], type='shap').plot()

#### Another plot available from Dalex is the ceteris paribus.

In [15]:
test_set.iloc[4, :]

age                   38
workclass              2
fnlwgt            260997
education-num         10
marital-status         1
occupation             8
relationship           1
race                   2
sex                    0
capital-gain           0
capital-loss           0
hours-per-week        40
native-country        41
Name: 19683, dtype: int64

In [9]:
ceteris_paribus_ex = exp.predict_profile(test_set.iloc[4, :])
ceteris_paribus_ex.result

Calculating ceteris paribus: 100%|██████████| 13/13 [00:00<00:00, 112.17it/s]


Unnamed: 0,age,workclass,fnlwgt,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,_original_,_yhat_,_vname_,_ids_,_label_
19683,17.00,2.0,260997.0,10.0,1.0,8.0,1.0,2.0,0.0,0.0,0.0,40.0,41.0,38,0.000309,age,19683,XGBClassifier
19683,17.73,2.0,260997.0,10.0,1.0,8.0,1.0,2.0,0.0,0.0,0.0,40.0,41.0,38,0.000309,age,19683,XGBClassifier
19683,18.46,2.0,260997.0,10.0,1.0,8.0,1.0,2.0,0.0,0.0,0.0,40.0,41.0,38,0.000309,age,19683,XGBClassifier
19683,19.19,2.0,260997.0,10.0,1.0,8.0,1.0,2.0,0.0,0.0,0.0,40.0,41.0,38,0.000309,age,19683,XGBClassifier
19683,19.92,2.0,260997.0,10.0,1.0,8.0,1.0,2.0,0.0,0.0,0.0,40.0,41.0,38,0.000309,age,19683,XGBClassifier
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19683,38.00,2.0,260997.0,10.0,1.0,8.0,1.0,2.0,0.0,0.0,0.0,40.0,39.4,41,0.006204,native-country,19683,XGBClassifier
19683,38.00,2.0,260997.0,10.0,1.0,8.0,1.0,2.0,0.0,0.0,0.0,40.0,39.8,41,0.006271,native-country,19683,XGBClassifier
19683,38.00,2.0,260997.0,10.0,1.0,8.0,1.0,2.0,0.0,0.0,0.0,40.0,40.2,41,0.005429,native-country,19683,XGBClassifier
19683,38.00,2.0,260997.0,10.0,1.0,8.0,1.0,2.0,0.0,0.0,0.0,40.0,40.6,41,0.005429,native-country,19683,XGBClassifier


In [10]:
ceteris_paribus_ex.plot(variables = ['capital-gain', 'capital-loss'])

#### We can visualize more records at the same time

In [27]:
ceteris_58 = exp.predict_profile(test_set.iloc[58, :])
ceteris_paribus_ex.plot(ceteris_58, variables = ['hours-per-week', 'capital-gain'])

Calculating ceteris paribus: 100%|██████████| 13/13 [00:00<00:00, 154.60it/s]


#### Another interesting plot is the break down plot

In [21]:
import matplotlib
from matplotlib.pyplot import figure
exp.predict_parts(test_set.iloc[2, :], type='shap').plot(min_max=[0,1])

In [22]:
import matplotlib
from matplotlib.pyplot import figure
figure(num=None, figsize=(3.5, 7), dpi=100)
exp.predict_parts(test_set.iloc[2, :], type='break_down').plot(min_max=[0,1])

<Figure size 350x700 with 0 Axes>

In [23]:
exp.predict_parts(test_set.iloc[9, :], type='shap').plot(min_max=[0,1])

In [24]:
exp.predict_parts(test_set.iloc[9, :], type='break_down').plot(min_max=[0,1])

### Dalex also offers a number of plots to visualize the overall behaviour of the black box model

In [25]:
exp.model_parts().plot()

In [26]:
exp.model_performance('classification').plot()