In [1]:
import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from xailib.data_loaders.dataframe_loader import prepare_dataframe

from xailib.explainers.lime_explainer import LimeXAITabularExplainer
from xailib.explainers.lore_explainer import LoreTabularExplainer
from xailib.explainers.shap_explainer_tab import ShapXAITabularExplainer

from xailib.models.sklearn_classifier_wrapper import sklearn_classifier_wrapper

### Load the dataset and prepare it

In [2]:
source_file = 'datasets/german_credit.csv'
class_field = 'default'
# Load and transform dataset 
df = pd.read_csv(source_file, skipinitialspace=True, na_values='?', keep_default_na=True)

In [3]:
df, feature_names, class_values, numeric_columns, rdf, real_feature_names, features_map = prepare_dataframe(df, class_field)

In [4]:
df['duration_in_month'].max()

72

### Split the dataset into train and test and scale it

In [5]:
test_size = 0.3
random_state = 42
X_train, X_test, Y_train, Y_test = train_test_split(df[feature_names], df[class_field],
                                                        test_size=test_size,
                                                        random_state=random_state,
                                                        stratify=df[class_field])



### Define and train a logistic regression model

In [None]:
scaler = preprocessing.StandardScaler().fit(X_train)
X_scaled = scaler.transform(X_train)

bb = LogisticRegression(C=1, penalty='l2')
bb.fit(X_scaled, Y_train.values)
# pass the model to the wrapper to use it in the XAI lib
bbox = sklearn_classifier_wrapper(bb)

In [None]:
# select a record to explain
inst = X_train.iloc[188].values
print('Instance ',inst[0:10])
print('True class ',Y_train.iloc[18])
print('Predicted class ',bb.predict(inst.reshape(1, -1)))

### SHAP explainer
##### We first define the explainer method we want to use: SHAP. It produces feature importance. Then, we can define a dictionary containing the SHAP's parameters we want. 
##### Explainer can be: linear, tree, kernel, deep
##### X_train is the training set we want the explainer to train on. It can be all the training set used for the training of the target model, just a piece of it or we can apply a clustering algorithm on it. 

In [None]:
explainer = ShapXAITabularExplainer(bbox)
config = {'explainer' : 'linear', 'X_train' : X_train.iloc[0:100].values, 'feature_pert' : 'interventional'}
explainer.fit(config)

#### We explain the instance and obtain the SHAP values

In [None]:
print('building an explanation')
exp = explainer.explain(inst)
print(exp)

##### We can use the method plot_shap_values to get a visual representation. 
##### Parameters: feature_names, the list of features, exp, the explanation we want to plot, start and end, which denotes the range of features we want to plot

In [None]:
explainer.plot_shap_values(feature_names, exp, 4, 15)

##### We can use the method plot_shap_values_alt  (or plot_shap_values_alt2) to get an interactive visual representation. 
##### Parameters: 
- feature_names: the list of features;
- exp: the explanation we want to plot
- fontDimension: font size (which affects entire plot size)

In [None]:
explainer.plot_shap_values_alt(feature_names, exp)

### LORE explainer
##### We first define the explainer method we want to use: LORE. It produces rules and counterfactual rules. We can define a dictionary containing the LORE's parameters we want. 
##### neigh_type can be: random, genetic, rndgen, geneticp, rndgenp

In [None]:
explainer = LoreTabularExplainer(bbox)
config = {'neigh_type':'geneticp', 'size':1000, 'ocr':0.1, 'ngen':10}
explainer.fit(df, class_field, config)
exp = explainer.explain(inst)
print(exp)

### LIME explainer
##### We first define the explainer method we want to use: LIME. It produces feature importances. We can define a dictionary containing the LIME's parameters we want. 
##### There are several parameters we can define: feature selection, discretize continuous, discretizer, sample_around_instance, kernel width, kernel.

In [None]:
limeExplainer = LimeXAITabularExplainer(bbox)
config = {'feature_selection': 'lasso_path'}
limeExplainer.fit(df, class_field, config)
lime_exp = limeExplainer.explain(inst)
print(lime_exp.as_list())

##### To plot the explanation, we pass the explanation as a list (tuples, where the first element is the feature name and the second the actual feature importance), start and end of the range of features we want to visualize.

In [None]:
limeExplainer.plot_lime_values(lime_exp.as_list(), 4, 15)

In [None]:
 feature_names

### Define and train a RFClassifier

In [6]:
bb = RandomForestClassifier(n_estimators=20, random_state=random_state)
bb.fit(X_train.values, Y_train.values)
bbox = sklearn_classifier_wrapper(bb)   

In [7]:
explainer = ShapXAITabularExplainer(bbox, feature_names)
config = {'explainer' : 'kernel', 'X_train' : X_train.iloc[0:100].values}
explainer.fit(config)

In [8]:
inst = X_train.iloc[147].values
print('Instance ',inst[0:10])
print('True class ',Y_train.iloc[8])
print('Predicted class ',bb.predict(inst.reshape(1, -1)))

Instance  [ 15 975   2   3  25   2   1   0   1   0]
True class  0
Predicted class  [0]


In [9]:
print('building an explanation')
exp = explainer.explain(inst)
print(exp.exp)

building an explanation
[array([ 0.04364305,  0.00233334,  0.02687993,  0.00044366,  0.02921614,
        0.        ,  0.        ,  0.0236707 , -0.02075349,  0.        ,
       -0.0293711 ,  0.00114501,  0.0632803 ,  0.        ,  0.00826698,
        0.        ,  0.007053  ,  0.        ,  0.01553522,  0.        ,
        0.00207688,  0.        ,  0.        , -0.0087651 ,  0.        ,
        0.        , -0.01230946,  0.        ,  0.        ,  0.        ,
        0.        ,  0.00847158,  0.01616477,  0.00875184,  0.00096531,
        0.00253366,  0.        ,  0.01243686,  0.        ,  0.00572106,
        0.        ,  0.00403564,  0.        ,  0.02340512,  0.        ,
       -0.01101317,  0.01310039,  0.01607476,  0.01834455,  0.00446476,
        0.        ,  0.01860371,  0.00551734,  0.00273426,  0.00470391,
        0.        ,  0.        , -0.00638797,  0.01052657,  0.        ,
        0.        ]), array([-0.04364305, -0.00233334, -0.02687993, -0.00044366, -0.02921614,
        0.       

In [10]:
exp.plot_features_importance()

##### We can use the method plot_shap_values to get a visual representation. 
##### Parameters: feature_names, the list of features, exp, the explanation we want to plot, start and end, which denotes the range of features we want to plot. When using Tree explainer, for each record we obtain n feature importance arrays (where n is the number of classes). For the plot, we have to select only one array (in this case we selected the one of the true class).

In [None]:
explainer.plot_shap_values_alt(feature_names, exp[0])

### LORE explainer

In [None]:
explainer = LoreTabularExplainer(bbox)
config = {'neigh_type':'rndgen', 'size':1000, 'ocr':0.1, 'ngen':10}
explainer.fit(df, class_field, config)
exp = explainer.explain(inst)
print(exp)

### LIME explainer

In [None]:
limeExplainer = LimeXAITabularExplainer(bbox)
config = {'feature_selection': 'lasso_path'}
limeExplainer.fit(df, class_field, config)
lime_exp = limeExplainer.explain(inst)
print(lime_exp.as_list())

In [None]:
limeExplainer.plot_lime_values(lime_exp.as_list(), 5, 10)