In [3]:
import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from xailib.data_loaders.dataframe_loader import prepare_dataframe

from xailib.explainers.lime_explainer import LimeXAITabularExplainer
from xailib.explainers.lore_explainer import LegacyLoreTabularExplainer
from xailib.explainers.lore_explainer import LoreTabularExplainer
from xailib.explainers.shap_explainer_tab import ShapXAITabularExplainer

from xailib.models.sklearn_classifier_wrapper import sklearn_classifier_wrapper

# Learning and explaining German Credit Dataset

## Loading and preparation of data

We start by reading from a CSV file the dataset to analyze. The table is loaded by means of the ```DataFrame``` class from the ```pandas``` library.

Among all the attributes of the table, we select the ```class_field``` column that contains the observed class for the corresponding row.

In [4]:
source_file = '../datasets/german_credit.csv'
class_field = 'default'
# Load and transform dataset 
df = pd.read_csv(source_file, skipinitialspace=True, na_values='?', keep_default_na=True)

After the data is loaded in memory, we need to extract metadata information to automatically handle the content withint the table.

The method ```prepare_dataframe``` scans the table and extract the following information:
 * ```df```: is a trasformed version of the original dataframe, where discrete attributes are transformed into numerical attributes by using one hot encoding strategy;
 * ```feature_names```: is a list containint the names of the features after the transformation;
 * ```class_values```: the list of all the possible values for the ```class_field``` column;
 * ```numeric_columns```: a list of the original features that contain numeric (i.e. continuous) values;
 * ```rdf```: the original dataframe, before the transformation;
 * ```real_feature_names```: the list of the features of the dataframe before the transformation;
 * ```features_map```: it is a dictionary pointing each feature to the original one before the transformation.

In [5]:
df, feature_names, class_values, numeric_columns, rdf, real_feature_names, features_map = prepare_dataframe(df, class_field)

### Learning a Random Forest classfier

We train a RF classifier by using the ```sklearn``` library. We start by splitting the dataset into a train and test subsets. 

In [6]:
test_size = 0.3
random_state = 42
X_train, X_test, Y_train, Y_test = train_test_split(df[feature_names], df[class_field],
                                                        test_size=test_size,
                                                        random_state=random_state,
                                                        stratify=df[class_field])



Then we train the model on the training set. 
Once the model has been learned, we use a wrapper class to get access to the model for ```XAI lib```

In [7]:
bb = RandomForestClassifier(n_estimators=20, random_state=random_state)
bb.fit(X_train.values, Y_train.values)
bbox = sklearn_classifier_wrapper(bb)   

Select a new instance to be classfied by the model and print the predicted class.

In [8]:
inst = X_train.iloc[147].values
print('Instance ',inst)
print('True class ',Y_train.iloc[8])
print('Predicted class ',bb.predict(inst.reshape(1, -1)))

Instance  [ 15 975   2   3  25   2   1   0   1   0   0   0   1   0   0   0   0   0
   0   0   0   0   0   1   0   0   0   1   0   0   0   0   0   1   0   0
   0   1   0   0   0   0   1   1   0   0   0   0   1   0   0   1   0   0
   1   0   0   1   0   0   1]
True class  0
Predicted class  [0]


## Explaining the prediction
We use the explanators of ```XAI lib``` to provide an explantion for the classified instance ```inst```.
Every explainer of ```XAI lib``` takes in input the blackbox to be explained with the corresponding feature names, and a configuration object to initialize the explainer.

### SHAP explainer

In [9]:
explainer = ShapXAITabularExplainer(bbox, feature_names)
config = {'explainer' : 'tree', 'X_train' : X_train.iloc[0:100].values}
explainer.fit(config)

In [10]:
exp = explainer.explain(inst)
# print(exp.exp)

In [11]:
exp.plot_features_importance()

### LORE explainer

In [12]:
explainer = LegacyLoreTabularExplainer(bbox)
config = {'neigh_type':'rndgen', 'size':1000, 'ocr':0.1, 'ngen':10}
explainer.fit(df, class_field, config)
exp = explainer.explain(inst)
print(exp)

<xailib.explainers.lore_explainer.LoreTabularExplanation object at 0x7a952c3c4ee0>


In [13]:
exp.plotRules()

In [14]:
exp.plotCounterfactualRules()

### LIME explainer

In [15]:
limeExplainer = LimeXAITabularExplainer(bbox)
config = {'feature_selection': 'lasso_path'}
limeExplainer.fit(df, class_field, config)
lime_exp = limeExplainer.explain(inst)
print(lime_exp.exp.as_list())

[('account_check_status=no checking account', -0.031955012192442574), ('duration_in_month', 0.030839782539954508), ('account_check_status=< 0 DM', 0.027467865053919967), ('credit_history=critical account/ other credits existing (not at this bank)', -0.026434593932261205), ('other_installment_plans=bank', 0.02294961869095158), ('age', -0.022178867490814862), ('property=real estate', -0.020171926658649243), ('savings=... < 100 DM', 0.01793407873368487), ('installment_as_income_perc', 0.015581579840944696), ('property=unknown / no property', 0.015253540546335109)]


In [16]:
# limeExplainer.plot_lime_values(lime_exp.as_list(), 5, 10)
lime_exp.plot_features_importance()

## Learning a different model

### Learning a Logistic Regressor

We train a Logistic Regression by using the ```sklearn``` library. We transform the dataset by using a ```Scaler``` to normalize all the attributes.


In [17]:
scaler = preprocessing.StandardScaler().fit(X_train)
X_scaled = scaler.transform(X_train)

bb = LogisticRegression(C=1, penalty='l2')
bb.fit(X_scaled, Y_train.values)
# pass the model to the wrapper to use it in the XAI lib
bbox = sklearn_classifier_wrapper(bb)

In [18]:
# select a record to explain
inst = X_scaled[182]
print('Instance ',inst)
print('Predicted class ',bb.predict(inst.reshape(1, -1)))

Instance  [ 2.27797454  3.35504085  0.94540357  1.07634233  0.04854891 -0.72456474
 -0.43411405  1.65027399 -0.61477862 -0.25898489 -0.80681063  4.17385345
 -0.6435382  -0.32533856 -1.03489416 -0.20412415 -0.22941573 -0.33068147
  1.75885396 -0.34899122 -0.60155441 -0.15294382 -0.09298136 -0.46852129
 -0.12038585 -0.08481889 -0.23623492 -1.21387736 -0.36174054 -0.24943031
  2.15526362 -0.59715086 -0.45485883 -0.73610476 -0.43875307  4.23307441
 -0.65242771 -0.23958675 -0.32533856  0.90192655  4.72581563 -0.2259448
 -3.15238005 -0.54212562 -0.70181003 -0.63024248  2.30354212 -0.40586384
  0.49329429 -0.23958675  2.88675135 -1.59227935 -0.46170508  2.46388049
 -1.33747696 -0.13206764 -0.5        -1.21387736  1.21387736 -0.20412415
  0.20412415]
Predicted class  [1]


## Explaining the prediction
We use the same explainators as for the previous model. In this case, a few adjustments are necessary for the initialization of the explanators. For example, SHAP needs a specific configuration for the linear model we are using.
### SHAP Explainer

In [19]:
explainer = ShapXAITabularExplainer(bbox, feature_names)
config = {'explainer' : 'linear', 'X_train' : X_scaled[0:100], 'feature_pert' : 'interventional'}
explainer.fit(config)

In [20]:
exp = explainer.explain(inst)
print(exp)

<xailib.explainers.shap_explainer_tab.ShapXAITabularExplanation object at 0x7a9433b80d30>


In [21]:
exp.plot_features_importance()

### LORE explainer

In [22]:
explainer = LegacyLoreTabularExplainer(bbox)
config = {'neigh_type':'geneticp', 'size':1000, 'ocr':0.1, 'ngen':10}
explainer.fit(df, class_field, config)
exp = explainer.explain(inst)
print(exp)

<xailib.explainers.lore_explainer.LoreTabularExplanation object at 0x7a9433ac41c0>


In [27]:
explainer2 = LoreTabularExplainer(bbox)
config = {'neigh_type':'geneticp', 'size':1000, 'ocr':0.1, 'ngen':10}
explainer2.fit(df, class_field, config)
exp2 = explainer2.explain(inst)
print(exp2)

Exception: ERR: target column cannot be continuous. Please, set a categorical column as target.You can force the content of target column by discretize it.

In [45]:
exp.plotRules()

In [22]:
exp.plotCounterfactualRules()

### LIME explainer

In [23]:
limeExplainer = LimeXAITabularExplainer(bbox)
config = {'feature_selection': 'lasso_path'}
limeExplainer.fit(df, class_field, config)
lime_exp = limeExplainer.explain(inst)
print(lime_exp.exp.as_list())

[('other_debtors=co-applicant', -1.2099484483255807e-09), ('credit_history=all credits at this bank paid back duly', -9.807292855860647e-10), ('present_emp_since=unemployed', -8.905493791597945e-10), ('other_debtors=none', 6.501490037885828e-10), ('housing=for free', -4.510074423165729e-10), ('credit_amount', 3.4347399311566785e-10), ('job=management/ self-employed/ highly qualified employee/ officer', -3.072084086389376e-10), ('property=unknown / no property', -3.047260454092542e-10), ('savings=unknown/ no savings account', -3.0122747825366714e-10), ('savings=... < 100 DM', 2.405511062391693e-10), ('credits_this_bank', 2.371877264255231e-10), ('housing=own', 2.1981825790193429e-10), ('property=real estate', 1.9425271631667495e-10), ('account_check_status=0 <= ... < 200 DM', -1.8341777236605377e-10), ('foreign_worker=yes', 1.735145952766157e-10), ('purpose=car (new)', -1.6299442716106328e-10), ('account_check_status=no checking account', 1.6227117790108462e-10), ('people_under_maintena

In [46]:
lime_exp.plot_features_importance()