In [2]:
import warnings
warnings.filterwarnings("ignore")

from xai_agg.agg_exp import *

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier

import dill

# Data Loading and Preprocessing

In [4]:
original_data = pd.read_csv('data/german_credit_data_updated.csv')

# Dataset overview - German Credit Risk (from Kaggle):
# 1. Age (numeric)
# 2. Sex (text: male, female)
# 3. Job (numeric: 0 - unskilled and non-resident, 1 - unskilled and resident, 2 - skilled, 3 - highly skilled)
# 4. Housing (text: own, rent, or free)
# 5. Saving accounts (text - little, moderate, quite rich, rich)
# 6. Checking account (numeric, in DM - Deutsch Mark)
# 7. Credit amount (numeric, in DM)
# 8. Duration (numeric, in month)
# 9. Purpose (text: car, furniture/equipment, radio/TV, domestic appliances, repairs, education, business, vacation/others)

display(original_data.head())
display(original_data.describe())
display(original_data.info())

# Display the unique values of the categorical features:
print('Unique values of the categorical features:')
for col in original_data.select_dtypes(include='object'):
    print(f'\t- {col}: {original_data[col].unique()}')

Unnamed: 0.1,Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Credit Risk
0,0,67,male,2,own,,little,1169,6,radio/TV,1
1,1,22,female,2,own,little,moderate,5951,48,radio/TV,2
2,2,49,male,1,own,little,,2096,12,education,1
3,3,45,male,2,free,little,little,7882,42,furniture/equipment,1
4,4,53,male,2,free,little,little,4870,24,car,2


Unnamed: 0.1,Unnamed: 0,Age,Job,Credit amount,Duration,Credit Risk
count,954.0,954.0,954.0,954.0,954.0,954.0
mean,476.5,35.501048,1.909853,3279.112159,20.780922,1.302935
std,275.540378,11.379668,0.649681,2853.315158,12.046483,0.459768
min,0.0,19.0,0.0,250.0,4.0,1.0
25%,238.25,27.0,2.0,1360.25,12.0,1.0
50%,476.5,33.0,2.0,2302.5,18.0,1.0
75%,714.75,42.0,2.0,3975.25,24.0,2.0
max,953.0,75.0,3.0,18424.0,72.0,2.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 954 entries, 0 to 953
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Unnamed: 0        954 non-null    int64 
 1   Age               954 non-null    int64 
 2   Sex               954 non-null    object
 3   Job               954 non-null    int64 
 4   Housing           954 non-null    object
 5   Saving accounts   779 non-null    object
 6   Checking account  576 non-null    object
 7   Credit amount     954 non-null    int64 
 8   Duration          954 non-null    int64 
 9   Purpose           954 non-null    object
 10  Credit Risk       954 non-null    int64 
dtypes: int64(6), object(5)
memory usage: 82.1+ KB


None

Unique values of the categorical features:
	- Sex: ['male' 'female']
	- Housing: ['own' 'free' 'rent']
	- Saving accounts: [nan 'little' 'quite rich' 'rich' 'moderate']
	- Checking account: ['little' 'moderate' nan 'rich']
	- Purpose: ['radio/TV' 'education' 'furniture/equipment' 'car' 'business'
 'domestic appliances' 'repairs' 'vacation/others']


In [5]:
preprocessed_data = original_data.copy()

# For savings and checking accounts, we will replace the missing values with 'none':
preprocessed_data['Saving accounts'].fillna('none', inplace=True)
preprocessed_data['Checking account'].fillna('none', inplace=True)

# Dropping index column:
preprocessed_data.drop(columns=['Unnamed: 0'], inplace=True)

# Using pd.dummies to one-hot-encode the categorical features
preprocessed_data["Job"] = preprocessed_data["Job"].map({0: 'unskilled_nonresident', 1: 'unskilled_resident',
                                                         2: 'skilled', 3: 'highlyskilled'})

categorical_features = preprocessed_data.select_dtypes(include='object').columns
numerical_features = preprocessed_data.select_dtypes(include='number').columns.drop('Credit Risk')
print(f'Categorical features: {categorical_features}')
print(f'Numerical features: {numerical_features}')

preprocessed_data = pd.get_dummies(preprocessed_data, columns=categorical_features, dtype='int64')

# Remapping the target variable to 0 and 1:
preprocessed_data['Credit Risk'] = preprocessed_data['Credit Risk'].map({1: 0, 2: 1})

# Make sure all column names are valid python identifiers (important for pd.query() calls):
preprocessed_data.columns = preprocessed_data.columns.str.replace(' ', '_')
preprocessed_data.columns = preprocessed_data.columns.str.replace('/', '_')

# Normalizing the data
scaler = StandardScaler()
scaled_preprocessed_data = scaler.fit_transform(preprocessed_data)

display(preprocessed_data.head())
display(preprocessed_data.info())

display(scaled_preprocessed_data)

Categorical features: Index(['Sex', 'Job', 'Housing', 'Saving accounts', 'Checking account',
       'Purpose'],
      dtype='object')
Numerical features: Index(['Age', 'Credit amount', 'Duration'], dtype='object')


Unnamed: 0,Age,Credit_amount,Duration,Credit_Risk,Sex_female,Sex_male,Job_highlyskilled,Job_skilled,Job_unskilled_nonresident,Job_unskilled_resident,...,Checking_account_none,Checking_account_rich,Purpose_business,Purpose_car,Purpose_domestic_appliances,Purpose_education,Purpose_furniture_equipment,Purpose_radio_TV,Purpose_repairs,Purpose_vacation_others
0,67,1169,6,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
1,22,5951,48,1,1,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,49,2096,12,0,0,1,0,0,0,1,...,1,0,0,0,0,1,0,0,0,0
3,45,7882,42,0,0,1,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
4,53,4870,24,1,0,1,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 954 entries, 0 to 953
Data columns (total 30 columns):
 #   Column                       Non-Null Count  Dtype
---  ------                       --------------  -----
 0   Age                          954 non-null    int64
 1   Credit_amount                954 non-null    int64
 2   Duration                     954 non-null    int64
 3   Credit_Risk                  954 non-null    int64
 4   Sex_female                   954 non-null    int64
 5   Sex_male                     954 non-null    int64
 6   Job_highlyskilled            954 non-null    int64
 7   Job_skilled                  954 non-null    int64
 8   Job_unskilled_nonresident    954 non-null    int64
 9   Job_unskilled_resident       954 non-null    int64
 10  Housing_free                 954 non-null    int64
 11  Housing_own                  954 non-null    int64
 12  Housing_rent                 954 non-null    int64
 13  Saving_accounts_little       954 non-null    int64

None

array([[ 2.7694545 , -0.7399179 , -1.22763429, ...,  1.62518349,
        -0.14633276, -0.11286653],
       [-1.18704073,  0.93690642,  2.26068929, ...,  1.62518349,
        -0.14633276, -0.11286653],
       [ 1.18685641, -0.41486224, -0.72930235, ..., -0.61531514,
        -0.14633276, -0.11286653],
       ...,
       [-1.0111965 , -0.39768023,  1.26402541, ..., -0.61531514,
        -0.14633276, -0.11286653],
       [-0.65950803,  0.29240557,  0.26736153, ..., -0.61531514,
        -0.14633276, -0.11286653],
       [-0.83535227,  2.69823821,  1.26402541, ..., -0.61531514,
        -0.14633276, -0.11286653]])

In [6]:
y = preprocessed_data['Credit_Risk']
X = preprocessed_data.drop(columns='Credit_Risk')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(f'ROC AUC: {roc_auc_score(y_test, y_pred)}')

Accuracy: 0.7696335078534031
ROC AUC: 0.6830357142857143


### Checking metrics behaviour for LIME, SHAP and Anchor

In [8]:
shap_exp = ShapTabularTreeWrapper(clf, X_train, categorical_features)
lime_exp = LimeWrapper(clf, X_train, categorical_features)
anchor_exp = AnchorWrapper(clf, X_train, categorical_features)

evaluator = ExplanationModelEvaluator(clf, X_train, categorical_features, noise_gen_args={'encoding_dim': 5, 'epochs': 500})
evaluator.init()

Epoch 1/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 1.2682 - val_loss: 1.2560
Epoch 2/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.2667 - val_loss: 1.2388
Epoch 3/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 1.2304 - val_loss: 1.2231
Epoch 4/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 1.2241 - val_loss: 1.2082
Epoch 5/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 1.2245 - val_loss: 1.1937
Epoch 6/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.1595 - val_loss: 1.1795
Epoch 7/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.1863 - val_loss: 1.1654
Epoch 8/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 1.1718 - val_loss: 1.1516
Epoch 9/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━

In [None]:
metric_runs = {shap_exp: [], lime_exp: [], anchor_exp: []} # Arrays of the format [[faithfulness1, sensitivity1, complexity1], [faithfulness2, sensitivity2, complexity2], ...]

num_instances_to_check = 100
# Chosse num_isntances_to_check unique indexes from X_test size
indexes = np.random.choice(X_test.index, num_instances_to_check, replace=False)

for idx in indexes:
    print (f'Checking instance {idx}')
    instance_data_row = X_test.loc[idx]
    for exp in [shap_exp, lime_exp, anchor_exp]:
        row = [
            idx,
            evaluator.faithfullness_correlation(exp, instance_data_row),
            evaluator.sensitivity(exp, instance_data_row),
            evaluator.complexity(exp, instance_data_row)
        ]
        metric_runs[exp].append(row)

Checking instance 527
Checking instance 235
Checking instance 398
Checking instance 948
Checking instance 633
Checking instance 692
Checking instance 918
Checking instance 695
Checking instance 296
Checking instance 689
Checking instance 890
Checking instance 424
Checking instance 412
Checking instance 63
Checking instance 634
Checking instance 66
Checking instance 826


Could not find an anchor satisfying the 0.95 precision constraint. Now returning the best non-eligible result. The desired precision threshold might not be achieved due to the quantile-based discretisation of the numerical features. The resolution of the bins may be too large to find an anchor of required precision. Consider increasing the number of bins in `disc_perc`, but note that for some numerical distribution (e.g. skewed distribution) it may not help.


Checking instance 346
Checking instance 67
Checking instance 788
Checking instance 481
Checking instance 660
Checking instance 751
Checking instance 342
Checking instance 209
Checking instance 789
Checking instance 306
Checking instance 468
Checking instance 513


Could not find an anchor satisfying the 0.95 precision constraint. Now returning the best non-eligible result. The desired precision threshold might not be achieved due to the quantile-based discretisation of the numerical features. The resolution of the bins may be too large to find an anchor of required precision. Consider increasing the number of bins in `disc_perc`, but note that for some numerical distribution (e.g. skewed distribution) it may not help.
Could not find an anchor satisfying the 0.95 precision constraint. Now returning the best non-eligible result. The desired precision threshold might not be achieved due to the quantile-based discretisation of the numerical features. The resolution of the bins may be too large to find an anchor of required precision. Consider increasing the number of bins in `disc_perc`, but note that for some numerical distribution (e.g. skewed distribution) it may not help.


Checking instance 870
Checking instance 786
Checking instance 580
Checking instance 745
Checking instance 30
Checking instance 357
Checking instance 707
Checking instance 33
Checking instance 666
Checking instance 547
Checking instance 928
Checking instance 827
Checking instance 910
Checking instance 858
Checking instance 485
Checking instance 881
Checking instance 714
Checking instance 684
Checking instance 70
Checking instance 567
Checking instance 732
Checking instance 39
Checking instance 841
Checking instance 568
Checking instance 370
Checking instance 174
Checking instance 139
Checking instance 218
Checking instance 522


Could not find an anchor satisfying the 0.95 precision constraint. Now returning the best non-eligible result. The desired precision threshold might not be achieved due to the quantile-based discretisation of the numerical features. The resolution of the bins may be too large to find an anchor of required precision. Consider increasing the number of bins in `disc_perc`, but note that for some numerical distribution (e.g. skewed distribution) it may not help.


Checking instance 625
Checking instance 917
Checking instance 882
Checking instance 493
Checking instance 158
Checking instance 78
Checking instance 800
Checking instance 903
Checking instance 949
Checking instance 49
Checking instance 530
Checking instance 694
Checking instance 621
Checking instance 321
Checking instance 617
Checking instance 601


Could not find an anchor satisfying the 0.95 precision constraint. Now returning the best non-eligible result. The desired precision threshold might not be achieved due to the quantile-based discretisation of the numerical features. The resolution of the bins may be too large to find an anchor of required precision. Consider increasing the number of bins in `disc_perc`, but note that for some numerical distribution (e.g. skewed distribution) it may not help.
Could not find an anchor satisfying the 0.95 precision constraint. Now returning the best non-eligible result. The desired precision threshold might not be achieved due to the quantile-based discretisation of the numerical features. The resolution of the bins may be too large to find an anchor of required precision. Consider increasing the number of bins in `disc_perc`, but note that for some numerical distribution (e.g. skewed distribution) it may not help.


Checking instance 819
Checking instance 199
Checking instance 137
Checking instance 596
Checking instance 603
Checking instance 286
Checking instance 529
Checking instance 381
Checking instance 318
Checking instance 260
Checking instance 314
Checking instance 593
Checking instance 213
Checking instance 422


Could not find an anchor satisfying the 0.95 precision constraint. Now returning the best non-eligible result. The desired precision threshold might not be achieved due to the quantile-based discretisation of the numerical features. The resolution of the bins may be too large to find an anchor of required precision. Consider increasing the number of bins in `disc_perc`, but note that for some numerical distribution (e.g. skewed distribution) it may not help.
Could not find an anchor satisfying the 0.95 precision constraint. Now returning the best non-eligible result. The desired precision threshold might not be achieved due to the quantile-based discretisation of the numerical features. The resolution of the bins may be too large to find an anchor of required precision. Consider increasing the number of bins in `disc_perc`, but note that for some numerical distribution (e.g. skewed distribution) it may not help.


Checking instance 302
Checking instance 679
Checking instance 120
Checking instance 500
Checking instance 436
Checking instance 850
Checking instance 31
Checking instance 88
Checking instance 280
Checking instance 728
Checking instance 265
Checking instance 382


In [29]:
# Change metric_runs keys to ["SHAP", "LIME", "Anchor"]
metric_runs1 = {"shap": metric_runs[shap_exp], "lime": metric_runs[lime_exp], "anchor": metric_runs[anchor_exp]}

In [30]:
metric_runs1

{'shap': [[0.5723255128372227, 0.9681694998768169, 2.3930018139207707],
  [0.4455079719457512, 0.7932906283588386, 2.5643626036115066],
  [0.11866524032382038, 0.5698036691722417, 2.4865124394765457],
  [0.3321909828950127, 0.5973491832952431, 2.267543704339637],
  [0.3880845549517916, 0.651052978785058, 2.1921237565267475],
  [0.045062567054379186, 0.7173494575917096, 2.4287078477545365],
  [0.3533635648657668, 0.8600620161683944, 2.55966172035827],
  [0.6466996145829886, 0.6259177137225918, 2.3808836586188544],
  [0.38288184555107013, 0.7660893228293795, 2.5972896124354863],
  [0.28946033910655344, 0.6925758450380424, 2.4304895479349486],
  [0.4108290471891962, 0.6841527247710804, 2.539228234839168],
  [0.4595011656489133, 0.7233812655654359, 2.7126702640784655],
  [0.11344736925449553, 0.6884734997750285, 2.4962205489144815],
  [0.912693063962245, 0.7772242998504544, 2.2466135681909547],
  [0.5627141313984826, 0.8864549119769549, 2.4792100295543626],
  [0.47097624826839524, 0.888735

In [None]:
# pickle metric_runs
with open('./pickles/experiments_org_metric_runs.pkl', 'wb') as f:
    dill.dump(metric_runs1, f)

In [8]:
# read pickle
with open('./pickles/experiments_org_metric_runs.pkl', 'rb') as f:
    metric_runs1 = dill.load(f)

metric_runs1

{'shap': [[0.5723255128372227, 0.9681694998768169, 2.3930018139207707],
  [0.4455079719457512, 0.7932906283588386, 2.5643626036115066],
  [0.11866524032382038, 0.5698036691722417, 2.4865124394765457],
  [0.3321909828950127, 0.5973491832952431, 2.267543704339637],
  [0.3880845549517916, 0.651052978785058, 2.1921237565267475],
  [0.045062567054379186, 0.7173494575917096, 2.4287078477545365],
  [0.3533635648657668, 0.8600620161683944, 2.55966172035827],
  [0.6466996145829886, 0.6259177137225918, 2.3808836586188544],
  [0.38288184555107013, 0.7660893228293795, 2.5972896124354863],
  [0.28946033910655344, 0.6925758450380424, 2.4304895479349486],
  [0.4108290471891962, 0.6841527247710804, 2.539228234839168],
  [0.4595011656489133, 0.7233812655654359, 2.7126702640784655],
  [0.11344736925449553, 0.6884734997750285, 2.4962205489144815],
  [0.912693063962245, 0.7772242998504544, 2.2466135681909547],
  [0.5627141313984826, 0.8864549119769549, 2.4792100295543626],
  [0.47097624826839524, 0.888735

In [None]:
# Create a pandas dataframe from the metric_runs dictionary
lime_metric_runs = pd.DataFrame(metric_runs1['lime'], columns=['idx', 'faithfulness', 'sensitivity', 'complexity'])
shap_metric_runs = pd.DataFrame(metric_runs1['shap'], columns=['idx', 'faithfulness', 'sensitivity', 'complexity'])
anchor_metric_runs = pd.DataFrame(metric_runs1['anchor'], columns=['idx', 'faithfulness', 'sensitivity', 'complexity'])

In [13]:
lime_metric_runs.describe()

Unnamed: 0,faithfulness,sensitivity,complexity
count,100.0,100.0,100.0
mean,0.390486,0.486279,2.583746
std,0.203001,0.095099,0.044482
min,0.017157,0.134483,2.457439
25%,0.248143,0.438337,2.553871
50%,0.383037,0.505665,2.58389
75%,0.538127,0.54899,2.6118
max,0.786396,0.672167,2.724201


In [15]:
shap_metric_runs.describe()

Unnamed: 0,faithfulness,sensitivity,complexity
count,100.0,100.0,100.0
mean,0.45949,0.713927,2.430877
std,0.217598,0.125869,0.156041
min,0.003457,0.391567,2.003556
25%,0.325217,0.629465,2.337372
50%,0.460242,0.716669,2.458565
75%,0.647138,0.789604,2.527835
max,0.912693,0.982225,2.758602


In [16]:
anchor_metric_runs.describe()

Unnamed: 0,faithfulness,sensitivity,complexity
count,100.0,100.0,100.0
mean,0.395468,0.902281,1.384464
std,0.230227,0.041055,0.550394
min,0.006044,0.801397,0.604767
25%,0.19084,0.874697,1.025438
50%,0.442871,0.903915,1.282263
75%,0.572965,0.932445,1.601507
max,0.811126,1.0,2.764107
