In [54]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier

from xai_agg.agg_exp import *
from xai_agg.utils import *

# Data reading and preprocessing

In [55]:
raw = pd.read_excel("../data/taiwan.xls")
display(raw)

Unnamed: 0.1,Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,...,X15,X16,X17,X18,X19,X20,X21,X22,X23,Y
0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
1,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
2,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
3,3,90000,2,2,2,34,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
4,4,50000,2,2,1,37,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29996,29996,220000,1,3,1,39,0,0,0,0,...,88004,31237,15980,8500,20000,5003,3047,5000,1000,0
29997,29997,150000,1,3,2,43,-1,-1,-1,-1,...,8979,5190,0,1837,3526,8998,129,0,0,0
29998,29998,30000,1,2,2,37,4,3,2,-1,...,20878,20582,19357,0,0,22000,4200,2000,3100,1
29999,29999,80000,1,3,1,41,1,-1,0,0,...,52774,11855,48944,85900,3409,1178,1926,52964,1804,1


In [56]:
# Trim columns and fix header
raw.columns = raw.iloc[0]
raw = raw.drop(0).reset_index(drop=True)
raw = raw.drop(columns=['ID'])
display(raw)

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,20000,2,2,1,24,2,2,-1,-1,-2,...,0,0,0,0,689,0,0,0,0,1
1,120000,2,2,2,26,-1,2,0,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,90000,2,2,2,34,0,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,50000,2,2,1,37,0,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,50000,1,2,1,57,-1,0,-1,0,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,220000,1,3,1,39,0,0,0,0,0,...,88004,31237,15980,8500,20000,5003,3047,5000,1000,0
29996,150000,1,3,2,43,-1,-1,-1,-1,0,...,8979,5190,0,1837,3526,8998,129,0,0,0
29997,30000,1,2,2,37,4,3,2,-1,0,...,20878,20582,19357,0,0,22000,4200,2000,3100,1
29998,80000,1,3,1,41,1,-1,0,0,0,...,52774,11855,48944,85900,3409,1178,1926,52964,1804,1


In [57]:
preprocessed_data = raw.copy()

# Mapping categorical veriables:
preprocessed_data['SEX'] = preprocessed_data['SEX'].map({1: 'male', 2: 'female'})
preprocessed_data['EDUCATION'] = preprocessed_data['EDUCATION'].map({1: 'graduate_school', 2: 'university', 3: 'high_school', 4: 'others'})
preprocessed_data['MARRIAGE'] = preprocessed_data['MARRIAGE'].map({1: 'married', 2: 'single', 3: 'others'})

categorical_features = ["SEX", "EDUCATION", "MARRIAGE"]

# Set all other columns to float
for column in preprocessed_data.columns:
    if column not in categorical_features:
        preprocessed_data[column] = preprocessed_data[column].astype(int)

# One-hot encoding
preprocessed_data = pd.get_dummies(preprocessed_data, columns=categorical_features, dtype='int64')

display(preprocessed_data.describe())
display(preprocessed_data.info())


Unnamed: 0,LIMIT_BAL,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,...,default payment next month,SEX_female,SEX_male,EDUCATION_graduate_school,EDUCATION_high_school,EDUCATION_others,EDUCATION_university,MARRIAGE_married,MARRIAGE_others,MARRIAGE_single
count,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,...,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0
mean,167484.322667,35.4855,-0.0167,-0.133767,-0.1662,-0.220667,-0.2662,-0.2911,51223.3309,49179.075167,...,0.2212,0.603733,0.396267,0.352833,0.1639,0.0041,0.467667,0.4553,0.010767,0.532133
std,129747.661567,9.217904,1.123802,1.197186,1.196868,1.169139,1.133187,1.149988,73635.860576,71173.768783,...,0.415062,0.489129,0.489129,0.477859,0.370191,0.063901,0.498962,0.498006,0.103204,0.498975
min,10000.0,21.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-165580.0,-69777.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,50000.0,28.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,3558.75,2984.75,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,140000.0,34.0,0.0,0.0,0.0,0.0,0.0,0.0,22381.5,21200.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,240000.0,41.0,0.0,0.0,0.0,0.0,0.0,0.0,67091.0,64006.25,...,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0
max,1000000.0,79.0,8.0,8.0,8.0,8.0,8.0,8.0,964511.0,983931.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 30 columns):
 #   Column                      Non-Null Count  Dtype
---  ------                      --------------  -----
 0   LIMIT_BAL                   30000 non-null  int64
 1   AGE                         30000 non-null  int64
 2   PAY_0                       30000 non-null  int64
 3   PAY_2                       30000 non-null  int64
 4   PAY_3                       30000 non-null  int64
 5   PAY_4                       30000 non-null  int64
 6   PAY_5                       30000 non-null  int64
 7   PAY_6                       30000 non-null  int64
 8   BILL_AMT1                   30000 non-null  int64
 9   BILL_AMT2                   30000 non-null  int64
 10  BILL_AMT3                   30000 non-null  int64
 11  BILL_AMT4                   30000 non-null  int64
 12  BILL_AMT5                   30000 non-null  int64
 13  BILL_AMT6                   30000 non-null  int64
 14  PAY_AM

None

# Fitting Classifier

In [58]:
y = preprocessed_data['default payment next month']
X = preprocessed_data.drop(columns='default payment next month')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [59]:
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(f'ROC AUC: {roc_auc_score(y_test, y_pred)}')

Accuracy: 0.8155
ROC AUC: 0.6524612079464663


In [60]:
# Check for dtype('O') in the dataframe
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 24000 entries, 21753 to 23654
Data columns (total 29 columns):
 #   Column                     Non-Null Count  Dtype
---  ------                     --------------  -----
 0   LIMIT_BAL                  24000 non-null  int64
 1   AGE                        24000 non-null  int64
 2   PAY_0                      24000 non-null  int64
 3   PAY_2                      24000 non-null  int64
 4   PAY_3                      24000 non-null  int64
 5   PAY_4                      24000 non-null  int64
 6   PAY_5                      24000 non-null  int64
 7   PAY_6                      24000 non-null  int64
 8   BILL_AMT1                  24000 non-null  int64
 9   BILL_AMT2                  24000 non-null  int64
 10  BILL_AMT3                  24000 non-null  int64
 11  BILL_AMT4                  24000 non-null  int64
 12  BILL_AMT5                  24000 non-null  int64
 13  BILL_AMT6                  24000 non-null  int64
 14  PAY_AMT1               

# Experiments

In [61]:
agg_explainer = AggregatedExplainer(
    explainer_types=[LimeWrapper, ShapTabularTreeWrapper, AnchorWrapper],       # Wrapped explainers whose explanations will be aggregated
    clf=clf, X_train=X_train, categorical_feature_names=categorical_features,   # Model and training data
    metrics=['nrc', 'sensitivity_spearman', 'faithfulness_corr'],               # Metrics to be considered for the aggregation
    noise_gen_args={'epochs': 50},                                              # Arguments passed to the autoencoder noisy data generator
    evaluator_args={"debug": False}                                             # Arguments passed to the evaluator class 
)  

Epoch 1/50




[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 629us/step - loss: 1.0797 - val_loss: 0.8666
Epoch 2/50
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 403us/step - loss: 0.8188 - val_loss: 0.8108
Epoch 3/50
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 408us/step - loss: 0.7805 - val_loss: 0.7910
Epoch 4/50
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 579us/step - loss: 0.7534 - val_loss: 0.7796
Epoch 5/50
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 519us/step - loss: 0.7505 - val_loss: 0.7703
Epoch 6/50
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 458us/step - loss: 0.7237 - val_loss: 0.7625
Epoch 7/50
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 412us/step - loss: 0.7183 - val_loss: 0.7560
Epoch 8/50
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 575us/step - loss: 0.7675 - val_loss: 0.7508
Epoch 9/50
[1m600/600[0m [32m━━━

In [62]:
agg_explainer.explain_instance(X_test.iloc[0])

  normalized_run[i] = _min_max_norm(run[q_ids[i]], invert)


Unnamed: 0,feature,score
0,PAY_0,1.158684
1,LIMIT_BAL,0.929684
2,BILL_AMT3,0.698323
3,PAY_AMT6,0.639182
4,PAY_AMT1,0.590935
5,PAY_AMT2,0.498228
6,PAY_2,0.47071
7,PAY_AMT3,0.470066
8,PAY_3,0.346045
9,PAY_4,0.277739
