In [2]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier

from xai_agg.agg_exp import *
from xai_agg.utils import *

2024-12-17 19:01:02.041523: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-12-17 19:01:02.316389: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Data reading and preprocessing

In [3]:
raw = pd.read_excel("../data/taiwan.xls", header=1)
display(raw)

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
1,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,3,90000,2,2,2,34,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,4,50000,2,2,1,37,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,5,50000,1,2,1,57,-1,0,-1,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,29996,220000,1,3,1,39,0,0,0,0,...,88004,31237,15980,8500,20000,5003,3047,5000,1000,0
29996,29997,150000,1,3,2,43,-1,-1,-1,-1,...,8979,5190,0,1837,3526,8998,129,0,0,0
29997,29998,30000,1,2,2,37,4,3,2,-1,...,20878,20582,19357,0,0,22000,4200,2000,3100,1
29998,29999,80000,1,3,1,41,1,-1,0,0,...,52774,11855,48944,85900,3409,1178,1926,52964,1804,1


In [4]:
# Trim columns and fix header
raw = raw.drop(columns=['ID'])
display(raw)

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,20000,2,2,1,24,2,2,-1,-1,-2,...,0,0,0,0,689,0,0,0,0,1
1,120000,2,2,2,26,-1,2,0,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,90000,2,2,2,34,0,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,50000,2,2,1,37,0,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,50000,1,2,1,57,-1,0,-1,0,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,220000,1,3,1,39,0,0,0,0,0,...,88004,31237,15980,8500,20000,5003,3047,5000,1000,0
29996,150000,1,3,2,43,-1,-1,-1,-1,0,...,8979,5190,0,1837,3526,8998,129,0,0,0
29997,30000,1,2,2,37,4,3,2,-1,0,...,20878,20582,19357,0,0,22000,4200,2000,3100,1
29998,80000,1,3,1,41,1,-1,0,0,0,...,52774,11855,48944,85900,3409,1178,1926,52964,1804,1


In [5]:
preprocessed_data = raw.copy()

# Mapping categorical veriables:
preprocessed_data['SEX'] = preprocessed_data['SEX'].map({1: 'male', 2: 'female'})
preprocessed_data['EDUCATION'] = preprocessed_data['EDUCATION'].map({1: 'graduate_school', 2: 'university', 3: 'high_school', 4: 'others'})
preprocessed_data['MARRIAGE'] = preprocessed_data['MARRIAGE'].map({1: 'married', 2: 'single', 3: 'others'})

categorical_features = ["SEX", "EDUCATION", "MARRIAGE"]

# Set all other columns to float
for column in preprocessed_data.columns:
    if column not in categorical_features:
        preprocessed_data[column] = preprocessed_data[column].astype(int)

# One-hot encoding
preprocessed_data = pd.get_dummies(preprocessed_data, columns=categorical_features, dtype='int64', drop_first=True)

display(preprocessed_data.head())
display(preprocessed_data.info())


Unnamed: 0,LIMIT_BAL,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,...,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month,SEX_male,EDUCATION_high_school,EDUCATION_others,EDUCATION_university,MARRIAGE_others,MARRIAGE_single
0,20000,24,2,2,-1,-1,-2,-2,3913,3102,...,0,0,0,1,0,0,0,1,0,0
1,120000,26,-1,2,0,0,0,2,2682,1725,...,1000,0,2000,1,0,0,0,1,0,1
2,90000,34,0,0,0,0,0,0,29239,14027,...,1000,1000,5000,0,0,0,0,1,0,1
3,50000,37,0,0,0,0,0,0,46990,48233,...,1100,1069,1000,0,0,0,0,1,0,0
4,50000,57,-1,0,-1,0,0,0,8617,5670,...,9000,689,679,0,1,0,0,1,0,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 27 columns):
 #   Column                      Non-Null Count  Dtype
---  ------                      --------------  -----
 0   LIMIT_BAL                   30000 non-null  int64
 1   AGE                         30000 non-null  int64
 2   PAY_0                       30000 non-null  int64
 3   PAY_2                       30000 non-null  int64
 4   PAY_3                       30000 non-null  int64
 5   PAY_4                       30000 non-null  int64
 6   PAY_5                       30000 non-null  int64
 7   PAY_6                       30000 non-null  int64
 8   BILL_AMT1                   30000 non-null  int64
 9   BILL_AMT2                   30000 non-null  int64
 10  BILL_AMT3                   30000 non-null  int64
 11  BILL_AMT4                   30000 non-null  int64
 12  BILL_AMT5                   30000 non-null  int64
 13  BILL_AMT6                   30000 non-null  int64
 14  PAY_AM

None

# Fitting Classifier

In [6]:
y = preprocessed_data['default payment next month']
X = preprocessed_data.drop(columns='default payment next month')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(f'ROC AUC: {roc_auc_score(y_test, y_pred)}')

Accuracy: 0.8166666666666667
ROC AUC: 0.655126859126969


In [8]:
# Check for dtype('O') in the dataframe
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 24000 entries, 21753 to 23654
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype
---  ------                 --------------  -----
 0   LIMIT_BAL              24000 non-null  int64
 1   AGE                    24000 non-null  int64
 2   PAY_0                  24000 non-null  int64
 3   PAY_2                  24000 non-null  int64
 4   PAY_3                  24000 non-null  int64
 5   PAY_4                  24000 non-null  int64
 6   PAY_5                  24000 non-null  int64
 7   PAY_6                  24000 non-null  int64
 8   BILL_AMT1              24000 non-null  int64
 9   BILL_AMT2              24000 non-null  int64
 10  BILL_AMT3              24000 non-null  int64
 11  BILL_AMT4              24000 non-null  int64
 12  BILL_AMT5              24000 non-null  int64
 13  BILL_AMT6              24000 non-null  int64
 14  PAY_AMT1               24000 non-null  int64
 15  PAY_AMT2               24000 non-null

# Experiments

In [9]:
results = evaluate_aggregate_explainer(clf, X_train, X_test, categorical_features, n_instances = 1)

Epoch 1/500
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 626us/step - loss: 1.1305 - val_loss: 0.8835
Epoch 2/500
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 464us/step - loss: 0.8293 - val_loss: 0.8242
Epoch 3/500
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 391us/step - loss: 0.7498 - val_loss: 0.8002
Epoch 4/500
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 405us/step - loss: 0.7500 - val_loss: 0.7851
Epoch 5/500
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 388us/step - loss: 0.7388 - val_loss: 0.7712
Epoch 6/500
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 458us/step - loss: 0.7230 - val_loss: 0.7630
Epoch 7/500
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 463us/step - loss: 0.7684 - val_loss: 0.7578
Epoch 8/500
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 567us/step - loss: 0.7126 - val_loss: 0.7537
Epoch 9/500
[1m

In [10]:
results

[[                              nrc  sensitivity_spearman  faithfulness_corr
  LimeWrapper             49.552339              0.939145           0.005559
  ShapTabularTreeWrapper  46.772160              0.988376           0.246278
  AnchorWrapper           33.698242              0.241264           0.221249
  AggregateExplainer      48.031281              0.928752           0.271152]]