In [1]:
import warnings
warnings.filterwarnings("ignore")

from xai_agg import *

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier

import pandas as pd
import numpy as np

import dill

2025-01-19 14:57:31.239948: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-19 14:57:31.460462: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Data Loading and Preprocessing

In [2]:
original_data = pd.read_csv('../data/german_credit_data_updated.csv')

# Dataset overview - German Credit Risk (from Kaggle):
# 1. Age (numeric)
# 2. Sex (text: male, female)
# 3. Job (numeric: 0 - unskilled and non-resident, 1 - unskilled and resident, 2 - skilled, 3 - highly skilled)
# 4. Housing (text: own, rent, or free)
# 5. Saving accounts (text - little, moderate, quite rich, rich)
# 6. Checking account (numeric, in DM - Deutsch Mark)
# 7. Credit amount (numeric, in DM)
# 8. Duration (numeric, in month)
# 9. Purpose (text: car, furniture/equipment, radio/TV, domestic appliances, repairs, education, business, vacation/others)

display(original_data.head())
display(original_data.describe())
display(original_data.info())

# Display the unique values of the categorical features:
print('Unique values of the categorical features:')
for col in original_data.select_dtypes(include='object'):
    print(f'\t- {col}: {original_data[col].unique()}')

Unnamed: 0.1,Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Credit Risk
0,0,67,male,2,own,,little,1169,6,radio/TV,1
1,1,22,female,2,own,little,moderate,5951,48,radio/TV,2
2,2,49,male,1,own,little,,2096,12,education,1
3,3,45,male,2,free,little,little,7882,42,furniture/equipment,1
4,4,53,male,2,free,little,little,4870,24,car,2


Unnamed: 0.1,Unnamed: 0,Age,Job,Credit amount,Duration,Credit Risk
count,954.0,954.0,954.0,954.0,954.0,954.0
mean,476.5,35.501048,1.909853,3279.112159,20.780922,1.302935
std,275.540378,11.379668,0.649681,2853.315158,12.046483,0.459768
min,0.0,19.0,0.0,250.0,4.0,1.0
25%,238.25,27.0,2.0,1360.25,12.0,1.0
50%,476.5,33.0,2.0,2302.5,18.0,1.0
75%,714.75,42.0,2.0,3975.25,24.0,2.0
max,953.0,75.0,3.0,18424.0,72.0,2.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 954 entries, 0 to 953
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Unnamed: 0        954 non-null    int64 
 1   Age               954 non-null    int64 
 2   Sex               954 non-null    object
 3   Job               954 non-null    int64 
 4   Housing           954 non-null    object
 5   Saving accounts   779 non-null    object
 6   Checking account  576 non-null    object
 7   Credit amount     954 non-null    int64 
 8   Duration          954 non-null    int64 
 9   Purpose           954 non-null    object
 10  Credit Risk       954 non-null    int64 
dtypes: int64(6), object(5)
memory usage: 82.1+ KB


None

Unique values of the categorical features:
	- Sex: ['male' 'female']
	- Housing: ['own' 'free' 'rent']
	- Saving accounts: [nan 'little' 'quite rich' 'rich' 'moderate']
	- Checking account: ['little' 'moderate' nan 'rich']
	- Purpose: ['radio/TV' 'education' 'furniture/equipment' 'car' 'business'
 'domestic appliances' 'repairs' 'vacation/others']


In [3]:
preprocessed_data = original_data.copy()

# For savings and checking accounts, we will replace the missing values with 'none':
preprocessed_data['Saving accounts'].fillna('none', inplace=True)
preprocessed_data['Checking account'].fillna('none', inplace=True)

# Dropping index column:
preprocessed_data.drop(columns=['Unnamed: 0'], inplace=True)

# Using pd.dummies to one-hot-encode the categorical features
preprocessed_data["Job"] = preprocessed_data["Job"].map({0: 'unskilled_nonresident', 1: 'unskilled_resident',
                                                         2: 'skilled', 3: 'highlyskilled'})

categorical_features = preprocessed_data.select_dtypes(include='object').columns
numerical_features = preprocessed_data.select_dtypes(include='number').columns.drop('Credit Risk')
print(f'Categorical features: {categorical_features}')
print(f'Numerical features: {numerical_features}')

preprocessed_data = pd.get_dummies(preprocessed_data, columns=categorical_features, dtype='int64')

# Remapping the target variable to 0 and 1:
preprocessed_data['Credit Risk'] = preprocessed_data['Credit Risk'].map({1: 0, 2: 1})

# Make sure all column names are valid python identifiers (important for pd.query() calls):
preprocessed_data.columns = preprocessed_data.columns.str.replace(' ', '_')
preprocessed_data.columns = preprocessed_data.columns.str.replace('/', '_')

display(preprocessed_data.head())
display(preprocessed_data.info())

Categorical features: Index(['Sex', 'Job', 'Housing', 'Saving accounts', 'Checking account',
       'Purpose'],
      dtype='object')
Numerical features: Index(['Age', 'Credit amount', 'Duration'], dtype='object')


Unnamed: 0,Age,Credit_amount,Duration,Credit_Risk,Sex_female,Sex_male,Job_highlyskilled,Job_skilled,Job_unskilled_nonresident,Job_unskilled_resident,...,Checking_account_none,Checking_account_rich,Purpose_business,Purpose_car,Purpose_domestic_appliances,Purpose_education,Purpose_furniture_equipment,Purpose_radio_TV,Purpose_repairs,Purpose_vacation_others
0,67,1169,6,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
1,22,5951,48,1,1,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,49,2096,12,0,0,1,0,0,0,1,...,1,0,0,0,0,1,0,0,0,0
3,45,7882,42,0,0,1,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
4,53,4870,24,1,0,1,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 954 entries, 0 to 953
Data columns (total 30 columns):
 #   Column                       Non-Null Count  Dtype
---  ------                       --------------  -----
 0   Age                          954 non-null    int64
 1   Credit_amount                954 non-null    int64
 2   Duration                     954 non-null    int64
 3   Credit_Risk                  954 non-null    int64
 4   Sex_female                   954 non-null    int64
 5   Sex_male                     954 non-null    int64
 6   Job_highlyskilled            954 non-null    int64
 7   Job_skilled                  954 non-null    int64
 8   Job_unskilled_nonresident    954 non-null    int64
 9   Job_unskilled_resident       954 non-null    int64
 10  Housing_free                 954 non-null    int64
 11  Housing_own                  954 non-null    int64
 12  Housing_rent                 954 non-null    int64
 13  Saving_accounts_little       954 non-null    int64

None

In [4]:
y = preprocessed_data['Credit_Risk']
X = preprocessed_data.drop(columns='Credit_Risk')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(f'ROC AUC: {roc_auc_score(y_test, y_pred)}')

Accuracy: 0.7696335078534031
ROC AUC: 0.6830357142857143


# Experiments

# Evaluating Old and New Complexity Metrics
Running the current setup: wsum, topsis, ['nrc', 'sensitiviy_spearman', 'faithfulness_corr']

### Execution

In [8]:
results, metadata = evaluate_aggregate_explainer(
    clf, X_train, X_test, categorical_features,
    metrics_sets=[
        ['complexity', 'sensitivity_spearman', 'faithfulness_corr'],
        ['nrc', 'sensitivity_spearman', 'faithfulness_corr']
    ],
    indexes=[629, 213, 485, 218, 703]
)

metadata["description"] = "compares entropy complexity with nrc metric sets"

with open('pickles/german/COMPARE_entropy-nrc_metricsets_wsum-topsis.pkl', 'wb') as f:
    dill.dump(ExperimentRun(metadata, results), f)


Selected indexes: [629, 213, 485, 218, 703]
Epoch 1/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 1.2904 - val_loss: 1.2596
Epoch 2/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.2678 - val_loss: 1.2416
Epoch 3/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.2638 - val_loss: 1.2253
Epoch 4/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.2183 - val_loss: 1.2097
Epoch 5/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.2324 - val_loss: 1.1943
Epoch 6/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.1837 - val_loss: 1.1791
Epoch 7/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.1742 - val_loss: 1.1635
Epoch 8/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.1404 - val_loss: 1.1481
Epoc

### Analysis

In [36]:
with open('pickles/german/COMPARE_entropy-nrc_metricsets_wsum-topsis.pkl', 'rb') as f:
    exp = dill.load(f)

In [37]:
for i, metrics in enumerate([['complexity', 'sensitivity_spearman', 'faithfulness_corr'], 
                             ['nrc', 'sensitivity_spearman', 'faithfulness_corr']]):
    print(f"With metrics: {metrics}\n")
    display(exp.results[i])
    wca = count_worst_case_avoidances(exp.results[i], [False, True, True], 1)
    print(f"Worst case avoidances:\n\t- for all metrics: {wca[0]}\n\t- for 2/3 metrics: {wca[1]}")
    print("AVG:")
    display(get_expconfig_mean_results(exp, i))
    print("\n")
    print("Avg rank:")
    display(get_average_metric_rank(exp.results[i], [False, True, True]))


With metrics: ['complexity', 'sensitivity_spearman', 'faithfulness_corr']



[                        complexity  sensitivity_spearman  faithfulness_corr
 LimeWrapper               2.607385              0.923645           0.504278
 ShapTabularTreeWrapper    2.639087              0.967607           0.038512
 AnchorWrapper             0.692943              0.473214           0.894271
 AggregateExplainer        2.462704              0.964286           0.576280,
                         complexity  sensitivity_spearman  faithfulness_corr
 LimeWrapper               2.538366              0.837833           0.248742
 ShapTabularTreeWrapper    2.513429              0.975094           0.558859
 AnchorWrapper             0.887787              0.564325           0.502479
 AggregateExplainer        2.620076              0.933645           0.232050,
                         complexity  sensitivity_spearman  faithfulness_corr
 LimeWrapper               2.542261              0.853744           0.169501
 ShapTabularTreeWrapper    2.284778              0.964992           0.3814

Worst case avoidances:
	- for all metrics: 4
	- for 2/3 metrics: 4
AVG:


Unnamed: 0,complexity,sensitivity_spearman,faithfulness_corr
AggregateExplainer,2.463323,0.930829,0.43176
AnchorWrapper,1.275002,0.62948,0.433368
LimeWrapper,2.575076,0.866256,0.423207
ShapTabularTreeWrapper,2.451083,0.968438,0.435356




Avg rank:


Unnamed: 0,complexity,sensitivity_spearman,faithfulness_corr
AggregateExplainer,2.8,2.0,2.8
AnchorWrapper,1.0,4.0,2.4
LimeWrapper,3.6,3.0,2.8
ShapTabularTreeWrapper,2.6,1.0,2.0


With metrics: ['nrc', 'sensitivity_spearman', 'faithfulness_corr']



[                              nrc  sensitivity_spearman  faithfulness_corr
 LimeWrapper             48.177761              0.870246           0.096595
 ShapTabularTreeWrapper  48.583731              0.977644           0.655846
 AnchorWrapper           37.743216              0.617716           0.952868
 AggregateExplainer      46.923408              0.957438           0.441459,
                               nrc  sensitivity_spearman  faithfulness_corr
 LimeWrapper             44.906524              0.847389           0.063032
 ShapTabularTreeWrapper  47.987182              0.982287           0.201590
 AnchorWrapper           37.743216              0.500799           0.089648
 AggregateExplainer      45.203013              0.953744           0.293751,
                               nrc  sensitivity_spearman  faithfulness_corr
 LimeWrapper             47.016105              0.823153           0.144989
 ShapTabularTreeWrapper  43.120370              0.967062           0.170781
 AnchorWra

Worst case avoidances:
	- for all metrics: 3
	- for 2/3 metrics: 5
AVG:


Unnamed: 0,nrc,sensitivity_spearman,faithfulness_corr
AggregateExplainer,45.502635,0.930181,0.399033
AnchorWrapper,38.521171,0.664532,0.563576
LimeWrapper,46.13345,0.845744,0.24089
ShapTabularTreeWrapper,46.630478,0.975466,0.356992




Avg rank:


Unnamed: 0,nrc,sensitivity_spearman,faithfulness_corr
AggregateExplainer,2.6,2.0,2.6
AnchorWrapper,1.4,3.8,1.6
LimeWrapper,2.8,3.2,3.2
ShapTabularTreeWrapper,3.2,1.0,2.6


### Discussion
This experiment sought to evaluate the performance of the NRC metric when it substitutes the entropy based "complexity" metric.

With both metric sets, the worst metric value was avoided in all samples for sensitivity_spearman and faithfulness_corr. Both NRC and complexity had the same percentage of worst-metric-value-avoidance, with both metrics having 80% (1/5) of the samples avoiding the worst value.
The worst value for the entropy metric is avoided in the average of its samples, while the worst value for the NRC metric is not avoided in the average of its samples.

# Evaluating Score and Rank Based Faithfulness Metrics
### Execution

In [6]:
results, metadata = evaluate_aggregate_explainer(
    clf, X_train, X_test, categorical_features,
    metrics_sets=[
        ['nrc', 'sensitivity_spearman', 'faithfulness_corr'],
        ['nrc', 'sensitivity_spearman', 'rb_faithfulness_corr']
    ],
    n_instances=5
)

metadata["description"] = "compares score and rank-based faithfulness metrics"

with open('pickles/german/COMPARE_score_rb_faithfulness_wsum-topsis.pkl', 'wb') as f:
    dill.dump(ExperimentRun(metadata, results), f)


Selected indexes: [231 141 789  76 830]
Epoch 1/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 1.2931 - val_loss: 1.2307
Epoch 2/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.2165 - val_loss: 1.2132
Epoch 3/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.2299 - val_loss: 1.1961
Epoch 4/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.1758 - val_loss: 1.1787
Epoch 5/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.1940 - val_loss: 1.1617
Epoch 6/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.1621 - val_loss: 1.1445
Epoch 7/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.1326 - val_loss: 1.1270
Epoch 8/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.1109 - val_loss: 1.1094
Epoch 9/

Could not find an anchor satisfying the 0.95 precision constraint. Now returning the best non-eligible result. The desired precision threshold might not be achieved due to the quantile-based discretisation of the numerical features. The resolution of the bins may be too large to find an anchor of required precision. Consider increasing the number of bins in `disc_perc`, but note that for some numerical distribution (e.g. skewed distribution) it may not help.
Could not find an anchor satisfying the 0.95 precision constraint. Now returning the best non-eligible result. The desired precision threshold might not be achieved due to the quantile-based discretisation of the numerical features. The resolution of the bins may be too large to find an anchor of required precision. Consider increasing the number of bins in `disc_perc`, but note that for some numerical distribution (e.g. skewed distribution) it may not help.
Could not find an anchor satisfying the 0.95 precision constraint. Now ret

	 Running instance 789
	 Running instance 76


Could not find an anchor satisfying the 0.95 precision constraint. Now returning the best non-eligible result. The desired precision threshold might not be achieved due to the quantile-based discretisation of the numerical features. The resolution of the bins may be too large to find an anchor of required precision. Consider increasing the number of bins in `disc_perc`, but note that for some numerical distribution (e.g. skewed distribution) it may not help.
Could not find an anchor satisfying the 0.95 precision constraint. Now returning the best non-eligible result. The desired precision threshold might not be achieved due to the quantile-based discretisation of the numerical features. The resolution of the bins may be too large to find an anchor of required precision. Consider increasing the number of bins in `disc_perc`, but note that for some numerical distribution (e.g. skewed distribution) it may not help.
Could not find an anchor satisfying the 0.95 precision constraint. Now ret

	 Running instance 830
Running evaluation for settings 2/2
Explainer components: [<class 'xai_agg.explainers.LimeWrapper'>, <class 'xai_agg.explainers.ShapTabularTreeWrapper'>, <class 'xai_agg.explainers.AnchorWrapper'>], Metrics: ['nrc', 'sensitivity_spearman', 'rb_faithfulness_corr'], MCDM algorithm: <pymcdm.methods.topsis.TOPSIS object at 0x75f2f2156320>, Aggregation algorithm: wsum
	 Running instance 231
	 Running instance 141


Could not find an anchor satisfying the 0.95 precision constraint. Now returning the best non-eligible result. The desired precision threshold might not be achieved due to the quantile-based discretisation of the numerical features. The resolution of the bins may be too large to find an anchor of required precision. Consider increasing the number of bins in `disc_perc`, but note that for some numerical distribution (e.g. skewed distribution) it may not help.
Could not find an anchor satisfying the 0.95 precision constraint. Now returning the best non-eligible result. The desired precision threshold might not be achieved due to the quantile-based discretisation of the numerical features. The resolution of the bins may be too large to find an anchor of required precision. Consider increasing the number of bins in `disc_perc`, but note that for some numerical distribution (e.g. skewed distribution) it may not help.
Could not find an anchor satisfying the 0.95 precision constraint. Now ret

	 Running instance 789
	 Running instance 76


Could not find an anchor satisfying the 0.95 precision constraint. Now returning the best non-eligible result. The desired precision threshold might not be achieved due to the quantile-based discretisation of the numerical features. The resolution of the bins may be too large to find an anchor of required precision. Consider increasing the number of bins in `disc_perc`, but note that for some numerical distribution (e.g. skewed distribution) it may not help.
Could not find an anchor satisfying the 0.95 precision constraint. Now returning the best non-eligible result. The desired precision threshold might not be achieved due to the quantile-based discretisation of the numerical features. The resolution of the bins may be too large to find an anchor of required precision. Consider increasing the number of bins in `disc_perc`, but note that for some numerical distribution (e.g. skewed distribution) it may not help.
Could not find an anchor satisfying the 0.95 precision constraint. Now ret

	 Running instance 830


### Analysis

In [7]:
with open('pickles/german/COMPARE_score_rb_faithfulness_wsum-topsis.pkl', 'rb') as f:
    exp = dill.load(f)

In [8]:
for i, metrics in enumerate([['nrc', 'sensitivity_spearman', 'faithfulness_corr'],
                             ['nrc', 'sensitivity_spearman', 'rb_faithfulness_corr']]):
    print(f"With metrics: {metrics}\n")
    display(exp.results[i])
    wca = count_worst_case_avoidances(exp.results[i], [False, True, True], 1)
    print(f"Worst case avoidances:\n\t- for all metrics: {wca[0]}\n\t- for 2/3 metrics: {wca[1]}")
    print("AVG:")
    display(get_expconfig_mean_results(exp, i))
    print("\n")
    print("Avg rank:")
    display(get_average_metric_rank(exp.results[i], [False, True, True]))

With metrics: ['nrc', 'sensitivity_spearman', 'faithfulness_corr']



[                              nrc  sensitivity_spearman  faithfulness_corr
 LimeWrapper             46.587759              0.884236           0.054802
 ShapTabularTreeWrapper  47.331073              0.987800           0.482357
 AnchorWrapper           42.893203              0.615160           0.151465
 AggregateExplainer      37.965262              0.909754           0.623834,
                               nrc  sensitivity_spearman  faithfulness_corr
 LimeWrapper             46.943338              0.866453           0.055590
 ShapTabularTreeWrapper  42.794726              0.968157           0.797068
 AnchorWrapper           45.258019              0.551023           0.411756
 AggregateExplainer      36.882437              0.946995           0.580465,
                               nrc  sensitivity_spearman  faithfulness_corr
 LimeWrapper             48.243168              0.856059           0.738596
 ShapTabularTreeWrapper  41.751409              0.964016           0.485469
 AnchorWra

Worst case avoidances:
	- for all metrics: 5
	- for 2/3 metrics: 5
AVG:


Unnamed: 0,nrc,sensitivity_spearman,faithfulness_corr
AggregateExplainer,37.15662,0.924365,0.599878
AnchorWrapper,42.150094,0.621403,0.397402
LimeWrapper,47.283227,0.862059,0.325113
ShapTabularTreeWrapper,43.454516,0.972598,0.569414




Avg rank:


Unnamed: 0,nrc,sensitivity_spearman,faithfulness_corr
AggregateExplainer,1.4,2.0,1.8
AnchorWrapper,2.2,4.0,2.8
LimeWrapper,3.6,3.0,3.2
ShapTabularTreeWrapper,2.8,1.0,2.2


With metrics: ['nrc', 'sensitivity_spearman', 'rb_faithfulness_corr']



[                              nrc  sensitivity_spearman  rb_faithfulness_corr
 LimeWrapper             44.816967              0.826700              0.461244
 ShapTabularTreeWrapper  47.331073              0.977409              0.717630
 AnchorWrapper           42.893203              0.734768              0.651884
 AggregateExplainer      36.882437              0.925517              0.701199,
                               nrc  sensitivity_spearman  rb_faithfulness_corr
 LimeWrapper             44.735289              0.846453              0.313002
 ShapTabularTreeWrapper  42.794726              0.978417              0.916702
 AnchorWrapper           49.000591              0.712833              0.644447
 AggregateExplainer      41.408068              0.956897              0.416047,
                               nrc  sensitivity_spearman  rb_faithfulness_corr
 LimeWrapper             46.647057              0.825172              0.320223
 ShapTabularTreeWrapper  41.751409              0.

Worst case avoidances:
	- for all metrics: 5
	- for 2/3 metrics: 5
AVG:


Unnamed: 0,nrc,sensitivity_spearman,rb_faithfulness_corr
AggregateExplainer,38.020042,0.929409,0.428296
AnchorWrapper,44.986444,0.681538,0.523103
LimeWrapper,45.772424,0.844936,0.274942
ShapTabularTreeWrapper,43.454516,0.972807,0.82604




Avg rank:


Unnamed: 0,nrc,sensitivity_spearman,rb_faithfulness_corr
AggregateExplainer,1.0,2.0,2.6
AnchorWrapper,2.8,4.0,2.6
LimeWrapper,3.4,3.0,3.8
ShapTabularTreeWrapper,2.8,1.0,1.0


# Evaluating Rank aggregation algorithms

### Execution

In [6]:
results, metadata = evaluate_aggregate_explainer(
    clf, X_train, X_test, categorical_features,
    aggregation_algs=["wsum", "w_bordafuse", "w_condorcet"],
    indexes=[629, 213, 485, 218, 703],
    n_instances=5
)

metadata["description"] = "compares wsum, w_bordafuse, w_condorcet aggregation algorithms"

with open('pickles/german/COMPARE_wsum-w_bordafuse-w_condorcet.pkl', 'wb') as f:
    dill.dump(ExperimentRun(metadata, results), f)

Selected indexes: [629, 213, 485, 218, 703]
Epoch 1/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 1.2351 - val_loss: 1.2167
Epoch 2/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.2313 - val_loss: 1.2015
Epoch 3/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 1.1810 - val_loss: 1.1869
Epoch 4/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.1933 - val_loss: 1.1722
Epoch 5/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.1816 - val_loss: 1.1577
Epoch 6/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.1672 - val_loss: 1.1432
Epoch 7/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.1085 - val_loss: 1.1282
Epoch 8/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.1483 - val_loss: 1.1135
Epoc

In [6]:
results, metadata = evaluate_aggregate_explainer(
    clf, X_train, X_test, categorical_features,
    aggregation_algs=["wsum", "w_bordafuse", "w_condorcet"],
    n_instances=5
)

metadata["description"] = "compares wsum, w_bordafuse, w_condorcet aggregation algorithms"

with open('pickles/german/COMPARE_wsum-w_bordafuse-w_condorcetRANKED.pkl', 'wb') as f:
    dill.dump(ExperimentRun(metadata, results), f)

Selected indexes: [158 707 882 784 580]
Epoch 1/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 1.2334 - val_loss: 1.2260
Epoch 2/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.2099 - val_loss: 1.2101
Epoch 3/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.2449 - val_loss: 1.1947
Epoch 4/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.1829 - val_loss: 1.1797
Epoch 5/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.2017 - val_loss: 1.1648
Epoch 6/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.2126 - val_loss: 1.1498
Epoch 7/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.1511 - val_loss: 1.1352
Epoch 8/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.1488 - val_loss: 1.1206
Epoch 9/

KeyboardInterrupt: 

In [7]:
results, metadata = evaluate_aggregate_explainer(
    clf, X_train, X_test, categorical_features,
    aggregation_algs=["wsum", "w_bordafuse"],
    n_instances=5
)

metadata["description"] = "compares wsum, w_bordafuse, w_condorcet aggregation algorithms on RANKED variation"

with open('pickles/german/COMPARE_wsum-w_bordafuse-w_condorcetRANKED-INVERSE.pkl', 'wb') as f:
    dill.dump(ExperimentRun(metadata, results), f)

Selected indexes: [850 784  78 342 513]
Epoch 1/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 1.2458 - val_loss: 1.2326
Epoch 2/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.2351 - val_loss: 1.2153
Epoch 3/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 1.2218 - val_loss: 1.1986
Epoch 4/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.2068 - val_loss: 1.1826
Epoch 5/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.1978 - val_loss: 1.1664
Epoch 6/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.1828 - val_loss: 1.1506
Epoch 7/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.1322 - val_loss: 1.1349
Epoch 8/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 1.1145 - val_loss: 1.1198
Epoch 9/

Could not find an anchor satisfying the 0.95 precision constraint. Now returning the best non-eligible result. The desired precision threshold might not be achieved due to the quantile-based discretisation of the numerical features. The resolution of the bins may be too large to find an anchor of required precision. Consider increasing the number of bins in `disc_perc`, but note that for some numerical distribution (e.g. skewed distribution) it may not help.
Could not find an anchor satisfying the 0.95 precision constraint. Now returning the best non-eligible result. The desired precision threshold might not be achieved due to the quantile-based discretisation of the numerical features. The resolution of the bins may be too large to find an anchor of required precision. Consider increasing the number of bins in `disc_perc`, but note that for some numerical distribution (e.g. skewed distribution) it may not help.
Could not find an anchor satisfying the 0.95 precision constraint. Now ret

Running evaluation for settings 2/2
Explainer components: [<class 'xai_agg.explainers.LimeWrapper'>, <class 'xai_agg.explainers.ShapTabularTreeWrapper'>, <class 'xai_agg.explainers.AnchorWrapper'>], Metrics: ['nrc', 'sensitivity_spearman', 'faithfulness_corr'], MCDM algorithm: <pymcdm.methods.topsis.TOPSIS object at 0x7a503ffbd060>, Aggregation algorithm: w_bordafuse
	 Running instance 850
	 Running instance 784
	 Running instance 78
	 Running instance 342
	 Running instance 513


Could not find an anchor satisfying the 0.95 precision constraint. Now returning the best non-eligible result. The desired precision threshold might not be achieved due to the quantile-based discretisation of the numerical features. The resolution of the bins may be too large to find an anchor of required precision. Consider increasing the number of bins in `disc_perc`, but note that for some numerical distribution (e.g. skewed distribution) it may not help.
Could not find an anchor satisfying the 0.95 precision constraint. Now returning the best non-eligible result. The desired precision threshold might not be achieved due to the quantile-based discretisation of the numerical features. The resolution of the bins may be too large to find an anchor of required precision. Consider increasing the number of bins in `disc_perc`, but note that for some numerical distribution (e.g. skewed distribution) it may not help.
Could not find an anchor satisfying the 0.95 precision constraint. Now ret

### Analysis

In [8]:
with open('pickles/german/COMPARE_wsum-w_bordafuse-w_condorcet.pkl', 'rb') as f:
    exp = dill.load(f)

In [13]:
for i, method in enumerate(["wsum", "w_bordafuse", "w_condorcet"]):
    print(f"{method}:\n")
    display(exp.results[i])
    wca = count_worst_case_avoidances(exp.results[i], [False, True, True], 1)
    print(f"Worst case avoidances:\n\t- for all metrics: {wca[0]}\n\t- for 2/3 metrics: {wca[1]}")
    print("AVG:")
    display(get_expconfig_mean_results(exp, i))
    print("\n")
    print("Avg rank:")
    display(get_average_metric_rank(exp.results[i], [False, True, True]))


wsum:



[                              nrc  sensitivity_spearman  faithfulness_corr
 LimeWrapper             43.673679              0.895074           0.042806
 ShapTabularTreeWrapper  48.583731              0.975856           0.265131
 AnchorWrapper           37.743216              0.716941           0.516036
 AggregateExplainer      46.009461              0.951773           0.483324,
                               nrc  sensitivity_spearman  faithfulness_corr
 LimeWrapper             46.301422              0.838128           0.650223
 ShapTabularTreeWrapper  47.987182              0.980981           0.150510
 AnchorWrapper           37.743216              0.560795           0.073738
 AggregateExplainer      43.851568              0.914581           0.416718,
                               nrc  sensitivity_spearman  faithfulness_corr
 LimeWrapper             45.363319              0.860443           0.032314
 ShapTabularTreeWrapper  43.120370              0.973885           0.434708
 AnchorWra

Worst case avoidances:
	- for all metrics: 4
	- for 2/3 metrics: 5
AVG:


Unnamed: 0,nrc,sensitivity_spearman,faithfulness_corr
AggregateExplainer,45.088396,0.922414,0.46524
AnchorWrapper,37.144285,0.697907,0.387216
LimeWrapper,45.960229,0.858601,0.380173
ShapTabularTreeWrapper,46.630478,0.975974,0.340436




Avg rank:


Unnamed: 0,nrc,sensitivity_spearman,faithfulness_corr
AggregateExplainer,2.6,2.0,2.2
AnchorWrapper,1.0,4.0,2.6
LimeWrapper,3.0,3.0,2.4
ShapTabularTreeWrapper,3.4,1.0,2.8


w_bordafuse:



[                              nrc  sensitivity_spearman  faithfulness_corr
 LimeWrapper             45.033764              0.844680           0.372345
 ShapTabularTreeWrapper  48.583731              0.979059           0.048094
 AnchorWrapper           42.893203              0.739947           0.152028
 AggregateExplainer      65.058328              0.926502           0.554303,
                               nrc  sensitivity_spearman  faithfulness_corr
 LimeWrapper             48.664858              0.819458           0.478736
 ShapTabularTreeWrapper  47.987182              0.984873           0.198131
 AnchorWrapper           37.743216              0.531792           0.590036
 AggregateExplainer      77.000265              0.922611           0.180661,
                               nrc  sensitivity_spearman  faithfulness_corr
 LimeWrapper             48.566517              0.858818           0.646299
 ShapTabularTreeWrapper  43.120370              0.979502           0.740686
 AnchorWra

Worst case avoidances:
	- for all metrics: 0
	- for 2/3 metrics: 2
AVG:


Unnamed: 0,nrc,sensitivity_spearman,faithfulness_corr
AggregateExplainer,72.996655,0.932207,0.37751
AnchorWrapper,37.289621,0.664321,0.460374
LimeWrapper,46.485508,0.830335,0.525115
ShapTabularTreeWrapper,46.630478,0.977029,0.331673




Avg rank:


Unnamed: 0,nrc,sensitivity_spearman,faithfulness_corr
AggregateExplainer,4.0,2.0,3.2
AnchorWrapper,1.0,4.0,2.0
LimeWrapper,2.6,3.0,1.8
ShapTabularTreeWrapper,2.4,1.0,3.0


w_condorcet:



[                              nrc  sensitivity_spearman  faithfulness_corr
 LimeWrapper             43.837925              0.886305           0.683597
 ShapTabularTreeWrapper  48.583731              0.967972           0.523176
 AnchorWrapper           42.893203              0.648405           0.229834
 AggregateExplainer      69.841686              0.922611           0.300632,
                               nrc  sensitivity_spearman  faithfulness_corr
 LimeWrapper             46.377541              0.825419           0.172140
 ShapTabularTreeWrapper  47.987182              0.982607           0.750568
 AnchorWrapper           37.743216              0.464710           0.438473
 AggregateExplainer      69.841686              0.883941           0.169785,
                               nrc  sensitivity_spearman  faithfulness_corr
 LimeWrapper             43.624818              0.820837           0.703933
 ShapTabularTreeWrapper  43.120370              0.977433           0.158063
 AnchorWra

Worst case avoidances:
	- for all metrics: 0
	- for 2/3 metrics: 4
AVG:


Unnamed: 0,nrc,sensitivity_spearman,faithfulness_corr
AggregateExplainer,69.841686,0.914956,0.276549
AnchorWrapper,37.917072,0.671072,0.348035
LimeWrapper,45.145219,0.840099,0.441532
ShapTabularTreeWrapper,46.630478,0.975955,0.455529




Avg rank:


Unnamed: 0,nrc,sensitivity_spearman,faithfulness_corr
AggregateExplainer,4.0,2.0,3.0
AnchorWrapper,1.0,3.8,3.2
LimeWrapper,2.4,3.2,2.0
ShapTabularTreeWrapper,2.6,1.0,1.8


### Discussion
This experiment sought to evaluate the performance of the different rank aggregation algorithms.

wsum is the best among the algorithms tested. all the others did not avoid the worst value in all metrics not even once.

In [8]:
with open('pickles/german/COMPARE_wsum-w_bordafuse-w_condorcetRANKED-INVERSE.pkl', 'rb') as f:
    exp = dill.load(f)

In [10]:
for i, method in enumerate(["wsum", "w_bordafuse"]):
    print(f"{method}:\n")
    display(exp.results[i])
    wca = count_worst_case_avoidances(exp.results[i], [False, True, True], 1)
    print(f"Worst case avoidances:\n\t- for all metrics: {wca[0]}\n\t- for 2/3 metrics: {wca[1]}")
    print("AVG:")
    display(get_expconfig_mean_results(exp, i))
    print("\n")
    print("Avg rank:")
    display(get_average_metric_rank(exp.results[i], [False, True, True]))


wsum:



[                              nrc  sensitivity_spearman  faithfulness_corr
 LimeWrapper             43.223368              0.824631           0.630092
 ShapTabularTreeWrapper  44.596431              0.989531           0.038333
 AnchorWrapper           38.321940              0.663263           0.748669
 AggregateExplainer      43.463904              0.915123           0.603937,
                               nrc  sensitivity_spearman  faithfulness_corr
 LimeWrapper             47.097208              0.817734           0.289902
 ShapTabularTreeWrapper  41.977105              0.953688           0.779571
 AnchorWrapper           37.743216              0.589859           0.433861
 AggregateExplainer      46.882211              0.913498           0.220383,
                               nrc  sensitivity_spearman  faithfulness_corr
 LimeWrapper             46.431268              0.859852           0.652862
 ShapTabularTreeWrapper  51.874824              0.989531           0.608134
 AnchorWra

Worst case avoidances:
	- for all metrics: 4
	- for 2/3 metrics: 5
AVG:


Unnamed: 0,nrc,sensitivity_spearman,faithfulness_corr
AggregateExplainer,46.403381,0.91664,0.385317
AnchorWrapper,39.205767,0.580374,0.287249
LimeWrapper,46.622359,0.83602,0.345628
ShapTabularTreeWrapper,46.220877,0.97776,0.410048




Avg rank:


Unnamed: 0,nrc,sensitivity_spearman,faithfulness_corr
AggregateExplainer,2.6,2.0,2.4
AnchorWrapper,1.6,4.0,2.6
LimeWrapper,3.0,3.0,3.0
ShapTabularTreeWrapper,2.8,1.0,2.0


w_bordafuse:



[                              nrc  sensitivity_spearman  faithfulness_corr
 LimeWrapper             44.954931              0.810394           0.532727
 ShapTabularTreeWrapper  44.596431              0.982510           0.908344
 AnchorWrapper           39.643100              0.811175           0.789034
 AggregateExplainer      67.294704              0.908374           0.415784,
                               nrc  sensitivity_spearman  faithfulness_corr
 LimeWrapper             53.881510              0.871330           0.020053
 ShapTabularTreeWrapper  41.977105              0.945176           0.488603
 AnchorWrapper           35.008968              0.724897           0.277124
 AggregateExplainer      66.619801              0.900000           0.165278,
                               nrc  sensitivity_spearman  faithfulness_corr
 LimeWrapper             45.056325              0.860640           0.605165
 ShapTabularTreeWrapper  51.874824              0.992166           0.802619
 AnchorWra

Worst case avoidances:
	- for all metrics: 0
	- for 2/3 metrics: 3
AVG:


Unnamed: 0,nrc,sensitivity_spearman,faithfulness_corr
AggregateExplainer,63.26721,0.912463,0.333337
AnchorWrapper,42.055658,0.679829,0.481195
LimeWrapper,47.233467,0.844118,0.3141
ShapTabularTreeWrapper,46.220877,0.974449,0.646213




Avg rank:


Unnamed: 0,nrc,sensitivity_spearman,faithfulness_corr
AggregateExplainer,4.0,2.0,3.0
AnchorWrapper,1.4,3.8,2.4
LimeWrapper,2.4,3.2,3.4
ShapTabularTreeWrapper,2.2,1.0,1.2


# Evaluating MCDM Algs

### Execution

In [11]:
results, metadata = evaluate_aggregate_explainer(
    clf, X_train, X_test, categorical_features,
    mcdm_algs=[pymcdm.methods.TOPSIS(), pymcdm.methods.COPRAS(),
               pymcdm.methods.PROMETHEE_II(preference_function="usual"),
               pymcdm.methods.ARAS(), pymcdm.methods.COCOSO(),
               pymcdm.methods.CODAS(), pymcdm.methods.EDAS(), pymcdm.methods.MABAC()],
    indexes=[629, 213, 485, 218, 703],
    n_instances=5
)

metadata["description"] = "compares TOPSIS, COPRAS, PROMETHEE_II, ARAS, COCOSO, CODAS, EDAS, MABAC MCDM algorithms"

with open('pickles/german/COMPARE_mcdm_algs.pkl', 'wb') as f:
    dill.dump(ExperimentRun(metadata, results), f)

Selected indexes: [629, 213, 485, 218, 703]
Epoch 1/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 1.2696 - val_loss: 1.2614
Epoch 2/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.2702 - val_loss: 1.2440
Epoch 3/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.2771 - val_loss: 1.2278
Epoch 4/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.2216 - val_loss: 1.2127
Epoch 5/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.2056 - val_loss: 1.1975
Epoch 6/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.2211 - val_loss: 1.1823
Epoch 7/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.1848 - val_loss: 1.1667
Epoch 8/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.1662 - val_loss: 1.1512
Epoc

### Analysis

In [29]:
with open('pickles/german/COMPARE_mcdm_algs.pkl', 'rb') as f:
    exp = dill.load(f)

In [30]:
methods = ["TOPSIS", "COPRAS", "PROMETHEE_II", "ARAS", "COCOSO", "CODAS", "EDAS", "MABAC"]
for i, method in enumerate(methods):
    print(f"{method}:\n")
    display(exp.results[i])
    wca = count_worst_case_avoidances(exp.results[i], [False, True, True], 1)
    print(f"Worst case avoidances:\n\t- for all metrics: {wca[0]}\n\t- for 2/3 metrics: {wca[1]}")
    print("AVG:")
    display(get_expconfig_mean_results(exp, i))
    print("\n")
    print("Avg rank:")
    display(get_average_metric_rank(exp.results[i], [False, True, True]))


TOPSIS:



[                              nrc  sensitivity_spearman  faithfulness_corr
 LimeWrapper             40.706734              0.873892           0.542231
 ShapTabularTreeWrapper  48.583731              0.968120           0.633913
 AnchorWrapper           37.743216              0.588246           0.613756
 AggregateExplainer      43.599339              0.964581           0.391358,
                               nrc  sensitivity_spearman  faithfulness_corr
 LimeWrapper             46.498196              0.834532           0.316616
 ShapTabularTreeWrapper  47.987182              0.984775           0.475040
 AnchorWrapper           35.618034              0.487391           0.027308
 AggregateExplainer      43.738128              0.929458           0.189650,
                               nrc  sensitivity_spearman  faithfulness_corr
 LimeWrapper             43.419713              0.849360           0.084178
 ShapTabularTreeWrapper  43.120370              0.972604           0.448373
 AnchorWra

Worst case avoidances:
	- for all metrics: 4
	- for 2/3 metrics: 5
AVG:


Unnamed: 0,nrc,sensitivity_spearman,faithfulness_corr
AggregateExplainer,43.700813,0.933721,0.325955
AnchorWrapper,37.993482,0.67107,0.440208
LimeWrapper,44.387491,0.837015,0.347298
ShapTabularTreeWrapper,46.630478,0.975708,0.444287




Avg rank:


Unnamed: 0,nrc,sensitivity_spearman,faithfulness_corr
AggregateExplainer,2.2,2.0,3.2
AnchorWrapper,1.6,4.0,2.4
LimeWrapper,2.8,3.0,2.4
ShapTabularTreeWrapper,3.4,1.0,2.0


COPRAS:



[                              nrc  sensitivity_spearman  faithfulness_corr
 LimeWrapper             45.349954              0.868424           0.142493
 ShapTabularTreeWrapper  48.583731              0.964770           0.668430
 AnchorWrapper           37.743216              0.560894           0.587868
 AggregateExplainer      44.451048              0.963103           0.294378,
                               nrc  sensitivity_spearman  faithfulness_corr
 LimeWrapper             44.286657              0.874581           0.189881
 ShapTabularTreeWrapper  47.987182              0.985317           0.218931
 AnchorWrapper           35.618034              0.547521           0.122014
 AggregateExplainer      59.244717              0.939360           0.391785,
                               nrc  sensitivity_spearman  faithfulness_corr
 LimeWrapper             45.706862              0.875567           0.519856
 ShapTabularTreeWrapper  43.120370              0.962996           0.377721
 AnchorWra

Worst case avoidances:
	- for all metrics: 2
	- for 2/3 metrics: 5
AVG:


Unnamed: 0,nrc,sensitivity_spearman,faithfulness_corr
AggregateExplainer,47.879488,0.920581,0.356725
AnchorWrapper,36.583852,0.666095,0.392215
LimeWrapper,44.076179,0.862867,0.352646
ShapTabularTreeWrapper,46.630478,0.971195,0.40911




Avg rank:


Unnamed: 0,nrc,sensitivity_spearman,faithfulness_corr
AggregateExplainer,3.0,2.0,2.4
AnchorWrapper,1.0,4.0,2.4
LimeWrapper,2.6,3.0,2.8
ShapTabularTreeWrapper,3.4,1.0,2.4


PROMETHEE_II:



[                              nrc  sensitivity_spearman  faithfulness_corr
 LimeWrapper             47.181229              0.844877           0.030663
 ShapTabularTreeWrapper  48.583731              0.963094           0.534762
 AnchorWrapper           42.893203              0.688851           0.564153
 AggregateExplainer      58.764071              0.538402           0.467273,
                               nrc  sensitivity_spearman  faithfulness_corr
 LimeWrapper             45.937560              0.848276           0.337298
 ShapTabularTreeWrapper  47.987182              0.985169           0.276967
 AnchorWrapper           37.743216              0.528617           0.326098
 AggregateExplainer      54.768723              0.547841           0.448123,
                               nrc  sensitivity_spearman  faithfulness_corr
 LimeWrapper             45.100060              0.860739           0.046934
 ShapTabularTreeWrapper  43.120370              0.962996           0.555777
 AnchorWra

Worst case avoidances:
	- for all metrics: 0
	- for 2/3 metrics: 3
AVG:


Unnamed: 0,nrc,sensitivity_spearman,faithfulness_corr
AggregateExplainer,56.255191,0.543122,0.347303
AnchorWrapper,39.570329,0.702991,0.463546
LimeWrapper,46.145285,0.851192,0.253927
ShapTabularTreeWrapper,46.630478,0.971766,0.506486




Avg rank:


Unnamed: 0,nrc,sensitivity_spearman,faithfulness_corr
AggregateExplainer,4.0,3.5,2.6
AnchorWrapper,1.4,3.0,2.4
LimeWrapper,2.0,2.2,2.6
ShapTabularTreeWrapper,2.6,1.0,2.4


ARAS:



[                              nrc  sensitivity_spearman  faithfulness_corr
 LimeWrapper             45.223001              0.879163           0.008710
 ShapTabularTreeWrapper  48.583731              0.968564           0.074819
 AnchorWrapper           42.893203              0.509784           0.218931
 AggregateExplainer      46.872105                   NaN           0.321547,
                               nrc  sensitivity_spearman  faithfulness_corr
 LimeWrapper             45.466355              0.839951           0.227091
 ShapTabularTreeWrapper  47.987182              0.989899           0.375013
 AnchorWrapper           37.743216              0.690145           0.103038
 AggregateExplainer      57.703650              0.951429           0.437721,
                               nrc  sensitivity_spearman  faithfulness_corr
 LimeWrapper             45.400819              0.832315           0.225714
 ShapTabularTreeWrapper  43.120370              0.966987           0.148438
 AnchorWra

Worst case avoidances:
	- for all metrics: 3
	- for 2/3 metrics: 5
AVG:


Unnamed: 0,nrc,sensitivity_spearman,faithfulness_corr
AggregateExplainer,47.899106,0.922451,0.380356
AnchorWrapper,38.455621,0.67858,0.384576
LimeWrapper,45.738267,0.849921,0.333765
ShapTabularTreeWrapper,46.630478,0.975107,0.272771




Avg rank:


Unnamed: 0,nrc,sensitivity_spearman,faithfulness_corr
AggregateExplainer,3.2,2.0,1.8
AnchorWrapper,1.0,3.8,2.2
LimeWrapper,2.8,2.8,3.0
ShapTabularTreeWrapper,3.0,1.0,3.0


COCOSO:



[                              nrc  sensitivity_spearman  faithfulness_corr
 LimeWrapper             45.166015              0.882266           0.674065
 ShapTabularTreeWrapper  48.583731              0.971766           0.805313
 AnchorWrapper           37.743216              0.451751           0.557636
 AggregateExplainer      46.358792              0.944631           0.348020,
                               nrc  sensitivity_spearman  faithfulness_corr
 LimeWrapper             46.092841              0.870837           0.713508
 ShapTabularTreeWrapper  47.987182              0.983641           0.486455
 AnchorWrapper           37.743216              0.517334           0.222136
 AggregateExplainer      46.015875              0.938768           0.272427,
                               nrc  sensitivity_spearman  faithfulness_corr
 LimeWrapper             44.847482              0.852217           0.368960
 ShapTabularTreeWrapper  43.120370              0.967085           0.204128
 AnchorWra

Worst case avoidances:
	- for all metrics: 2
	- for 2/3 metrics: 5
AVG:


Unnamed: 0,nrc,sensitivity_spearman,faithfulness_corr
AggregateExplainer,46.149413,0.926034,0.361123
AnchorWrapper,37.511655,0.630963,0.490063
LimeWrapper,45.235229,0.859606,0.441265
ShapTabularTreeWrapper,46.630478,0.973077,0.522835




Avg rank:


Unnamed: 0,nrc,sensitivity_spearman,faithfulness_corr
AggregateExplainer,3.0,2.0,2.8
AnchorWrapper,1.0,3.8,2.4
LimeWrapper,2.8,2.8,2.8
ShapTabularTreeWrapper,3.2,1.0,2.0


CODAS:



[                              nrc  sensitivity_spearman  faithfulness_corr
 LimeWrapper             49.006978              0.829015           0.449095
 ShapTabularTreeWrapper  48.583731              0.961222           0.740708
 AnchorWrapper           42.893203              0.685479           0.892637
 AggregateExplainer      60.847304              0.523162           0.333326,
                               nrc  sensitivity_spearman  faithfulness_corr
 LimeWrapper             45.410974              0.778473           0.025545
 ShapTabularTreeWrapper  47.987182              0.980833           0.720092
 AnchorWrapper           35.618034              0.529188           0.307509
 AggregateExplainer      43.957436              0.368867           0.432910,
                               nrc  sensitivity_spearman  faithfulness_corr
 LimeWrapper             42.075812              0.857635           0.389370
 ShapTabularTreeWrapper  43.120370              0.964523           0.625543
 AnchorWra

Worst case avoidances:
	- for all metrics: 0
	- for 2/3 metrics: 2
AVG:


Unnamed: 0,nrc,sensitivity_spearman,faithfulness_corr
AggregateExplainer,52.417471,0.418583,0.255632
AnchorWrapper,37.613849,0.700365,0.391035
LimeWrapper,44.775418,0.828719,0.370423
ShapTabularTreeWrapper,46.630478,0.969076,0.629526




Avg rank:


Unnamed: 0,nrc,sensitivity_spearman,faithfulness_corr
AggregateExplainer,3.2,4.0,3.2
AnchorWrapper,1.0,2.8,2.8
LimeWrapper,2.6,2.2,2.6
ShapTabularTreeWrapper,3.2,1.0,1.4


EDAS:



[                              nrc  sensitivity_spearman  faithfulness_corr
 LimeWrapper             45.275626              0.845911           0.324015
 ShapTabularTreeWrapper  48.583731              0.966642           0.169981
 AnchorWrapper           42.893203              0.653178           0.132105
 AggregateExplainer      50.589081              0.921429           0.543379,
                               nrc  sensitivity_spearman  faithfulness_corr
 LimeWrapper             48.378436              0.801576           0.799896
 ShapTabularTreeWrapper  47.987182              0.986647           0.189788
 AnchorWrapper           37.743216              0.517906           0.278018
 AggregateExplainer      49.714171              0.895517           0.126758,
                               nrc  sensitivity_spearman  faithfulness_corr
 LimeWrapper             45.174290              0.859803           0.498583
 ShapTabularTreeWrapper  43.120370              0.972752           0.083270
 AnchorWra

Worst case avoidances:
	- for all metrics: 1
	- for 2/3 metrics: 4
AVG:


Unnamed: 0,nrc,sensitivity_spearman,faithfulness_corr
AggregateExplainer,44.989723,0.879265,0.385484
AnchorWrapper,38.35582,0.661122,0.414054
LimeWrapper,45.940605,0.838552,0.468458
ShapTabularTreeWrapper,46.630478,0.97353,0.356728




Avg rank:


Unnamed: 0,nrc,sensitivity_spearman,faithfulness_corr
AggregateExplainer,2.8,2.6,2.8
AnchorWrapper,1.2,3.8,2.2
LimeWrapper,2.8,2.6,2.6
ShapTabularTreeWrapper,3.2,1.0,2.4


MABAC:



[                              nrc  sensitivity_spearman  faithfulness_corr
 LimeWrapper             42.770543              0.801281           0.295258
 ShapTabularTreeWrapper  48.583731              0.972506           0.039473
 AnchorWrapper           37.743216              0.606942           0.424705
 AggregateExplainer      46.824018              0.639951           0.555988,
                               nrc  sensitivity_spearman  faithfulness_corr
 LimeWrapper             43.500975              0.834877           0.041312
 ShapTabularTreeWrapper  47.987182              0.983740           0.590600
 AnchorWrapper           35.618034              0.542512           0.070940
 AggregateExplainer      45.442618              0.587833           0.229744,
                               nrc  sensitivity_spearman  faithfulness_corr
 LimeWrapper             41.369189              0.869212           0.267952
 ShapTabularTreeWrapper  43.120370              0.967283           0.456386
 AnchorWra

Worst case avoidances:
	- for all metrics: 2
	- for 2/3 metrics: 5
AVG:


Unnamed: 0,nrc,sensitivity_spearman,faithfulness_corr
AggregateExplainer,43.697125,0.500719,0.402868
AnchorWrapper,36.515712,0.659859,0.254054
LimeWrapper,42.877432,0.834236,0.370733
ShapTabularTreeWrapper,46.630478,0.975452,0.34016




Avg rank:


Unnamed: 0,nrc,sensitivity_spearman,faithfulness_corr
AggregateExplainer,2.8,3.6,1.8
AnchorWrapper,1.0,3.4,3.0
LimeWrapper,2.2,2.0,2.8
ShapTabularTreeWrapper,4.0,1.0,2.4


### Discussion

Most of them didn't significantly outperformed the TOPSIS. EDAS showed promising results!

# RAE-T vs. RAE-E | 10 samples
### Execution

In [6]:
results, metadata = evaluate_aggregate_explainer(
    clf, X_train, X_test, categorical_features,
    metrics_sets=[['nrc', 'sensitivity_spearman', 'faithfulness_corr']],
    mcdm_algs=[pymcdm.methods.TOPSIS(), pymcdm.methods.EDAS()],
    n_instances=10
)

metadata["description"] = "RAE-T vs RAE-S, 10 samples"

with open('pickles/german/RAE-T_vs_RAE-S_10-RANKED.pkl', 'wb') as f:
    dill.dump(ExperimentRun(metadata, results), f)

Selected indexes: [745 332  59 928 893 493 910 940 870 213]
Epoch 1/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 1.2787 - val_loss: 1.2390
Epoch 2/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.2530 - val_loss: 1.2224
Epoch 3/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.2172 - val_loss: 1.2065
Epoch 4/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.2202 - val_loss: 1.1906
Epoch 5/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.1614 - val_loss: 1.1746
Epoch 6/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.1532 - val_loss: 1.1587
Epoch 7/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.1602 - val_loss: 1.1428
Epoch 8/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.1689 - val_l

### Analysis

In [7]:
with open('pickles/german/RAE-T_vs_RAE-S_10-RANKED.pkl', 'rb') as f:
    exp = dill.load(f)

In [8]:
methods = ["RAE-T", "RAE-E"]
for i, method in enumerate(methods):
    print(f"{method}:\n")
    display(exp.results[i])
    wca = count_worst_case_avoidances(exp.results[i], [False, True, True], 1)
    print(f"Worst case avoidances:\n\t- for all metrics: {wca[0]}\n\t- for 2/3 metrics: {wca[1]}")
    print("AVG:")
    display(get_expconfig_mean_results(exp, i))
    print("\n")
    print("Avg rank:")
    display(get_average_metric_rank(exp.results[i], [False, True, True]))

RAE-T:



[                              nrc  sensitivity_spearman  faithfulness_corr
 LimeWrapper             52.959308              0.800493           0.152183
 ShapTabularTreeWrapper  54.840345              0.988667           0.580538
 AnchorWrapper           35.618034              0.687871           0.282737
 AggregateExplainer      37.288109              0.854335           0.230025,
                               nrc  sensitivity_spearman  faithfulness_corr
 LimeWrapper             46.003947              0.871724           0.167361
 ShapTabularTreeWrapper  58.810492              0.991586           0.173186
 AnchorWrapper           36.104963              0.650424           0.253174
 AggregateExplainer      37.396762              0.908522           0.332136,
                               nrc  sensitivity_spearman  faithfulness_corr
 LimeWrapper             46.608743              0.864187           0.108648
 ShapTabularTreeWrapper  46.793985              0.986043           0.658543
 AnchorWra

Worst case avoidances:
	- for all metrics: 7
	- for 2/3 metrics: 10
AVG:


Unnamed: 0,nrc,sensitivity_spearman,faithfulness_corr
AggregateExplainer,38.404812,0.900394,0.405411
AnchorWrapper,38.169462,0.636306,0.405655
LimeWrapper,46.155689,0.848133,0.311262
ShapTabularTreeWrapper,49.084641,0.98323,0.43632




Avg rank:


Unnamed: 0,nrc,sensitivity_spearman,faithfulness_corr
AggregateExplainer,1.7,2.0,2.4
AnchorWrapper,1.3,4.0,2.5
LimeWrapper,3.2,3.0,2.8
ShapTabularTreeWrapper,3.8,1.0,2.3


RAE-E:



[                              nrc  sensitivity_spearman  faithfulness_corr
 LimeWrapper             43.711824              0.878916           0.516962
 ShapTabularTreeWrapper  54.840345              0.984294           0.466814
 AnchorWrapper           35.618034              0.732773           0.734726
 AggregateExplainer      39.334892              0.828030           0.265690,
                               nrc  sensitivity_spearman  faithfulness_corr
 LimeWrapper             43.933355              0.852611           0.614771
 ShapTabularTreeWrapper  58.810492              0.992671           0.750279
 AnchorWrapper           42.906130              0.630380           0.872320
 AggregateExplainer      38.541156              0.868227           0.349046,
                               nrc  sensitivity_spearman  faithfulness_corr
 LimeWrapper             46.408475              0.841724           0.441467
 ShapTabularTreeWrapper  46.793985              0.975362           0.602831
 AnchorWra

Worst case avoidances:
	- for all metrics: 7
	- for 2/3 metrics: 10
AVG:


Unnamed: 0,nrc,sensitivity_spearman,faithfulness_corr
AggregateExplainer,37.936895,0.864911,0.493251
AnchorWrapper,37.529196,0.68483,0.575317
LimeWrapper,45.713571,0.854069,0.368931
ShapTabularTreeWrapper,49.084641,0.980092,0.558692




Avg rank:


Unnamed: 0,nrc,sensitivity_spearman,faithfulness_corr
AggregateExplainer,1.6,2.3,2.5
AnchorWrapper,1.4,4.0,2.1
LimeWrapper,3.2,2.7,3.3
ShapTabularTreeWrapper,3.8,1.0,2.1
