In [1]:
import warnings
warnings.filterwarnings("ignore")

from xai_agg.agg_exp import *
from xai_agg.utils import *

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier

import pandas as pd
import numpy as np

import dill

2024-12-19 10:42:51.069906: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-12-19 10:42:51.094045: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Data Loading and Preprocessing

In [2]:
original_data = pd.read_csv('../data/german_credit_data_updated.csv')

# Dataset overview - German Credit Risk (from Kaggle):
# 1. Age (numeric)
# 2. Sex (text: male, female)
# 3. Job (numeric: 0 - unskilled and non-resident, 1 - unskilled and resident, 2 - skilled, 3 - highly skilled)
# 4. Housing (text: own, rent, or free)
# 5. Saving accounts (text - little, moderate, quite rich, rich)
# 6. Checking account (numeric, in DM - Deutsch Mark)
# 7. Credit amount (numeric, in DM)
# 8. Duration (numeric, in month)
# 9. Purpose (text: car, furniture/equipment, radio/TV, domestic appliances, repairs, education, business, vacation/others)

display(original_data.head())
display(original_data.describe())
display(original_data.info())

# Display the unique values of the categorical features:
print('Unique values of the categorical features:')
for col in original_data.select_dtypes(include='object'):
    print(f'\t- {col}: {original_data[col].unique()}')

Unnamed: 0.1,Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Credit Risk
0,0,67,male,2,own,,little,1169,6,radio/TV,1
1,1,22,female,2,own,little,moderate,5951,48,radio/TV,2
2,2,49,male,1,own,little,,2096,12,education,1
3,3,45,male,2,free,little,little,7882,42,furniture/equipment,1
4,4,53,male,2,free,little,little,4870,24,car,2


Unnamed: 0.1,Unnamed: 0,Age,Job,Credit amount,Duration,Credit Risk
count,954.0,954.0,954.0,954.0,954.0,954.0
mean,476.5,35.501048,1.909853,3279.112159,20.780922,1.302935
std,275.540378,11.379668,0.649681,2853.315158,12.046483,0.459768
min,0.0,19.0,0.0,250.0,4.0,1.0
25%,238.25,27.0,2.0,1360.25,12.0,1.0
50%,476.5,33.0,2.0,2302.5,18.0,1.0
75%,714.75,42.0,2.0,3975.25,24.0,2.0
max,953.0,75.0,3.0,18424.0,72.0,2.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 954 entries, 0 to 953
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Unnamed: 0        954 non-null    int64 
 1   Age               954 non-null    int64 
 2   Sex               954 non-null    object
 3   Job               954 non-null    int64 
 4   Housing           954 non-null    object
 5   Saving accounts   779 non-null    object
 6   Checking account  576 non-null    object
 7   Credit amount     954 non-null    int64 
 8   Duration          954 non-null    int64 
 9   Purpose           954 non-null    object
 10  Credit Risk       954 non-null    int64 
dtypes: int64(6), object(5)
memory usage: 82.1+ KB


None

Unique values of the categorical features:
	- Sex: ['male' 'female']
	- Housing: ['own' 'free' 'rent']
	- Saving accounts: [nan 'little' 'quite rich' 'rich' 'moderate']
	- Checking account: ['little' 'moderate' nan 'rich']
	- Purpose: ['radio/TV' 'education' 'furniture/equipment' 'car' 'business'
 'domestic appliances' 'repairs' 'vacation/others']


In [3]:
preprocessed_data = original_data.copy()

# For savings and checking accounts, we will replace the missing values with 'none':
preprocessed_data['Saving accounts'].fillna('none', inplace=True)
preprocessed_data['Checking account'].fillna('none', inplace=True)

# Dropping index column:
preprocessed_data.drop(columns=['Unnamed: 0'], inplace=True)

# Using pd.dummies to one-hot-encode the categorical features
preprocessed_data["Job"] = preprocessed_data["Job"].map({0: 'unskilled_nonresident', 1: 'unskilled_resident',
                                                         2: 'skilled', 3: 'highlyskilled'})

categorical_features = preprocessed_data.select_dtypes(include='object').columns
numerical_features = preprocessed_data.select_dtypes(include='number').columns.drop('Credit Risk')
print(f'Categorical features: {categorical_features}')
print(f'Numerical features: {numerical_features}')

preprocessed_data = pd.get_dummies(preprocessed_data, columns=categorical_features, dtype='int64')

# Remapping the target variable to 0 and 1:
preprocessed_data['Credit Risk'] = preprocessed_data['Credit Risk'].map({1: 0, 2: 1})

# Make sure all column names are valid python identifiers (important for pd.query() calls):
preprocessed_data.columns = preprocessed_data.columns.str.replace(' ', '_')
preprocessed_data.columns = preprocessed_data.columns.str.replace('/', '_')

# Normalizing the data
scaler = StandardScaler()
scaled_preprocessed_data = scaler.fit_transform(preprocessed_data)

display(preprocessed_data.head())
display(preprocessed_data.info())

display(scaled_preprocessed_data)

Categorical features: Index(['Sex', 'Job', 'Housing', 'Saving accounts', 'Checking account',
       'Purpose'],
      dtype='object')
Numerical features: Index(['Age', 'Credit amount', 'Duration'], dtype='object')


Unnamed: 0,Age,Credit_amount,Duration,Credit_Risk,Sex_female,Sex_male,Job_highlyskilled,Job_skilled,Job_unskilled_nonresident,Job_unskilled_resident,...,Checking_account_none,Checking_account_rich,Purpose_business,Purpose_car,Purpose_domestic_appliances,Purpose_education,Purpose_furniture_equipment,Purpose_radio_TV,Purpose_repairs,Purpose_vacation_others
0,67,1169,6,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
1,22,5951,48,1,1,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,49,2096,12,0,0,1,0,0,0,1,...,1,0,0,0,0,1,0,0,0,0
3,45,7882,42,0,0,1,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
4,53,4870,24,1,0,1,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 954 entries, 0 to 953
Data columns (total 30 columns):
 #   Column                       Non-Null Count  Dtype
---  ------                       --------------  -----
 0   Age                          954 non-null    int64
 1   Credit_amount                954 non-null    int64
 2   Duration                     954 non-null    int64
 3   Credit_Risk                  954 non-null    int64
 4   Sex_female                   954 non-null    int64
 5   Sex_male                     954 non-null    int64
 6   Job_highlyskilled            954 non-null    int64
 7   Job_skilled                  954 non-null    int64
 8   Job_unskilled_nonresident    954 non-null    int64
 9   Job_unskilled_resident       954 non-null    int64
 10  Housing_free                 954 non-null    int64
 11  Housing_own                  954 non-null    int64
 12  Housing_rent                 954 non-null    int64
 13  Saving_accounts_little       954 non-null    int64

None

array([[ 2.7694545 , -0.7399179 , -1.22763429, ...,  1.62518349,
        -0.14633276, -0.11286653],
       [-1.18704073,  0.93690642,  2.26068929, ...,  1.62518349,
        -0.14633276, -0.11286653],
       [ 1.18685641, -0.41486224, -0.72930235, ..., -0.61531514,
        -0.14633276, -0.11286653],
       ...,
       [-1.0111965 , -0.39768023,  1.26402541, ..., -0.61531514,
        -0.14633276, -0.11286653],
       [-0.65950803,  0.29240557,  0.26736153, ..., -0.61531514,
        -0.14633276, -0.11286653],
       [-0.83535227,  2.69823821,  1.26402541, ..., -0.61531514,
        -0.14633276, -0.11286653]])

In [4]:
y = preprocessed_data['Credit_Risk']
X = preprocessed_data.drop(columns='Credit_Risk')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(f'ROC AUC: {roc_auc_score(y_test, y_pred)}')

Accuracy: 0.7696335078534031
ROC AUC: 0.6830357142857143


# Metadata setup

In [6]:
from dataclasses import dataclass

@dataclass
class ExperimentRun:
    metadata: dict
    results: any

In [10]:
def get_expconfig_mean_results(exp: ExperimentRun, config: int):
    config_results = exp.results[config]
    return pd.concat(config_results).groupby(level=0).mean()

# Tuning

# Evaluating Old and New Metric sets
Running the current setup: wsum, topsis, ['nrc', 'sensitiviy_spearman', 'faithfulness_corr']

### Execution

In [8]:
results, metadata = evaluate_aggregate_explainer(
    clf, X_train, X_test, categorical_features,
    metrics_sets=[
        ['complexity', 'sensitivity_spearman', 'faithfulness_corr'],
        ['nrc', 'sensitivity_spearman', 'faithfulness_corr']
    ],
    indexes=[629, 213, 485, 218, 703]
)

metadata["description"] = "compares entropy complexity with nrc metric sets"

with open('pickles/COMPARE_entropy-nrc_metricsets_wsum-topsis.pkl', 'wb') as f:
    dill.dump(ExperimentRun(metadata, results), f)


Selected indexes: [629, 213, 485, 218, 703]
Epoch 1/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 1.2904 - val_loss: 1.2596
Epoch 2/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.2678 - val_loss: 1.2416
Epoch 3/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.2638 - val_loss: 1.2253
Epoch 4/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.2183 - val_loss: 1.2097
Epoch 5/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.2324 - val_loss: 1.1943
Epoch 6/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.1837 - val_loss: 1.1791
Epoch 7/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.1742 - val_loss: 1.1635
Epoch 8/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.1404 - val_loss: 1.1481
Epoc

### Analysis

In [13]:
with open('pickles/COMPARE_entropy-nrc_metricsets_wsum-topsis.pkl', 'rb') as f:
    run = dill.load(f)

In [18]:
print("With metrics: ['complexity', 'sensitivity_spearman', 'faithfulness_corr']\n")
display(run.results[0])

print("AVG:")
display(get_expconfig_mean_results(run, 0))

With metrics: ['complexity', 'sensitivity_spearman', 'faithfulness_corr']



[                        complexity  sensitivity_spearman  faithfulness_corr
 LimeWrapper               2.607385              0.923645           0.504278
 ShapTabularTreeWrapper    2.639087              0.967607           0.038512
 AnchorWrapper             0.692943              0.473214           0.894271
 AggregateExplainer        2.462704              0.964286           0.576280,
                         complexity  sensitivity_spearman  faithfulness_corr
 LimeWrapper               2.538366              0.837833           0.248742
 ShapTabularTreeWrapper    2.513429              0.975094           0.558859
 AnchorWrapper             0.887787              0.564325           0.502479
 AggregateExplainer        2.620076              0.933645           0.232050,
                         complexity  sensitivity_spearman  faithfulness_corr
 LimeWrapper               2.542261              0.853744           0.169501
 ShapTabularTreeWrapper    2.284778              0.964992           0.3814

AVG:


Unnamed: 0,complexity,sensitivity_spearman,faithfulness_corr
AggregateExplainer,2.463323,0.930829,0.43176
AnchorWrapper,1.275002,0.62948,0.433368
LimeWrapper,2.575076,0.866256,0.423207
ShapTabularTreeWrapper,2.451083,0.968438,0.435356


In [19]:
print("\n\nWith metrics: ['nrc', 'sensitivity_spearman', 'faithfulness_corr']\n")
display(run.results[1])

print("AVG:")
display(get_expconfig_mean_results(run, 1))



With metrics: ['nrc', 'sensitivity_spearman', 'faithfulness_corr']



[                              nrc  sensitivity_spearman  faithfulness_corr
 LimeWrapper             48.177761              0.870246           0.096595
 ShapTabularTreeWrapper  48.583731              0.977644           0.655846
 AnchorWrapper           37.743216              0.617716           0.952868
 AggregateExplainer      46.923408              0.957438           0.441459,
                               nrc  sensitivity_spearman  faithfulness_corr
 LimeWrapper             44.906524              0.847389           0.063032
 ShapTabularTreeWrapper  47.987182              0.982287           0.201590
 AnchorWrapper           37.743216              0.500799           0.089648
 AggregateExplainer      45.203013              0.953744           0.293751,
                               nrc  sensitivity_spearman  faithfulness_corr
 LimeWrapper             47.016105              0.823153           0.144989
 ShapTabularTreeWrapper  43.120370              0.967062           0.170781
 AnchorWra

AVG:


Unnamed: 0,nrc,sensitivity_spearman,faithfulness_corr
AggregateExplainer,45.502635,0.930181,0.399033
AnchorWrapper,38.521171,0.664532,0.563576
LimeWrapper,46.13345,0.845744,0.24089
ShapTabularTreeWrapper,46.630478,0.975466,0.356992


### Discussion
This experiment sought to evaluate the performance of the NRC metric when it substitutes the entropy based "complexity" metric.

With both metric sets, the worst metric value was avoided in all samples for sensitivity_spearman and faithfulness_corr. Both NRC and complexity had the same percentage of worst-metric-value-avoidance, with both metrics having 80% (1/5) of the samples avoiding the worst value.
The worst value for the entropy metric is avoided in the average of its samples, while the worst value for the NRC metric is not avoided in the average of its samples.

# Evaluating Rank aggregation algorithms

### Execution

In [7]:
results, metadata = evaluate_aggregate_explainer(
    clf, X_train, X_test, categorical_features,
    aggregation_algs=["wsum", "w_bordafuse", "w_condorcet"],
    indexes=[629, 213, 485, 218, 703],
    n_instances=5
)

metadata["description"] = "compares wsum, w_bordafuse, w_condorcet aggregation algorithms"

with open('pickles/COMPARE_wsum-w_bordafuse-w_condorcet.pkl', 'wb') as f:
    dill.dump(ExperimentRun(metadata, results), f)

Selected indexes: [629, 213, 485, 218, 703]
Epoch 1/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - loss: 1.2603 - val_loss: 1.2438
Epoch 2/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.2752 - val_loss: 1.2257
Epoch 3/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.2272 - val_loss: 1.2082
Epoch 4/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.2311 - val_loss: 1.1911
Epoch 5/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.1949 - val_loss: 1.1744
Epoch 6/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.1433 - val_loss: 1.1574
Epoch 7/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.1481 - val_loss: 1.1400
Epoch 8/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.1358 - val_loss: 1.1229
Epo

### Analysis

In [20]:
with open('pickles/COMPARE_wsum-w_bordafuse-w_condorcet.pkl', 'rb') as f:
    run = dill.load(f)

In [21]:
print("WSUM:")
display(run.results[0])

print("AVG:")
display(get_expconfig_mean_results(run, 0))

print("\n\nW_BORDAFUSE:")
display(run.results[1])

print("AVG:")
display(get_expconfig_mean_results(run, 1))

print("\n\nW_CONDORCET:")
display(run.results[2])

print("AVG:")
display(get_expconfig_mean_results(run, 2))

WSUM:


[                              nrc  sensitivity_spearman  faithfulness_corr
 LimeWrapper             47.834639              0.856158           0.299596
 ShapTabularTreeWrapper  48.583731              0.972407           0.109894
 AnchorWrapper           37.743216              0.731589           0.379407
 AggregateExplainer      49.183939              0.950394           0.593073,
                               nrc  sensitivity_spearman  faithfulness_corr
 LimeWrapper             44.562959              0.813547           0.614179
 ShapTabularTreeWrapper  47.987182              0.986082           0.283443
 AnchorWrapper           37.743216              0.528527           0.267040
 AggregateExplainer      45.872785              0.929951           0.277345,
                               nrc  sensitivity_spearman  faithfulness_corr
 LimeWrapper             43.777669              0.851429           0.464864
 ShapTabularTreeWrapper  43.120370              0.964353           0.434860
 AnchorWra

AVG:


Unnamed: 0,nrc,sensitivity_spearman,faithfulness_corr
AggregateExplainer,47.219324,0.919724,0.390187
AnchorWrapper,38.812264,0.676657,0.29958
LimeWrapper,44.946276,0.83798,0.512961
ShapTabularTreeWrapper,46.630478,0.971879,0.349105




W_BORDAFUSE:


[                              nrc  sensitivity_spearman  faithfulness_corr
 LimeWrapper             46.302442              0.890394           0.478538
 ShapTabularTreeWrapper  48.583731              0.972334           0.365453
 AnchorWrapper           42.893203              0.693191           0.437305
 AggregateExplainer      62.759800              0.932956           0.383190,
                               nrc  sensitivity_spearman  faithfulness_corr
 LimeWrapper             45.489937              0.864729           0.026440
 ShapTabularTreeWrapper  47.987182              0.983580           0.674341
 AnchorWrapper           42.893203              0.554394           0.078678
 AggregateExplainer      60.879271              0.883645           0.232789,
                               nrc  sensitivity_spearman  faithfulness_corr
 LimeWrapper             43.539125              0.860148           0.002892
 ShapTabularTreeWrapper  43.120370              0.966695           0.447880
 AnchorWra

AVG:


Unnamed: 0,nrc,sensitivity_spearman,faithfulness_corr
AggregateExplainer,64.67612,0.915498,0.293481
AnchorWrapper,38.651371,0.714872,0.38759
LimeWrapper,44.617306,0.865626,0.171087
ShapTabularTreeWrapper,46.630478,0.972968,0.419234




W_CONDORCET:


[                              nrc  sensitivity_spearman  faithfulness_corr
 LimeWrapper             45.166888              0.878719           0.167372
 ShapTabularTreeWrapper  48.583731              0.965991           0.127322
 AnchorWrapper           42.893203              0.492117           0.487369
 AggregateExplainer      69.841686              0.869064           0.244482,
                               nrc  sensitivity_spearman  faithfulness_corr
 LimeWrapper             43.445727              0.870443           0.648649
 ShapTabularTreeWrapper  47.987182              0.982633           0.084455
 AnchorWrapper           42.893203              0.531224           0.046006
 AggregateExplainer      69.841686              0.859261           0.128161,
                               nrc  sensitivity_spearman  faithfulness_corr
 LimeWrapper             41.955835              0.852217           0.337086
 ShapTabularTreeWrapper  43.120370              0.957544           0.337494
 AnchorWra

AVG:


Unnamed: 0,nrc,sensitivity_spearman,faithfulness_corr
AggregateExplainer,69.841686,0.878355,0.257825
AnchorWrapper,39.517977,0.650304,0.34208
LimeWrapper,44.567679,0.859704,0.341113
ShapTabularTreeWrapper,46.630478,0.969409,0.216884


### Discussion
This experiment sought to evaluate the performance of the different rank aggregation algorithms.

wsum is the best among the algorithms tested.

# Evaluating MCDM Algs

### Execution

In [7]:
results, metadata = evaluate_aggregate_explainer(
    clf, X_train, X_test, categorical_features,
    mcdm_algs=[pymcdm.methods.ARAS(), pymcdm.methods.COCOSO(),
               pymcdm.methods.CODAS(), pymcdm.methods.EDAS(), pymcdm.methods.MABAC()],
    indexes=[629, 213, 485, 218, 703],
    n_instances=5
)

metadata["description"] = "compares ARAS, COCOSO, CODAS, EDAS, MABAC MCDM algorithms. wsum aggregation"

with open('pickles/COMPARE_mcdm_algs.pkl', 'wb') as f:
    dill.dump(ExperimentRun(metadata, results), f)

Selected indexes: [629, 213, 485, 218, 703]
Epoch 1/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 1.2512 - val_loss: 1.2543
Epoch 2/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 1.2806 - val_loss: 1.2385
Epoch 3/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.2602 - val_loss: 1.2233
Epoch 4/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.2319 - val_loss: 1.2087
Epoch 5/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.2083 - val_loss: 1.1944
Epoch 6/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.1966 - val_loss: 1.1800
Epoch 7/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.1829 - val_loss: 1.1657
Epoch 8/500
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.1623 - val_loss: 1.1510
Epo

### Analysis

In [22]:
with open('pickles/COMPARE_mcdm_algs.pkl', 'rb') as f:
    run = dill.load(f)

In [23]:
print("ARAS:")
display(run.results[0])
print("AVG:")
display(get_expconfig_mean_results(run, 0))

print("\n\nCOCOSO:")
display(run.results[1])
print("AVG:")
display(get_expconfig_mean_results(run, 1))

print("\n\nCODAS:")
display(run.results[2])
print("AVG:")
display(get_expconfig_mean_results(run, 2))

print("\n\nEDAS:")
display(run.results[3])
print("AVG:")
display(get_expconfig_mean_results(run, 3))

print("\n\nMABAC:")
display(run.results[4])
print("AVG:")
display(get_expconfig_mean_results(run, 4))

ARAS:


[                              nrc  sensitivity_spearman  faithfulness_corr
 LimeWrapper             45.000905              0.860591           0.590592
 ShapTabularTreeWrapper  48.583731              0.938786           0.158710
 AnchorWrapper           37.743216              0.494133           0.140858
 AggregateExplainer      46.186138              0.961921           0.399411,
                               nrc  sensitivity_spearman  faithfulness_corr
 LimeWrapper             43.236179              0.832709           0.361183
 ShapTabularTreeWrapper  47.987182              0.976658           0.795198
 AnchorWrapper           35.618034              0.569675           0.185134
 AggregateExplainer      48.246215              0.942512           0.379164,
                               nrc  sensitivity_spearman  faithfulness_corr
 LimeWrapper             46.360684              0.810837           0.225060
 ShapTabularTreeWrapper  43.120370              0.959218           0.293173
 AnchorWra

AVG:


Unnamed: 0,nrc,sensitivity_spearman,faithfulness_corr
AggregateExplainer,45.527185,0.919015,0.394515
AnchorWrapper,36.462039,0.661363,0.386532
LimeWrapper,44.531795,0.833596,0.389892
ShapTabularTreeWrapper,46.630478,0.961248,0.438199




COCOSO:


[                              nrc  sensitivity_spearman  faithfulness_corr
 LimeWrapper             46.300581              0.797685           0.469643
 ShapTabularTreeWrapper  48.583731              0.956533           0.830980
 AnchorWrapper           37.743216              0.617751           0.216845
 AggregateExplainer      48.420191                   NaN           0.444600,
                               nrc  sensitivity_spearman  faithfulness_corr
 LimeWrapper             43.234268              0.841478           0.159024
 ShapTabularTreeWrapper  47.987182              0.978654           0.358037
 AnchorWrapper           35.618034              0.640770           0.289978
 AggregateExplainer      45.770429              0.930148           0.323217,
                               nrc  sensitivity_spearman  faithfulness_corr
 LimeWrapper             45.816373              0.848621           0.307514
 ShapTabularTreeWrapper  43.120370              0.951101           0.339676
 AnchorWra

AVG:


Unnamed: 0,nrc,sensitivity_spearman,faithfulness_corr
AggregateExplainer,50.15745,0.917266,0.387149
AnchorWrapper,37.086619,0.65547,0.406126
LimeWrapper,44.275771,0.82732,0.341626
ShapTabularTreeWrapper,46.630478,0.962937,0.615126




CODAS:


[                              nrc  sensitivity_spearman  faithfulness_corr
 LimeWrapper             46.945926              0.850690           0.764673
 ShapTabularTreeWrapper  48.583731              0.954525           0.508137
 AnchorWrapper           37.743216              0.681888           0.666562
 AggregateExplainer      58.815256              0.433204           0.428492,
                               nrc  sensitivity_spearman  faithfulness_corr
 LimeWrapper             45.364049              0.843153           0.251064
 ShapTabularTreeWrapper  47.987182              0.978999           0.579272
 AnchorWrapper           37.743216              0.484337           0.141530
 AggregateExplainer      47.081246              0.345517           0.533534,
                               nrc  sensitivity_spearman  faithfulness_corr
 LimeWrapper             43.828514              0.852808           0.585474
 ShapTabularTreeWrapper  43.120370              0.955040           0.817945
 AnchorWra

AVG:


Unnamed: 0,nrc,sensitivity_spearman,faithfulness_corr
AggregateExplainer,54.994235,0.430306,0.38033
AnchorWrapper,39.465033,0.689521,0.478277
LimeWrapper,45.940264,0.839537,0.368377
ShapTabularTreeWrapper,46.630478,0.965477,0.507442




EDAS:


[                              nrc  sensitivity_spearman  faithfulness_corr
 LimeWrapper             43.243764              0.904433           0.018943
 ShapTabularTreeWrapper  48.583731              0.951455           0.705132
 AnchorWrapper           37.743216              0.598090           0.218162
 AggregateExplainer      47.017873              0.911182           0.570695,
                               nrc  sensitivity_spearman  faithfulness_corr
 LimeWrapper             45.179792              0.848079           0.538377
 ShapTabularTreeWrapper  47.987182              0.980945           0.249669
 AnchorWrapper           37.743216              0.540141           0.276989
 AggregateExplainer      45.264170              0.899803           0.242529,
                               nrc  sensitivity_spearman  faithfulness_corr
 LimeWrapper             47.070436              0.809507           0.395494
 ShapTabularTreeWrapper  43.120370              0.960154           0.027698
 AnchorWra

AVG:


Unnamed: 0,nrc,sensitivity_spearman,faithfulness_corr
AggregateExplainer,44.788101,0.902416,0.472689
AnchorWrapper,38.812264,0.654595,0.347215
LimeWrapper,45.056925,0.848207,0.33465
ShapTabularTreeWrapper,46.630478,0.965664,0.444585




MABAC:


[                              nrc  sensitivity_spearman  faithfulness_corr
 LimeWrapper             47.513136              0.866158           0.088058
 ShapTabularTreeWrapper  48.583731              0.965488           0.471271
 AnchorWrapper           42.893203              0.738253           0.859405
 AggregateExplainer      49.485649              0.649409           0.709135,
                               nrc  sensitivity_spearman  faithfulness_corr
 LimeWrapper             44.920788              0.838128           0.719453
 ShapTabularTreeWrapper  47.987182              0.976499           0.290031
 AnchorWrapper           35.618034              0.509306           0.062415
 AggregateExplainer      47.057979              0.500148           0.466299,
                               nrc  sensitivity_spearman  faithfulness_corr
 LimeWrapper             46.359020              0.844236           0.467506
 ShapTabularTreeWrapper  43.120370              0.968616           0.160671
 AnchorWra

AVG:


Unnamed: 0,nrc,sensitivity_spearman,faithfulness_corr
AggregateExplainer,48.094897,0.46467,0.559872
AnchorWrapper,38.382833,0.673541,0.453359
LimeWrapper,45.832835,0.848424,0.36516
ShapTabularTreeWrapper,46.630478,0.967455,0.378882


### Discussion

Most of them didn't significantly outperformed the TOPSIS. EDAS showed promising results!

- ARAS: very close to topsis, slightly worse sensitivity, but this might have been noise;
- COCOSO: higher complexity, but similar sensitivity and faithfulness to topsis;
- CODAS: very bad complexity and sensitivity, but similar faithfulness -> DISCARD;
- EDAS: similar complexity and sensitivity, with better faithfulness -> CONSIDER;
- MABAC: significantly better faithfulness_corr, but bad complexity and sensitivity;