In [1]:
import warnings

warnings.filterwarnings("ignore")

In [2]:
from utils import (
    compute_shap_similarity_pearson,
    compute_ndcg_similarity,
    compute_pred_jaccard,
    compute_auc_roc,
    compute_score_correlations,
    load_nested_results,
)
import numpy as np

In [3]:
all_results = load_nested_results("results")

In [4]:
datasets_to_del = []

for name, results in all_results.items():
    auc_roc, _ = compute_auc_roc(results)

    median_auc_roc = np.median(auc_roc)
    if median_auc_roc < 0.55 or median_auc_roc > 0.95:
        datasets_to_del.append(name)
        print(f"{name} - {median_auc_roc}")

for d in datasets_to_del:
    del all_results[d]

4_breastw - 0.9687693877915169
21_Lymphography - 0.9964285714285716
38_thyroid - 0.9526445655921953
39_vertebral - 0.4117186302070023
42_WBC - 0.98656330749354
44_Wilt - 0.4107742873751187
47_yeast - 0.41243399747230003


In [5]:
dataset_names = list(all_results.keys())
models_names = sorted(list(all_results[dataset_names[0]].keys() - {"ground_truth"}))

In [6]:
import numpy as np
from itertools import combinations
from collections import defaultdict
from sklearn.metrics import f1_score


def aggreg_f1_relative(y_true, y_preds):
    best_score = np.median(
        np.array([f1_score(y_true, y_preds[i]) for i in range(y_preds.shape[0])])
    )
    auc = f1_score(y_true, np.sum(y_preds, axis=0) >= 1)
    return auc - best_score


scores = defaultdict(lambda: defaultdict(list))
n_models_ensemblist = 2

for dataset in dataset_names:
    print(dataset)

    y_true_folds = all_results[dataset]["ground_truth"]

    # Compute similarities on the full set of models first
    dist_shap_sim = 1 - compute_shap_similarity_pearson(all_results[dataset])[0]
    dist_ndcg_sim = 1 - compute_ndcg_similarity(all_results[dataset])[0]
    dist_scores_sim = 1 - compute_score_correlations(all_results[dataset])[0]
    dist_jaccard_sim = 1 - compute_pred_jaccard(all_results[dataset])[0]

    for comb in combinations(range(len(models_names)), n_models_ensemblist):
        metric = []

        for fold in range(len(y_true_folds)):
            y_true = y_true_folds[fold]

            metric.append(
                aggreg_f1_relative(
                    y_true,
                    np.array(
                        [
                            all_results[dataset][models_names[i]][fold]["predictions"]
                            for i in comb
                        ]
                    ),
                )
            )

        dists_shap = []
        dists_ndcg = []
        dists_scores = []
        dists_jaccard = []

        # Metric aggregation
        for i in range(n_models_ensemblist):
            for j in range(i + 1, n_models_ensemblist):
                dists_shap.append(dist_shap_sim[comb[i], comb[j]])
                dists_ndcg.append(dist_ndcg_sim[comb[i], comb[j]])
                dists_scores.append(dist_scores_sim[comb[i], comb[j]])
                dists_jaccard.append(dist_jaccard_sim[comb[i], comb[j]])

        scores[dataset]["name"].append(f"{'-'.join([models_names[i] for i in comb])}")
        scores[dataset]["mcc"].append(np.nanmean(metric))
        scores[dataset]["ndcg"].append(np.mean(dists_ndcg))
        scores[dataset]["shap"].append(np.mean(dists_shap))
        scores[dataset]["scores"].append(np.mean(dists_scores))
        scores[dataset]["jaccard"].append(np.mean(dists_jaccard))

2_annthyroid
14_glass
15_Hepatitis
23_mammography
27_PageBlocks
29_Pima
37_Stamps
40_vowels
45_wine


In [7]:
import pandas as pd
from scipy.stats import pearsonr

data = []

for dataset in dataset_names:
    mcc = scores[dataset]["mcc"]
    r, _ = pearsonr(mcc, scores[dataset]["shap"])
    r_w, _ = pearsonr(mcc, scores[dataset]["ndcg"])
    r_s, _ = pearsonr(mcc, scores[dataset]["scores"])
    r_J, _ = pearsonr(mcc, scores[dataset]["jaccard"])

    data.append(
        {"Dataset": dataset, "shap": r, "NDCG": r_w, "Scores": r_s, "Jaccard": r_J}
    )

df = pd.DataFrame(data).set_index("Dataset")

# Affichage propre
print(df.round(3))

                 shap   NDCG  Scores  Jaccard
Dataset                                      
2_annthyroid    0.757  0.379   0.509    0.756
14_glass        0.290  0.346   0.234    0.227
15_Hepatitis    0.522  0.482   0.053    0.423
23_mammography -0.051  0.542  -0.036   -0.251
27_PageBlocks   0.550  0.163   0.229    0.437
29_Pima         0.488  0.403   0.802    0.886
37_Stamps       0.405  0.078   0.359    0.657
40_vowels       0.659  0.584   0.198    0.227
45_wine         0.226  0.257   0.467    0.452
