In [41]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import rankdata
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [42]:
datasets = ["logistic", "bank", "brca", "census", "credit"]
dataset_names = ["Simulated", "Portuguese Bank", "BRCA", "Census Income", "German Credit"]

methods = ["glm", "nn", "rf"]
method_names = ["GLM", "Neural Net", "RF"]
cvshap_methods = ["Indep, SS", "Indep, KSHAP", "Dep, SS", "Dep, KSHAP"]
num_most_important = 5


## Variance Reductions

In [46]:
for i, method in enumerate(methods):
    print("\n" + method_names[i])
    results_mat = np.empty((4,len(datasets)))
    for idx in range(len(datasets)):
        fname = '../Results/' + datasets[idx] + '_' + method
        # #pts x #iters x #outputs x #features
        kshaps_indep = np.array(np.load(fname+'_kshap_indep.npy'))
        sss_indep = np.array(np.load(fname+'_ss_indep.npy'))
        kshaps_dep = np.array(np.load(fname+'_kshap_dep.npy'))
        sss_dep = np.array(np.load(fname+'_ss_dep.npy'))

        n_pts, nsim_per_point, h, d = kshaps_indep.shape

        vars_kshap_dep = np.nanvar(kshaps_dep,axis=1)
        vars_kshap_indep = np.nanvar(kshaps_indep,axis=1)
        vars_ss_dep = np.nanvar(sss_dep,axis=1)
        vars_ss_indep = np.nanvar(sss_indep,axis=1)

        var_reducs_indep = 1-np.array([[vars_ss_indep[i][0]/vars_ss_indep[i][1] for i in range(n_pts)],
                            [vars_kshap_indep[i][6]/vars_kshap_indep[i][0] for i in range(n_pts)]])
        reducs_indep_50 = np.quantile(var_reducs_indep,0.50,axis=1).T

        var_reducs_dep = 1-np.array([[vars_ss_dep[i][0]/vars_ss_dep[i][1] for i in range(n_pts)],
                            [vars_kshap_dep[i][6]/vars_kshap_dep[i][0] for i in range(n_pts)]])
        reducs_dep_50 = np.quantile(var_reducs_dep,0.50,axis=1).T

        global_shap_indep = np.sum(np.abs(np.nanmean(sss_indep, axis=1)[:,1,:]), axis=0)
        order_indep = np.argsort(global_shap_indep)[::-1]
        global_shap_dep = np.sum(np.abs(np.nanmean(sss_dep, axis=1)[:,1,:]), axis=0)
        order_dep = np.argsort(global_shap_dep)[::-1]

        indep_avg_var_reducs = np.median(reducs_indep_50[order_indep[:num_most_important]], axis=0)#num_most_important
        dep_avg_var_reducs = np.median(reducs_dep_50[order_dep[:num_most_important]], axis=0)#num_most_important
        avg_var_reducs = np.concatenate([indep_avg_var_reducs, dep_avg_var_reducs]) # Indep (SS, KSHAP), dep (SS, KSHAP)
        results_mat[:,idx] = avg_var_reducs

    results_df = pd.DataFrame(results_mat)
    results_df.index = cvshap_methods
    results_df.columns = dataset_names
    results_df_int = (results_df*100).apply(np.round, axis=1).astype('Int64')
    print(results_df_int)
    results_df_int.to_csv("../PaperFigs/var_reducs_" + method + ".csv")
    # Reload with pd.read_csv("NAME.csv", index_col=0)


GLM
              Simulated  Portuguese Bank  BRCA  Census Income  German Credit
Indep, SS            10               76    75             33             83
Indep, KSHAP         24               84    87              3             94
Dep, SS              59               67    92             71             85
Dep, KSHAP           51               69    94             59             87

Neural Net
              Simulated  Portuguese Bank  BRCA  Census Income  German Credit
Indep, SS             9                8    76              4             55
Indep, KSHAP         22               25    86             -1             80
Dep, SS              58               58    92             61             84
Dep, KSHAP           51               40    92             51             87

RF
              Simulated  Portuguese Bank  BRCA  Census Income  German Credit
Indep, SS            31               18    19              4             15
Indep, KSHAP         37                4    14         

# Ranking Changes

In [39]:
def calc_num_pairs(n):
    return n*(n-1)/2
for i, method in enumerate(methods):
    print("\n" + method_names[i])
    results_mat = np.empty((4,len(datasets)))
    for idx in range(len(datasets)):
        fname = '../Results/' + datasets[idx] + '_' + method
        # #pts x #iters x #outputs x #features
        kshaps_indep = np.array(np.load(fname+'_kshap_indep.npy'))
        sss_indep = np.array(np.load(fname+'_ss_indep.npy'))
        kshaps_dep = np.array(np.load(fname+'_kshap_dep.npy'))
        sss_dep = np.array(np.load(fname+'_ss_dep.npy'))

        n_pts, nsim_per_point, h, d = kshaps_indep.shape

        ss_rank_cors_indep = np.empty(n_pts)
        kshap_rank_cors_indep = np.empty(n_pts)

        ss_cv_rank_cors_indep = np.empty(n_pts)
        kshap_cv_rank_cors_indep = np.empty(n_pts)

        ss_rank_cors_dep = np.empty(n_pts)
        kshap_rank_cors_dep = np.empty(n_pts)

        ss_cv_rank_cors_dep = np.empty(n_pts)
        kshap_cv_rank_cors_dep = np.empty(n_pts)
        n_pairs = calc_num_pairs(nsim_per_point)

        for i in range(n_pts):
            rankmat = np.array([rankdata(sss_indep[i][j][1]) for j in range(nsim_per_point)])
            ss_rank_cors_indep[i] = np.sum(np.abs(rankmat[:,None,:]-rankmat[None,:,:]))/n_pairs# n_pts**2

            rankmat = np.array([rankdata(sss_dep[i][j][1]) for j in range(nsim_per_point)])
            ss_rank_cors_dep[i] = np.sum(np.abs(rankmat[:,None,:]-rankmat[None,:,:]))/n_pairs

            rankmat = np.array([rankdata(sss_indep[i][j][0]) for j in range(nsim_per_point)])
            ss_cv_rank_cors_indep[i] = np.sum(np.abs(rankmat[:,None,:]-rankmat[None,:,:]))/n_pairs

            rankmat = np.array([rankdata(sss_dep[i][j][0]) for j in range(nsim_per_point)])
            ss_cv_rank_cors_dep[i] = np.sum(np.abs(rankmat[:,None,:]-rankmat[None,:,:]))/n_pairs

            rankmat = np.array([rankdata(kshaps_indep[i][j][0]) for j in range(nsim_per_point)])
            kshap_rank_cors_indep[i] = np.sum(np.abs(rankmat[:,None,:]-rankmat[None,:,:]))/n_pairs

            rankmat = np.array([rankdata(kshaps_dep[i][j][0]) for j in range(nsim_per_point)])
            kshap_rank_cors_dep[i] = np.sum(np.abs(rankmat[:,None,:]-rankmat[None,:,:]))/n_pairs

            rankmat = np.array([rankdata(kshaps_indep[i][j][6]) for j in range(nsim_per_point)])
            kshap_cv_rank_cors_indep[i] = np.sum(np.abs(rankmat[:,None,:]-rankmat[None,:,:]))/n_pairs

            rankmat = np.array([rankdata(kshaps_dep[i][j][6]) for j in range(nsim_per_point)])
            kshap_cv_rank_cors_dep[i] = np.sum(np.abs(rankmat[:,None,:]-rankmat[None,:,:]))/n_pairs

        pct_reduc_rank_chgs_ss_indep = np.nanmean((ss_rank_cors_indep - ss_cv_rank_cors_indep)/ss_rank_cors_indep)*100
        pct_reduc_rank_chgs_ss_dep = np.nanmean((ss_rank_cors_dep - ss_cv_rank_cors_dep)/ss_rank_cors_dep)*100
        pct_reduc_rank_chgs_kshap_indep = np.nanmean((kshap_rank_cors_indep - kshap_cv_rank_cors_indep)/kshap_rank_cors_indep)*100
        pct_reduc_rank_chgs_kshap_dep = np.nanmean((kshap_rank_cors_dep - kshap_cv_rank_cors_dep)/kshap_rank_cors_dep)*100
        results_mat[:,idx] = [pct_reduc_rank_chgs_ss_indep, pct_reduc_rank_chgs_kshap_indep, pct_reduc_rank_chgs_ss_dep, pct_reduc_rank_chgs_kshap_dep]

    results_df = pd.DataFrame(results_mat)
    results_df.index = ["Indep, SS", "Indep, KSHAP", "Dep, SS", "Dep, KSHAP"]
    results_df.columns = dataset_names
    results_df_int = (results_df).apply(np.round, axis=1).astype('Int64')
    print(results_df_int)
    results_df_int.to_csv("../PaperFigs/rank_chgs_" + method + ".csv")
    # Reload with pd.read_csv("NAME.csv", index_col=0)



GLM
              Simulated  Portuguese Bank  BRCA  Census Income  German Credit
Indep, SS             6               57    57             13             60
Indep, KSHAP         17               43    54              2             67
Dep, SS              32               42    68             58             64
Dep, KSHAP           27               29    71             30             59

Neural Net
              Simulated  Portuguese Bank  BRCA  Census Income  German Credit
Indep, SS             6                2    52              3             36
Indep, KSHAP         15                8    55              1             45
Dep, SS              31               37    65             42             61
Dep, KSHAP           28               13    68             24             60

RF
              Simulated  Portuguese Bank  BRCA  Census Income  German Credit
Indep, SS            15                3     7              2              6
Indep, KSHAP         19                3     6         

Rankdata: i'th element of rankdata's list is the rank (lowest-to-highest) of i'th element of original list<br>
- Each element (i,j,k) of np.abs(rankmat[:,None,:]-rankmat[None,:,:]) is the number of rank changes of feature k between runs i and j