In [27]:
import numpy as np
import pickle
import pathlib
from matplotlib.ticker import ScalarFormatter
import matplotlib.pyplot as plt

import os
from os.path import join
path_to_file = str(pathlib.Path().resolve())
dir_path = join(path_to_file, "../../")
import warnings
warnings.filterwarnings('ignore')

import sys
sys.path.append(join(dir_path, "HelperFiles"))
from load_data import *
from helper import *

# SHAP Results
## Failure cases for maximum are at indices that converge infrequently

In [None]:

# alpha = 0.2
# # alpha = 0.1

# # guarantee = "rank"
# guarantee = "set"

datasets = ["census", "bank", "brca", "credit", "breast_cancer"]
Ks = [2,5]
# method = "rankshap"
# method = "sprtshap"
methods = ["rankshap", "sprtshap"]
for method in methods:
    for i, K in enumerate(Ks):
        for alpha in [0.1, 0.2]:
            max_mat = np.empty((len(Ks), len(datasets)))
            for guarantee in ["rank", "set"]:
                data_dir = join(dir_path, "Experiments", "Results", "Top_K", guarantee, "alpha_"+str(alpha)) 
                for j, dataset in enumerate(datasets):
                    max_mat[i,j] = np.nan
                    fname = method + "_" + dataset + "_K" + str(K) 
                    path = join(data_dir, fname)
                    if os.path.exists(path):
                        with open(path, "rb") as fp:
                            results = pickle.load(fp)
                        # fwers = results['fwers']
                        try:
                            top_K_all = results['top_K']
                            rejection_idx = results['rejection_idx']
                            fwers_all = np.array([calc_fwer(top_K, rejection_idx=idx) for top_K, idx in zip(top_K_all, rejection_idx)])
                            relevant_idx = [len(idx)>50*0.95 for idx in rejection_idx]
                            fwers = fwers_all[relevant_idx]
                            # if len(fwers)==30:
                            if len(fwers)>=10:
                                max_fwer = np.max(fwers)
                                max_mat[i, j] = np.round(max_fwer*100, 1).item()
                            else:
                                print(f"Only ran on {len(fwers)} inputs: ", fname, guarantee, alpha)
                        except:
                            print(f"OUTDATED: ", fname, guarantee, alpha)
                    else:
                        print("File not found: ", fname, guarantee, alpha)
            print("#"*20)
            print("Method: ", method, "\tK: ", str(K), "\tAlpha: ", alpha)

            # print("Average FWER (%)")
            # print(avg_mat)
            print("Max FWER (%)")
            print(max_mat)
            # print("Proportion of FWERs below alpha")
            # print(control_mat)

rank
Only ran on 1 inputs:  rankshap_breast_cancer_K2 rank 0.1
set
####################
Method:  rankshap 	Guarantee:  set 	Alpha:  0.1
Max FWER (%)
[[2. 2. 6. 2. 4.]
 [4. 2. 2. 2. 0.]]
rank
Only ran on 1 inputs:  rankshap_breast_cancer_K2 rank 0.2
set
####################
Method:  rankshap 	Guarantee:  set 	Alpha:  0.2
Max FWER (%)
[[16.  0. 10.  2. 12.]
 [ 4.  2.  2.  2.  0.]]
rank
File not found:  rankshap_breast_cancer_K5 rank 0.1
set
####################
Method:  rankshap 	Guarantee:  set 	Alpha:  0.1
Max FWER (%)
[[16.  0. 10.  2. 12.]
 [ 2. 10.  6.  4.  4.]]
rank
File not found:  rankshap_breast_cancer_K5 rank 0.2
set
####################
Method:  rankshap 	Guarantee:  set 	Alpha:  0.2
Max FWER (%)
[[16.  0. 10.  2. 12.]
 [14. 10.  8.  8.  4.]]
rank
Only ran on 0 inputs:  sprtshap_brca_K2 rank 0.1
set
Only ran on 1 inputs:  sprtshap_brca_K2 set 0.1
####################
Method:  sprtshap 	Guarantee:  set 	Alpha:  0.1
Max FWER (%)
[[ 0.  0. nan  0.  0.]
 [14. 10.  8.  8.  4.]]
ran

## Check for completeness. Which ran correctly?

In [21]:
# # method = "rankshap"
# for method in methods:
#     for guarantee in ["rank", "set"]:
#         for alpha in [0.1, 0.2]:
#             max_mat = np.empty((len(Ks), len(datasets)))
#             data_dir = join(dir_path, "Experiments", "Results", "Top_K", guarantee, "alpha_"+str(alpha))
#             for i, K in enumerate(Ks):
#                 for j, dataset in enumerate(datasets):
#                     max_mat[i,j] = np.nan
#                     fname = method + "_" + dataset + "_K" + str(K) 
#                     path = join(data_dir, fname)
#                     if os.path.exists(path):
#                         with open(path, "rb") as fp:
#                             results = pickle.load(fp)
#                         fwers = results['fwers']
#                         if "rejection_idx" not in results.keys():
#                             print(f"Doesn't have number of samples per feature.", fname, guarantee, alpha)
#                     else:
#                         print("File not found: ", fname)


# FWERs were calculated improperly. n

In [28]:
# np.all(np.array([3,1,2])!=np.array([3,1,22]))
top_K_all = results['top_K']
fwers = [calc_fwer(top_K) for top_K in top_K_all]
fwers

[0.02,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.02,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.02,
 0.0]

## LIME Results

In [122]:
method = "lime"
guarantee = "rank"
for alpha in [0.1, 0.2]:
    avg_mat = np.empty((len(Ks), len(datasets)))
    control_mat = np.empty((len(Ks), len(datasets)))
    max_mat = np.empty((len(Ks), len(datasets)))
    data_dir = join(dir_path, "Experiments", "Results", "Top_K", guarantee, "alpha_"+str(alpha))
    for i, K in enumerate(Ks):
        for j, dataset in enumerate(datasets):
            avg_mat[i, j] = np.nan
            control_mat[i, j] = np.nan
            max_mat[i,j] = np.nan
            fname = method + "_" + dataset + "_K" + str(K) 
            path = join(data_dir, fname)
            if os.path.exists(path):
                with open(path, "rb") as fp:
                    results = pickle.load(fp)
                # fwers = results['fwers']
                top_K_all = results['top_K']
                if top_K_all.shape[0]==30:
                    rejection_idx = results['rejection_idx']
                    # fwers = [calc_fwer(top_K, rejection_idx=rejection_idx[i]) for i, top_K in enumerate(top_K_all)]
                    fwers_all = np.array([calc_fwer(top_K, rejection_idx=idx) for top_K, idx in zip(top_K_all, rejection_idx)])
                    relevant_idx = [len(idx)>50*0.95 for idx in rejection_idx]
                    fwers = fwers_all[relevant_idx]

                    avg_fwer = np.mean(fwers)
                    max_fwer = np.max(fwers)
                    max_mat[i, j] = np.round(max_fwer*100, 1)
                    avg_mat[i, j] = np.round(avg_fwer*100, 1)

    print("#"*20)
    print("Method: LIME", "\tAlpha: ", alpha)

    print("Max FWER (%)")
    print(max_mat)

    print("Avg FWER (%)")
    print(avg_mat)


####################
Method: LIME 	Alpha:  0.1
Max FWER (%)
[[ 0.  2.  2.  6. nan]
 [nan  0. 68. nan nan]]
Avg FWER (%)
[[0.  0.1 0.1 0.3 nan]
 [nan 0.  4.5 nan nan]]
####################
Method: LIME 	Alpha:  0.2
Max FWER (%)
[[ 0.  2.  2.  6. nan]
 [nan  0. 68. nan nan]]
Avg FWER (%)
[[0.  0.1 0.1 0.3 nan]
 [nan 0.  4.5 nan nan]]


In [115]:
dataset = "brca"
K = 2
fname = method + "_" + dataset + "_K" + str(K) 
path = join(data_dir, fname)
if os.path.exists(path):
    with open(path, "rb") as fp:
        results = pickle.load(fp)
    # fwers = results['fwers']
    top_K_all = results['top_K']
    rejection_idx = results['rejection_idx']
    # fwers = [calc_fwer(top_K) for top_K in top_K_all]
    fwers_all = np.array([calc_fwer(top_K, rejection_idx=idx) for top_K, idx in zip(top_K_all, rejection_idx)])
    relevant_idx = [len(idx)>50*0.95 for idx in rejection_idx]
    fwers = fwers_all[relevant_idx]

    print(fwers)
idx_of_failure = np.argmax(fwers)
x_idx_failure = results['x_indices'][idx_of_failure]
print(x_idx_failure)

[0.]
3


In [95]:
alphas = [0.1, 0.2]
arrs = []
method = "lime"
dataset = "credit"
fname = method + "_" + dataset + "_K" + str(K) 
for alpha in alphas:
    data_dir = join(dir_path, "Experiments", "Results", "Top_K", guarantee, "alpha_"+str(alpha))
    path = join(data_dir, fname)
    with open(path, "rb") as fp:
        results = pickle.load(fp)
    # fwers = results['fwers']
    top_K_all = results['top_K']
    arrs.append(top_K_all)
np.array_equal(arrs[0], arrs[1])

True