In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression

import sys
sys.path.append('../HelperFiles/')
from helper import *
from helper_dep import *
from helper_indep import *
from helper_shapley_sampling import *
from helper_kshap import *
from os.path import join

import warnings
warnings.filterwarnings('ignore')
data_path = "../Simulations/Data/bank"

### Load Bank dataset

In [2]:

df_orig = pd.read_csv(join(data_path, "df_orig.csv"))

X_train_raw = np.load(join(data_path, "X_train.npy"))
X_test_raw = np.load(join(data_path, "X_test.npy"))
y_train = np.load(join(data_path, "Y_train.npy"))
y_test = np.load(join(data_path, "Y_test.npy"))
full_dim = X_train_raw.shape[1] # dimension including all binarized categorical columns
X_df = pd.read_csv(join(data_path, "X_df.csv"))


trainmean, trainstd = np.mean(X_train_raw, axis=0), np.std(X_train_raw, axis=0)
def rescale(x, trainmean, trainstd):
    return (x - trainmean) / trainstd
X_train = rescale(X_train_raw, trainmean, trainstd)
X_test = rescale(X_test_raw, trainmean, trainstd)

feature_means = np.mean(X_train, axis=0)
cov_mat = np.cov(X_train, rowvar=False)


df_orig.columns = df_orig.columns.str.replace(' ', '_')
categorical_cols = ['Job', 'Marital', 'Education', 'Default', 'Housing',
                    'Loan', 'Contact', 'Month', 'Prev_Outcome']
mapping_dict = get_mapping_dict(df_orig, X_df, X_train_raw, categorical_cols)

d = X_df.shape[1]

# Compute mean and covariance of training data
feature_means = np.mean(X_train, axis=0)
cov_mat = np.cov(X_train, rowvar=False)
cov2 = correct_cov(cov_mat) # Recondition


In [3]:
logreg = LogisticRegression().fit(X_train, y_train)
print("Class imbalance: {}".format(100*(max(np.mean(y_test), 1-np.mean(y_test)))))
print("Estimation accuracy: {}".format(np.mean((logreg.predict(X_test) > 0.5)==y_test)*100))

def fmodel(x):
    return logreg.predict_proba(x)[:,1]

Class imbalance: 88.29904888299049
Estimation accuracy: 90.22340190223403


# Compute CV-SHAP values, assuming independent features
### Compute true SHAP values of Taylor approximation around $x$, and verify $\sum_{j=1}^d \phi_j(x) \approx f(x)-Ef(X)$

In [4]:
# Select point and compute its gradient and hessian
xloc = X_test[0:1]

BETA = logreg.coef_.reshape(-1)
gradient = logreg_gradient(fmodel, xloc, BETA)
hessian = logreg_hessian(fmodel, xloc, BETA)

# Obtain true SHAP values and verify their feasibility
shap_CV_true_indep = compute_true_shap_cv_indep(xloc, gradient, hessian, feature_means, cov_mat, mapping_dict=mapping_dict)
sum_shap_CV_true = np.sum(shap_CV_true_indep)
avg_CV_empirical = np.mean(f_second_order_approx(fmodel(xloc),X_train, xloc, gradient, hessian))
pred = fmodel(xloc)[0]
exp_CV_sum_empirical = pred - avg_CV_empirical
print(sum_shap_CV_true)
print(exp_CV_sum_empirical)


-0.06352270605618326
-0.06352134275952509


## Shapley Sampling

In [5]:
independent_features = True
obj_ss = cv_shapley_sampling(fmodel, X_train, xloc, 
                        independent_features,
                        gradient, hessian,
                        mapping_dict=mapping_dict,
                        M=100, n_samples_per_perm=10) # M is number of permutations
final_ests, vshap_ests_model, vshap_ests_CV, corr_ests = obj_ss

order = np.argsort(np.abs(final_ests))[::-1]
print(np.round(100*(np.maximum(corr_ests, 0)**2)[order])) # Variance reductions


[95. 97. 91. 84. 90. 90. 92. 91. 75. 41. 97. 68. 90. 92. 96. 93.]


## KernelSHAP

In [6]:
independent_features = True
obj_kshap = cv_kshap(fmodel, X_train, xloc, 
            independent_features,
            gradient, hessian,
            mapping_dict=mapping_dict,var_method="ls",
            M=1000, n_samples_per_perm=10)
final_ests, vshap_ests_model, vshap_ests_CV, corr_ests = obj_kshap

order = np.argsort(np.abs(final_ests))[::-1]
print(np.round(100*(np.maximum(corr_ests,0)**2)[order])) # Variance reductions

[94. 94. 90. 62. 92. 94. 90. 75. 95. 92. 92. 50. 58. 75. 93. 63.]


# Compute CV-SHAP values, assuming dependent features
First, prepare for dependent sampling by precomputing matrices. Note that we use more permutations to estimate the D matrices, so as to obtain a reliable estimate of $\phi^\text{approx}(x)$. While this computation is somewhat expensive, we can reuse the resulting matrices for as many local points $x$ as we like. 

In [7]:
M_linear = 500
D_matrices = make_all_lundberg_matrices(M_linear, cov2)

## Shapley Sampling

In [8]:
independent_features = False
shap_CV_true_dep = linear_shap_vals(xloc, D_matrices, feature_means, gradient)
obj_dep = cv_shapley_sampling(fmodel, X_train, xloc,
                    independent_features,
                    gradient,
                    mapping_dict=mapping_dict,
                    shap_CV_true=shap_CV_true_dep, # Equivalently, can give D_matrices instead
                    M=100,n_samples_per_perm=10,
                    cov_mat=cov2)
final_ests, vshap_ests_model, vshap_ests_CV, corr_ests = obj_dep
order = np.argsort(np.abs(final_ests))[::-1]
print(np.round(100*(np.maximum(corr_ests,0)**2)[order])) # Variance reductions

[41. 73. 71. 56. 59. 69. 59. 56. 43. 52. 52. 56. 48. 50. 50. 50.]


## KernelSHAP

In [9]:
independent_features = False
obj_kshap_dep = cv_kshap(fmodel, X_train, xloc,
                    independent_features,
                    gradient,
                    mapping_dict=mapping_dict,
                    shap_CV_true=shap_CV_true_dep,
                    M=100,n_samples_per_perm=10, var_method="ls",
                    cov_mat=cov2)
final_ests, vshap_ests_model, vshap_ests_CV, corr_ests = obj_kshap_dep
order = np.argsort(np.abs(final_ests))[::-1]
print(np.round(100*(np.maximum(corr_ests,0)**2)[order])) # Variance reductions

[63. 55. 84. 70. 73. 71. 74. 69. 73. 79. 70. 63. 71. 82. 74. 82.]
