In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import sage

import sys
sys.path.append('../HelperFiles/')
from helper import *
from helper_dep import *
from helper_indep import *
from helper_shapley_sampling import *
from helper_kshap import *
from os.path import join

import warnings
warnings.filterwarnings('ignore')
data_path = "../Simulations/Data/bank"

  from .autonotebook import tqdm as notebook_tqdm


### German Credit 

In [2]:
df = sage.datasets.credit()
# Property, other installment, housing, job, status of checking act, credit history, purpose, savings, employment since, marital status, old debtors
n = df.shape[0]
X_df = df.drop(["Good Customer"], axis=1)
y = df["Good Customer"]

categorical_columns = [
    'Checking Status', 'Credit History', 'Purpose', #'Credit Amount', # It's listed but has 923 unique values
    'Savings Account/Bonds', 'Employment Since', 'Personal Status',
    'Debtors/Guarantors', 'Property Type', 'Other Installment Plans',
    'Housing Ownership', 'Job', #'Telephone', 'Foreign Worker' # These are just binary
]
X_binarized = pd.get_dummies(X_df, columns=categorical_columns)

mapping_dict = {}
for i, col in enumerate(X_df.columns):
    bin_cols = []
    for j, bin_col in enumerate(X_binarized.columns):
        if bin_col.startswith(col):
            bin_cols.append(j)
    mapping_dict[i] = bin_cols

np.random.seed(1)
X_norm = (X_binarized-X_binarized.min())/(X_binarized.max()-X_binarized.min())
n_train = round(n*0.8)
train_idx = np.random.choice(n, n_train, replace=False)
X_train, y_train = X_norm.iloc[train_idx].to_numpy(), y.iloc[train_idx].to_numpy()
test_idx = np.setdiff1d(np.arange(n),train_idx)
X_test, y_test = X_norm.iloc[test_idx].to_numpy(), y.iloc[test_idx].to_numpy()
d = X_train.shape[1] # dimension of binarized data

In [3]:
# Compute standard deviations of features
sds = []
for i in range(d):
    uu = np.unique(X_train[:,i])
    if len(uu) == 2:
        sds.append(uu)
    else:
        sds.append(np.repeat(np.std(X_train[:,i]),2))
sds = np.array(sds)

# Compute mean and covariance of training data
feature_means = np.mean(X_train, axis=0)
cov_mat = np.cov(X_train, rowvar=False)
cov2 = correct_cov(cov_mat) # Recondition

### Train Random Forest Classifier

In [4]:
rf = RandomForestClassifier(n_estimators=1000).fit(X_train, y_train)
print("Class imbalance: {}".format(100*(max(np.mean(y_test), 1-np.mean(y_test)))))
print("Estimation accuracy: {}".format(np.mean((rf.predict(X_test) > 0.5)==y_test)*100))

def fmodel(x):
    return rf.predict_proba(x)[:,1]

Class imbalance: 72.0
Estimation accuracy: 76.0


# Compute ControlSHAP values, assuming independent features
### Compute true SHAP values of Taylor approximation around $x$, and verify $\sum_{j=1}^d \phi_j(x) \approx f(x)-Ef(X)$

In [5]:
xloc = X_test[0:1]
gradient = difference_gradient(fmodel,xloc,sds)
hessian = difference_hessian(fmodel,xloc,sds)

shap_CV_true_indep = compute_true_shap_cv_indep(xloc, gradient, hessian, feature_means, cov_mat, mapping_dict=mapping_dict)
sum_shap_CV_true = np.sum(shap_CV_true_indep)
avg_CV_empirical = np.mean(f_second_order_approx(fmodel(xloc),X_train, xloc, gradient, hessian))
pred = fmodel(xloc)#[0]
exp_CV_sum_empirical = (pred - avg_CV_empirical)[0]
print(sum_shap_CV_true)
print(exp_CV_sum_empirical)


0.33475336209479073
0.33464059710197225


## Shapley Sampling

In [6]:
independent_features = True
obj_ss = cv_shapley_sampling(fmodel, X_train, xloc, 
                        independent_features,
                        gradient, hessian,
                        mapping_dict=mapping_dict,
                        M=100, n_samples_per_perm=10) # M is number of permutations
final_ests, vshap_ests_model, vshap_ests_CV, corr_ests = obj_ss

order = np.argsort(np.abs(final_ests))[::-1]
print(np.round(100*(np.maximum(corr_ests, 0)**2)[order])) # Variance reductions


[32. 31. 74. 65. 49. 45. 29. 22. 26. 60. 28. 53. 56. 27. 34.  6.  6.  0.
 22. 60.]


## KernelSHAP

In [7]:
independent_features = True
obj_kshap = cv_kshap(fmodel, X_train, xloc, 
            independent_features,
            gradient, hessian,
            mapping_dict=mapping_dict,var_method="ls",
            M=1000, n_samples_per_perm=10)
final_ests, vshap_ests_model, vshap_ests_CV, corr_ests = obj_kshap

order = np.argsort(np.abs(final_ests))[::-1]
print(np.round(100*(np.maximum(corr_ests,0)**2)[order])) # Variance reductions

[45. 35. 47. 28. 32. 40. 43. 31. 27. 41. 42. 34. 31. 37. 35. 42. 36. 37.
 36. 33.]


# Compute ControlSHAP values, assuming dependent features
First, prepare for dependent sampling by precomputing matrices. Note that we use more permutations to estimate the D matrices, so as to obtain a reliable estimate of $\phi^\text{approx}(x)$. While this computation is somewhat expensive, we can reuse the resulting matrices for as many local points $x$ as we like. 

In [16]:
# Prepare for dependent sampling
M_linear = 500
D_matrices = make_all_lundberg_matrices(M_linear, cov2)
shap_CV_true_dep = linear_shap_vals(xloc, D_matrices, feature_means, gradient, mapping_dict=mapping_dict)

## Shapley Sampling

In [17]:
independent_features = False
obj_dep = cv_shapley_sampling(fmodel, X_train, xloc,
                    independent_features,
                    gradient,
                    mapping_dict=mapping_dict,
                    shap_CV_true=shap_CV_true_dep, # Equivalently, can give D_matrices instead
                    M=100,n_samples_per_perm=10,
                    cov_mat=cov2)
final_ests, vshap_ests_model, vshap_ests_CV, corr_ests = obj_dep
order = np.argsort(np.abs(final_ests))[::-1]
print(np.round(100*(np.maximum(corr_ests,0)**2)[order])) # Variance reductions

[71. 74. 78. 74. 70. 69. 73. 74. 69. 82. 85. 85. 80. 79. 73. 79. 80. 80.
 80. 74.]


In [18]:
print(np.argsort(np.abs(vshap_ests_model))[::-1])
print(np.argsort(np.abs(final_ests))[::-1])
print(np.argsort(np.abs(shap_CV_true_dep))[::-1])

[ 2  1 11  8  4  3 19 14  7  5 13  9 17 12 16 10  6 15 18  0]
[ 2  1 11  0 19 12  7 13 10 14  6  4  8 16 17  9  3 18  5 15]
[ 2 11 19  6  1 13  8  3 16 14  7 17 15  4  9 10 12  0  5 18]


## KernelSHAP

In [19]:
independent_features = False
obj_kshap_dep = cv_kshap(fmodel, X_train, xloc,
                    independent_features,
                    gradient,
                    mapping_dict=mapping_dict,
                    shap_CV_true=shap_CV_true_dep,
                    M=100,n_samples_per_perm=10, var_method="ls",
                    cov_mat=cov2)
final_ests, vshap_ests_model, vshap_ests_CV, corr_ests = obj_kshap_dep
order = np.argsort(np.abs(final_ests))[::-1]
print(np.round(100*(np.maximum(corr_ests,0)**2)[order])) # Variance reductions

[16. 34. 36. 60. 36. 28. 47. 16. 28. 40. 34. 43. 67. 41. 19. 37. 55. 25.
 69. 45.]
