In [291]:
import numpy as np
import shap # The original SHAP package contains the dataset
import pandas as pd
from sklearn.linear_model import LogisticRegression
# from helper import *
# from helper_dep import *
# from helper_indep import *
# from helper_shapley_sampling import *
# from helper_kshap import *
import sys
sys.path.append('../GilesCode/')
from helper2 import *
from helper2_dep import *
from helper2_indep import *
from helper2_shapley_sampling import *
from helper4_kshap import *
import matplotlib.pyplot as plt
import sage

German Credit dataset has 1000 samples, 20 covariates; response is "good customer" (binary); ~70% of Ys are 1. On UCI ML Repo.

Problem: can't load as already categorical. The Gradient Boosting model they used allowed you to just input the (numerical) data w/ a list of the categorical indices -- we can't do that with sklrean logistic regression.
- FWIW, I think we had to convert things manually for the bank dataset. I just don't want to have to deal with that again.
- Actually, it might have been OK to begin with - I just made things more complicated than necessary. Wouldn't be the first time.

In [42]:
df = sage.datasets.credit()
df.head()


KeyboardInterrupt: 

In [228]:
# Property, other installment, housing, job, status of checking act, credit history, purpose, savings, employment since, marital status, old debtors
n = df.shape[0]
X_df = df.drop(["Good Customer"], axis=1)
y = df["Good Customer"]

categorical_columns = [
    'Checking Status', 'Credit History', 'Purpose', #'Credit Amount', # It's listed but has 923 unique values
    'Savings Account/Bonds', 'Employment Since', 'Personal Status',
    'Debtors/Guarantors', 'Property Type', 'Other Installment Plans',
    'Housing Ownership', 'Job', #'Telephone', 'Foreign Worker' # These are just binary
]
# feature_names = df.columns.tolist()[:-1]
# categorical_inds = [feature_names.index(col) for col in categorical_columns]
# print([pd.unique(df[colname]).shape[0] for colname in categorical_columns])
X_binarized = pd.get_dummies(X_df, columns=categorical_columns)
d_bin = X_binarized.shape[1]
X_binarized.head()

Unnamed: 0,Duration,Credit Amount,Installment Rate,Residence Duration,Age,Number Existing Credits,Number Liable,Telephone,Foreign Worker,Checking Status_1,...,Other Installment Plans_1,Other Installment Plans_2,Other Installment Plans_3,Housing Ownership_1,Housing Ownership_2,Housing Ownership_3,Job_1,Job_2,Job_3,Job_4
0,18,1049,4,4,21,1,2,1,2,1,...,0,0,1,1,0,0,0,0,1,0
1,9,2799,2,2,36,2,1,1,2,1,...,0,0,1,1,0,0,0,0,1,0
2,12,841,2,4,23,1,2,1,2,0,...,0,0,1,1,0,0,0,1,0,0
3,12,2122,3,2,39,2,1,1,1,1,...,0,0,1,1,0,0,0,1,0,0
4,12,2171,4,4,38,2,2,1,1,1,...,1,0,0,0,1,0,0,1,0,0


In [229]:
mapping_dict = {}
for i, col in enumerate(X_df.columns):
    bin_cols = []
    for j, bin_col in enumerate(X_binarized.columns):
        if bin_col.startswith(col):
            bin_cols.append(j)
    mapping_dict[i] = bin_cols
mapping_dict

{0: [9, 10, 11, 12],
 1: [0],
 2: [13, 14, 15, 16, 17],
 3: [18, 19, 20, 21, 22, 23, 24, 25, 26, 27],
 4: [1],
 5: [28, 29, 30, 31, 32],
 6: [33, 34, 35, 36, 37],
 7: [2],
 8: [38, 39, 40, 41],
 9: [42, 43, 44],
 10: [3],
 11: [45, 46, 47, 48],
 12: [4],
 13: [49, 50, 51],
 14: [52, 53, 54],
 15: [5],
 16: [55, 56, 57, 58],
 17: [6],
 18: [7],
 19: [8]}

In [230]:
np.random.seed(1)
X_norm = (X_binarized-X_binarized.min())/(X_binarized.max()-X_binarized.min())
n_train = round(n*0.8)
train_idx = np.random.choice(n, n_train, replace=False)
X_train, y_train = X_norm.iloc[train_idx].to_numpy(), y.iloc[train_idx].to_numpy()
test_idx = np.setdiff1d(np.arange(n),train_idx)
X_test, y_test = X_norm.iloc[test_idx].to_numpy(), y.iloc[test_idx].to_numpy()

Fit logistic regression model

In [231]:
logreg = LogisticRegression(max_iter=10000).fit(X_train, y_train)
print("Logistic regression accuracy: {}".format(np.mean(logreg.predict(X_test)==y_test)*100))
print("Class imbalance: {}".format(100*np.mean(y_train))) # 70% --> 74%. not amazing... but could be worse

BETA = logreg.coef_.reshape(d_bin)
INTERCEPT = logreg.intercept_

def model(x):
    yhat = sigmoid(np.dot(x, BETA) + INTERCEPT)
    return yhat.item() if x.shape[0]==1 else yhat


Logistic regression accuracy: 74.0
Class imbalance: 69.5


Recondition covariance

In [232]:
np.random.seed(1)
feature_means = np.mean(X_train, axis=0)
cov_mat = np.cov(X_train, rowvar=False)
# Recondition covariance
u, s, vh = np.linalg.svd(cov_mat, full_matrices=True)
K = 10000
if np.max(s)/np.min(s) < K:
    cov2 = cov_mat
else:
    s_max = s[0]
    min_acceptable = s_max/K
    s2 = np.copy(s)
    s2[s <= min_acceptable] = min_acceptable
    cov2 = np.matmul(u, np.matmul(np.diag(s2), vh))

# Prepare for dependent sampling
M_linear = 1000 # 20 seconds/1000 perms
D_matrices = make_all_lundberg_matrices(M_linear, cov2)

Choose test point and compute gradient and hessian

In [248]:
xloc = X_test[20:21]
print(logreg.predict_proba(xloc))
print(model(xloc)) # Yes, our function matches sklearn
print(y_test[0]) # Correct classification

gradient = logreg_gradient(model, xloc, BETA)
hessian = logreg_hessian(model, xloc, BETA)

[[0.05712682 0.94287318]]
0.9428731769700575
1


# Compute SHAP values, assuming independent features
#### Sanity check: Verify true SHAP values of the quadratic approximation add up to $f(x)-Ef(X)$
#### They're close (but not perfect...)

In [249]:
feature_means = np.mean(X_train, axis=0)
cov_mat = np.cov(X_train, rowvar=False)

avg_CV_empirical = np.mean(f_second_order_approx(model(xloc),X_train, xloc, gradient, hessian))
pred = model(xloc)
exp_CV_sum_empirical = pred - avg_CV_empirical
shap_CV_true_indep = compute_true_shap_cv_indep(xloc, gradient, hessian, feature_means, cov2, mapping_dict=mapping_dict)
sum_shap_CV_true = np.sum(shap_CV_true_indep)
print(sum_shap_CV_true)
print(exp_CV_sum_empirical)

0.2043844422880463
0.2043137720490672


## Shapley Sampling
### Amazing! 95+% Variance reductions
- Weird that we get a correlation of slightly over 1.

In [292]:
np.random.seed(13)
independent_features = True
obj_ss = cv_shapley_sampling(model, X_train, xloc, 
                        independent_features,
                        gradient, hessian,
                        mapping_dict=mapping_dict,
                        M=100, n_samples_per_perm=10) # M is number of permutations
final_ests, vshap_ests_model, vshap_ests_CV, corr_ests = obj_ss

order = np.argsort(np.abs(final_ests))[::-1]
print(np.round(corr_ests[order], 4)) # Correlations
print(np.round(100*(corr_ests**2)[order])) # Variance reductions


[0.9967 0.9843 0.9811 0.9881 0.9937 0.9737 0.9842 0.9844 0.9972 0.9793
 0.9885 0.9948 0.9773 0.9812 1.0041 0.9766 0.9936 0.9933 0.9726 0.9771]
[ 99.  97.  96.  98.  99.  95.  97.  97.  99.  96.  98.  99.  96.  96.
 101.  95.  99.  99.  95.  95.]


In [293]:
print(np.argsort(np.abs(shap_CV_true_indep))[::-1])
print(np.argsort(np.abs(final_ests))[::-1])
print(np.argsort(np.abs(vshap_ests_model))[::-1])
print(np.argsort(np.abs(vshap_ests_CV))[::-1])

[ 0  5  7 11  1  3  8 18 12  2 13 15  6 10  9  4 19 16 14 17]
[ 0  5  7 11  1  3  8 18  2 12 13 15  6 10  9  4 19 16 14 17]
[ 0  5  7 11  1  3  8  2 18 12 13  6 15 10  9 19 16  4 14 17]
[ 0  5  7 11  1  3  8  2 18 12 13  6 15 10  9 19 16 14 17  4]


## KernelSHAP
### BAD: CV-adjusted KernelSHAP values are nonsensical
### The correlations close to 1 must somehow be throwing the (co)variance off. 
### However, using many bootstrapped samples or samples per perm (for WLS) doesn't fix.
Holds for all local x.
The features that blow up are the ones with correlations closest to 1.
When corr(A, B)=1, cov(A, B) = sA*sB*corr(A, B) =ish sA sB; cov(A, B)/var(B) = sA/sB
- Small standard deviation of the quadratic model --> blows up
- Not sure what the pattern is here 

0.95*corr(A,B) = 0.95*(Cov(A,B)/(sqrt(VarA)*sqrt(VarA))) = Cov(A,B)/(sqrt(VarA)*(sqrt(VarB)/0.95))

In [294]:
np.random.seed(1)
independent_features = True
obj_kshap = cv_kshap(model, X_train, xloc, 
            independent_features,
            gradient, hessian,
            mapping_dict=mapping_dict,
            M=1000, n_samples_per_perm=10, 
            var_method='wls') # n_boot=1000
final_ests, vshap_ests_model, vshap_ests_CV, corr_ests = obj_kshap
order = np.argsort(np.abs(final_ests))[::-1]
# print(np.round(final_ests[order], 3)) # Final SHAP estimates, ordered
print(np.round(corr_ests[order], 4)) # Correlations
print(np.round(100*(corr_ests**2)[order])) # Variance reductions

[0.9954 0.9937 0.9915 0.99   0.99   0.993  0.9902 0.9971 0.9936 0.9885
 0.9891 0.997  0.9905 0.9962 0.9937 0.9886 0.984  0.9888 0.9873 0.9868]
[99. 99. 98. 98. 98. 99. 98. 99. 99. 98. 98. 99. 98. 99. 99. 98. 97. 98.
 97. 97.]


In [295]:
print(np.argsort(np.abs(shap_CV_true_indep))[::-1])
print(np.argsort(np.abs(final_ests))[::-1])
print(np.argsort(np.abs(vshap_ests_model))[::-1])
print(np.argsort(np.abs(vshap_ests_CV))[::-1])

[ 0  5  7 11  1  3  8 18 12  2 13 15  6 10  9  4 19 16 14 17]
[12  0  2  9 17  4  7 18 10 14 19  5  3 13 11  1  8  6 15 16]
[ 0  5  7  1 11  3  8  2 12 18 13  9 15 16 19 10  6 14 17  4]
[ 0  5  7  1 11  3  8 12  2 18 13  9 15 19 10  6 16 17  4 14]


In [290]:
print(np.round(np.sort(np.abs(vshap_ests_model))[::-1],3))
print(np.round(np.sort(np.abs(final_ests))[::-1],3)) 

print(np.round(np.sort(np.abs(shap_CV_true_indep))[::-1],3))
print(np.round(np.sort(np.abs(vshap_ests_CV))[::-1],3)) 
# gradient[mapping_dict[15]]

[0.11  0.068 0.044 0.041 0.038 0.033 0.024 0.021 0.019 0.016 0.015 0.014
 0.01  0.008 0.007 0.006 0.005 0.003 0.003 0.001]
[0.746 0.334 0.317 0.293 0.217 0.206 0.197 0.195 0.143 0.123 0.116 0.104
 0.094 0.057 0.055 0.04  0.032 0.025 0.012 0.006]
[0.091 0.061 0.032 0.03  0.029 0.019 0.018 0.017 0.016 0.016 0.009 0.009
 0.008 0.005 0.003 0.003 0.002 0.002 0.002 0.001]
[0.091 0.059 0.038 0.037 0.033 0.027 0.022 0.017 0.016 0.015 0.014 0.011
 0.008 0.006 0.005 0.005 0.004 0.002 0.001 0.   ]


# Correlated Features

## Shapley Sampling
#### Looks great. Variance reductions around 90%.

In [196]:
np.random.seed(1)
independent_features = False
shap_CV_true_dep = linear_shap_vals(xloc, D_matrices, feature_means, gradient, mapping_dict=mapping_dict)
obj_dep = cv_shapley_sampling(model, X_train, xloc,
                    independent_features,
                    gradient,
                    shap_CV_true=shap_CV_true_dep, # Equivalently, can give D_matrices instead
                    M=100,n_samples_per_perm=10,
                    mapping_dict=mapping_dict,
                    cov_mat=cov2)
final_ests, vshap_ests_model, vshap_ests_CV, corr_ests = obj_dep
order = np.argsort(np.abs(final_ests))[::-1]
print(final_ests[order]) # Final SHAP estimates, ordered
print(np.round(corr_ests[order], 2)) # Correlations
print(np.round(100*(corr_ests**2)[order])) # Variance reductions



[ 0.18033836  0.10851251  0.07696052  0.05581577 -0.0313031   0.02713382
  0.02296224  0.0226137   0.02195412  0.01540157  0.01336367  0.01297492
 -0.01225083  0.00664529  0.00626478  0.00592896  0.00554915  0.00325047
  0.00240749  0.00071319]
[0.96 0.95 0.95 0.95 0.96 0.92 0.94 0.97 0.96 0.94 0.95 0.94 0.96 0.96
 0.95 0.95 0.96 0.97 0.94 0.93]
[92. 90. 91. 90. 93. 85. 88. 95. 91. 88. 91. 88. 92. 92. 90. 90. 92. 94.
 88. 87.]


In [197]:
print(np.argsort(np.abs(shap_CV_true_dep))[::-1])
print(np.argsort(np.abs(final_ests))[::-1])
print(np.argsort(np.abs(vshap_ests_model))[::-1])
print(np.argsort(np.abs(vshap_ests_CV))[::-1])

[ 2  0  5 12  1  4 18  8  3  7 14 13 11 16  6 10 17 19 15  9]
[ 2  0  5 12  1  4  8  3 18 14 13 16  7  9 10 17 19  6 11 15]
[ 0  5 12  3  6  8  4 16 17 10 14 18 19 11 13  9 15  2  1  7]
[ 0 12  5  3  8  6  4 16 17 14 10 19 13 18 15 11  9  7  1  2]


## KernelSHAP
#### Now nothing blows up, thankfully. Around 90% variance reduction.

In [254]:
%run helper_dep
np.random.seed(1)
independent_features = False
shap_CV_true_dep = linear_shap_vals(xloc, D_matrices, feature_means, gradient, mapping_dict=mapping_dict)
obj_kshap_dep = cv_kshap(model, X_train, xloc,
                    independent_features,
                    gradient,
                    shap_CV_true=shap_CV_true_dep,
                    M=1000,n_samples_per_perm=10,
                    mapping_dict=mapping_dict,
                    cov_mat=cov2)

final_ests, vshap_ests_model, vshap_ests_CV, corr_ests = obj_kshap_dep
order = np.argsort(np.abs(final_ests))[::-1]
print(final_ests[order]) # Final SHAP estimates, ordered
print(np.round(corr_ests[order], 2)) # Correlations
print(np.round(100*(corr_ests**2)[order])) # Variance reductions



covariance is not positive-semidefinite.


[-3.47256830e-01  3.18186521e-01  1.13698761e-01 -7.91640923e-02
  7.48499196e-02  3.57076996e-02  2.68944096e-02 -2.25914226e-02
  2.04085562e-02  1.97896558e-02  1.75542718e-02  1.53139486e-02
  1.39719736e-02 -1.23491749e-02  4.76000784e-03 -2.77565483e-03
  2.40854014e-03  1.04371454e-03  6.25556805e-04 -2.10860440e-04]
[0.94 0.92 0.9  0.97 0.84 0.92 0.91 0.97 0.94 0.97 0.92 0.93 0.97 0.95
 0.96 0.97 0.9  0.94 0.9  0.94]
[88. 85. 82. 93. 70. 86. 82. 95. 88. 93. 84. 86. 94. 90. 92. 94. 81. 89.
 81. 89.]


In [203]:
print(np.argsort(np.abs(shap_CV_true_dep))[::-1])
print(np.argsort(np.abs(final_ests))[::-1])
print(np.argsort(np.abs(vshap_ests_model))[::-1])
print(np.argsort(np.abs(vshap_ests_CV))[::-1])

[ 2  0  5 12  1  4 18  8  3  7 14 13 11 16  6 10 17 19 15  9]
[ 2  0  5 12  1  3  8 14  7 18  4 13  6 16 11  9 10 19 15 17]
[ 0  5 12  1  3  4 11 13  7 14  2  9 10  6 17 18  8 19 16 15]
[ 0  5 12  1  2  4 11  3 13  7 18 14 10 17  8  9 16 19 15  6]
