In [103]:
import numpy as np
import pandas as pd
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import sys
sys.path.append("../GilesCode")
from helper import *
from helper2_dep import *
from helper2_indep import *
from helper2_shapley_sampling import *
from helper4_kshap import *
import matplotlib.pyplot as plt
from scipy import stats

from os.path import join
import warnings 

warnings.filterwarnings('ignore')


In [104]:
# Load data
np.random.seed(1)
data = pd.read_csv('../Data/brca_small.csv')
X = data.values[:, :-1][:,:20]
Y = data.values[:, -1]

# Split data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=100, random_state=0)
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=100, random_state=1)

# Normalize
mean = X_train.mean(axis=0)
std = X_train.std(axis=0)
X_train = (X_train - mean) / std
X_val = (X_val - mean) / std
X_test = (X_test - mean) / std


In [105]:

def fit_model(x, y, x_val, y_val):
    # Cross validate for C
    C_list = np.arange(0.1, 1.0, 0.05)
    best_loss = np.inf
    best_C = None

    for C in C_list:
        # Fit model
        model = LogisticRegression(C=C, penalty='l1', multi_class='multinomial',
                                   solver='saga', max_iter=20000)
        model.fit(x, y)

        # Calculate loss
        val_loss = log_loss(y_val, model.predict_proba(x_val))

        # See if best
        if val_loss < best_loss:
            best_loss = val_loss
            best_C = C
            
    # Train model with all data
    model = LogisticRegression(C=best_C, penalty='l1', multi_class='multinomial',
                               solver='saga', max_iter=10000)
    model.fit(np.concatenate((x, x_val), axis=0),
              np.concatenate((y, y_val), axis=0))
    
    return model

# Train model
model = fit_model(X_train, Y_train, X_val, Y_val)

# OK, now let's get a gradient and hessian

d = X_train.shape[1]

BETA = model.coef_
A = model.intercept_.reshape(4,1)


def modelf(x):
    yhat = np.exp(A+np.dot(BETA,x.T))
    #return yhat.item() if x.shape[0]==1 else yhat
    return yhat[1]/np.sum(yhat)


def modelg(x):
    yhat = np.exp(A+np.dot(BETA,x.T))
    yhat = yhat/np.sum(yhat)
    
    return BETA[1]*yhat[1] - yhat[1]*np.dot(yhat.T,BETA)
    
def modelH(x):
    yhat = np.exp(A+np.dot(BETA,x.T))
    yhat = yhat/np.sum(yhat)
    
    return yhat[1]*(np.outer(BETA[1],BETA[1].T) -
        np.dot(np.dot(BETA.T,np.diagflat(yhat)),BETA) +
        np.outer(np.dot(yhat.T,BETA),np.dot(yhat.T,BETA)))

xloc = X_train[1].reshape(1,-1)
modelf(xloc)
gradient = modelg(xloc).T
hessian = modelH(xloc) # Not H?

In [106]:
independent_features = True
obj_ss = cv_shapley_sampling(modelf, X_train, xloc, 
                        independent_features,
                        gradient, hessian,
                        M=100, n_samples_per_perm=10) # M is number of permutations
final_ests, vshap_ests_model, vshap_ests_CV, corr_ests = obj_ss

order = np.argsort(np.abs(final_ests))[::-1]
print(np.round(100*(np.maximum(corr_ests, 0)**2**2)[order])) # Variance reductions


[55. 22. 41. 43.  0. 28. 57. 37.  7. 53. 62. 25.  0.  0. 71. 31.  0.  0.
 14.  0.]


In [107]:
independent_features = True
obj_kshap = cv_kshap(modelf, X_train, xloc, 
            independent_features,
            gradient, hessian,
            var_method="wls",
            M=1000, n_samples_per_perm=10)
final_ests, vshap_ests_model, vshap_ests_CV, corr_ests = obj_kshap

order = np.argsort(np.abs(final_ests))[::-1]
print(np.round(100*(corr_ests**2)[order])) # Variance reductions

[41. 64. 54. 65. 62. 50. 62. 60. 66. 67. 36. 64. 68. 63. 38. 45. 56. 54.
 64. 68.]


Dependent features

In [108]:
feature_means = np.mean(X_train, axis=0)
cov_mat = np.cov(X_train, rowvar=False)

# Now to set up D matrices
u, s, vh = np.linalg.svd(cov_mat, full_matrices=True)
K = 10000
s_max = s[0]
min_acceptable = s_max/K
s2 = np.copy(s)
s2[s <= min_acceptable] = min_acceptable
cov2 = np.matmul(u, np.matmul(np.diag(s2), vh))

M_linear = 1000 # 10 seconds/1000 perms or so
D_matrices = make_all_lundberg_matrices(M_linear, cov2)

In [109]:
independent_features = False
shap_CV_true_dep = linear_shap_vals(xloc, D_matrices, feature_means, gradient)
obj_dep = cv_shapley_sampling(modelf, X_train, xloc,
                    independent_features,
                    gradient,
                    shap_CV_true=shap_CV_true_dep, # Equivalently, can give D_matrices instead
                    M=50,n_samples_per_perm=1,
                    cov_mat=cov2)
final_ests, vshap_ests_model, vshap_ests_CV, corr_ests = obj_dep
order = np.argsort(np.abs(final_ests))[::-1]
print(np.round(100*(corr_ests**2)[order])) # Variance reductions

[42. 32. 27. 14. 35. 15. 54. 54. 30. 54. 43. 25. 51. 43. 33. 43. 48. 48.
  7. 37.]


In [110]:
np.random.seed(2)
independent_features = False
obj_kshap_dep = cv_kshap(modelf, X_train, xloc,
                    independent_features,
                    gradient,
                    shap_CV_true=shap_CV_true_dep,
                    M=1000,n_samples_per_perm=10, var_method="wls",
                    cov_mat=cov2)
final_ests, vshap_ests_model, vshap_ests_CV, corr_ests = obj_kshap_dep
order = np.argsort(np.abs(final_ests))[::-1]
print(np.round(100*(corr_ests**2)[order])) # Variance reductions

[37. 42. 34. 42. 33. 30. 44. 36. 48. 32. 40. 28. 34. 41. 34. 36. 37. 33.
 36. 34.]
