In [1]:
import numpy as np
import pandas as pd
import sys
sys.path.append('../HelperFiles/')
from helper import *
from helper_dep import *
from helper_indep import *
from helper_shapley_sampling import *
from helper_kshap import *
from os.path import join
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, WeightedRandomSampler
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
data_path = "../Simulations/Data"

  from .autonotebook import tqdm as notebook_tqdm


### Load BRCA dataset

In [2]:
# Load data
np.random.seed(1)
data = pd.read_csv(join(data_path, "brca_small.csv"))
X = data.values[:, :-1][:,:20]
Y = data.values[:, -1]
Y = (Y==2).astype(int) # Formulate as binary classification problem

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=100, random_state=0)

# Normalize
mean = X_train.mean(axis=0)
std = X_train.std(axis=0)
X_train = (X_train - mean) / std
X_test = (X_test - mean) / std
d = X_train.shape[1]
mapping_dict = None

# Compute mean and covariance of training data
feature_means = np.mean(X_train, axis=0)
cov_mat = np.cov(X_train, rowvar=False)
cov2 = correct_cov(cov_mat) # Recondition

### Define and train neural network

In [3]:
class TwoLayerNet(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(TwoLayerNet, self).__init__()
        self.linear1 = nn.Linear(input_size, hidden_size)
        self.tanh = nn.Tanh()
        self.linear2 = nn.Linear(hidden_size, output_size)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        out = self.linear1(x)
        out = self.tanh(out)
        out = self.linear2(out)
        out = self.softmax(out)
        return out

# Convert the input and label data to PyTorch tensors
inputs = torch.tensor(X_train, dtype=torch.float32)
labels = torch.tensor(y_train, dtype=torch.long)

# Compute the class weights
class_counts = torch.bincount(labels)
num_samples = len(labels)
class_weights = 1.0 / class_counts.float()
sample_weights = class_weights[labels]

# Create a sampler with balanced weights
sampler = WeightedRandomSampler(weights=sample_weights, num_samples=num_samples, replacement=True)

# Create a DataLoader with the sampler
dataset = TensorDataset(inputs, labels)
dataloader = DataLoader(dataset, batch_size=32, sampler=sampler)


torch.manual_seed(0)

# Create an instance
net = TwoLayerNet(input_size=d, hidden_size=50, output_size=2)
# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()#weight=torch.tensor(weights)
optimizer = torch.optim.Adam(net.parameters(), lr=0.001)#.01

# Iterate over the training data in batches
num_epochs = 5

# Train the network for the specified number of epochs
for epoch in range(num_epochs):
    for inputs, labels in dataloader:
        # Zero the gradients for this batch
        optimizer.zero_grad()

        # Compute the forward pass of the network
        outputs = net(inputs)

        # Compute the loss for this batch
        loss = criterion(outputs, labels)

        # Compute the gradients of the loss with respect to the parameters
        loss.backward()

        # Update the parameters using the optimizer
        optimizer.step()
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}")

Epoch 1/5, Loss: 0.6165
Epoch 2/5, Loss: 0.6549
Epoch 3/5, Loss: 0.5868
Epoch 4/5, Loss: 0.5764
Epoch 5/5, Loss: 0.5998


#### Move to numpy & evaluate predictive accuracy

In [4]:
def neural_net(x):
    output = net(x)[0,1] if x.shape[0]==1 else net(x)[:,1]
    return output

def compute_hessian(x):
    if not torch.is_tensor(x):
        x = torch.tensor(x, dtype=torch.float32)
    hessian = torch.autograd.functional.hessian(neural_net, x)
    hessian = hessian.reshape((d,d)).detach().numpy()
    return hessian

def fmodel(x):
    if not torch.is_tensor(x):
        x = torch.tensor(x, dtype=torch.float32)
    return neural_net(x).detach().numpy()


print("Class imbalance: {}%".format(round(100*(max(np.mean(y_test), 1-np.mean(y_test))))))
Y_preds = (fmodel(X_test) > 0.5).astype("int")
print("NN with balanced sampling: {}% accuracy".format(round(np.mean(Y_preds == y_test)*100)))



Class imbalance: 55%
NN with balanced sampling: 73% accuracy


# Compute CV-SHAP values, assuming independent features
### Compute true SHAP values of Taylor approximation around $x$, and verify $\sum_{j=1}^d \phi_j(x) \approx f(x)-Ef(X)$

In [5]:
# Compute gradient and hessian around local x
xloc = X_test[0:1]
xloc_torch = torch.tensor(xloc, dtype=torch.float32).requires_grad_(True)
y_pred = net(xloc_torch)[0,1]
y_pred.backward()
gradient = xloc_torch.grad.detach().numpy().reshape((d, 1))
hessian = compute_hessian(xloc)

In [6]:
# Obtain true SHAP values and verify their feasibility
shap_CV_true_indep = compute_true_shap_cv_indep(xloc, gradient, hessian, feature_means, cov_mat, mapping_dict=mapping_dict)
sum_shap_CV_true = np.sum(shap_CV_true_indep)
avg_CV_empirical = np.mean(f_second_order_approx(fmodel(xloc),X_train, xloc, gradient, hessian))
pred = fmodel(xloc)#[0]
exp_CV_sum_empirical = pred - avg_CV_empirical
print(sum_shap_CV_true)
print(exp_CV_sum_empirical)


-0.34031945
-0.34014217331407215


## Shapley Sampling

In [7]:
independent_features = True
obj_ss = cv_shapley_sampling(fmodel, X_train, xloc, 
                        independent_features,
                        gradient, hessian,
                        mapping_dict=mapping_dict,
                        M=100, n_samples_per_perm=10) # M is number of permutations
final_ests, vshap_ests_model, vshap_ests_CV, corr_ests = obj_ss

order = np.argsort(np.abs(final_ests))[::-1]
print(np.round(100*(np.maximum(corr_ests, 0)**2)[order])) # Variance reductions


[75. 79. 86. 73. 80. 71. 74. 89. 83. 80. 66. 75. 38. 91. 47. 94. 86.  0.
 56. 53.]


Compare predicted rankings. ControlSHAP estimates are slightly closer to those of the control variate.

In [8]:
print(np.argsort(np.abs(vshap_ests_model))[::-1])
print(np.argsort(np.abs(final_ests))[::-1])
print(np.argsort(np.abs(shap_CV_true_indep).reshape(-1))[::-1])

[14  9  6  5 18  8 19  4 11 12  2 15 13  0 17  1  7 16 10  3]
[14  9  6  5  8 18 19  4 11 12  2 15 13  0  1 17  7 16 10  3]
[14  9  5 19  6 18  8  4 13 12  2 11  0 15 16  7  1 17 10  3]


## KernelSHAP

In [9]:
independent_features = True
obj_kshap = cv_kshap(fmodel, X_train, xloc, 
            independent_features,
            gradient, hessian,
            mapping_dict=mapping_dict,var_method="ls",
            M=1000, n_samples_per_perm=10)
final_ests, vshap_ests_model, vshap_ests_CV, corr_ests = obj_kshap

order = np.argsort(np.abs(final_ests))[::-1]
print(np.round(100*(np.maximum(corr_ests,0)**2)[order])) # Variance reductions

[90. 83. 86. 90. 90. 84. 88. 72. 77. 87. 84. 84. 81. 90. 84. 87. 83. 86.
 78. 88.]


# Compute CV-SHAP values, assuming dependent features
First, prepare for dependent sampling by precomputing matrices. Note that we use more permutations to estimate the D matrices, so as to obtain a reliable estimate of $\phi^\text{approx}(x)$. While this computation is somewhat expensive, we can reuse the resulting matrices for as many local points $x$ as we like. 

In [10]:
M_linear = 5000
D_matrices = make_all_lundberg_matrices(M_linear, cov2)

## Shapley Sampling

In [11]:
independent_features = False
shap_CV_true_dep = linear_shap_vals(xloc, D_matrices, feature_means, gradient, mapping_dict=mapping_dict)
obj_dep = cv_shapley_sampling(fmodel, X_train, xloc,
                    independent_features,
                    gradient,
                    mapping_dict=mapping_dict,
                    shap_CV_true=shap_CV_true_dep, # Equivalently, can give D_matrices instead
                    M=100,n_samples_per_perm=10,
                    cov_mat=cov2)
final_ests, vshap_ests_model, vshap_ests_CV, corr_ests = obj_dep
order = np.argsort(np.abs(final_ests))[::-1]
print(np.round(100*(np.maximum(corr_ests,0)**2)[order])) # Variance reductions

[85. 83. 64. 85. 85. 85. 85. 84. 89. 93. 79. 86. 83. 87. 87. 86. 88. 89.
 86. 92.]


In [12]:
print(np.argsort(np.abs(vshap_ests_model))[::-1])
print(np.argsort(np.abs(final_ests))[::-1])
print(np.argsort(np.abs(shap_CV_true_dep))[::-1])

[14 19 18  4  8  9  5 11  6 12 17 10  2  3 13  1 15  7  0 16]
[14 19 18  4  8  9  5 11  6 12 17  2 15  0 10  3  7  1 13 16]
[19 14 18  9  4  8  5 12 11 13 17  6 16 15  1  2 10  0  3  7]


## KernelSHAP

In [14]:
independent_features = False
obj_kshap_dep = cv_kshap(fmodel, X_train, xloc,
                    independent_features,
                    gradient,
                    mapping_dict=mapping_dict,
                    shap_CV_true=shap_CV_true_dep,
                    M=500,n_samples_per_perm=10, var_method="ls",
                    cov_mat=cov2)
final_ests, vshap_ests_model, vshap_ests_CV, corr_ests = obj_kshap_dep
order = np.argsort(np.abs(final_ests))[::-1]
print(np.round(100*(np.maximum(corr_ests,0)**2)[order])) # Variance reductions

[85. 91. 93. 85. 92. 93. 91. 88. 92. 93. 90. 91. 91. 91. 93. 94. 89. 91.
 93. 94.]
