# Load data

In [1]:
import sage
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
# Load data
df = sage.datasets.bank()

# Convert binary features to 0/1
binary_cols = ['Default', 'Housing', 'Loan']
for col in binary_cols:
    df[col] = (df[col] == 'yes').astype(float)
    
# Convert education to numerical
df['Education'].replace(
    {'unknown': 0, 'primary': 1, 'secondary': 2, 'tertiary': 3},
    inplace=True)

# Convert month to numerical
df['Month'].replace(
    {'jan': 0, 'feb': 1, 'mar': 2, 'apr': 3, 'may': 4, 'jun': 5,
     'jul': 6, 'aug': 7, 'sep': 8, 'oct': 9, 'nov': 10, 'dec': 11},
    inplace=True)

# Convert marital to one-hot
for value in np.unique(df['Marital'].values):
    df['Marital-{}'.format(value)] = (df['Marital'] == value).astype(float)
df.drop(columns='Marital', inplace=True)

# Convert contact to one-hot
for value in np.unique(df['Contact'].values):
    df['Contact-{}'.format(value)] = (df['Contact'] == value).astype(float)
df.drop(columns='Contact', inplace=True)

# Convert prev outcome to one-hot
for value in np.unique(df['Prev Outcome'].values):
    df['Prev Outcome-{}'.format(value)] = (df['Prev Outcome'] == value).astype(float)
df.drop(columns='Prev Outcome', inplace=True)

# Convert job to one-hot
for value in np.unique(df['Job'].values):
    df['Job-{}'.format(value)] = (df['Job'] == value).astype(float)
df.drop(columns='Job', inplace=True)

# Split into X, Y
values = df.values.astype(float)
X_cols = np.array(df.columns) != 'Success'
X, Y = values[:, X_cols], values[:, ~X_cols]

# Get feature names, groups
feature_names = np.array(df.columns)[X_cols]
prefixes = np.array([name.split('-')[0] for name in feature_names])
groups = []
group_names = []
for prefix in np.unique(prefixes):
    groups.append(np.where(prefixes == prefix)[0])
    group_names.append(prefix)

# Train/val/test split
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.1, random_state=123)
X_train, X_val, Y_train, Y_val = train_test_split(
    X_train, Y_train, test_size=0.1, random_state=123)

In [3]:
# Standardize continuous columns
feature_names = list(feature_names)
num_features = len(feature_names)
continuous_cols = ['Age', 'Balance', 'Day', 'Duration', 'Campaign',
                   'Month', 'Prev Days', 'Prev Contacts']
continuous_inds = [feature_names.index(col) for col in continuous_cols]
ss = StandardScaler()
ss.fit(X_train[:, continuous_inds])
X_train[:, continuous_inds] = ss.transform(X_train[:, continuous_inds])
X_val[:, continuous_inds] = ss.transform(X_val[:, continuous_inds])
X_test[:, continuous_inds] = ss.transform(X_test[:, continuous_inds])

# Set up imputer

In [4]:
import torch
import fastshap_torch

In [5]:
device = torch.device('cuda', 2)
surrogate = torch.load('../models/bank_surrogate.pt').eval().to(device)

In [6]:
def imputer(x, S):
    x = torch.tensor(x, dtype=torch.float32, device=device)
    S = torch.tensor(S, dtype=torch.float32, device=device)
    pred = surrogate((x, S)).softmax(dim=-1)
    return pred.cpu().data.numpy()

# Ground truth SHAP values

In [7]:
import pickle
import matplotlib.pyplot as plt
from shapreg import games, shapley, shapley_sampling

In [8]:
num_examples = 256
# thresh = 0.001
thresh = 0.01

In [12]:
shap_values = []

for i in range(num_examples):
    # Get instance
    x = X_test[i]

    # Set up game
    game = games.PredictionGame(imputer, x, groups)
    
    # Calculate ground truth SHAP values
    explanation = shapley.ShapleyRegression(game, thresh=thresh, bar=False)
    shap_values.append(explanation.values.T)
    print('Done with sample = {}'.format(i))

Done with sample = 0
Done with sample = 1
Done with sample = 2
Done with sample = 3
Done with sample = 4
Done with sample = 5
Done with sample = 6
Done with sample = 7
Done with sample = 8
Done with sample = 9
Done with sample = 10
Done with sample = 11
Done with sample = 12
Done with sample = 13
Done with sample = 14
Done with sample = 15
Done with sample = 16
Done with sample = 17
Done with sample = 18
Done with sample = 19
Done with sample = 20
Done with sample = 21
Done with sample = 22
Done with sample = 23
Done with sample = 24
Done with sample = 25
Done with sample = 26
Done with sample = 27
Done with sample = 28
Done with sample = 29
Done with sample = 30
Done with sample = 31
Done with sample = 32
Done with sample = 33
Done with sample = 34
Done with sample = 35
Done with sample = 36
Done with sample = 37
Done with sample = 38
Done with sample = 39
Done with sample = 40
Done with sample = 41
Done with sample = 42
Done with sample = 43
Done with sample = 44
Done with sample = 4

In [13]:
with open('../results/bank_shap.pkl', 'wb') as f:
    pickle.dump(shap_values, f)

# Estimation curves

In [9]:
samples = 8192
num_features = X_train.shape[1]

In [10]:
kernelshap_curves = []

for i in range(num_examples):
    # Get instance
    x = X_test[i]

    # Set up game
    game = games.PredictionGame(imputer, x, groups)
    
    # Calculate ground truth SHAP values
    results = shapley.ShapleyRegression(game, batch_size=32, n_samples=2*samples, detect_convergence=False,
                                        bar=False, paired_sampling=False, return_all=True)
    curve = np.array([explanation.T for explanation in results[1]['values']])
    kernelshap_curves.append(curve)
    print('Done with sample = {}'.format(i))

kernelshap_iters = results[1]['iters']

Done with sample = 0
Done with sample = 1
Done with sample = 2
Done with sample = 3
Done with sample = 4
Done with sample = 5
Done with sample = 6
Done with sample = 7
Done with sample = 8
Done with sample = 9
Done with sample = 10
Done with sample = 11
Done with sample = 12
Done with sample = 13
Done with sample = 14
Done with sample = 15
Done with sample = 16
Done with sample = 17
Done with sample = 18
Done with sample = 19
Done with sample = 20
Done with sample = 21
Done with sample = 22
Done with sample = 23
Done with sample = 24
Done with sample = 25
Done with sample = 26
Done with sample = 27
Done with sample = 28
Done with sample = 29
Done with sample = 30
Done with sample = 31
Done with sample = 32
Done with sample = 33
Done with sample = 34
Done with sample = 35
Done with sample = 36
Done with sample = 37
Done with sample = 38
Done with sample = 39
Done with sample = 40
Done with sample = 41
Done with sample = 42
Done with sample = 43
Done with sample = 44
Done with sample = 4

In [13]:
paired_curves = []

for i in range(num_examples):
    # Get instance
    x = X_test[i]

    # Set up game
    game = games.PredictionGame(imputer, x, groups)
    
    # Calculate ground truth SHAP values
    results = shapley.ShapleyRegression(game, batch_size=32, n_samples=2 * (samples / 2), detect_convergence=False,
                                        bar=False, paired_sampling=True, return_all=True)
    curve = np.array([explanation.T for explanation in results[1]['values']])
    paired_curves.append(curve)
    print('Done with sample = {}'.format(i))

paired_iters = results[1]['iters']

Done with sample = 0
Done with sample = 1
Done with sample = 2
Done with sample = 3
Done with sample = 4
Done with sample = 5
Done with sample = 6
Done with sample = 7
Done with sample = 8
Done with sample = 9
Done with sample = 10
Done with sample = 11
Done with sample = 12
Done with sample = 13
Done with sample = 14
Done with sample = 15
Done with sample = 16
Done with sample = 17
Done with sample = 18
Done with sample = 19
Done with sample = 20
Done with sample = 21
Done with sample = 22
Done with sample = 23
Done with sample = 24
Done with sample = 25
Done with sample = 26
Done with sample = 27
Done with sample = 28
Done with sample = 29
Done with sample = 30
Done with sample = 31
Done with sample = 32
Done with sample = 33
Done with sample = 34
Done with sample = 35
Done with sample = 36
Done with sample = 37
Done with sample = 38
Done with sample = 39
Done with sample = 40
Done with sample = 41
Done with sample = 42
Done with sample = 43
Done with sample = 44
Done with sample = 4

In [14]:
sampling_curves = []

for i in range(num_examples):
    # Get instance
    x = X_test[i]

    # Set up game
    game = games.PredictionGame(imputer, x, groups)
    
    # Calculate ground truth SHAP values
    results = shapley_sampling.ShapleySampling(game, batch_size=1, n_samples=int(np.ceil(samples / num_features)), detect_convergence=False,
                                               bar=False, return_all=True)
    curve = np.array([explanation for explanation in results[1]['values']])
    sampling_curves.append(curve)
    print('Done with sample = {}'.format(i))

sampling_iters = results[1]['iters']

Done with sample = 0
Done with sample = 1
Done with sample = 2
Done with sample = 3
Done with sample = 4
Done with sample = 5
Done with sample = 6
Done with sample = 7
Done with sample = 8
Done with sample = 9
Done with sample = 10
Done with sample = 11
Done with sample = 12
Done with sample = 13
Done with sample = 14
Done with sample = 15
Done with sample = 16
Done with sample = 17
Done with sample = 18
Done with sample = 19
Done with sample = 20
Done with sample = 21
Done with sample = 22
Done with sample = 23
Done with sample = 24
Done with sample = 25
Done with sample = 26
Done with sample = 27
Done with sample = 28
Done with sample = 29
Done with sample = 30
Done with sample = 31
Done with sample = 32
Done with sample = 33
Done with sample = 34
Done with sample = 35
Done with sample = 36
Done with sample = 37
Done with sample = 38
Done with sample = 39
Done with sample = 40
Done with sample = 41
Done with sample = 42
Done with sample = 43
Done with sample = 44
Done with sample = 4

In [15]:
antithetical_curves = []

for i in range(num_examples):
    # Get instance
    x = X_test[i]

    # Set up game
    game = games.PredictionGame(imputer, x, groups)
    
    # Calculate ground truth SHAP values
    results = shapley_sampling.ShapleySampling(game, batch_size=2, n_samples=int(np.ceil(samples / num_features)), detect_convergence=False,
                                               bar=False, antithetical=True, return_all=True)
    curve = np.array([explanation for explanation in results[1]['values']])
    antithetical_curves.append(curve)
    print('Done with sample = {}'.format(i))

antithetical_iters = results[1]['iters']

Done with sample = 0
Done with sample = 1
Done with sample = 2
Done with sample = 3
Done with sample = 4
Done with sample = 5
Done with sample = 6
Done with sample = 7
Done with sample = 8
Done with sample = 9
Done with sample = 10
Done with sample = 11
Done with sample = 12
Done with sample = 13
Done with sample = 14
Done with sample = 15
Done with sample = 16
Done with sample = 17
Done with sample = 18
Done with sample = 19
Done with sample = 20
Done with sample = 21
Done with sample = 22
Done with sample = 23
Done with sample = 24
Done with sample = 25
Done with sample = 26
Done with sample = 27
Done with sample = 28
Done with sample = 29
Done with sample = 30
Done with sample = 31
Done with sample = 32
Done with sample = 33
Done with sample = 34
Done with sample = 35
Done with sample = 36
Done with sample = 37
Done with sample = 38
Done with sample = 39
Done with sample = 40
Done with sample = 41
Done with sample = 42
Done with sample = 43
Done with sample = 44
Done with sample = 4

In [16]:
with open('../results/bank_curves.pkl', 'wb') as f:
    save_dict = {
        'kernelshap': kernelshap_curves,
        'kernelshap_iters': kernelshap_iters,

        'paired_sampling': paired_curves,
        'paired_sampling_iters': paired_iters,

        'sampling_curves': sampling_curves,
        'sampling_iters': sampling_iters,
        
        'antithetical_curves': antithetical_curves,
        'antithetical_iters': antithetical_iters,
    }
    pickle.dump(save_dict, f)