# Bank

In [None]:
import sage
import shap_sampling
import pickle
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import log_loss
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split

In [None]:
# Load data
df = sage.datasets.bank()

# Feature names and categorical columns (for CatBoost model)
feature_names = df.columns.tolist()[:-1]
categorical_cols = ['Job', 'Marital', 'Education', 'Default', 'Housing',
                    'Loan', 'Contact', 'Month', 'Prev Outcome']
categorical_inds = [feature_names.index(col) for col in categorical_cols]

In [None]:
# Split data
train, test = train_test_split(
    df.values, test_size=int(0.1 * len(df.values)), random_state=123)
train, val = train_test_split(
    train, test_size=int(0.1 * len(df.values)), random_state=123)
Y_train = train[:, -1].copy().astype(int)
Y_val = val[:, -1].copy().astype(int)
Y_test = test[:, -1].copy().astype(int)
train = train[:, :-1].copy()
val = val[:, :-1].copy()
test = test[:, :-1].copy()

In [None]:
with open('trained_models/bank model.pkl', 'rb') as f:
    model = pickle.load(f)

In [None]:
sage_final = sage.load('results/bank_sage.pkl')
final_values = sage_final.values

In [None]:
# Setup and calculate
imputer = sage.MarginalImputer(model, test[:512])
estimator = shap_sampling.SHAPEstimator(imputer, 'cross entropy')

In [None]:
permutations_list = []
for i in range(16):
    x = test[i:i+1]
    y = Y_test[i:i+1]
    shap_values = estimator(x, y, batch_size=16, thresh=0.01, verbose=False, bar=True)
    permutations_list.append(shap_values.n_permutations)
    print('Done with {} ({} permutations)'.format(i, permutations_list[-1]))

In [None]:
mean_permutations = np.mean(permutations_list)
median_permutations = np.median(permutations_list)
num_permutations = (mean_permutations // 32) * 32

In [None]:
values_list = []
for i in range(512):
    x = test[i:i+1]
    y = Y_test[i:i+1]
    shap_values = estimator(x, y, batch_size=32, n_permutations=num_permutations,
                          detect_convergence=False, verbose=False, bar=False)
    values_list.append(shap_values.values)
    corr = np.corrcoef(np.array(values_list).mean(axis=0), final_values)[0, 1]
    print(i, corr)

In [None]:
results_dict = {
    'values': np.array(values_list),
    'evals': median_permutations * len(feature_names),
    'inner_samples': 512
}
with open('results/bank shap convergence.pkl', 'wb') as f:
    pickle.dump(results_dict, f)

# Bike

In [None]:
import sage
import shap_sampling
import pickle
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split

In [None]:
# Load data
df = sage.datasets.bike()
feature_names = df.columns.tolist()[:-3]

In [None]:
# Split data, with total count serving as regression target
train, test = train_test_split(
    df.values, test_size=int(0.1 * len(df.values)), random_state=123)
train, val = train_test_split(
    train, test_size=int(0.1 * len(df.values)), random_state=123)
Y_train = train[:, -1].copy()
Y_val = val[:, -1].copy()
Y_test = test[:, -1].copy()
train = train[:, :-3].copy()
val = val[:, :-3].copy()
test = test[:, :-3].copy()

In [None]:
with open('trained_models/bike model.pkl', 'rb') as f:
    model = pickle.load(f)

In [None]:
sage_final = sage.load('results/bike_sage.pkl')
final_values = sage_final.values

In [None]:
# Setup and calculate
imputer = sage.MarginalImputer(model, test[:512])
estimator = shap_sampling.SHAPEstimator(imputer, 'mse')

In [None]:
permutations_list = []
for i in range(16):
    x = test[i:i+1]
    y = Y_test[i:i+1]
    shap_values = estimator(x, y, batch_size=128, thresh=0.01, verbose=False, bar=True)
    permutations_list.append(shap_values.n_permutations)
    print('Done with {} ({} permutations)'.format(i, permutations_list[-1]))

In [None]:
mean_permutations = np.mean(permutations_list)
median_permutations = np.median(permutations_list)
num_permutations = (mean_permutations // 32) * 32

In [None]:
values_list = []
for i in range(512):
    x = test[i:i+1]
    y = Y_test[i:i+1]
    shap_values = estimator(x, y, batch_size=32, n_permutations=num_permutations,
                          detect_convergence=False, verbose=False, bar=False)
    values_list.append(shap_values.values)
    corr = np.corrcoef(np.array(values_list).mean(axis=0), final_values)[0, 1]
    print(i, corr)

In [None]:
results_dict = {
    'values': np.array(values_list),
    'evals': median_permutations * len(feature_names),
    'inner_samples': 512
}
with open('results/bike shap convergence.pkl', 'wb') as f:
    pickle.dump(results_dict, f)

# Credit

In [None]:
import sage
import pickle
import shap_sampling
import numpy as np
from sklearn.metrics import log_loss
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split

In [None]:
# Load data
df = sage.datasets.credit()

# Feature names and categorical columns (for CatBoost model)
feature_names = df.columns.tolist()[:-1]
categorical_columns = [
    'Checking Status', 'Credit History', 'Purpose', 'Credit Amount',
    'Savings Account/Bonds', 'Employment Since', 'Personal Status',
    'Debtors/Guarantors', 'Property Type', 'Other Installment Plans',
    'Housing Ownership', 'Job', 'Telephone', 'Foreign Worker'
]
categorical_inds = [feature_names.index(col) for col in categorical_columns]

In [None]:
# Split data
train, test = train_test_split(
    df.values, test_size=int(0.1 * len(df.values)), random_state=0)
train, val = train_test_split(
    train, test_size=int(0.1 * len(df.values)), random_state=0)
Y_train = train[:, -1].copy().astype(int)
Y_val = val[:, -1].copy().astype(int)
Y_test = test[:, -1].copy().astype(int)
train = train[:, :-1].copy()
val = val[:, :-1].copy()
test = test[:, :-1].copy()

In [None]:
with open('trained_models/credit model.pkl', 'rb') as f:
    model = pickle.load(f)

In [None]:
sage_final = sage.load('results/credit_sage.pkl')
final_values = sage_final.values

In [None]:
# Setup and calculate
imputer = sage.MarginalImputer(model, train[:512])
estimator = shap_sampling.SHAPEstimator(imputer, 'cross entropy')

In [None]:
permutations_list = []
for i in range(16):
    x = test[i:i+1]
    y = Y_test[i:i+1]
    shap_values = estimator(x, y, batch_size=16, thresh=0.01, verbose=False, bar=True)
    permutations_list.append(shap_values.n_permutations)
    print('Done with {} ({} permutations)'.format(i, permutations_list[-1]))

In [None]:
mean_permutations = np.mean(permutations_list)
median_permutations = np.median(permutations_list)
num_permutations = (mean_permutations // 32) * 32

In [None]:
values_list = []
for i in range(100):
    x = test[i:i+1]
    y = Y_test[i:i+1]
    shap_values = estimator(x, y, batch_size=32, n_permutations=num_permutations,
                          detect_convergence=False, verbose=False, bar=False)
    values_list.append(shap_values.values)
    corr = np.corrcoef(np.array(values_list).mean(axis=0), final_values)[0, 1]
    print(i, corr)

In [None]:
results_dict = {
    'values': np.array(values_list),
    'evals': num_permutations * len(feature_names),
    'inner_samples': 512
}
with open('results/credit shap convergence.pkl', 'wb') as f:
    pickle.dump(results_dict, f)