# Load data

In [1]:
import sage
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
# Load data
df = sage.datasets.bank()

# Convert binary features to 0/1
binary_cols = ['Default', 'Housing', 'Loan']
for col in binary_cols:
    df[col] = (df[col] == 'yes').astype(float)
    
# Convert education to numerical
df['Education'].replace(
    {'unknown': 0, 'primary': 1, 'secondary': 2, 'tertiary': 3},
    inplace=True)

# Convert month to numerical
df['Month'].replace(
    {'jan': 0, 'feb': 1, 'mar': 2, 'apr': 3, 'may': 4, 'jun': 5,
     'jul': 6, 'aug': 7, 'sep': 8, 'oct': 9, 'nov': 10, 'dec': 11},
    inplace=True)

# Convert marital to one-hot
for value in np.unique(df['Marital'].values):
    df['Marital-{}'.format(value)] = (df['Marital'] == value).astype(float)
df.drop(columns='Marital', inplace=True)

# Convert contact to one-hot
for value in np.unique(df['Contact'].values):
    df['Contact-{}'.format(value)] = (df['Contact'] == value).astype(float)
df.drop(columns='Contact', inplace=True)

# Convert prev outcome to one-hot
for value in np.unique(df['Prev Outcome'].values):
    df['Prev Outcome-{}'.format(value)] = (df['Prev Outcome'] == value).astype(float)
df.drop(columns='Prev Outcome', inplace=True)

# Convert job to one-hot
for value in np.unique(df['Job'].values):
    df['Job-{}'.format(value)] = (df['Job'] == value).astype(float)
df.drop(columns='Job', inplace=True)

# Split into X, Y
values = df.values.astype(float)
X_cols = np.array(df.columns) != 'Success'
X, Y = values[:, X_cols], values[:, ~X_cols]

# Get feature names, groups
feature_names = np.array(df.columns)[X_cols]
prefixes = np.array([name.split('-')[0] for name in feature_names])
groups = []
group_names = []
for prefix in np.unique(prefixes):
    groups.append(np.where(prefixes == prefix)[0])
    group_names.append(prefix)

# Train/val/test split
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.1, random_state=123)
X_train, X_val, Y_train, Y_val = train_test_split(
    X_train, Y_train, test_size=0.1, random_state=123)

In [3]:
# Standardize continuous columns
feature_names = list(feature_names)
num_features = len(feature_names)
continuous_cols = ['Age', 'Balance', 'Day', 'Duration', 'Campaign',
                   'Month', 'Prev Days', 'Prev Contacts']
continuous_inds = [feature_names.index(col) for col in continuous_cols]
ss = StandardScaler()
ss.fit(X_train[:, continuous_inds])
X_train[:, continuous_inds] = ss.transform(X_train[:, continuous_inds])
X_val[:, continuous_inds] = ss.transform(X_val[:, continuous_inds])
X_test[:, continuous_inds] = ss.transform(X_test[:, continuous_inds])

# Train model

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from torch.utils.data import RandomSampler, BatchSampler
from madgrad import MADGRAD
from copy import deepcopy

In [5]:
# Prepare data
X_train_pt = torch.tensor(X_train, dtype=torch.float32)
X_val_pt = torch.tensor(X_val, dtype=torch.float32)
X_test_pt = torch.tensor(X_test, dtype=torch.float32)
Y_train_pt = torch.tensor(Y_train, dtype=torch.float32)
Y_val_pt = torch.tensor(Y_val, dtype=torch.float32)
Y_test_pt = torch.tensor(Y_test, dtype=torch.float32)

In [134]:
# Set up model
device = torch.device('cuda', 6)
model = nn.Sequential(
    nn.Linear(num_features, 128),
    nn.ReLU(inplace=True),
    nn.Linear(128, 128),
    nn.ReLU(inplace=True),
    nn.Linear(128, 1)).to(device)

# Training parameters
batch_size = 32
max_epochs = 100
lr = 1e-4
lookback = 10
validation_batch_size = 1000
verbose = True

# Set up train data loader
train_set = TensorDataset(X_train_pt, Y_train_pt)
random_sampler = RandomSampler(
    train_set, replacement=True,
    num_samples=int(np.ceil(len(X_train) / batch_size))*batch_size)
batch_sampler = BatchSampler(
    random_sampler, batch_size=batch_size, drop_last=True)
train_loader = DataLoader(train_set, batch_sampler=batch_sampler)

# Prepare validation dataset
val_set = TensorDataset(X_val_pt, Y_val_pt)
val_loader = DataLoader(val_set, batch_size=validation_batch_size)

def validate(model, loader, loss_fn):
    with torch.no_grad():
        # Setup.
        device = next(model.parameters()).device
        mean_loss = 0
        N = 0

        for x, y in loader:
            x = x.to(device)
            y = y.to(device)
            pred = model(x)
            loss = loss_fn(pred, y)
            N += len(x)
            mean_loss += len(x) * (loss - mean_loss) / N

    return mean_loss

# Setup for training
device = next(model.parameters()).device
optimizer = MADGRAD(model.parameters(), lr=lr)
loss_fn = nn.BCEWithLogitsLoss()
best_loss = np.inf
best_epoch = 0
best_model = None
loss_list = []

for epoch in range(max_epochs):
    for i, (x, y) in enumerate(train_loader):
        # Prepare data
        x = x.to(device)
        y = y.to(device)

        # Make predictions
        pred = model(x)
        loss = loss_fn(pred, y)

        # Optimizer step
        loss.backward()
        optimizer.step()
        model.zero_grad()

    # Print progress
    val_loss = validate(model, val_loader, loss_fn).item()
    loss_list.append(val_loss)
    if verbose:
        print('----- Epoch = {} -----'.format(epoch + 1))
        print('Val loss = {:.4f}'.format(val_loss))
        print('')

    # Check if best model
    if val_loss < best_loss:
        best_loss = val_loss
        best_model = deepcopy(model)
        best_epoch = epoch
        if verbose:
            print('New best epoch, loss = {:.4f}'.format(val_loss))
            print('')
    elif epoch - best_epoch == lookback:
        if verbose:
            print('Stopping early')
        break

# Clean up
for param, best_param in zip(model.parameters(),
                             best_model.parameters()):
    param.data = best_param.data

----- Epoch = 1 -----
Val loss = 0.2396

New best epoch, loss = 0.2396

----- Epoch = 2 -----
Val loss = 0.2324

New best epoch, loss = 0.2324

----- Epoch = 3 -----
Val loss = 0.2311

New best epoch, loss = 0.2311

----- Epoch = 4 -----
Val loss = 0.2321

----- Epoch = 5 -----
Val loss = 0.2290

New best epoch, loss = 0.2290

----- Epoch = 6 -----
Val loss = 0.2246

New best epoch, loss = 0.2246

----- Epoch = 7 -----
Val loss = 0.2239

New best epoch, loss = 0.2239

----- Epoch = 8 -----
Val loss = 0.2226

New best epoch, loss = 0.2226

----- Epoch = 9 -----
Val loss = 0.2285

----- Epoch = 10 -----
Val loss = 0.2216

New best epoch, loss = 0.2216

----- Epoch = 11 -----
Val loss = 0.2222

----- Epoch = 12 -----
Val loss = 0.2226

----- Epoch = 13 -----
Val loss = 0.2200

New best epoch, loss = 0.2200

----- Epoch = 14 -----
Val loss = 0.2145

New best epoch, loss = 0.2145

----- Epoch = 15 -----
Val loss = 0.2154

----- Epoch = 16 -----
Val loss = 0.2167

----- Epoch = 17 -----
Val 

In [135]:
# Save model
model.eval()
model.cpu()
torch.save(model, '../models/bank_model.pt')

# Train surrogate

In [7]:
import torch
import torch.nn as nn
from fastshap_torch.utils import MaskLayer1d
from fastshap_torch import Surrogate, SoftCrossEntropyLoss
import matplotlib.pyplot as plt

In [15]:
# Create data
Y_train_surrogate = model(X_train_pt.to(device=device)).sigmoid().cpu().data.numpy()
Y_train_surrogate = np.concatenate([1 - Y_train_surrogate, Y_train_surrogate], axis=1)
Y_val_surrogate = model(X_val_pt.to(device=device)).sigmoid().cpu().data.numpy()
Y_val_surrogate = np.concatenate([1 - Y_val_surrogate, Y_val_surrogate], axis=1)

In [19]:
# Get loss upper bound
p = Y_train_surrogate.mean(axis=0)
soft_ce = - np.mean(np.sum(np.log(p) * Y_train_surrogate, axis=1))
print('Loss given no information = {:.4f}'.format(soft_ce))

Loss given no information = 0.3657


In [24]:
# Set up device
device = torch.device('cuda', 6)

# Create model
surrogate = nn.Sequential(
    MaskLayer1d(value=0, append=True),
    nn.Linear(2 * num_features, 128),
    nn.ReLU(inplace=True),
    nn.Linear(128, 128),
    nn.ReLU(inplace=True),
    nn.Linear(128, 2)).to(device)

# Set up surrogate object
surr = Surrogate(surrogate, num_features, groups)

In [25]:
# Train
for batch_size in (32, 512, 8192):
    surr.train((X_train, Y_train_surrogate),
               (X_val, Y_val_surrogate),
               batch_size=batch_size,
               max_epochs=100,
               loss_fn=SoftCrossEntropyLoss(),
               validation_samples=50,
               validation_batch_size=10000,
               validation_seed=0,
               verbose=True)
    
    print('Best loss = {:.4f}'.format(min(surr.loss_list)))

----- Epoch = 1 -----
Val loss = 0.2920

New best epoch, loss = 0.2920

----- Epoch = 2 -----
Val loss = 0.2870

New best epoch, loss = 0.2870

----- Epoch = 3 -----
Val loss = 0.2839

New best epoch, loss = 0.2839

----- Epoch = 4 -----
Val loss = 0.2831

New best epoch, loss = 0.2831

----- Epoch = 5 -----
Val loss = 0.2831

----- Epoch = 6 -----
Val loss = 0.2805

New best epoch, loss = 0.2805

----- Epoch = 7 -----
Val loss = 0.2802

New best epoch, loss = 0.2802

----- Epoch = 8 -----
Val loss = 0.2807

----- Epoch = 9 -----
Val loss = 0.2787

New best epoch, loss = 0.2787

----- Epoch = 10 -----
Val loss = 0.2798

----- Epoch = 11 -----
Val loss = 0.2787

----- Epoch = 12 -----
Val loss = 0.2784

New best epoch, loss = 0.2784

----- Epoch = 13 -----
Val loss = 0.2785

----- Epoch = 14 -----
Val loss = 0.2780

New best epoch, loss = 0.2780

----- Epoch = 15 -----
Val loss = 0.2781

----- Epoch = 16 -----
Val loss = 0.2784

----- Epoch = 17 -----
Val loss = 0.2780

New best epoch, 

In [26]:
surrogate.cpu()
surrogate.eval()
torch.save(surrogate, '../models/bank_surrogate.pt')