# Load data

In [1]:
import shap
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [2]:
# Load and split data
X_train, X_test, Y_train, Y_test = train_test_split(
    *shap.datasets.adult(), test_size=0.2, random_state=7)
X_train, X_val, Y_train, Y_val = train_test_split(
    X_train, Y_train, test_size=0.2, random_state=0)

In [3]:
# Neural network pre-processing
enc = OneHotEncoder(sparse=False)
Y_train_oh = enc.fit_transform(np.expand_dims(Y_train, -1))
Y_val_oh = enc.fit_transform(np.expand_dims(Y_val, -1))
Y_test_oh = enc.transform(np.expand_dims(Y_test, -1))

ss = StandardScaler()
ss.fit(X_train)
X_train_std = ss.transform(X_train)
X_val_std = ss.transform(X_val)
X_test_std = ss.transform(X_test)

# Train model

In [4]:
import pickle
import lightgbm as lgb

In [5]:
# Setup
params = {
    "max_bin": 512,
    "learning_rate": 0.05,
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "binary_logloss",
    "num_leaves": 10,
    "verbose": -1,
    "min_data": 100,
    "boost_from_average": True
}

# More setup
d_train = lgb.Dataset(X_train, label=Y_train)
d_test = lgb.Dataset(X_val, label=Y_val)

# Train model
model = lgb.train(params, d_train, 10000, valid_sets=[d_test], early_stopping_rounds=50, verbose_eval=1000)

Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[656]	valid_0's binary_logloss: 0.281561


In [6]:
with open('../models/census_model.pkl', 'wb') as f:
    pickle.dump(model, f)

# Train surrogate

In [7]:
import torch
import torch.nn as nn
from fastshap_torch.utils import MaskLayer1d
from fastshap_torch import Surrogate, SoftCrossEntropyLoss
import matplotlib.pyplot as plt

In [8]:
# Create data
num_features = X_train.shape[1]
Y_train_surrogate = model.predict(X_train)
Y_train_surrogate = np.vstack([1 - Y_train_surrogate, Y_train_surrogate]).T
Y_val_surrogate = model.predict(X_val)
Y_val_surrogate = np.vstack([1 - Y_val_surrogate, Y_val_surrogate]).T

In [9]:
# Get loss upper bound
p = Y_train_surrogate.mean(axis=0)
soft_ce = - np.mean(np.sum(np.log(p) * Y_train_surrogate, axis=1))
print('Loss given no information = {:.4f}'.format(soft_ce))

Loss given no information = 0.5500


In [10]:
# Set up device
device = torch.device('cuda', 3)

# Create model
surrogate = nn.Sequential(
    MaskLayer1d(value=0, append=True),
    nn.Linear(2 * num_features, 128),
    nn.ELU(inplace=True),
    nn.Linear(128, 128),
    nn.ELU(inplace=True),
    nn.Linear(128, 2)).to(device)

# Set up surrogate object
surr = Surrogate(surrogate, num_features)

In [11]:
# Train
for batch_size in (32, 512, 8192):
    surr.train((X_train_std, Y_train_surrogate),
               (X_val_std, Y_val_surrogate),
               batch_size=batch_size,
               max_epochs=100,
               loss_fn=SoftCrossEntropyLoss(),
               validation_samples=10,
               validation_batch_size=10000,
               verbose=False)
    
    print('Best loss = {:.4f}'.format(min(surr.loss_list)))

Best loss = 0.3933
Best loss = 0.3931
Best loss = 0.3924


In [12]:
surrogate.cpu()
surrogate.eval()
torch.save(surrogate, '../models/census_surrogate.pt')