# Load data

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [2]:
# Load and split data
filename = '../data/bankruptcy.csv'
df = pd.read_csv(filename, sep=',', index_col=False)

# Drop column (always same value)
df.drop(columns=[' Net Income Flag'], inplace=True)

# Drop two outlier rows (encoding errors)
df.drop(df[df[' Revenue per person'] > 1].index, inplace=True)

# Split into X, Y
values = df.values
X, Y = values[:, 1:], values[:, 0]
feature_names = list(df.columns)[1:]

# Train/val/test split
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=0)
X_train, X_val, Y_train, Y_val = train_test_split(
    X_train, Y_train, test_size=0.2, random_state=0)

In [3]:
# Neural network pre-processing
enc = OneHotEncoder(sparse=False)
Y_train_oh = enc.fit_transform(np.expand_dims(Y_train, -1))
Y_val_oh = enc.fit_transform(np.expand_dims(Y_val, -1))
Y_test_oh = enc.transform(np.expand_dims(Y_test, -1))

ss = StandardScaler()
ss.fit(X_train)
X_train_std = ss.transform(X_train)
X_val_std = ss.transform(X_val)
X_test_std = ss.transform(X_test)

# Train model

In [4]:
import pickle
import xgboost as xgb

In [5]:
# Set up data
dtrain = xgb.DMatrix(X_train, label=Y_train)
dval = xgb.DMatrix(X_val, label=Y_val)

# Parameters
param = {
    'max_depth': 6,
    'objective': 'binary:logistic',
    'nthread': 4
}
evallist = [(dtrain, 'train'), (dval, 'val')]
num_round = 25

# Train
model = xgb.train(param, dtrain, num_round, evallist, verbose_eval=False)



In [6]:
with open('../models/bankruptcy_model.pkl', 'wb') as f:
    pickle.dump(model, f)

# Train surrogate

In [7]:
import torch
import torch.nn as nn
from fastshap_torch.utils import MaskLayer1d
from fastshap_torch import Surrogate, SoftCrossEntropyLoss
import matplotlib.pyplot as plt

In [8]:
# Create data
num_features = X_train.shape[1]
Y_train_surrogate = model.predict(dtrain)
Y_train_surrogate = np.vstack([1 - Y_train_surrogate, Y_train_surrogate]).T
Y_val_surrogate = model.predict(dval)
Y_val_surrogate = np.vstack([1 - Y_val_surrogate, Y_val_surrogate]).T

In [9]:
# Get loss upper bound
p = Y_train_surrogate.mean(axis=0)
soft_ce = - np.mean(np.sum(np.log(p) * Y_train_surrogate, axis=1))
print('Loss given no information = {:.4f}'.format(soft_ce))

Loss given no information = 0.1453


In [14]:
# Set up device
device = torch.device('cuda', 6)

# Create model
surrogate = nn.Sequential(
    MaskLayer1d(value=0, append=True),
    nn.Linear(2 * num_features, 128),
    nn.ELU(inplace=True),
    nn.Linear(128, 128),
    nn.ELU(inplace=True),
    nn.Linear(128, 2)).to(device)

# Set up surrogate object
surr = Surrogate(surrogate, num_features)

In [15]:
# Train
for batch_size in (32, 512, 8192):
    surr.train((X_train_std, Y_train_surrogate),
               (X_val_std, Y_val_surrogate),
               batch_size=batch_size,
               max_epochs=100,
               loss_fn=SoftCrossEntropyLoss(),
               validation_samples=50,
               validation_batch_size=10000,
               validation_seed=0,
               verbose=False)
    
    print('Best loss = {:.4f}'.format(min(surr.loss_list)))

Best loss = 0.0716
Best loss = 0.0716
Best loss = 0.0716


In [16]:
surrogate.cpu()
surrogate.eval()
torch.save(surrogate, '../models/bankruptcy_surrogate.pt')