In [212]:
import pandas as pd
import sklearn
import numpy as np
from scipy import optimize
from scipy.optimize import brute
from scipy.optimize import minimize
from scipy.optimize import minimize_scalar
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv('SBAnational.csv')
df.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,LoanNr_ChkDgt,Name,City,State,Zip,Bank,BankState,NAICS,ApprovalDate,ApprovalFY,...,RevLineCr,LowDoc,ChgOffDate,DisbursementDate,DisbursementGross,BalanceGross,MIS_Status,ChgOffPrinGr,GrAppv,SBA_Appv
0,1000014003,ABC HOBBYCRAFT,EVANSVILLE,IN,47711,FIFTH THIRD BANK,OH,451120,28-Feb-97,1997,...,N,Y,,28-Feb-99,"$60,000.00",$0.00,P I F,$0.00,"$60,000.00","$48,000.00"
1,1000024006,LANDMARK BAR & GRILLE (THE),NEW PARIS,IN,46526,1ST SOURCE BANK,IN,722410,28-Feb-97,1997,...,N,Y,,31-May-97,"$40,000.00",$0.00,P I F,$0.00,"$40,000.00","$32,000.00"
2,1000034009,"WHITLOCK DDS, TODD M.",BLOOMINGTON,IN,47401,GRANT COUNTY STATE BANK,IN,621210,28-Feb-97,1997,...,N,N,,31-Dec-97,"$287,000.00",$0.00,P I F,$0.00,"$287,000.00","$215,250.00"
3,1000044001,"BIG BUCKS PAWN & JEWELRY, LLC",BROKEN ARROW,OK,74012,1ST NATL BK & TR CO OF BROKEN,OK,0,28-Feb-97,1997,...,N,Y,,30-Jun-97,"$35,000.00",$0.00,P I F,$0.00,"$35,000.00","$28,000.00"
4,1000054004,"ANASTASIA CONFECTIONS, INC.",ORLANDO,FL,32801,FLORIDA BUS. DEVEL CORP,FL,0,28-Feb-97,1997,...,N,N,,14-May-97,"$229,000.00",$0.00,P I F,$0.00,"$229,000.00","$229,000.00"


In [4]:
bad_columns = ['LoanNr_ChkDgt', 'NAICS', 'ApprovalDate', 'DisbursementDate', 'Name', 'FranchiseCode']
target = 'MIS_Status'
categorical = ['City', 'State', 'Zip', 'Bank', 'BankState', 'NewExist', 'UrbanRural', 'RevLineCr', 'LowDoc']
ordinal = ['ApprovalFY', 'Term', 'NoEmp', 'CreateJob', 'RetainedJob']
money_columns = ['DisbursementGross', 'BalanceGross', 'ChgOffPrinGr', 'GrAppv', 'SBA_Appv']
date_col = ['ChgOffDate']

In [5]:
# drop bad columns
df = df.dropna()
df = df.drop(columns=bad_columns)

In [6]:
# convert date to ordinal days
import datetime as dt
for c in date_col:
    df[c] = pd.to_datetime(df[c]).map(dt.datetime.toordinal)

In [7]:
# encode labels
le = preprocessing.LabelEncoder()
df[target] = le.fit_transform(df[target].astype(str))
class_names = le.classes_
print(class_names)

['CHGOFF' 'P I F']


In [8]:
# process ordinal features
for col in ordinal:
    df[col] = pd.to_numeric(df['ApprovalFY'].replace('1976A','1976', regex=False).replace('\d+\-\w+\-\d+|,','', regex=True))
    df[col] = df[col].astype('int32')
    

In [9]:
# process money columns
for c in money_columns:
    df[c] = df[c].replace('\$|,','', regex=True).replace('\(','-', regex=True).replace('\)','', regex=True)
    df[c] = pd.to_numeric(df[c])

In [10]:
# process categorical features
cat_idxs = [df.columns.get_loc(c) for c in categorical if c in df]
print(cat_idxs)
categorical_names = {}
for c in categorical:
    le = preprocessing.LabelEncoder()
    df[c] = le.fit_transform(df[c])
    categorical_names[c] = le.classes_

[0, 1, 2, 3, 4, 8, 11, 12, 13]


In [11]:
Y = df[target]
X = df.drop(columns=[target])

# Split the data into train and test data:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2)

In [149]:
rf = RandomForestClassifier(max_depth=19, random_state=0, n_estimators=10)
rf.fit(X_train, Y_train)
sklearn.metrics.accuracy_score(Y_test, rf.predict(X_test))
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
out = rf.predict(X_test) 

print(confusion_matrix(Y_test,out))
print(classification_report(Y_test,out))
print(accuracy_score(Y_test, out))

[[31169    41]
 [  944    21]]
              precision    recall  f1-score   support

           0       0.97      1.00      0.98     31210
           1       0.34      0.02      0.04       965

    accuracy                           0.97     32175
   macro avg       0.65      0.51      0.51     32175
weighted avg       0.95      0.97      0.96     32175

0.9693861693861694


Counterfactual loss function = $L(x,x^\prime,y^\prime,\lambda)=\lambda\cdot(\hat{f}(x^\prime)-y^\prime)^2+d(x,x^\prime)$

In [26]:
num_f=X_train.shape[1]
x = X_train.values[0].reshape(-1,20)
y = rf.predict(x)

In [14]:
from scipy.spatial import distance
def d(a,b):
    return distance.cityblock(a, b)

Sample a cf instance from the data distribution delimited by the (min, max) per feature boundaries

In [46]:
def cf_loss(xp, l, x, y):
    xp = xp.reshape(-1,20)
    y_c = rf.predict(xp)
    diff = y_c-abs(1-y)
    distance = d(xp, x)
    loss = l * (diff)**2 + distance
    return loss[0]

In [220]:
import random
import numpy
from scipy.spatial import distance

num_f=X_train.shape[1]

x = X_train.values[0].reshape(-1,num_f)
y = rf.predict(x)
t = 1-y
eps = 0.01

def d(a,b):
    return distance.cityblock(a, b)

def cf_loss(xp):
    xp = xp.reshape(-1,num_f)
    y_c = rf.predict(xp)
    diff = abs(y_c-t)
    distance = d(xp, x)
    ft = l * (diff)**2
    loss = ft + distance
    return loss[0]

b = []
for c in range(num_f):
    b.append((X_train.values[:,c].min(), X_train.values[:,c].max())) 

for m in numpy.arange(-4, 10):
    l = 10.0**m
    i = 0
    print('using lambda='+str(l))
    idx = random.randint(0, len(X_train.values)-1)
    x_sample = X_train.values[idx].reshape(-1, num_f)
    while abs(rf.predict(x_sample.reshape(-1,num_f)) - t) > 0.01 and i < 10:
        if rf.predict(x_sample.reshape(-1,num_f)) > 0.5:
          print('success for '+str(x_sample))
        cf_min = optimize.fmin(cf_loss, [x_sample])
        x_sample = cf_min
        cur_loss = cf_loss(x_sample)
        l += 10.0**(m-2)
        i += 1

    if rf.predict(x_sample.reshape(-1,num_f)) == t:
        print("original input:"+str(x))
        print("counterf input:"+str(x_sample))
        print("counterf output:"+str(rf.predict(x_sample.reshape(-1,num_f))))

using lambda=0.0001
Optimization terminated successfully.
         Current function value: 11236.317031
         Iterations: 1696
         Function evaluations: 2439
Optimization terminated successfully.
         Current function value: 169.987586
         Iterations: 3008
         Function evaluations: 3995
Optimization terminated successfully.
         Current function value: 26.278540
         Iterations: 1736
         Function evaluations: 2440
Optimization terminated successfully.
         Current function value: 3.261424
         Iterations: 2788
         Function evaluations: 3712
Optimization terminated successfully.
         Current function value: 2.295628
         Iterations: 1235
         Function evaluations: 1796
Optimization terminated successfully.
         Current function value: 0.395265
         Iterations: 1936
         Function evaluations: 2678
Optimization terminated successfully.
         Current function value: 0.090509
         Iterations: 1213
         Functi

Sample cf instances directly from the training data

In [227]:
import random
import numpy
from scipy.spatial import distance

num_f=X_train.shape[1]

x = X_train.values[0].reshape(-1,num_f)
y = rf.predict(x)
t = 1-y
eps = 0.01

def d(a,b):
    return distance.cityblock(a, b)

def cf_loss(idx):
    try:
        idx = idx[0]
    except:
        pass
    idx = int(idx)
    xp = X_train.values[idx].reshape(-1, num_f)
    y_c = rf.predict(xp)
    diff = y_c-t
    distance = d(xp, x)
    loss = l * (diff)**2 + distance
    return loss[0]

b = [(0, X_train.shape[0])]

for m in numpy.arange(-10, 10):
    l = 10.0**m
    i = 0
    print('using lambda='+str(l))
    idx_sample = random.randint(0, num_f-1)
    while abs(t-rf.predict(X_train.values[idx_sample].reshape(-1, num_f))) > eps and i < 10:
        cf_min = optimize.fmin(cf_loss, [idx_sample])
        idx_sample = int(cf_min)
        cur_loss = cf_loss(idx_sample)
        l += 10.0**(m-2)
        i += 1

    if rf.predict(x_sample.reshape(-1,num_f)) == t:
        print("original input:"+str(x))
        print("counterf input:"+str(x_sample))
        print("counterf output:"+str(rf.predict(x_sample.reshape(-1,num_f))))

using lambda=1e-10
Optimization terminated successfully.
         Current function value: 86790.000000
         Iterations: 15
         Function evaluations: 44
Optimization terminated successfully.
         Current function value: 86790.000000
         Iterations: 15
         Function evaluations: 44
Optimization terminated successfully.
         Current function value: 86790.000000
         Iterations: 15
         Function evaluations: 44
Optimization terminated successfully.
         Current function value: 86790.000000
         Iterations: 15
         Function evaluations: 44
Optimization terminated successfully.
         Current function value: 86790.000000
         Iterations: 15
         Function evaluations: 44
Optimization terminated successfully.
         Current function value: 86790.000000
         Iterations: 15
         Function evaluations: 44
Optimization terminated successfully.
         Current function value: 86790.000000
         Iterations: 15
         Function eva