In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from keras.metrics import AUC
from keras.models import Sequential
from keras.layers import Dense

In [2]:
# Function that finds the FDR at 3%
def findFDR(y_label, y_pred):
    data = pd.DataFrame({'Actual':y_label, 'PredictedProb':y_pred})
    topRows = int(round(len(data)*0.03))
    temp0 = data.sort_values('PredictedProb',ascending=False)
    temp1 = temp0.head(topRows)
    numbads = sum(temp0.loc[:, 'Actual'])
    needed1 = temp1.loc[:,'Actual']
    FDR1 = sum(needed1)/numbads * 100
    print('FDR at 3% rate: ', round(FDR1, 2))

In [3]:
data = pd.read_csv('application data.csv', parse_dates = ['date'])

In [4]:
# Sets OOT data
oot = data.loc[data['date'] > '2016-10-31']
traintest = data.loc[data['date'] <= '2016-10-31']
X = traintest.drop('date', axis = 1)
X = X.drop('fraud_label', axis = 1)
y = traintest['fraud_label']
ootX = oot.drop('date', axis = 1)
ootX = ootX.drop('fraud_label', axis = 1)
ootY = oot['fraud_label']

In [5]:
# Normalizes (z-scale) the paratio and rm values
mms = MinMaxScaler()

X = mms.fit_transform(X)
ootX = mms.fit_transform(ootX)

# Sampling Data

In [6]:
# Splits data into 75/25 train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [7]:
# Creates different undersampling ratios
rus = RandomUnderSampler(sampling_strategy = 1)
X_train_under50, y_train_under50 = rus.fit_sample(X_train, y_train)

rus = RandomUnderSampler(sampling_strategy = (2/3))
X_train_under40, y_train_under40 = rus.fit_sample(X_train, y_train)

rus = RandomUnderSampler(sampling_strategy = (3/7))
X_train_under30, y_train_under30 = rus.fit_sample(X_train, y_train)

In [8]:
# Creates different oversampling ratios
oversample = SMOTE(sampling_strategy = 1)
X_train_over50, y_train_over50 = oversample.fit_resample(X_train, y_train)

oversample = SMOTE(sampling_strategy = (2/3))
X_train_over40, y_train_over40 = oversample.fit_resample(X_train, y_train)

oversample = SMOTE(sampling_strategy = (3/7))
X_train_over30, y_train_over30 = oversample.fit_resample(X_train, y_train)

# Logistic Regression

In [14]:
model = LogisticRegression().fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [15]:
y_pred = model.predict_proba(X_train)
findFDR(y_train, y_pred[:,1])

FDR at 3% rate:  52.61


In [16]:
y_pred = model.predict_proba(X_test)
findFDR(y_test, y_pred[:,1])

FDR at 3% rate:  52.23


In [17]:
y_pred = model.predict_proba(ootX)
findFDR(ootY, y_pred[:,1])

FDR at 3% rate:  50.29


# Neural Networks

In [26]:
# Defines a simple Neural Network model function 
def runNN(X, y, nodes, optimizer, epochs):
    # first neural network with keras make predictions

    # define the keras model
    model = Sequential()
    model.add(Dense(nodes, input_dim=30, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))

    # compile the keras model
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=[AUC()])

    # fit the keras model on the dataset
    model.fit(X, y, epochs=epochs, batch_size=10)

    y_pred = model.predict(X_train)
    print('Train')
    findFDR(y_train, y_pred.reshape(-1))

    y_pred = model.predict(X_test)
    print('Test')
    findFDR(y_test, y_pred.reshape(-1))

    y_pred = model.predict(ootX)
    print('OOT')
    findFDR(ootY, y_pred.reshape(-1))

In [None]:
data.columns

In [39]:
runNN(X_train_over30, y_train_over30, 10, 'sgd', 20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Train
FDR at 3% rate:  54.47
Test
FDR at 3% rate:  54.01
OOT
FDR at 3% rate:  52.05


In [40]:
runNN(X_train_over30, y_train_over30, 30, 'sgd', 20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Train
FDR at 3% rate:  55.25
Test
FDR at 3% rate:  54.53
OOT
FDR at 3% rate:  52.31


In [41]:
runNN(X_train_over30, y_train_over30, 30, 'sgd', 30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Train
FDR at 3% rate:  55.25
Test
FDR at 3% rate:  54.53
OOT
FDR at 3% rate:  52.26


In [42]:
runNN(X_train_over30, y_train_over30, 50, 'adam', 30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Train
FDR at 3% rate:  55.59
Test
FDR at 3% rate:  54.73
OOT
FDR at 3% rate:  53.19


In [43]:
runNN(X_train_over30, y_train_over30, 30, 'adam', 40)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40
Train
FDR at 3% rate:  55.59
Test
FDR at 3% rate:  55.09
OOT
FDR at 3% rate:  52.72


In [44]:
runNN(X_train_over30, y_train_over30, 50, 'adam', 40)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40
Train
FDR at 3% rate:  55.72
Test
FDR at 3% rate:  55.12
OOT
FDR at 3% rate:  53.02


In [52]:
# Last Run
runNN(X_train_over30, y_train_over30, 50, 'adam', 20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Train
FDR at 3% rate:  55.61
Test
FDR at 3% rate:  54.76
OOT
FDR at 3% rate:  53.23


# Random Forest

In [9]:
# Defines a simple Random Forest 
def RunForest(trees,depth, min_sample, criterion, X, y): 
    model = RandomForestClassifier(n_estimators = trees,
                                   min_samples_split = min_sample, 
                                   max_depth = depth,
                                   class_weight = {1:.9, 0:.1}
                                  ).fit(
        X, y)
    y_pred = model.predict_proba(X_train)
    print("Train")
    findFDR(y_train, y_pred[:,1])
    y_pred = model.predict_proba(X_test)
    print('Test')
    findFDR(y_test, y_pred[:,1])
    y_pred = model.predict_proba(ootX)
    print('OOT')
    findFDR(ootY, y_pred[:,1])

In [10]:
RunForest(100, 20, 300, 'gini', X_train, y_train)

Train
FDR at 3% rate:  55.54
Test
FDR at 3% rate:  55.59
OOT
FDR at 3% rate:  53.44


In [46]:
RunForest(150, 10, 200, 'entorpy', X_train, y_train)

Train
FDR at 3% rate:  55.51
Test
FDR at 3% rate:  54.76
OOT
FDR at 3% rate:  53.35


In [47]:
RunForest(150, 25, 300, 'entorpy', X_train, y_train)

Train
FDR at 3% rate:  55.73
Test
FDR at 3% rate:  55.06
OOT
FDR at 3% rate:  53.48


In [48]:
RunForest(150, 30, 400, 'entorpy', X_train, y_train)

Train
FDR at 3% rate:  55.67
Test
FDR at 3% rate:  55.12
OOT
FDR at 3% rate:  53.52


In [49]:
RunForest(100, 20, 400, 'gini', X_train, y_train)

Train
FDR at 3% rate:  55.71
Test
FDR at 3% rate:  55.12
OOT
FDR at 3% rate:  53.52


In [50]:
RunForest(50, 25, 300, 'gini', X_train, y_train)

Train
FDR at 3% rate:  55.65
Test
FDR at 3% rate:  55.06
OOT
FDR at 3% rate:  53.44


In [51]:
RunForest(50, 25, 300, 'gini', X_train, y_train)

Train
FDR at 3% rate:  55.69
Test
FDR at 3% rate:  54.99
OOT
FDR at 3% rate:  53.56


# Boosted Trees

In [None]:
# XGBoost

In [None]:
xgbs = XGBClassifier(n_jobs =  8, eta = 0.05, 
                     max_depth = 5, 
                     n_estimators = 600, subsample = 1,
                     eval_metric = 'auc')
model = xgbs.fit(X_train_under50, y_train_under50)

In [None]:
y_pred = model.predict_proba(X_train)
print('Train')
findFDR(y_train, y_pred[:,1])

In [None]:
y_pred = model.predict_proba(X_test)
print('Test')
findFDR(y_test, y_pred[:,1])

In [None]:
y_pred = model.predict_proba(ootX)
print('OOT')
findFDR(ootY, y_pred[:,1])