In [5]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from keras.metrics import AUC
from keras.models import Sequential
from keras.layers import Dense
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [6]:
def findFDR(y_label, y_pred):

    data = pd.DataFrame({'Actual':y_label, 'PredictedProb':y_pred})
    topRows = int(round(len(data)*0.03))
    temp0 = data.sort_values('PredictedProb',ascending=False)
    temp1 = temp0.head(topRows)
    numbads = sum(temp0.loc[:, 'Actual'])
    needed1 = temp1.loc[:,'Actual']
    FDR1 = sum(needed1)/numbads * 100
    print('FDR at 3% rate: ', round(FDR1, 2))

In [7]:
data = pd.read_csv('CreditCardFraudDataVFinal.csv', parse_dates = ['Date'])

In [8]:
oot = data.loc[data['Date'] > '2010-10-31']
traintest = data.loc[(data['Date'] <= '2010-10-31') & (data['Date'] >= '2010-01-14')]
X = traintest.drop(['Recnum', 'Date', 'Fraud', 'Cardnum', 'Merchnum'], axis = 1)
y = traintest['Fraud']
ootX = oot.drop(['Recnum', 'Date', 'Fraud', 'Cardnum', 'Merchnum'], axis = 1)
ootY = oot['Fraud']

In [None]:
# Normalizes (z-scale) the paratio and rm values
mms = MinMaxScaler()

X = mms.fit_transform(X)
ootX = mms.fit_transform(ootX)

# Sampling Data

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

# Logistic Regression

In [10]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver = 'liblinear').fit(X_train, y_train)



In [11]:
y_pred = model.predict_proba(X_train)
findFDR(y_train, y_pred[:,1])

FDR at 3% rate:  39.76


In [12]:
y_pred = model.predict_proba(X_test)
findFDR(y_test, y_pred[:,1])

FDR at 3% rate:  45.45


In [13]:
y_pred = model.predict_proba(ootX)
findFDR(ootY, y_pred[:,1])

FDR at 3% rate:  18.99


# Random Forest

In [14]:
def RunForest(trees,depth, min_sample, criterion, X, y): 
    model = RandomForestClassifier(n_estimators = trees,
                                   min_samples_split = min_sample, 
                                   max_depth = depth,
                                  ).fit(
        X, y)
    y_pred = model.predict_proba(X_train)
    print("Train")
    findFDR(y_train, y_pred[:,1])
    y_pred = model.predict_proba(X_test)
    print('Test')
    findFDR(y_test, y_pred[:,1])
    y_pred = model.predict_proba(ootX)
    print('OOT')
    findFDR(ootY, y_pred[:,1])

In [None]:
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
    RunForest(100, 15, 200, 'gini', X_train, y_train)

Train
FDR at 3% rate:  96.5
Test
FDR at 3% rate:  84.94
OOT
FDR at 3% rate:  49.16
Train
FDR at 3% rate:  95.97
Test
FDR at 3% rate:  86.87
OOT
FDR at 3% rate:  50.28
Train
FDR at 3% rate:  96.11
Test
FDR at 3% rate:  84.51
OOT
FDR at 3% rate:  48.6
Train
FDR at 3% rate:  94.31
Test
FDR at 3% rate:  87.43
OOT
FDR at 3% rate:  47.49
Train
FDR at 3% rate:  94.7
Test
FDR at 3% rate:  88.89
OOT
FDR at 3% rate:  48.6


In [None]:
RunForest(150, 10, 200, 'entorpy', X_train, y_train)

In [None]:
RunForest(150, 25, 300, 'entorpy', X_train, y_train)

In [None]:
RunForest(150, 30, 400, 'entorpy', X_train, y_train)

In [None]:
RunForest(100, 20, 400, 'gini', X_train, y_train)

In [None]:
RunForest(50, 25, 300, 'gini', X_train, y_train)

In [None]:
RunForest(50, 25, 300, 'gini', X_train, y_train)

# Boosted Trees

In [None]:
# XGBoost

In [None]:
xgbs = XGBClassifier(n_jobs =  8, eta = 0.02, 
                     max_depth = 3, 
                     n_estimators = 400, subsample = 1,
                     eval_metric = 'auc')
model = xgbs.fit(X_train, y_train)

In [None]:
y_pred = model.predict_proba(X_train)
print('Train')
findFDR(y_train, y_pred[:,1])

In [None]:
y_pred = model.predict_proba(X_test)
print('Test')
findFDR(y_test, y_pred[:,1])

In [None]:
y_pred = model.predict_proba(ootX)
print('OOT')
findFDR(ootY, y_pred[:,1])