In [31]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import math
import seaborn as sns
import sklearn
import scipy
from sklearn import linear_model
from sklearn import preprocessing
from sklearn import ensemble
from sklearn.model_selection import GridSearchCV
%matplotlib inline
sns.set_style('darkgrid')

import warnings
warnings.filterwarnings('ignore')
plt.rcParams['figure.figsize'] = (20.0, 10.0)
plt.rcParams.update({'font.size': 22})

# Background 

This is a credit card fraud dataset. The data has already been transformed into Principal Components. The datasets contains transactions made by credit cards in September 2013 by European cardholders. This dataset presents transactions that occurred in two days, where there are 492 frauds out of 284,807 transactions. The dataset is highly unbalanced, the positive class (frauds) account for 0.172% of all transactions.

The goal of this assignment is to design an algorithm that can correctly identify the fraudulent cases. 
The dataset can be found here: https://www.kaggle.com/mlg-ulb/creditcardfraud

In [2]:
fraud_data = pd.read_csv('creditcard.csv')

In [3]:
# Look at what the shape of the data looks like 
fraud_data.shape

(284807, 31)

In [4]:
# Look at the standard distribution of the data.
fraud_data.describe()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
count,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,...,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0
mean,94813.859575,3.91956e-15,5.688174e-16,-8.769071e-15,2.782312e-15,-1.552563e-15,2.010663e-15,-1.694249e-15,-1.927028e-16,-3.137024e-15,...,1.537294e-16,7.959909e-16,5.36759e-16,4.458112e-15,1.453003e-15,1.699104e-15,-3.660161e-16,-1.206049e-16,88.349619,0.001727
std,47488.145955,1.958696,1.651309,1.516255,1.415869,1.380247,1.332271,1.237094,1.194353,1.098632,...,0.734524,0.7257016,0.6244603,0.6056471,0.5212781,0.482227,0.4036325,0.3300833,250.120109,0.041527
min,0.0,-56.40751,-72.71573,-48.32559,-5.683171,-113.7433,-26.16051,-43.55724,-73.21672,-13.43407,...,-34.83038,-10.93314,-44.80774,-2.836627,-10.2954,-2.604551,-22.56568,-15.43008,0.0,0.0
25%,54201.5,-0.9203734,-0.5985499,-0.8903648,-0.8486401,-0.6915971,-0.7682956,-0.5540759,-0.2086297,-0.6430976,...,-0.2283949,-0.5423504,-0.1618463,-0.3545861,-0.3171451,-0.3269839,-0.07083953,-0.05295979,5.6,0.0
50%,84692.0,0.0181088,0.06548556,0.1798463,-0.01984653,-0.05433583,-0.2741871,0.04010308,0.02235804,-0.05142873,...,-0.02945017,0.006781943,-0.01119293,0.04097606,0.0165935,-0.05213911,0.001342146,0.01124383,22.0,0.0
75%,139320.5,1.315642,0.8037239,1.027196,0.7433413,0.6119264,0.3985649,0.5704361,0.3273459,0.597139,...,0.1863772,0.5285536,0.1476421,0.4395266,0.3507156,0.2409522,0.09104512,0.07827995,77.165,0.0
max,172792.0,2.45493,22.05773,9.382558,16.87534,34.80167,73.30163,120.5895,20.00721,15.59499,...,27.20284,10.50309,22.52841,4.584549,7.519589,3.517346,31.6122,33.84781,25691.16,1.0


In [5]:
fraud_data.Class.value_counts()

0    284315
1       492
Name: Class, dtype: int64

- The dataset doesn't possess any Nan since PCA was performed so it's ready to use. 
- Dataset is imbalanced since only .172% of transactions logged are fraudulent.
- In order to remedy the Imbalanced Class data, I will downsample the majority to improve the validity of the model.

In [3]:
# Use downsampling of the minority class to address the class imbalance. 
from sklearn.utils import resample

# Separate the majority and minority Class.
fd_majority = fraud_data[fraud_data.Class==0]
fd_minority = fraud_data[fraud_data.Class==1]


# Downsample majority class 
fd_maj_downsampled = resample(fd_majority, 
                                 replace=False,    # sample without replacement
                                 n_samples=len(fd_minority))    # to match minority class
                                
# Combine minority class with downsampled majority class
fd_downsampled = pd.concat([fd_maj_downsampled, fd_minority])


# Model 1: Naive Bayes

In [4]:
# Pepare to train/test split the resampled data
from sklearn.cross_validation import train_test_split

X_resampled = fd_downsampled.loc[:, fd_downsampled.columns != 'Class']
Y_resampled = fd_downsampled.loc[:, fd_downsampled.columns == 'Class']

# Used the test/train/split method with 30% holdout.
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, Y_resampled, test_size = 0.3) 

In [8]:
# Our data is binary / boolean, so we're importing the Bernoulli classifier.# Our d 
from sklearn.naive_bayes import BernoulliNB

# Instantiate our model and store it in a new variable.
bnb = BernoulliNB()

# Fit our model to the data.
bnb.fit(X_train, y_train)

# Classify, storing the result in a new variable.
y_pred_train = bnb.predict(X_train)

# Display our results.
print(str(100*bnb.fit(X_train, y_train).score(X_test, y_test)) + '%') 

92.9054054054054%


In [9]:
# Lets check for overfitting now.
from sklearn.model_selection import cross_val_score
cross_val_score(bnb, X_train, y_train, cv=10)


array([0.91428571, 0.88571429, 0.85714286, 0.92753623, 0.91304348,
       0.82352941, 0.89705882, 0.91176471, 0.92647059, 0.89705882])

In [13]:
# Checking the test set now.
cross_val_score(bnb, X_test, y_test, cv=10)

array([0.90322581, 0.86666667, 0.8       , 0.96666667, 1.        ,
       0.89655172, 0.86206897, 0.96551724, 0.96551724, 0.86206897])

In [21]:
# Results are consistent but lets check one last thing
from sklearn.metrics import confusion_matrix

confusion_matrix(y_train, y_pred_train)

array([[347,   4],
       [ 59, 278]])

# Analysis

Naive Bayes is the first model that I used because it's simple to use and is computationally inexpensive due to the level of math involved. However, NB has prerequesites before it could be used, independence and class balance. Both of these weren't an issue since PCA causes these new features to be independent and have low multicollinearity. I used downsampling in order to avoid the massive class imbalance. I thought that I had solved it since my model had an accuracy of 89.5%. I had used cross validation to make sure the variables weren't overfitting my model which worked. However, the model's accuracy was still suspicious so I used the confusion matrix to see if my model was suffering from any false positives or false negatives. My model had 4 false positives (.58%) which means that it identified a charge as fraudulent when it wasn't and failed to identify 59 charges (8.57%) that were fraudulent.  



# Model 2: Random Forest Classifier 

In [30]:
# Prepare to use the random forest and GridSearchcV
from sklearn import ensemble
from sklearn.model_selection import GridSearchCV

rfc = ensemble.RandomForestClassifier()

#Imbalanced data
X = fraud_data.drop('Class', 1)
Y = fraud_data['Class']

# Create rfc parameters for GS-CV
param_grid = {'n_estimators': [10, 25, 50, 75, 100, 200, 500, 1000],
             'max_features': [2, 4, 6, 8],
             'max_depth': [2, 4, 6, 8]}

# Set up GS-CV
grid = GridSearchCV(rfc, param_grid, cv=5, verbose=3, n_jobs=-1)

grid.fit(X, Y)

# Show the best parameter and best score for unfiltered
print('Best Parameter for data:', grid.best_params_)
print('\nBest Score for data:', grid.best_score_)

Fitting 5 folds for each of 128 candidates, totalling 640 fits
[CV] max_depth=2, max_features=2, n_estimators=10 ....................
[CV] max_depth=2, max_features=2, n_estimators=10 ....................
[CV] max_depth=2, max_features=2, n_estimators=10 ....................
[CV] max_depth=2, max_features=2, n_estimators=10 ....................
[CV] max_depth=2, max_features=2, n_estimators=10 ....................
[CV] max_depth=2, max_features=2, n_estimators=25 ....................
[CV] max_depth=2, max_features=2, n_estimators=25 ....................
[CV] max_depth=2, max_features=2, n_estimators=25 ....................
[CV]  max_depth=2, max_features=2, n_estimators=10, score=0.9987711105649381, total=   2.5s
[CV] max_depth=2, max_features=2, n_estimators=25 ....................
[CV]  max_depth=2, max_features=2, n_estimators=10, score=0.9989642217618764, total=   2.5s
[CV] max_depth=2, max_features=2, n_estimators=25 ....................
[CV]  max_depth=2, max_features=2, n_estima

[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   29.9s


[CV]  max_depth=2, max_features=2, n_estimators=75, score=0.9989642217618764, total=  20.2s
[CV] max_depth=2, max_features=2, n_estimators=100 ...................
[CV]  max_depth=2, max_features=2, n_estimators=75, score=0.9985428626604168, total=  20.3s
[CV] max_depth=2, max_features=2, n_estimators=200 ...................
[CV]  max_depth=2, max_features=2, n_estimators=75, score=0.9987886448622741, total=  21.5s
[CV] max_depth=2, max_features=2, n_estimators=200 ...................
[CV]  max_depth=2, max_features=2, n_estimators=75, score=0.9982795245869981, total=  21.5s
[CV] max_depth=2, max_features=2, n_estimators=200 ...................
[CV]  max_depth=2, max_features=2, n_estimators=100, score=0.9985779993679997, total=  27.8s
[CV] max_depth=2, max_features=2, n_estimators=200 ...................
[CV]  max_depth=2, max_features=2, n_estimators=100, score=0.9988764439450862, total=  27.1s
[CV] max_depth=2, max_features=2, n_estimators=200 ...................
[CV]  max_depth=2, m

[CV] max_depth=2, max_features=4, n_estimators=500 ...................
[CV]  max_depth=2, max_features=4, n_estimators=200, score=0.9989642035778866, total= 1.4min
[CV] max_depth=2, max_features=4, n_estimators=1000 ..................
[CV]  max_depth=2, max_features=4, n_estimators=200, score=0.9991924299081828, total= 1.4min
[CV] max_depth=2, max_features=4, n_estimators=1000 ..................
[CV]  max_depth=2, max_features=4, n_estimators=200, score=0.9989115359632029, total= 1.4min
[CV] max_depth=2, max_features=4, n_estimators=1000 ..................
[CV]  max_depth=2, max_features=4, n_estimators=500, score=0.9989817773252344, total= 3.4min
[CV] max_depth=2, max_features=4, n_estimators=1000 ..................
[CV]  max_depth=2, max_features=4, n_estimators=500, score=0.9992451107756047, total= 3.3min
[CV] max_depth=2, max_features=4, n_estimators=1000 ..................
[CV]  max_depth=2, max_features=4, n_estimators=500, score=0.9989642035778866, total= 3.3min
[CV] max_depth=2

[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed: 21.7min


[CV]  max_depth=2, max_features=6, n_estimators=500, score=0.9988764242200804, total= 4.9min
[CV] max_depth=2, max_features=8, n_estimators=10 ....................
[CV]  max_depth=2, max_features=6, n_estimators=500, score=0.9992626533944278, total= 4.9min
[CV] max_depth=2, max_features=8, n_estimators=10 ....................
[CV]  max_depth=2, max_features=6, n_estimators=500, score=0.9989817594494479, total= 4.9min
[CV] max_depth=2, max_features=8, n_estimators=10 ....................
[CV]  max_depth=2, max_features=8, n_estimators=10, score=0.9989817773252344, total=   7.7s
[CV] max_depth=2, max_features=8, n_estimators=10 ....................
[CV]  max_depth=2, max_features=8, n_estimators=10, score=0.9995084442259752, total=   7.6s
[CV] max_depth=2, max_features=8, n_estimators=10 ....................
[CV]  max_depth=2, max_features=8, n_estimators=10, score=0.9988588683485191, total=   7.4s
[CV] max_depth=2, max_features=8, n_estimators=25 ....................
[CV]  max_depth=2, 

[CV] max_depth=4, max_features=2, n_estimators=50 ....................
[CV]  max_depth=4, max_features=2, n_estimators=25, score=0.9992275416513052, total=  10.4s
[CV] max_depth=4, max_features=2, n_estimators=50 ....................
[CV]  max_depth=4, max_features=2, n_estimators=25, score=0.9990344270641316, total=  10.4s
[CV] max_depth=4, max_features=2, n_estimators=50 ....................
[CV]  max_depth=4, max_features=2, n_estimators=50, score=0.9989817773252344, total=  20.6s
[CV] max_depth=4, max_features=2, n_estimators=50 ....................
[CV]  max_depth=4, max_features=2, n_estimators=50, score=0.9991924440855307, total=  20.5s
[CV] max_depth=4, max_features=2, n_estimators=50 ....................
[CV]  max_depth=4, max_features=2, n_estimators=50, score=0.9989290918347641, total=  20.8s
[CV] max_depth=4, max_features=2, n_estimators=75 ....................
[CV]  max_depth=4, max_features=2, n_estimators=50, score=0.999280209265989, total=  20.8s
[CV] max_depth=4, max_f

[CV]  max_depth=4, max_features=4, n_estimators=75, score=0.999420656238479, total=  56.8s
[CV] max_depth=4, max_features=4, n_estimators=100 ...................
[CV]  max_depth=4, max_features=4, n_estimators=75, score=0.9990695388072541, total=  57.0s
[CV] max_depth=4, max_features=4, n_estimators=100 ...................
[CV]  max_depth=4, max_features=2, n_estimators=1000, score=0.9989993328885924, total= 7.2min
[CV] max_depth=4, max_features=4, n_estimators=100 ...................
[CV]  max_depth=4, max_features=2, n_estimators=1000, score=0.9992099996488887, total= 7.2min
[CV] max_depth=4, max_features=4, n_estimators=100 ...................
[CV]  max_depth=4, max_features=4, n_estimators=100, score=0.9990695551420246, total= 1.3min
[CV] max_depth=4, max_features=4, n_estimators=200 ...................
[CV]  max_depth=4, max_features=4, n_estimators=100, score=0.999420666409185, total= 1.4min
[CV] max_depth=4, max_features=4, n_estimators=200 ...................
[CV]  max_depth=4,

[CV] max_depth=4, max_features=6, n_estimators=500 ...................
[CV]  max_depth=4, max_features=6, n_estimators=200, score=0.9995084442259752, total= 3.6min
[CV] max_depth=4, max_features=6, n_estimators=500 ...................
[CV]  max_depth=4, max_features=6, n_estimators=200, score=0.9991222064219378, total= 3.6min
[CV] max_depth=4, max_features=6, n_estimators=500 ...................
[CV]  max_depth=4, max_features=4, n_estimators=1000, score=0.9994733238531627, total=12.5min
[CV] max_depth=4, max_features=6, n_estimators=500 ...................
[CV]  max_depth=4, max_features=4, n_estimators=1000, score=0.9991046505503766, total=12.5min
[CV] max_depth=4, max_features=6, n_estimators=1000 ..................
[CV]  max_depth=4, max_features=6, n_estimators=200, score=0.9995084355962852, total= 3.6min
[CV] max_depth=4, max_features=6, n_estimators=1000 ..................
[CV]  max_depth=4, max_features=6, n_estimators=200, score=0.9991748740366215, total= 3.6min
[CV] max_depth

[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed: 81.0min


[CV]  max_depth=4, max_features=6, n_estimators=500, score=0.9991222064219378, total= 9.0min
[CV] max_depth=4, max_features=8, n_estimators=10 ....................
[CV]  max_depth=4, max_features=6, n_estimators=500, score=0.9995084355962852, total= 8.9min
[CV] max_depth=4, max_features=8, n_estimators=10 ....................
[CV]  max_depth=4, max_features=8, n_estimators=10, score=0.9989817773252344, total=  14.8s
[CV] max_depth=4, max_features=8, n_estimators=10 ....................
[CV]  max_depth=4, max_features=6, n_estimators=500, score=0.9991748740366215, total= 9.0min
[CV] max_depth=4, max_features=8, n_estimators=10 ....................
[CV]  max_depth=4, max_features=8, n_estimators=10, score=0.9995435553526912, total=  15.1s
[CV] max_depth=4, max_features=8, n_estimators=10 ....................
[CV]  max_depth=4, max_features=8, n_estimators=10, score=0.9991397622934991, total=  14.9s
[CV] max_depth=4, max_features=8, n_estimators=25 ....................
[CV]  max_depth=4, 

[CV] max_depth=6, max_features=2, n_estimators=50 ....................
[CV]  max_depth=6, max_features=2, n_estimators=25, score=0.9995084355962852, total=  15.1s
[CV] max_depth=6, max_features=2, n_estimators=50 ....................
[CV]  max_depth=6, max_features=2, n_estimators=25, score=0.9991222064219378, total=  14.8s
[CV] max_depth=6, max_features=2, n_estimators=50 ....................
[CV]  max_depth=6, max_features=2, n_estimators=50, score=0.9990344440153085, total=  30.0s
[CV] max_depth=6, max_features=2, n_estimators=50 ....................
[CV]  max_depth=6, max_features=2, n_estimators=50, score=0.9993679997191109, total=  29.5s
[CV] max_depth=6, max_features=2, n_estimators=50 ....................
[CV]  max_depth=6, max_features=2, n_estimators=50, score=0.9988939800916417, total=  29.6s
[CV] max_depth=6, max_features=2, n_estimators=75 ....................
[CV]  max_depth=6, max_features=2, n_estimators=50, score=0.9992450975228665, total=  29.2s
[CV] max_depth=6, max_

[CV]  max_depth=6, max_features=4, n_estimators=75, score=0.23519188230750324, total= 1.4min
[CV] max_depth=6, max_features=4, n_estimators=100 ...................
[CV]  max_depth=6, max_features=4, n_estimators=75, score=0.9995962220427653, total= 1.4min
[CV] max_depth=6, max_features=4, n_estimators=100 ...................
[CV]  max_depth=6, max_features=4, n_estimators=75, score=0.9991397622934991, total= 1.4min
[CV] max_depth=6, max_features=4, n_estimators=100 ...................
[CV]  max_depth=6, max_features=4, n_estimators=75, score=0.9995786590825302, total= 1.4min
[CV] max_depth=6, max_features=4, n_estimators=100 ...................
[CV]  max_depth=6, max_features=4, n_estimators=75, score=0.999280209265989, total= 1.4min
[CV] max_depth=6, max_features=4, n_estimators=200 ...................
[CV]  max_depth=6, max_features=4, n_estimators=100, score=0.9649239844106597, total= 2.0min
[CV] max_depth=6, max_features=4, n_estimators=200 ...................
[CV]  max_depth=6, ma

[CV] max_depth=6, max_features=6, n_estimators=500 ...................
[CV]  max_depth=6, max_features=6, n_estimators=200, score=0.9995786664794073, total= 5.3min
[CV] max_depth=6, max_features=6, n_estimators=500 ...................
[CV]  max_depth=6, max_features=6, n_estimators=200, score=0.9991573181650603, total= 5.3min
[CV] max_depth=6, max_features=6, n_estimators=500 ...................
[CV]  max_depth=6, max_features=6, n_estimators=200, score=0.9996313266972139, total= 5.3min
[CV] max_depth=6, max_features=6, n_estimators=500 ...................
[CV]  max_depth=6, max_features=4, n_estimators=1000, score=0.9996137708256526, total=18.1min
[CV] max_depth=6, max_features=6, n_estimators=1000 ..................
[CV]  max_depth=6, max_features=4, n_estimators=1000, score=0.9992450975228665, total=18.2min
[CV] max_depth=6, max_features=6, n_estimators=1000 ..................
[CV]  max_depth=6, max_features=6, n_estimators=200, score=0.9993679886237953, total= 5.3min
[CV] max_depth

[CV] max_depth=8, max_features=2, n_estimators=10 ....................
[CV]  max_depth=8, max_features=2, n_estimators=10, score=0.9982268881008391, total=   7.9s
[CV] max_depth=8, max_features=2, n_estimators=10 ....................
[CV]  max_depth=8, max_features=2, n_estimators=10, score=0.9995435553526912, total= 6.4min
[CV] max_depth=8, max_features=2, n_estimators=10 ....................
[CV]  max_depth=8, max_features=2, n_estimators=10, score=0.9990168711925703, total= 6.4min
[CV] max_depth=8, max_features=2, n_estimators=10 ....................
[CV]  max_depth=8, max_features=2, n_estimators=10, score=0.9993855444953564, total=   8.9s
[CV] max_depth=8, max_features=2, n_estimators=25 ....................
[CV]  max_depth=8, max_features=2, n_estimators=10, score=0.9991748740366215, total=   8.4s
[CV] max_depth=8, max_features=2, n_estimators=25 ....................
[CV]  max_depth=6, max_features=8, n_estimators=500, score=0.9993855444953564, total=27.4min
[CV] max_depth=8, max

[Parallel(n_jobs=-1)]: Done 496 tasks      | elapsed: 229.3min


[CV]  max_depth=8, max_features=2, n_estimators=100, score=0.9994557775359011, total= 1.4min
[CV] max_depth=8, max_features=2, n_estimators=100 ...................
[CV]  max_depth=8, max_features=2, n_estimators=100, score=0.9990870946788153, total= 1.4min
[CV] max_depth=8, max_features=2, n_estimators=200 ...................
[CV]  max_depth=8, max_features=2, n_estimators=100, score=0.999490879724724, total= 1.4min
[CV] max_depth=8, max_features=2, n_estimators=200 ...................
[CV]  max_depth=8, max_features=2, n_estimators=100, score=0.9991748740366215, total= 1.4min
[CV] max_depth=8, max_features=2, n_estimators=200 ...................
[CV]  max_depth=8, max_features=2, n_estimators=200, score=0.9990695551420246, total= 2.7min
[CV] max_depth=8, max_features=2, n_estimators=200 ...................
[CV]  max_depth=8, max_features=2, n_estimators=200, score=0.9994557775359011, total= 2.7min
[CV] max_depth=8, max_features=2, n_estimators=200 ...................
[CV]  max_depth=8

[CV]  max_depth=8, max_features=4, n_estimators=200, score=0.9995435553526912, total= 4.9min
[CV] max_depth=8, max_features=4, n_estimators=500 ...................
[CV]  max_depth=8, max_features=4, n_estimators=200, score=0.9991573181650603, total= 4.9min
[CV] max_depth=8, max_features=4, n_estimators=1000 ..................
[CV]  max_depth=8, max_features=4, n_estimators=200, score=0.9996664384403364, total= 4.9min
[CV] max_depth=8, max_features=4, n_estimators=1000 ..................
[CV]  max_depth=8, max_features=4, n_estimators=200, score=0.9993855444953564, total= 4.9min
[CV] max_depth=8, max_features=4, n_estimators=1000 ..................
[CV]  max_depth=8, max_features=4, n_estimators=500, score=0.02763245672553632, total=12.2min
[CV] max_depth=8, max_features=4, n_estimators=1000 ..................
[CV]  max_depth=8, max_features=4, n_estimators=500, score=0.9995611109160493, total=12.3min
[CV] max_depth=8, max_features=4, n_estimators=1000 ..................
[CV]  max_depth

[CV]  max_depth=8, max_features=8, n_estimators=10, score=0.9994733330992591, total=  28.5s
[CV] max_depth=8, max_features=8, n_estimators=10 ....................
[CV]  max_depth=8, max_features=8, n_estimators=10, score=0.9991924299081828, total=  28.8s
[CV] max_depth=8, max_features=8, n_estimators=25 ....................
[CV]  max_depth=8, max_features=8, n_estimators=10, score=0.9996488825687752, total=  28.6s
[CV] max_depth=8, max_features=8, n_estimators=25 ....................
[CV]  max_depth=8, max_features=8, n_estimators=10, score=0.9994382121100402, total=  28.5s
[CV] max_depth=8, max_features=8, n_estimators=25 ....................
[CV]  max_depth=8, max_features=8, n_estimators=25, score=0.005828447034865349, total= 1.2min
[CV] max_depth=8, max_features=8, n_estimators=25 ....................
[CV]  max_depth=8, max_features=8, n_estimators=25, score=0.9995962220427653, total= 1.2min
[CV] max_depth=8, max_features=8, n_estimators=25 ....................
[CV]  max_depth=8, m

[Parallel(n_jobs=-1)]: Done 640 out of 640 | elapsed: 401.5min finished


Best Parameter for data: {'max_depth': 4, 'max_features': 8, 'n_estimators': 500}

Best Score for data: 0.9993293704157552


In [32]:
# Show the best parameter and best score for unfiltered since it took so long/
print('Best Parameter for data:', grid.best_params_)
print('Best Score for data:', grid.best_score_)

Best Parameter for data: {'max_depth': 4, 'max_features': 8, 'n_estimators': 500}
Best Score for data: 0.9993293704157552


In [17]:
# # Cross validate score 
# #Imbalanced data
# X = fraud_data.drop('Class', 1)
# Y = fraud_data['Class']

# # Use the best parameters from previous GS-CV
# # Took way too long so I don't want to do it again.
# rfc = ensemble.RandomForestClassifier(n_estimators=500, 
#                                          max_features=8, max_depth=4)

# # use cross val score
# cross_val_score(rfc, X, Y, cv=5)


In [15]:
# Cross validate sample set data with best parameters
X_down = fd_downsampled.drop('Class', 1)
Y_down = fd_downsampled['Class']

# Use the best parameters from previous GS-CV
# Took way too long so I don't want to do it again.
new_rfc = ensemble.RandomForestClassifier(n_estimators=500, 
                                         max_features=8, max_depth=4)

# use cross val score
cross_val_score(new_rfc, X_down, Y_down, cv=5)

array([0.92424242, 0.93434343, 0.91326531, 0.93877551, 0.91326531])

# Analyis

The random forest model for predicting fraud was extremely accurate. I could see that even my test set did pretty well using the best parameters of the raw data. The advantages of using a random forest model is that there are a lot of parameters to choose and tune which can increase the accuracy but at the same time it's the downfall of this model. It took me around 7 hours to have the results returned to me. Although the accuracy of the model is high, the fact that it's a black box model (meaning that I don't know how the model achieved it's accuracy) and it's computational time makes this model subpar.  

# Model 3: Logistic Regression

- Although there are more than 284,000 rows, I am not going to use a ridge regularization which remedies that since I'm going to use my test model to run it through the logistic regression. 
- Otherwise I would have used the ridge regularization to keep the coefficients of the logistic regression from overfitting. 
- The training model only has 984 rows so it doesn't need to penalize the coefficients.

In [18]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()

# Use GS-CV to see which alpha level is best.
parameters = {
             'C':[0.00001,0.0001,0.001,0.01,0.1,1,10,100,10000,100000,100000000]}

lr_grid = GridSearchCV(lr, parameters, cv=5, verbose=1)

#Fit the logistic regression 
lr_grid.fit(X_down, Y_down)

#return best parameters and best score
print('Best parameters:')
print(lr_grid.best_params_)
print('Best Score:')
print(lr_grid.best_score_)

Fitting 5 folds for each of 11 candidates, totalling 55 fits
Best parameters:
{'C': 0.01}
Best Score:
0.9136178861788617


[Parallel(n_jobs=1)]: Done  55 out of  55 | elapsed:    0.5s finished


In [19]:
# Implement parameter 
lr = LogisticRegression(C=.01)

# Cross Validate the lr model. 
cross_val_score(lr, X_down, Y_down, cv=5)

array([0.95959596, 0.93434343, 0.91326531, 0.91836735, 0.84183673])

# Logistic Regression Write Up

The logistic regression ran way faster than the random forest, but that's also because this time the GridSearch had way less parameters to run through. The accuracy of this model is 92.37% which is less accurate than the random forest model, but more accurate than the BNB model. 

The reason I used a logistic regression model instead of a linear regression model is because the data I'm trying to predict is categorical rather than continuous. 

# Model 4:  Support Vector Classifier

In [26]:
from sklearn.svm import SVC

svm = SVC()

# new parameters for this model
svc_params = [{'C': [.00001,.0001,.001,.01,.1,1,
                    10,100,1000,10000],
              'gamma': [.00001,.0001,.001,.01,.1,1,10,25,100]}]

# setting up the grid
svc_grid = GridSearchCV(svm, svc_params, cv=5, verbose=1, n_jobs=-1)

#Fit the grid
svc_grid.fit(X_down, Y_down)

#return best parameters and best score
print('Best parameters:')
print(svc_grid.best_params_)
print('Best Score:')
print(svc_grid.best_score_)

Fitting 5 folds for each of 90 candidates, totalling 450 fits


[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 352 tasks      | elapsed:    3.6s


Best parameters:
{'C': 1e-05, 'gamma': 1e-05}
Best Score:
0.5010162601626016


[Parallel(n_jobs=-1)]: Done 450 out of 450 | elapsed:    4.7s finished


In [28]:
# Implement the parameters parameter 
svm = SVC(C=1e-05, gamma=1e-05)

# Cross Validate the svc model. 
cross_val_score(svm, X_down, Y_down, cv=5)

array([0.49494949, 0.5       , 0.51020408, 0.5       , 0.5       ])

# Analysis

I am surprised by how bad the SVC performed since it was only able to get it half right. SVM should have tremendous accuracy due to the kernel smoothing and I think the default rbf parameter was the correct kernel to use. Once again, the job finished super quickly, but that might have something to do with there being less parameters to run through. This is the worst model so far. 

# Model 5: Gradient Boosting Model

In [29]:
# Set up Gradient boosting parameters.
gb_params = [{'n_estimators': [100,200,500,800,1000],
             'max_depth': [2,4,6]
             }]

# Initialize and fit the model.
clf = ensemble.GradientBoostingClassifier()

# Use the grid
gb_grid = GridSearchCV(clf, gb_params, cv=5, verbose=1, n_jobs=-1)

# Fit the grid
gb_grid.fit(X_down, Y_down)

# Return best parameters and best score
print('Best parameters:')
print(gb_grid.best_params_)
print('Best Score:')
print(gb_grid.best_score_)

Fitting 5 folds for each of 15 candidates, totalling 75 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    5.1s
[Parallel(n_jobs=-1)]: Done  75 out of  75 | elapsed:   12.3s finished


Best parameters:
{'max_depth': 2, 'n_estimators': 100}
Best Score:
0.8658536585365854


In [30]:
# Implement the parameters parameter 
clf = ensemble.GradientBoostingClassifier(n_estimators=100, 
                                         max_depth=2)

# Cross Validate the gradient boosting model. 
cross_val_score(clf, X_down, Y_down, cv=5)

array([0.6969697 , 0.92929293, 0.91836735, 0.92346939, 0.8622449 ])

# Analysis

The reason I used the gradient boosing model was because it was also computationally quick. The gradient boosting model works by adjusting previous outputs based on what was learned by the old model. It learns iteratively which is why it's accuracy increases but it is also susceptible to overfitting. That's the reason I used GS_CV here too besides the fact that it can help me find the optimal parameters, it also finds it while avoiding overfitting my model. 


# Model 6: KNN

In [34]:
from sklearn.neighbors import KNeighborsClassifier

# Initialize the model
knn = KNeighborsClassifier()

# Set parameters for KNN
knn_params = [{'n_neighbors': [2,5,7,10,12,15,20]}]

# Search for the best paramters. 
knn_grid = GridSearchCV(knn, knn_params, cv=5, verbose=1, n_jobs=-1)

# Fit the grid and obtain results
knn_grid.fit(X_down, Y_down)

# Return best parameters and best score
print('Best parameters:')
print(knn_grid.best_params_)
print('Best Score:')
print(knn_grid.best_score_)

Fitting 5 folds for each of 7 candidates, totalling 35 fits
Best parameters:
{'n_neighbors': 2}
Best Score:
0.4532520325203252


[Parallel(n_jobs=-1)]: Done  35 out of  35 | elapsed:    0.2s finished


In [35]:
# Implement the parameters parameter 
knn = KNeighborsClassifier(n_neighbors=2, weights='distance')

# Cross Validate the knn model. 
cross_val_score(knn, X_down, Y_down, cv=5)

array([0.37878788, 0.38888889, 0.34183673, 0.5       , 0.35714286])

# Analysis 

The k neighbors model didn't perform as well as I thought it would. I think the main issue must be that I didn't normalize the distance of the data which is probably why the best parameter became 2 neighbors.

# Model 7: Decision Tree


In [37]:
from sklearn.tree import DecisionTreeClassifier

# Initialize the model
dtc = DecisionTreeClassifier()

# Set parameters for dtc
dtc_params = [{'max_features': [2, 4, 6, 8],
             'max_depth': [2, 4, 6, 8]}]

# Search for the best paramters. 
dtc_grid = GridSearchCV(dtc, dtc_params, cv=5, verbose=1, n_jobs=-1)

# Fit the grid and obtain results
dtc_grid.fit(X_down, Y_down)

# Return best parameters and best score
print('Best parameters:')
print(dtc_grid.best_params_)
print('Best Score:')
print(dtc_grid.best_score_)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best parameters:
{'max_depth': 4, 'max_features': 8}
Best Score:
0.9115853658536586


[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:    0.2s finished


In [39]:
# Implement the parameters parameter 
dtc = DecisionTreeClassifier(max_features=8, max_depth=4)

# Cross Validate the decision tree model. 
cross_val_score(dtc, X_down, Y_down, cv=5)

array([0.55050505, 0.9040404 , 0.90816327, 0.87755102, 0.90306122])

# Analysis

The decision tree is a simpler version of the random forest since it's only one tree compared to the forest. It didn't have the opportunity to vote amongst a set of varied options so its score may be a bit lower. The model's accuracy is due to the low amount of entropy which menas that it was given enough options to correctly classify which cases were fraudulent. A drawback to this could be that I'm allowing the model to overfit but the grid search serves as a safeguard against it. 

# Conclusion

There was an incredible class imbalance initially which is why I downsized the sample so that my data could be normalized. I didn't have to intentionally look for independence and low multicollinearity due to the fact that I'm looking at principal components. 

The best model was the random forest model, but it took the longest by far. My GB model beat my logistic regression model, which beat my BNB model leaving SVC in last place. Besides the random forest model, all of the models were extremely lightweight and quick. The biggest surprise for me is that my support vector model was incredibly inaccurate, failing to predict the fraudulent cases correctly. 