We now need to determine how much outlier data we want to exclude from our dataset, as doing so will reduce the number of training features available. I will decide to only remove the outliers for the top three most correlated features. The top three most correlated features, V14, V4, and V12, all have a significant number of non-fraud outliers that intersect with the quartiles of the fraud box plots. While the correlation coefficients for the next two most correlated features (V11 and V10) are also quite high, they do not have as many outliers that intersect with the opposite box plots. The V16 feature class has some outliers that intersect, but given its lower correlation coefficient, I will choose to only remove outliers from the top three most correlated features to minimize the necessary outlier removals.

We will use the interquartile range method to remove outlier data from our analysis. 

In [None]:
# imports
import os
import numpy as np
import matplotlib.pyplot as plt

from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, classification_report
from collections import Counter
from sklearn.model_selection import KFold, StratifiedKFold

In [None]:
# custom scorer, we only care about getting as many true negatives as possible
precision_true_neg = make_scorer(precision_score, pos_label=1)

In [None]:
# decision trees

dt_hyperparams = {
        "max_depth": [None, 2, 8],
        "criterion": ["gini", "entropy"],
        "max_leaf_nodes": [None, 5, 10]
    }

decision_trees = DecisionTreeClassifier()
clf = GridSearchCV(
        estimator=decision_trees, 
        param_grid=dt_hyperparams, 
        scoring=precision_true_neg,
        cv=5, 
        verbose=2,
        n_jobs = -1
    )

dt_model = clf.fit(xtrain, ytrain)
print("Best parameters found from grid search: ", clf.best_params_)
print("Validation fitting with best precision on labels = 1:", clf.best_score_)

In [None]:
min_k = 2
max_k = 20
n_array = np.arange(min_k, max_k, 3)

knn = KNeighborsClassifier()
knn_hyperparams = {
    'n_neighbors': n_array,
    'weights': ['uniform', 'distance']
    }

clf = GridSearchCV(estimator=knn, 
                          param_grid=knn_hyperparams,
                          scoring= precision_true_neg, 
                          cv=5,  
                          verbose=2, 
                          n_jobs=-1)

knn_model = clf.fit(xtrain, ytrain)
print("Best parameters found from grid search: ", clf.best_params_)
print("Validation fitting with best precision on labels = 1:", clf.best_score_)

In [None]:
logistic = LogisticRegression()

logistic_hyperparams = {
    "penalty": ['l1', 'l2'], 
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]
    }

clf = GridSearchCV(estimator=logistic, 
                          param_grid=logistic_hyperparams,
                          scoring= precision_true_neg, 
                          cv=5,  
                          verbose=2, 
                          n_jobs=-1)

logistic_model = clf.fit(xtrain, ytrain)

print("Best parameters found from grid search: ", clf.best_params_)
print("Validation fitting with best precision on labels = 1:", clf.best_score_)

In [None]:
svm = SVC()

svm_hyperparams = {
    'C': [0.5, 0.7, 0.9, 1], 
    'kernel': ['rbf', 'poly', 'sigmoid', 'linear']
    }

clf = GridSearchCV(estimator=svm, 
                          param_grid=svm_hyperparams,
                          scoring= precision_true_neg, 
                          cv=5,  
                          verbose=2, 
                          n_jobs=-1)

svm_model = clf.fit(xtrain, ytrain)

print("Best parameters found from grid search: ", clf.best_params_)
print("Validation fitting with best precision on labels = 1:", clf.best_score_)