In [23]:
import numpy as np
import pandas as pd
import zipfile
from sklearn.model_selection import train_test_split
from src.helpers import get_data, gridsearch_with_output, score_classifer, test_classifer

import numpy as np

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
import xgboost as xgb
from xgboost.sklearn import XGBClassifier

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate, GridSearchCV, cross_val_score
from sklearn.metrics import recall_score, precision_score, f1_score, confusion_matrix
import time

In [2]:
df_analysis = get_data()

In [3]:
y = df_analysis.pop('fraud')
X = df_analysis.copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, stratify=y, shuffle=True, random_state=1)


In [31]:
#Random Forest Grid Search Tuning
random_forest_grid = {'max_depth': [2, 4, None],
                      'max_features': ['sqrt', None],
                      'oob_score': [True, False],
                      'n_estimators': [20, 30],
                      'class_weight': ['balanced', None],
                      'random_state': [1]
                     }
rf_best_params, rf_best_model, rf_best_score = gridsearch_with_output(RandomForestClassifier(), 
                                                                      random_forest_grid, 
                                                                      'recall',
                                                                      X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    4.6s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:   20.3s
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:   34.0s finished



Result of gridsearch:
Parameter            | Optimal  | Gridsearch values
-------------------------------------------------------
max_depth            | 2        | [2, 4, None]
max_features         | sqrt     | ['sqrt', None]
oob_score            | True     | [True, False]
n_estimators         | 20       | [20, 30]
class_weight         | balanced | ['balanced', None]
random_state         | 1        | [1]


In [32]:
rf_best_score

0.9252526439482962

In [6]:
#  Gradient Boost Grid Search
gb_grid = {'max_depth': [4, None],
           'max_features': ['sqrt', None],
           'min_samples_split': [2, 4, None],
           'min_samples_leaf': [1, None],
           'n_estimators': [40, 50],
           'learning_rate': [.5, .6],
           'random_state': [1]
                     }

gb_best_params, gb_best_model, gb_best_score = gridsearch_with_output(GradientBoostingClassifier(), 
                                                           gb_grid, 
                                                           'f1',
                                                           X_train, y_train)

Fitting 5 folds for each of 96 candidates, totalling 480 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:   33.8s
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 480 out of 480 | elapsed:  2.9min finished



Result of gridsearch:
Parameter            | Optimal  | Gridsearch values
-------------------------------------------------------
max_depth            | None     | [4, None]
max_features         | sqrt     | ['sqrt', None]
min_samples_split    | 4        | [2, 4, None]
min_samples_leaf     | 1        | [1, None]
n_estimators         | 50       | [40, 50]
learning_rate        | 0.5      | [0.5, 0.6]
random_state         | 1        | [1]


In [7]:
gb_best_score

0.8730176989500246

In [8]:
xgb_grid = {'max_depth': [4, None],
            'min_child_weight': [.1, None],
           'learning_rate': [.4, .5],
           'random_state': [1]
                     }

xgb_best_params, xgb_best_model, xgb_best_score = gridsearch_with_output(XGBClassifier(), 
                                                           xgb_grid, 
                                                           'f1',
                                                           X_train, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:   49.4s finished



Result of gridsearch:
Parameter            | Optimal  | Gridsearch values
-------------------------------------------------------
max_depth            | None     | [4, None]
min_child_weight     | 0.1      | [0.1, None]
learning_rate        | 0.4      | [0.4, 0.5]
random_state         | 1        | [1]


In [9]:
xgb_best_score

0.8844152152975102

In [10]:
xgb = XGBClassifier(n_jobs=-1, random_state=1)


In [33]:
# instantiate classifiers
rf = rf_best_model
xgb = xgb_best_model
gbc = gb_best_model

# instantiate voting classifier after other estimators have already been fit
estimators = [['rf',rf], ['xgb', xgb], ['gbc', gbc]]
voting = VotingClassifier(estimators, voting = 'soft')
voting.fit(X_train, y_train)





VotingClassifier(estimators=[['rf',
                              RandomForestClassifier(class_weight='balanced',
                                                     max_depth=2,
                                                     max_features='sqrt',
                                                     n_estimators=20,
                                                     oob_score=True,
                                                     random_state=1)],
                             ['xgb',
                              XGBClassifier(base_score=0.5, booster='gbtree',
                                            colsample_bylevel=1,
                                            colsample_bynode=1,
                                            colsample_bytree=1, gamma=0,
                                            gpu_id=-1, importance_type='gain',
                                            interaction_constraints='',
                                            learning_...
           

In [34]:
y_predict = voting.predict(X_test)

In [35]:
rf = rf_best_model
y_predict = rf.predict(X_test)
print("Random Forest")
print(f"precision: {precision_score(y_test, y_predict)}")
print(f"recall: {recall_score(y_test, y_predict)}")
print(f"f1: {f1_score(y_test, y_predict)}")
print(f"confusion matrix: \n {confusion_matrix(y_test, y_predict)} \n ")

xgb = xgb_best_model
y_predict = xgb.predict(X_test)
print("XGBoost")
print(f"precision: {precision_score(y_test, y_predict)}")
print(f"recall: {recall_score(y_test, y_predict)}")
print(f"f1: {f1_score(y_test, y_predict)}")
print(f"confusion matrix: \n {confusion_matrix(y_test, y_predict)} \n ")

gb = gb_best_model
y_predict = gb.predict(X_test)
print("Gradient Boost")
print(f"precision: {precision_score(y_test, y_predict)}")
print(f"recall: {recall_score(y_test, y_predict)}")
print(f"f1: {f1_score(y_test, y_predict)}")
print(f"confusion matrix: \n {confusion_matrix(y_test, y_predict)} \n ")


y_predict = voting.predict(X_test)
print("Voting")
print(f"precision: {precision_score(y_test, y_predict)}")
print(f"recall: {recall_score(y_test, y_predict)}")
print(f"f1: {f1_score(y_test, y_predict)}")
print(f"confusion matrix: \n {confusion_matrix(y_test, y_predict)} \n ")


Random Forest
precision: 0.3511326860841424
recall: 0.9393939393939394
f1: 0.5111896348645465
confusion matrix: 
 [[2189  401]
 [  14  217]] 
 
XGBoost
precision: 0.9414634146341463
recall: 0.8354978354978355
f1: 0.8853211009174311
confusion matrix: 
 [[2578   12]
 [  38  193]] 
 
Gradient Boost
precision: 0.9631578947368421
recall: 0.7922077922077922
f1: 0.8693586698337292
confusion matrix: 
 [[2583    7]
 [  48  183]] 
 
Voting
precision: 0.9593908629441624
recall: 0.8181818181818182
f1: 0.883177570093458
confusion matrix: 
 [[2582    8]
 [  42  189]] 
 


In [37]:
import pickle

In [39]:
filename = 'voting_model.sav'
pickle.dump(voting, open(filename, 'wb'))