In [None]:
!pip install catboost
!pip install xgboost

In [21]:
import pandas as pd
import numpy as np
import os
import xgboost as xgb
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, cohen_kappa_score, f1_score, recall_score
import warnings
warnings.filterwarnings("ignore")

### Read in the data

In [2]:
cc = pd.read_csv(r'/Users/joshuacurie/my_files/Machine Learning Comps/credit_card_fraud/creditcard.csv')
class_names = {0:'Not Fraud', 1:'Fraud'}
print(cc.Class.value_counts().rename(index = class_names))

Not Fraud    284315
Fraud           492
Name: Class, dtype: int64


In [3]:
cc.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


### Define some functions to run the model and print resulting stats

In [30]:
def run_model(model, X_train, X_test, y_train, y_test, silent=False, verbosity=True):
    if silent:
        model.fit(X_train, y_train, silent=True)
    elif not verbosity:
        model.fit(X_train, y_train, verbosity=False)
    else:
        model.fit(X_train, y_train)
    pred = model.predict(X_test)
    c_matrix = confusion_matrix(y_test, pred)
    return c_matrix, pred

In [55]:
def PrintStats(cmat, y_test, pred):
    # separate out the confusion matrix components
    tpos = cmat[0][0]
    fneg = cmat[1][1]
    fpos = cmat[0][1]
    tneg = cmat[1][0]
    # calculate F!, Recall scores
    f1Score = round(f1_score(y_test, pred), 2)
    recallScore = round(recall_score(y_test, pred), 2)
    # calculate and display metrics
    print(cmat)
    print( 'Accuracy: '+ str(np.round(100*float(tpos+fneg)/float(tpos+fneg + fpos + tneg),2))+'%')
    print(f'Percent of False Positives: {round(fpos/len(y_test)*100, 2)}%')
    print(f'Percent of False Negatives: {round(tneg/len(y_test)*100, 2)}%')
    print( 'Cohen Kappa: '+ str(np.round(cohen_kappa_score(y_test, pred),3)))
    print("Sensitivity/Recall for Model : {recall_score}".format(recall_score = recallScore))
    print("F1 Score for Model : {f1_score}".format(f1_score = f1Score))

### Get a full and undersampled size train/test set

In [39]:
feature_names = cc.iloc[:, 1:30].columns
target = cc.iloc[:1, 30:].columns
data_features = cc[feature_names]
data_target = cc[target]
data_sets = train_test_split(data_features, 
                             data_target, 
                             train_size=.7, 
                             test_size=.3, 
                             random_state=1)
X_train, X_test, y_train, y_test = data_sets

In [40]:
fraud_idx = cc[cc.Class==1].index
normal_idx = cc[cc.Class==0].index
num_fraud = len(cc[cc.Class == 1])
under_sample_idx = np.random.choice(normal_idx, num_fraud, replace=False)
cc_under_sample = cc.iloc[np.concatenate([fraud_idx,under_sample_idx]), :]
X_under_sample = cc_under_sample.iloc[:, 1:30]
y_under_sample = cc_under_sample.Class
X_us_train, X_us_test, y_us_train, y_us_test = train_test_split(X_under_sample, y_under_sample, train_size=.7, test_size=.3)

### Define the models

In [26]:
lr = LogisticRegression()
rf = RandomForestClassifier()
xgboost = xgb.XGBClassifier()
cb = CatBoostClassifier()

### Run models with full sampled data sets

In [22]:
lr_c_matrix, lr_pred = run_model(lr, X_train, X_test, y_train, y_test)
PrintStats(lr_c_matrix, y_test, lr_pred)

[[85293    15]
 [   57    78]]
Accuracy: 99.92%
Cohen Kappa: 0.684
Sensitivity/Recall for Model : 0.58
F1 Score for Model : 0.68


In [23]:
rf_cmat, rf_pred = run_model(rf, X_train, X_test, y_train, y_test)
PrintStats(rf_cmat, y_test, rf_pred)

[[85301     7]
 [   36    99]]
Accuracy: 99.95%
Cohen Kappa: 0.821
Sensitivity/Recall for Model : 0.73
F1 Score for Model : 0.82


In [31]:
xgb_cmat, xgb_pred = run_model(xgboost, X_train, X_test, y_train, y_test, verbosity=True)
PrintStats(xgb_cmat, y_test, xgb_pred)

[[85297    11]
 [   34   101]]
Accuracy: 99.95%
Cohen Kappa: 0.818
Sensitivity/Recall for Model : 0.75
F1 Score for Model : 0.82


In [34]:
cb_cmat, cb_pred = run_model(cb, X_train, X_test, y_train, y_test, silent=True)
PrintStats(cb_cmat, y_test, cb_pred)

[[85303     5]
 [   31   104]]
Accuracy: 99.96%
Cohen Kappa: 0.852
Sensitivity/Recall for Model : 0.77
F1 Score for Model : 0.85


### Run models with full undersampled data sets

In [35]:
lr_us_cmat, lr_us_pred = run_model(lr, X_us_train, X_test, y_us_train, y_test)
PrintStats(lr_us_cmat, y_test, lr_us_pred)

[[82613  2695]
 [   10   125]]
Accuracy: 96.83%
Cohen Kappa: 0.082
Sensitivity/Recall for Model : 0.93
F1 Score for Model : 0.08


In [36]:
rf_us_cmat, rf_us_pred = run_model(rf, X_us_train, X_test, y_us_train, y_test)
PrintStats(rf_us_cmat, y_test, rf_us_pred)

[[84009  1299]
 [   10   125]]
Accuracy: 98.47%
Cohen Kappa: 0.158
Sensitivity/Recall for Model : 0.93
F1 Score for Model : 0.16


In [41]:
xgb_us_cmat, xgb_us_pred = run_model(xgboost, X_us_train, X_test, y_us_train, y_test, verbosity=True)
PrintStats(xgb_us_cmat, y_test, xgb_us_pred)

[[81762  3546]
 [    2   133]]
Accuracy: 95.85%
Cohen Kappa: 0.067
Sensitivity/Recall for Model : 0.99
F1 Score for Model : 0.07


In [42]:
cb_us_cmat, cb_us_pred = run_model(cb, X_us_train, X_test, y_us_train, y_test, silent=True)
PrintStats(cb_us_cmat, y_test, cb_us_pred)

[[82821  2487]
 [    2   133]]
Accuracy: 97.09%
Cohen Kappa: 0.094
Sensitivity/Recall for Model : 0.99
F1 Score for Model : 0.1


### Print out summary statistics for all of the models

In [57]:
print('------------------------------------------------------------------------------')
print('logistic regression')
PrintStats(lr_c_matrix, y_test, lr_pred)
print('------------------------------------------------------------------------------')
print('random forrests')
PrintStats(rf_cmat, y_test, rf_pred)
print('------------------------------------------------------------------------------')
print('xgboost')
PrintStats(xgb_cmat, y_test, xgb_pred)
print('------------------------------------------------------------------------------')
print('catboost')
PrintStats(cb_cmat, y_test, cb_pred)
print('------------------------------------------------------------------------------')
print('logistic regression undersampled')
PrintStats(lr_us_cmat, y_test, lr_us_pred)
print('------------------------------------------------------------------------------')
print('random forrests under sampled')
PrintStats(rf_us_cmat, y_test, rf_us_pred)
print('------------------------------------------------------------------------------')
print('xgboost undersampled')
PrintStats(xgb_us_cmat, y_test, xgb_us_pred)
print('------------------------------------------------------------------------------')
print('catboost undersampled')
PrintStats(cb_us_cmat, y_test, cb_us_pred)
print('------------------------------------------------------------------------------')


------------------------------------------------------------------------------
logistic regression
[[85293    15]
 [   57    78]]
Accuracy: 99.92%
Percent of False Positives: 0.02%
Percent of False Negatives: 0.07%
Cohen Kappa: 0.684
Sensitivity/Recall for Model : 0.58
F1 Score for Model : 0.68
------------------------------------------------------------------------------
random forrests
[[85301     7]
 [   36    99]]
Accuracy: 99.95%
Percent of False Positives: 0.01%
Percent of False Negatives: 0.04%
Cohen Kappa: 0.821
Sensitivity/Recall for Model : 0.73
F1 Score for Model : 0.82
------------------------------------------------------------------------------
xgboost
[[85297    11]
 [   34   101]]
Accuracy: 99.95%
Percent of False Positives: 0.01%
Percent of False Negatives: 0.04%
Cohen Kappa: 0.818
Sensitivity/Recall for Model : 0.75
F1 Score for Model : 0.82
------------------------------------------------------------------------------
catboost
[[85303     5]
 [   31   104]]
Accuracy: