In [91]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, roc_curve, plot_roc_curve, f1_score
from sklearn.linear_model import LogisticRegression
from imblearn.ensemble import BalancedRandomForestClassifier
import xgboost as xgb

In [2]:
df = pd.read_csv('creditcard.csv')
# df = df.rename({'Class': 'fraud'}, axis=1)
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [3]:
X = df.drop('Class', axis=1, inplace=False)
y = df[['Class']]

In [90]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify=y)

In [5]:
gridsearch = False

# Logistic Regression

In [6]:
if gridsearch:
    parameters_lr = {'penalty': ('l1', 'l2'), #, 'elasticnet', 'none'
                     'max_iter': (10, 50, 100, 200), #
                     'solver': ['saga', 'liblinear'], #
                     # 'l1_ratio': [0.5]
                     }
    lr = GridSearchCV(
        LogisticRegression(random_state=42),
        parameters_lr,
        n_jobs=-1,
        scoring='f1',
        verbose=2,
        cv=2
        )
else:
    lr = LogisticRegression(random_state=42,
                            penalty='l1', #
                            max_iter=50, # 100
                            solver='liblinear'
                            )

lr.fit(X_train, y_train.values.ravel())

if gridsearch:
    print(lr.best_params_)

y_pred_lr = lr.predict(X_test)
print(classification_report(y_test, y_pred_lr))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     93838
           1       0.87      0.63      0.73       149

    accuracy                           1.00     93987
   macro avg       0.93      0.82      0.87     93987
weighted avg       1.00      1.00      1.00     93987



In [71]:
# Get prediction probability
y_pred_prob_lr = lr.predict_proba(X_test)

# Make Dataframe to .map
y_pred_lr_treshold = pd.DataFrame(data=y_pred_prob_lr[:,1], columns=['pred'])

# Find optimal threshold based on f1 score
max_score = 0
threshold = 0
for i in np.linspace(0,1,101):
    y_pred_lr_treshold_temp = y_pred_lr_treshold['pred'].map(lambda x: 1 if x > i else 0)
    if f1_score(y_test, y_pred_lr_treshold_temp) > max_score:
        threshold = i
        max_score = f1_score(y_test, y_pred_lr_treshold_temp)
        y_pred_lr_treshold_final = y_pred_lr_treshold_temp

print(threshold)
print(classification_report(y_test, y_pred_lr_treshold_final))

0.11
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     93838
           1       0.80      0.79      0.79       149

    accuracy                           1.00     93987
   macro avg       0.90      0.89      0.90     93987
weighted avg       1.00      1.00      1.00     93987



# Random Forest

In [124]:
if gridsearch:
    parameters_rf = {'n_estimators': (3, 5, 20, 100), #
                     'max_depth': (10, 50, 200), #
                     }
    rf = GridSearchCV(
        RandomForestClassifier(random_state=42),
        parameters_rf,
        n_jobs=-1,
        scoring='f1',
        verbose=2,
        cv=2
        )
else:
    rf = RandomForestClassifier(random_state=42,
                                n_estimators=5, # 5
                                max_depth=50, # 50
                                )

rf.fit(X_train, y_train.values.ravel())

if gridsearch:
    print(rf.best_params_)

y_pred_rf = rf.predict(X_test)
print(classification_report(y_test, y_pred_rf))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     93838
           1       0.89      0.81      0.85       149

    accuracy                           1.00     93987
   macro avg       0.94      0.91      0.92     93987
weighted avg       1.00      1.00      1.00     93987



# Weighted Random Forest

In [125]:
if gridsearch:
    parameters_wrf = {'n_estimators': (100, 200, 400), # 5, 20, 100
                     'max_depth': (50, 100), # 10, 50, 200
                     }
    wrf = GridSearchCV(
        RandomForestClassifier(random_state=42, class_weight='balanced'),
        parameters_wrf,
        n_jobs=-1,
        scoring='f1',
        verbose=2,
        cv=2
        )
else:
    wrf = RandomForestClassifier(random_state=42,
                                 class_weight='balanced',
                                 n_estimators=100, # 200
                                 max_depth=100, # 50
                                 )

wrf.fit(X_train, y_train.values.ravel())

if gridsearch:
    print(wrf.best_params_)

y_pred_wrf = wrf.predict(X_test)
print(classification_report(y_test, y_pred_wrf))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     93838
           1       0.96      0.79      0.87       149

    accuracy                           1.00     93987
   macro avg       0.98      0.90      0.93     93987
weighted avg       1.00      1.00      1.00     93987



# Balanced Random Forest

In [126]:
if gridsearch:
    parameters_brf = {'n_estimators': (5, 50, 100), # 5, 20, 100
                     'max_depth': (1, 2, 3, 5), # 10, 50, 200
                     }
    brf = GridSearchCV(
        BalancedRandomForestClassifier(random_state=42, sampling_strategy='not minority'),
        parameters_brf,
        n_jobs=-1,
        scoring='f1',
        verbose=2,
        cv=2
        )
else:
    brf = BalancedRandomForestClassifier(random_state=42,
                                         sampling_strategy='not majority',
                                         n_estimators=100, # 100
                                         criterion='entropy',
                                         max_depth=50, # 50
                                         )

brf.fit(X_train, y_train.values.ravel())
if gridsearch:
    print(brf.best_params_)

y_pred_brf = brf.predict(X_test)
print(classification_report(y_test, y_pred_brf))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     93838
           1       0.95      0.83      0.88       149

    accuracy                           1.00     93987
   macro avg       0.98      0.91      0.94     93987
weighted avg       1.00      1.00      1.00     93987



# Gradient Boosting

In [127]:
if gridsearch:
    parameters_gb = {'n_estimators': (2,3), # , 20, 100
                     'max_depth': (50, 100), # 10, 50, 200
                     'learning_rate': (0.1, 0.2), # 0.1, 0.5, 1
                     }
    gb = GridSearchCV(
        GradientBoostingClassifier(random_state=42, loss='deviance'),
        parameters_gb,
        n_jobs=-1,
        scoring='f1',
        verbose=2,
        cv=2
        )
else:
    gb = GradientBoostingClassifier(random_state=42,
                                    loss='deviance',
                                    n_estimators=5, # 2
                                    max_depth=50, # 50
                                    learning_rate=0.1) # 0.1

gb.fit(X_train, y_train.values.ravel())
if gridsearch:
    print(gb.best_params_)

y_pred_gb = gb.predict(X_test)
print(classification_report(y_test, y_pred_gb))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     93838
           1       0.71      0.81      0.76       149

    accuracy                           1.00     93987
   macro avg       0.85      0.91      0.88     93987
weighted avg       1.00      1.00      1.00     93987



# XGBoost

In [135]:
if gridsearch:
    parameters_xgb = {'n_estimators': (5, 20, 100), # 5, 20, 100
                     'max_depth': (10, 50, 200), # 10, 50, 200
                     'learning_rate': (0.1, 0.5, 1), # 0.1, 0.5, 1
                     }
    xgbc = GridSearchCV(
        xgb.sklearn.XGBClassifier(objective="binary:logistic", random_state=42),
        parameters_xgb,
        n_jobs=-1,
        scoring='f1',
        verbose=2,
        cv=2
        )
else:
    xgbc = xgb.sklearn.XGBClassifier(random_state=42,
                                     objective="binary:logistic",
                                     n_estimators=100, # 100
                                     max_depth=50, # 50
                                     learning_rate=0.5,) # 0.5

xgbc.fit(X_train, y_train.values.ravel())
if gridsearch:
    print(xgbc.best_params_)

y_pred_xgb = xgbc.predict(X_test)
print(classification_report(y_test, y_pred_xgb))



              precision    recall  f1-score   support

           0       1.00      1.00      1.00     93838
           1       0.96      0.83      0.89       149

    accuracy                           1.00     93987
   macro avg       0.98      0.92      0.95     93987
weighted avg       1.00      1.00      1.00     93987

