In [143]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

# 1. Exploratory Data Analysis: Skipping...

In [144]:
df = pd.read_csv('2_fraud_ex.csv')

In [145]:
df.rename(columns={'oldbalanceOrg': 'oldbalanceOrig'}, inplace=True)

In [146]:
df['type'] = df['type'].astype('category')
df['isFraud'] = df['isFraud'].astype('bool')
df['isFlaggedFraud'] = df['isFlaggedFraud'].astype('bool')

# 3. Feature Engineering: Using portions from the previous assignment.

In [147]:
df_augmented = df.copy()

In [148]:
df_augmented['isMerchandise'] = df_augmented['nameDest'].str.contains('M')

In [149]:
df_augmented['numTransactionsForDest'] = df_augmented.groupby('nameDest')['nameDest'].transform('count')
df_augmented['numTransactionsForOrig'] = df_augmented.groupby('nameOrig')['nameOrig'].transform('count')

In [150]:
df_augmented['dayOfWeek'] = df_augmented['step'] // 24 % 7
df_augmented['hourOfDay'] = df_augmented['step'] % 24

In [151]:
df_augmented['isConsecutiveTransferCashOut'] = (
    (
            ((df_augmented['type'] == "TRANSFER") & (df_augmented['type'].shift(-1) == "CASH_OUT") & (
                    df_augmented['amount'] == df_augmented.shift(-1)['amount'])) |
            ((df_augmented['type'] == "CASH_OUT") & (df_augmented['type'].shift(1) == "TRANSFER") & (
                    df_augmented['amount'] == df_augmented.shift(1)['amount']))
    )
)

# 4. Feature Selection: Using portions from the previous assignment.

## 4.0. List of features

In [152]:
df_selected = df_augmented.copy()

## 4.1. Evaluating each features

In [153]:
df_selected.drop(columns=['step'], inplace=True)

In [154]:
df_selected['type'] = df_selected['type'].cat.codes

In [155]:
df_selected.drop(columns=['nameOrig', 'nameDest'], inplace=True)

In [156]:
df_selected.drop(columns=['oldbalanceOrig', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest'], inplace=True)

In [157]:
# df_selected.drop(columns=['numTransactionsForDest', 'numTransactionsForOrig'], inplace=True)

In [158]:
# df_selected.drop(columns=['isMerchandise'], inplace=True)

In [159]:
df_selected['dayOfWeekSin'] = np.sin(2 * np.pi * df_selected['dayOfWeek'] / 7)
df_selected['dayOfWeekCos'] = np.cos(2 * np.pi * df_selected['dayOfWeek'] / 7)
df_selected['hourOfDaySin'] = np.sin(2 * np.pi * df_selected['hourOfDay'] / 24)
df_selected['hourOfDayCos'] = np.cos(2 * np.pi * df_selected['hourOfDay'] / 24)
df_selected.drop(columns=['dayOfWeek', 'hourOfDay'], inplace=True)

# 5. Splitting the Dataset

In [160]:
from sklearn.model_selection import train_test_split

X = df_selected.drop(columns=['isFraud'])
y = df_selected['isFraud']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

# 6. Feature Scaling

In [161]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaling_columns = ['amount', 'dayOfWeekSin', 'dayOfWeekCos', 'hourOfDaySin', 'hourOfDayCos']

X_train[scaling_columns] = scaler.fit_transform(X_train[scaling_columns])
X_test[scaling_columns] = scaler.transform(X_test[scaling_columns])

# 7. Handling Imbalanced Data: Oversampling

In [162]:
y_train.value_counts()

False    4765805
True        6160
Name: isFraud, dtype: int64

In [163]:
from imblearn.over_sampling import SMOTENC

sm = SMOTENC(categorical_features=[0], sampling_strategy='minority', random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)
y_train_res.value_counts()

False    4765805
True     4765805
Name: isFraud, dtype: int64

# 8. Model: Decision Tree

In [164]:
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier

feature_columns = ['type', 'amount',
                   'dayOfWeekSin', 'dayOfWeekCos', 'hourOfDaySin', 'hourOfDayCos',
                   'isConsecutiveTransferCashOut']

def run_tree(tree):
    tree.fit(X_train_res[feature_columns], y_train_res)
    y_pred_test = tree.predict(X_test[feature_columns])

    print(classification_report(y_test, y_pred_test))
    print('Train Accuracy: ', tree.score(X_train[feature_columns], y_train))
    print('Test Accuracy: ', tree.score(X_test[feature_columns], y_test))

## 8.1. Decision Tree without pruning

In [165]:
clf = DecisionTreeClassifier()
run_tree(clf)

              precision    recall  f1-score   support

       False       1.00      1.00      1.00   1588602
        True       0.94      1.00      0.97      2053

    accuracy                           1.00   1590655
   macro avg       0.97      1.00      0.98   1590655
weighted avg       1.00      1.00      1.00   1590655
Train Accuracy:  1.0
Test Accuracy:  0.9999082139118791


## 8.2. Decision Tree with pruning (max_depth)

In [166]:
clf_maxdepth_3 = DecisionTreeClassifier(max_depth=3)
run_tree(clf_maxdepth_3)

              precision    recall  f1-score   support

       False       1.00      1.00      1.00   1588602
        True       1.00      1.00      1.00      2053

    accuracy                           1.00   1590655
   macro avg       1.00      1.00      1.00   1590655
weighted avg       1.00      1.00      1.00   1590655
Train Accuracy:  0.999983025860416
Test Accuracy:  0.9999899412506169


In [167]:
clf_maxdepth_5 = DecisionTreeClassifier(max_depth=5)
run_tree(clf_maxdepth_5) # Sweet spot!

              precision    recall  f1-score   support

       False       1.00      1.00      1.00   1588602
        True       1.00      1.00      1.00      2053

    accuracy                           1.00   1590655
   macro avg       1.00      1.00      1.00   1590655
weighted avg       1.00      1.00      1.00   1590655
Train Accuracy:  0.9999911985942898
Test Accuracy:  0.9999930846097991


In [168]:
clf_maxdepth_10 = DecisionTreeClassifier(max_depth=10)
run_tree(clf_maxdepth_10)

              precision    recall  f1-score   support

       False       1.00      1.00      1.00   1588602
        True       0.98      1.00      0.99      2053

    accuracy                           1.00   1590655
   macro avg       0.99      1.00      0.99   1590655
weighted avg       1.00      1.00      1.00   1590655
Train Accuracy:  0.9999777869284456
Test Accuracy:  0.9999729671110329


In [169]:
clf_maxdepth_20 = DecisionTreeClassifier(max_depth=20)
run_tree(clf_maxdepth_20)

              precision    recall  f1-score   support

       False       1.00      1.00      1.00   1588602
        True       0.97      1.00      0.99      2053

    accuracy                           1.00   1590655
   macro avg       0.99      1.00      0.99   1590655
weighted avg       1.00      1.00      1.00   1590655
Train Accuracy:  0.9999985330990483
Test Accuracy:  0.9999622796898133


## 8.3. Decision Tree with pruning (min_samples_leaf)

In [170]:
clf_min_samples_leaf_10 = DecisionTreeClassifier(min_samples_leaf=3)
run_tree(clf_min_samples_leaf_10)

              precision    recall  f1-score   support

       False       1.00      1.00      1.00   1588602
        True       0.97      1.00      0.98      2053

    accuracy                           1.00   1590655
   macro avg       0.99      1.00      0.99   1590655
weighted avg       1.00      1.00      1.00   1590655
Train Accuracy:  0.9999738053401481
Test Accuracy:  0.9999566216432854


In [171]:
clf_min_samples_leaf_50 = DecisionTreeClassifier(min_samples_leaf=50)
run_tree(clf_min_samples_leaf_50) # Sweet spot!

              precision    recall  f1-score   support

       False       1.00      1.00      1.00   1588602
        True       0.98      1.00      0.99      2053

    accuracy                           1.00   1590655
   macro avg       0.99      1.00      0.99   1590655
weighted avg       1.00      1.00      1.00   1590655
Train Accuracy:  0.9999794633866761
Test Accuracy:  0.9999717097673599


In [172]:
clf_min_samples_leaf_100 = DecisionTreeClassifier(min_samples_leaf=100)
run_tree(clf_min_samples_leaf_100)

              precision    recall  f1-score   support

       False       1.00      1.00      1.00   1588602
        True       0.98      1.00      0.99      2053

    accuracy                           1.00   1590655
   macro avg       0.99      1.00      0.99   1590655
weighted avg       1.00      1.00      1.00   1590655
Train Accuracy:  0.9999685664081778
Test Accuracy:  0.9999710810955236


In [173]:
clf_min_samples_leaf_500 = DecisionTreeClassifier(min_samples_leaf=500)
run_tree(clf_min_samples_leaf_500)

              precision    recall  f1-score   support

       False       1.00      1.00      1.00   1588602
        True       0.96      0.99      0.98      2053

    accuracy                           1.00   1590655
   macro avg       0.98      1.00      0.99   1590655
weighted avg       1.00      1.00      1.00   1590655
Train Accuracy:  0.9999474011230175
Test Accuracy:  0.9999459342220658


## 8.4. Decision Tree with pruning (min_samples_split)

In [174]:
clf_min_samples_split_10 = DecisionTreeClassifier(min_samples_split=10)
run_tree(clf_min_samples_split_10)

              precision    recall  f1-score   support

       False       1.00      1.00      1.00   1588602
        True       0.94      1.00      0.97      2053

    accuracy                           1.00   1590655
   macro avg       0.97      1.00      0.98   1590655
weighted avg       1.00      1.00      1.00   1590655
Train Accuracy:  0.99998763612055
Test Accuracy:  0.9999182726612622


In [175]:
clf_min_samples_split_50 = DecisionTreeClassifier(min_samples_split=50)
run_tree(clf_min_samples_split_50)

              precision    recall  f1-score   support

       False       1.00      1.00      1.00   1588602
        True       0.97      1.00      0.98      2053

    accuracy                           1.00   1590655
   macro avg       0.99      1.00      0.99   1590655
weighted avg       1.00      1.00      1.00   1590655
Train Accuracy:  0.9999916177088474
Test Accuracy:  0.999959136330631


In [176]:
clf_min_samples_split_100 = DecisionTreeClassifier(min_samples_split=100)
run_tree(clf_min_samples_split_100)

              precision    recall  f1-score   support

       False       1.00      1.00      1.00   1588602
        True       0.98      1.00      0.99      2053

    accuracy                           1.00   1590655
   macro avg       0.99      1.00      0.99   1590655
weighted avg       1.00      1.00      1.00   1590655
Train Accuracy:  0.9999947610680296
Test Accuracy:  0.9999647943771591


## 8.5. Decision Tree with pruning (max_leaf_nodes)

In [177]:
clf_max_leaf_nodes_10 = DecisionTreeClassifier(max_leaf_nodes=10)
run_tree(clf_max_leaf_nodes_10)

              precision    recall  f1-score   support

       False       1.00      1.00      1.00   1588602
        True       1.00      1.00      1.00      2053

    accuracy                           1.00   1590655
   macro avg       1.00      1.00      1.00   1590655
weighted avg       1.00      1.00      1.00   1590655
Train Accuracy:  0.9999914081515686
Test Accuracy:  0.9999930846097991


In [178]:
clf_max_leaf_nodes_50 = DecisionTreeClassifier(max_leaf_nodes=50)
run_tree(clf_max_leaf_nodes_50)

              precision    recall  f1-score   support

       False       1.00      1.00      1.00   1588602
        True       0.99      1.00      1.00      2053

    accuracy                           1.00   1590655
   macro avg       1.00      1.00      1.00   1590655
weighted avg       1.00      1.00      1.00   1590655
Train Accuracy:  0.9999888934642228
Test Accuracy:  0.9999880552351076


In [179]:
clf_max_leaf_nodes_100 = DecisionTreeClassifier(max_leaf_nodes=100)
run_tree(clf_max_leaf_nodes_100)

              precision    recall  f1-score   support

       False       1.00      1.00      1.00   1588602
        True       0.99      1.00      0.99      2053

    accuracy                           1.00   1590655
   macro avg       0.99      1.00      1.00   1590655
weighted avg       1.00      1.00      1.00   1590655
Train Accuracy:  0.9999895221360593
Test Accuracy:  0.9999811398449067


## 8.6. Decision Tree with pruning (max_features) ... 생략