In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, precision_recall_curve, average_precision_score, auc
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import IsolationForest

pf = pd.read_csv("/DS/Fraud-Detection-System/data/creditcard.csv")
pf.head()

FileNotFoundError: [Errno 2] No such file or directory: '/Users/hayden/Downloads/Projects/fraud_detection/data/creditcard.csv'

In [None]:
pf.info()
pf.describe()
pf.isnull().sum()

In [None]:
# Check data unbalance

counts = pf['Class'].value_counts()

plt.figure(figsize = (10,6))
sns.barplot(x = counts.index, y = counts.values)
plt.title('Data Unbalance (0 = Not fraud, 1 = Fraud)')
plt.xticks([0,1], ['Non-Fraud (0)', 'Fraud (1)'])
plt.ylabel('Transaction count')
plt.show()

In [None]:
# Feature and Target 

X = pf.drop('Class', axis = 1)
y = pf['Class']
print(X.shape)
print(y.shape)

In [None]:
# Train-test split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

In [None]:
# Logistic Regression

log_reg = LogisticRegression(class_weight='balanced', max_iter=1000, n_jobs=None)
log_reg.fit(X_train, y_train)

y_pred = log_reg.predict(X_test)
y_proba = log_reg.predict_proba(X_test)[:, 1]

prec_log, rec_log, _ = precision_recall_curve(y_test, y_proba)
pr_auc_log = auc(rec_log, prec_log)

print("Logistic Regression")
print(classification_report(y_test, y_pred, digits=4))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))
print('PR-AUC:', pr_auc_log)

In [None]:
# #PR Curve Logistic Regression

# prec, rec, _ = precision_recall_curve(y_test, y_proba)
# plt.figure(figsize=(5,4))
# plt.plot(rec, prec, label="LogReg")
# plt.xlabel("Recall")
# plt.ylabel("Precision")
# plt.title("Precision–Recall Curve")
# plt.legend()
# plt.show()

In [None]:
# Confusion Matrix

cm = confusion_matrix(y_test, y_pred)
fig = plt.figure(figsize=(4,4))
plt.imshow(cm, interpolation='nearest', cmap= "Oranges")
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
for (i,j), v in np.ndenumerate(cm):
    plt.text(j, i, str(v), ha = 'center', va = 'center')
plt.tight_layout()
plt.show()

In [None]:
# SMOTE

sm = SMOTE(random_state = 42)
X_train_sm, y_train_sm = sm.fit_resample(X_train, y_train)

print('Before SMOTE:', X_train.shape, y_train.sum())
print('After SMOTE:', X_train_sm.shape, y_train_sm.sum())

In [None]:
# SMOTE + Logistic Regression

logsm = LogisticRegression(max_iter=2000, solver='lbfgs')
logsm.fit(X_train_sm, y_train_sm)

y_pred_logsm = logsm.predict(X_test)
y_proba_logsm = logsm.predict_proba(X_test)[:, 1]

prec_logsm, rec_logsm, _ = precision_recall_curve(y_test, y_proba_logsm)
pr_auc_logsm = auc(rec_logsm, prec_logsm)

print('Logistic Regression with SMOTE Oversampling')
print(classification_report(y_test, y_pred_logsm, digits= 4))
print('ROC-AUC:', roc_auc_score(y_test, y_proba_logsm))
print('PR-AUC:', pr_auc_logsm)

In [None]:
# SMOTE + Random Forest

rfsm = RandomForestClassifier(class_weight= 'balanced', random_state=42, n_jobs=-1)
rfsm.fit(X_train_sm, y_train_sm)

y_pred_rfsm = rfsm.predict(X_test)
y_proba_rfsm = rfsm.predict_proba(X_test)[:, 1]

prec_rfsm, rec_rfsm, _ = precision_recall_curve(y_test, y_proba_rfsm)
pr_auc_rfsm = auc(rec_rfsm, prec_rfsm)

print('Random Forest with SMOTE Oversampling')
print(classification_report(y_test, y_pred_rfsm, digits= 4))
print('ROC-AUC:', roc_auc_score(y_test, y_proba_rfsm))
print('PR-AUC:', pr_auc_rfsm)

In [None]:
# Random Forest

rf = RandomForestClassifier(class_weight= 'balanced', random_state=42, n_jobs=-1)

rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)
y_prob_rf = rf.predict_proba(X_test)[:, 1]

print("Random Forest")
print(confusion_matrix(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf, digits = 4))

prec_rf, rec_rf, _ = precision_recall_curve(y_test, y_prob_rf)

print(f'ROC-AUC:', roc_auc_score(y_test, y_prob_rf))
print(f'PR-AUC:', auc(rec_rf, prec_rf))

In [None]:
# Isolation Forest

iso = IsolationForest(contamination=0.0017, random_state=42, n_jobs=-1)

iso.fit(X_train)

y_pred_iso = iso.predict(X_test)
y_pred_iso = [1 if x == -1 else 0 for x in y_pred_iso]

prec_iso, rec_iso, _ = precision_recall_curve(y_test, y_pred_iso)

print('Isolation Forest')
print(confusion_matrix(y_test, y_pred_iso))
print(classification_report(y_test, y_pred_iso, digits=4))

print('ROC-AUC:', roc_auc_score(y_test,y_proba))
print('PR-AUC:', auc(rec_iso, prec_iso))

In [None]:
# Table Comparison

comparisons = {"Model": [ 'Logistic Regression', 'Logistic Regression + SMOTE', 'Random Forest', 'Random Forest + SMOTE', 'Isolation Forest'],
               'Precision': [5.10, 12.48 , 96.05, 82.65, 31.13],
               'Recall' : [91.84, 89.90 , 74.49, 82.65, 33.67],
               'F1-Score': [9.67, 21.92 ,83.91, 82.65, 32.35],
               'ROC-AUC': [97.35, 97.53, 95.29, 96.44, 97.35],
               'PR-AUC': [75.26, 78.58, 86.00, 87.53, 32.46]}

df_comp = pd.DataFrame(comparisons)

print(df_comp)

#

In [None]:
# Bar Chart Comparison

df_comp = pd.DataFrame(comparisons)

for conv in ['Precision', 'Recall', 'F1-Score', 'ROC-AUC', 'PR-AUC']:
    df_comp[conv] = pd.to_numeric(df_comp[conv])

# Precision
plt.figure(figsize= (10,6))
plt.bar(df_comp['Model'], df_comp['Precision'], color = 'skyblue')
plt.ylabel('Precision')
plt.title("Precision Comparison")
plt.xticks(rotation = 20)
plt.show()

# Recall
plt.figure(figsize= (10,6))
plt.bar(df_comp['Model'], df_comp['Recall'], color = 'skyblue')
plt.ylabel('Recall')
plt.title("Recall Comparison")
plt.xticks(rotation = 20)
plt.show()

# F1 Score
plt.figure(figsize= (10,6))
plt.bar(df_comp['Model'], df_comp['F1-Score'], color = 'skyblue')
plt.ylabel('F1-Score')
plt.title("F1-Score Comparison")
plt.xticks(rotation = 20)
plt.show()

In [None]:
# # Conclusion

# - Firstly, the data is highly imbalance which makes PR-AUC and recall more significant evaluation metrics than other factors, even accuracy.
# - Logistic regression achieved high recall rate but the downside is that it has low precision rate. If the user is looking for not missing a fraud, it might be a
#   good model but we have to keep in mind that high flag rate means the customer(the user who is being protected by the system) might not consider the flag important after couple of false flags
# - Logistic regression with smote did improve precision from logistic regression but still the precision rate is not good enough for users to rely on.
# - Random forest achieved high precision rate with good recall rate. Random forest also returned high PR-Auc score. This indicates that random forest is a better model than logistic regression.
# - Random forest with smote improved the recall rate from original random forest. Random forest with smote returned slightly higher ROC-AUC and PR-AUC score, meaning random forest with SMOTE is a better fit for prediction model over random forest without SMOTE
# - Isolation forest significantly underperfomed compared to other models