In [1]:
# 1. Import Libraries

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import classification_report

from sklearn.ensemble import RandomForestClassifier

import pickle

In [2]:
# 2. Load Dataset

data = pd.read_csv("../dataset/PS_20174392719_1491204439457_log.csv")

data.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [3]:
# 3. Drop Unnecessary Columns

data = data.drop(['nameOrig', 'nameDest', 'isFlaggedFraud'], axis=1)

In [4]:
# 4. Encode Categorical Column

label_encoder = LabelEncoder()
data['type'] = label_encoder.fit_transform(data['type'])

In [5]:
# 5. Feature Engineering

data['balance_diff_org'] = data['oldbalanceOrg'] - data['newbalanceOrig']
data['balance_diff_dest'] = data['newbalanceDest'] - data['oldbalanceDest']

In [6]:
# 6. Split Data

X = data.drop('isFraud', axis=1)
y = data['isFraud']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [7]:
# 7. Train model

rf = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    class_weight="balanced"
)

rf.fit(X_train, y_train)

In [8]:
# 8. Evaluate

y_pred = rf.predict(X_test)
y_prob = rf.predict_proba(X_test)[:, 1]

print("Random Forest Results")
print("Accuracy :", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall   :", recall_score(y_test, y_pred))
print("F1 Score :", f1_score(y_test, y_pred))
print("ROC-AUC  :", roc_auc_score(y_test, y_prob))

print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))

Random Forest Results
Accuracy : 0.999741458707262
Precision: 0.9666193181818182
Recall   : 0.8283627510651248
F1 Score : 0.892166502785972
ROC-AUC  : 0.9972074514004172

Classification Report:

              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1270881
           1       0.97      0.83      0.89      1643

    accuracy                           1.00   1272524
   macro avg       0.98      0.91      0.95   1272524
weighted avg       1.00      1.00      1.00   1272524



In [9]:
# 9. Save the model

with open("model.pkl", "wb") as file:
    pickle.dump(rf, file)

print("Model Saved Successfully!")

Model Saved Successfully!


In [10]:
# 10. Save the Label Encoder

with open("label_encoder.pkl", "wb") as file:
    pickle.dump(label_encoder, file)

print("Encoder Saved Successfully!")

Encoder Saved Successfully!
