In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report,  confusion_matrix, roc_auc_score

In [30]:
def clean_data(df):
    df = df.copy()
    df.drop_duplicates(inplace=True)
    df.fillna(0, inplace=True)
    df = df.drop(['nameOrig', 'nameDest'], axis=1)
    return df


In [32]:
path = 'Fraud.csv'
df = pd.read_csv(path)
df = clean_data(df)

In [33]:
label_encoders = {}
categorical_cols = ['type']
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [34]:
X = df.drop(['isFraud', 'isFlaggedFraud'], axis=1)
y = df['isFraud']


In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)


In [36]:
model = LGBMClassifier(is_unbalance=True, random_state=42)
model.fit(X_train, y_train)


[LightGBM] [Info] Number of positive: 3564, number of negative: 4209999
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.059253 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1517
[LightGBM] [Info] Number of data points in the train set: 4213563, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.000846 -> initscore=-7.074334
[LightGBM] [Info] Start training from score -7.074334


In [37]:
joblib.dump(model, 'fraud_detection_model.joblib')
joblib.dump(label_encoders, 'label_encoders.joblib')

['label_encoders.joblib']

In [38]:
preds = model.predict(X_test)
print("Classification Report:\n", classification_report(y_test, preds))
print("Confusion Matrix:\n", confusion_matrix(y_test, preds))
print("ROC AUC Score:", roc_auc_score(y_test, preds))


Classification Report:
               precision    recall  f1-score   support

         0.0       1.00      0.89      0.94   1804287
         1.0       0.01      0.91      0.01      1527

    accuracy                           0.89   1805814
   macro avg       0.50      0.90      0.48   1805814
weighted avg       1.00      0.89      0.94   1805814

Confusion Matrix:
 [[1602686  201601]
 [    143    1384]]
ROC AUC Score: 0.8973089417294304


In [39]:
importances = pd.Series(model.feature_importances_, index=X.columns)
print("Top features predicting fraud:\n", importances.sort_values(ascending=False).head(10))


Top features predicting fraud:
 oldbalanceOrg     751
step              724
oldbalanceDest    659
newbalanceDest    440
amount            344
newbalanceOrig     69
type               13
dtype: int32


In [40]:
def predict_fraud(txn_df):
    model = joblib.load('fraud_detection_model.joblib')
    encoders = joblib.load('label_encoders.joblib')

    for col in txn_df.columns:
        if col in encoders:
            txn_df[col] = encoders[col].transform(txn_df[col].astype(str))

    return model.predict(txn_df)[0]
