In [2]:
import pandas as pd
import numpy as np
import joblib
from joblib import load
from joblib import dump
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

In [3]:
# Load raw data
path = '../data/paysim_synthetic.csv'
raw_data = pd.read_csv(path)

In [4]:
# Feature engineering function
def engineer_features(data):
    df = data.copy()

    # Extract account types
    df['origAccountType'] = data['nameOrig'].str[0]
    df['destAccountType'] = data['nameDest'].str[0]

    # Flag for Customer-to-Customer transactions
    df['C_to_C'] = ((df['origAccountType'] == 'C') & (df['destAccountType'] == 'C')).astype(int)

    # Log-transform amount to reduce skewness
    df['logAmount'] = np.log1p(data['amount'])

    # Balance features
    df['oldbalanceDiff'] = data['oldbalanceOrg'] - data['oldbalanceDest']
    df['newbalanceDiff'] = data['newbalanceOrig'] - data['newbalanceDest']
    df['balanceChangeOrig'] = data['newbalanceOrig'] - data['oldbalanceOrg']
    df['balanceChangeDest'] = data['newbalanceDest'] - data['oldbalanceDest']

    # Check if transaction zeroes out the origin account
    df['isOriginZeroed'] = (data['oldbalanceOrg'] > 0) & (data['newbalanceOrig'] == 0)

    # Ratio of transaction amount to old balance
    # To avoid division by zero, we'll add a small epsilon
    epsilon = 1e-10
    df['amountToOldBalanceRatio'] = data['amount'] / (data['oldbalanceOrg'] + epsilon)

    # Is the transaction amount suspiciously close to the total balance?
    df['isAmountCloseToBalance'] = (
        abs(data['amount'] - data['oldbalanceOrg']) / (data['oldbalanceOrg'] + epsilon) < 0.05
    ) & (data['oldbalanceOrg'] > 0)

    # Error/oddity detection using np.isclose for balance check
    df['errorBalanceOrig'] = ~np.isclose(data['newbalanceOrig'] + data['amount'], data['oldbalanceOrg'])
    df['errorBalanceDest'] = ~np.isclose(data['oldbalanceDest'] + data['amount'], data['newbalanceDest'])

    # One-hot encode categorical features
    df = pd.get_dummies(df, columns=['type', 'origAccountType', 'destAccountType'], drop_first=True)

    return df

In [5]:
# Clean and select features
def preprocess_data(data):
    data = engineer_features(data)
    data = data.drop(columns=['nameOrig', 'nameDest', 'isFlaggedFraud', 'step', 'balanceChangeDest'], errors='ignore')
    return data

In [6]:
# Apply to full dataset
df_processed = preprocess_data(raw_data)

In [7]:
# Separate target and features
X = df_processed.drop("isFraud", axis=1)
y = df_processed["isFraud"]

In [8]:
# Split (for evaluation only — not part of pipeline)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

In [10]:
# Load best model
best_model = load("../models/best_RandomForestClassifier.joblib")

In [11]:
# Wrap preprocessing and model into pipeline
preprocessing_pipeline = Pipeline([
    ('scaler', StandardScaler())
])

preprocessing_pipeline.fit(X_train)

full_pipeline = Pipeline([
    ('preprocessing', preprocessing_pipeline),
    ('model', best_model)
])

In [12]:
# Fit full pipeline
full_pipeline.fit(X_train, y_train)

In [13]:
# Evaluate pipeline
y_pred_pipeline = full_pipeline.predict(X_test)
y_prob_pipeline = full_pipeline.predict_proba(X_test)[:, 1]

pipeline_accuracy = accuracy_score(y_test, y_pred_pipeline)
pipeline_f1 = f1_score(y_test, y_pred_pipeline)
pipeline_auc = roc_auc_score(y_test, y_prob_pipeline)

print("\nPipeline Performance:")
print(f"Accuracy: {pipeline_accuracy:.4f}")
print(f"F1 Score: {pipeline_f1:.4f}")
print(f"ROC AUC: {pipeline_auc:.4f}")


📊 Pipeline Performance:
Accuracy: 1.0000
F1 Score: 0.9985
ROC AUC: 0.9988


In [17]:
# Save final pipeline
joblib.dump(full_pipeline, "../models/fraud_detection_model.pkl")
print("Final pipeline saved to '../models/fraud_detection_model.pkl'")

Final pipeline saved to 'fraud_detection_model.pkl'


In [16]:
# Save feature order used during training for future inference
feature_order = X_train.columns.tolist()
joblib.dump(feature_order, "../models/feature_order.pkl")
print("Feature order saved to '../models/feature_order.pkl'")

Feature order saved to 'feature_order.pkl'


In [24]:
# Create a feature transformation function for new data
def preprocess_data(df, feature_order=feature_order):
    # Apply the same feature engineering as before
    processed_df = engineer_features(df)

    # Ensure all expected columns are present
    missing_cols = [col for col in feature_order if col not in processed_df.columns]
    for col in missing_cols:
        processed_df[col] = 0

    # Reorder to match training set
    processed_df = processed_df[feature_order]

    return processed_df

# Create a function to make predictions on new data
def predict_fraud(transaction_data, model=None):
    """
    Make fraud predictions on new transaction data.

    Parameters:
    -----------
    transaction_data : pandas DataFrame
        New transaction data with the same format as the original dataset

    Returns:
    --------
    DataFrame with original data plus fraud predictions and probabilities
    """
    # Preprocess the data
    processed_data = preprocess_data(transaction_data)

    # Load the model
    if model is None:
        model = joblib.load('../models/fraud_detection_model.pkl')

    # Make predictions
    fraud_proba = model.predict_proba(processed_data)[:, 1]
    fraud_pred = model.predict(processed_data)

    # Add predictions to the original data
    result = transaction_data.copy()
    result['fraud_probability'] = fraud_proba
    result['fraud_prediction'] = fraud_pred

    return result

In [27]:
# Test the prediction function on a sample of test data
model = joblib.load('../models/fraud_detection_model.pkl')
sample_test = raw_data.iloc[:5].copy()

sample_predictions = predict_fraud(sample_test, model=model)

sample_predictions[['step', 'type', 'amount', 'nameOrig', 'nameDest', 'fraud_probability', 'fraud_prediction']]

Unnamed: 0,step,type,amount,nameOrig,nameDest,fraud_probability,fraud_prediction
0,1,PAYMENT,9839.64,C1231006815,M1979787155,0.0,0
1,1,PAYMENT,1864.28,C1666544295,M2044282225,0.0,0
2,1,TRANSFER,181.0,C1305486145,C553264065,1.0,1
3,1,CASH_OUT,181.0,C840083671,C38997010,0.87498,1
4,1,PAYMENT,11668.14,C2048537720,M1230701703,0.0,0
