# Credit Card Fraud Detection Model

## Model training

In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier as RFC
from xgboost import XGBClassifier as XGBC
import multiprocessing as mp
from sklearn.ensemble import VotingClassifier
from utils import clean_data, KFold, predict, feature_importance

Load and process train data

Data preprocessing (feature engineering, selection) and data cleaning from `utils.py`

In [3]:
# Load train data from Kaggle dataset [Credit Card Transactions Fraud Detection Dataset]
# Dataset created by KARTIK SHENOY and available under CC0
df_train = pd.read_csv('data/fraudTrain.csv')

# Clean train data
X, y = clean_data(df_train)

Train Random Forest model with different class weights with K-Fold cross-validation

In [4]:
## Random Forest
# RFC list for different class weights
rf_classifiers = []
rfc_class_weights = [None, {0: 1, 1: 50}, {0: 1, 1: 75}, {0: 1, 1: 100}]

# Define model with parameters
for cw in rfc_class_weights:
    rf_params = {
        'n_estimators': 50,
        'max_depth': 20,
        'class_weight': cw,
        'random_state': 42,
        'n_jobs': mp.cpu_count()
        }
    
    rf_classifier = RFC(**rf_params)
    rf_classifiers.append(rf_classifier)
    KFold(rf_classifier, cw, X, y)

Class weights: None
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1289169
           1       0.90      0.68      0.77      7506

    accuracy                           1.00   1296675
   macro avg       0.95      0.84      0.89   1296675
weighted avg       1.00      1.00      1.00   1296675

Class weights: {0: 1, 1: 50}
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1289169
           1       0.78      0.75      0.77      7506

    accuracy                           1.00   1296675
   macro avg       0.89      0.88      0.88   1296675
weighted avg       1.00      1.00      1.00   1296675

Class weights: {0: 1, 1: 75}
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1289169
           1       0.75      0.77      0.76      7506

    accuracy                           1.00   1296675
   macro avg       0.87      0.88      0.88   1296

The same for XGBoost

In [None]:
## XGBoost
# XGBC list for different class weights
xgb_classifiers = []
xgbc_class_weights = [None, 10, 20, 30]

# Define model with parameters
for cw in xgbc_class_weights:
    xgb_params = {
        'max_depth': 20,
        'n_estimators': 50,
        'learning_rate': 0.1,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'scale_pos_weight': cw,
        'random_state': 42,
        'n_jobs': -1
        }
    
    xgb_classifier = XGBC(**xgb_params)
    xgb_classifiers.append(xgb_classifier)
    KFold(xgb_classifier, cw, X, y)

All models look good and will be used for deployment. (Stored in lists)

## Deployment on test data

Load and process test data

In [None]:
# Load test data from Kaggle dataset [Credit Card Transactions Fraud Detection Dataset]
# Dataset created by KARTIK SHENOY and available under CC0
df_test = pd.read_csv('data/fraudTest.csv')

# Clean test data
X_test, y_test = clean_data(df_test)

Make predictions with Random Forest models and XGBoost models respectively.

Savings are calculated through *total detected fraud amount* - *total cost of false detections*.

Random Forest models:

In [None]:
# Predict with Random Forest
rfc_best = None
for idx, rfc in enumerate(rf_classifiers):
    
    # Fit the model
    rfc.fit(X, y)
    
    # Predict
    y_pred = rfc.predict(X_test)
    
    # Predict and best RFC
    if not rfc_best:
        rfc_best = rfc
        savings_best = predict(y_pred, rfc_class_weights, rfc, X_test, y_test, idx)
    else:
        total_savings = predict(y_pred, rfc_class_weights, rfc, X_test, y_test, idx)
        if total_savings > savings_best:
            rfc_best = rfc
            savings_best = total_savings

Feature importance of best Random Forest model:

In [None]:
feature_importance(rfc_best, X) # Feature importance

XGBoost models:

In [None]:
# Predict with XGBoost
xgbc_best = None
for idx, xgbc in enumerate(xgb_classifiers):
    
    # Fit the model
    xgbc.fit(X, y)
    
    # Predict
    y_pred = xgbc.predict(X_test)
    
    # Predict and best RFC
    if not xgbc_best:
        xgbc_best = xgbc
        savings_best = predict(y_pred, xgbc_class_weights, xgbc, X_test, y_test, idx)
    else:
        total_savings = predict(y_pred, xgbc_class_weights, xgbc, X_test, y_test, idx)
        if total_savings > savings_best:
            xgbc_best = xgbc
            savings_best = total_savings

Feature importance of best XGBoost model:

In [None]:
feature_importance(xgbc_best, X) # Feature importance

Combine both models into an ensemble learning model and make predictions

In [None]:
# Create a VotingClassifier with the cross-validated predictions as inputs
voting_classifier = VotingClassifier(
    estimators=[('random_forest', rfc_best), ('xgboost', xgbc_best)],
    voting='soft',
    weights=[1, 1]
)

# Train the VotingClassifier on the entire training dataset
voting_classifier.fit(X, y)

# Ensemble model
ensemble_preds = voting_classifier.predict(X_test)
predict(ensemble_preds, None, voting_classifier, X_test, y_test, None)

Best model is achieved.