In [1]:
import os
import sys
import pandas as pd
sys.path.append('../src')
from Classical_ML_Fruaddata import FraudModelingPipeline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import pandas as pd

# Step 1: Define data types for columns (excluding datetime columns)
data_types = {
    'user_id': 'int64',
    'purchase_value': 'float64',
    'device_id': 'object',
    'source': 'object',
    'browser': 'object',
    'sex': 'category',
    'age': 'int64',
    'ip_address': 'object',
    'class': 'int64',
    'ip_numeric': 'int64',
    'country': 'category'
}

# Step 2: Load the fraud dataset with specified data types
Fraud_df = pd.read_csv(
    "../Data/balanced_fraud_df.csv", 
    dtype=data_types, 
    parse_dates=['signup_time', 'purchase_time']  # Parse these columns as dates
)
Fraud_df['signup_time'] = pd.to_datetime(Fraud_df['signup_time'], errors='coerce')
Fraud_df['purchase_time'] = pd.to_datetime(Fraud_df['purchase_time'], errors='coerce')

# Step 3: Check data types of the loaded DataFrame
print(Fraud_df.dtypes)

# Step 4: Optionally sample the dataset if it's too large
Fraud_sample = Fraud_df.sample(frac=0.2, random_state=42)  # Adjust sample size as needed

# Step 5: Initialize the fraud modeling pipeline
Fraud_pipeline = FraudModelingPipeline(Fraud_sample, target_column='class', dataset_name="Fraud")

# Step 6: Train the models
Fraud_pipeline.train_models()

user_id                    int64
signup_time       datetime64[ns]
purchase_time     datetime64[ns]
purchase_value           float64
device_id                 object
source                    object
browser                   object
sex                     category
age                        int64
ip_address                object
class                      int64
ip_numeric                 int64
country                 category
dtype: object
--- Logistic Regression ---
              precision    recall  f1-score   support

           0       0.49      0.49      0.49       544
           1       0.53      0.52      0.53       589

    accuracy                           0.51      1133
   macro avg       0.51      0.51      0.51      1133
weighted avg       0.51      0.51      0.51      1133





--- Decision Tree ---
              precision    recall  f1-score   support

           0       0.55      0.54      0.54       544
           1       0.58      0.59      0.58       589

    accuracy                           0.56      1133
   macro avg       0.56      0.56      0.56      1133
weighted avg       0.56      0.56      0.56      1133





--- Random Forest ---
              precision    recall  f1-score   support

           0       0.56      0.58      0.57       544
           1       0.60      0.58      0.59       589

    accuracy                           0.58      1133
   macro avg       0.58      0.58      0.58      1133
weighted avg       0.58      0.58      0.58      1133





--- Gradient Boosting ---
              precision    recall  f1-score   support

           0       0.53      0.51      0.52       544
           1       0.56      0.58      0.57       589

    accuracy                           0.55      1133
   macro avg       0.55      0.55      0.55      1133
weighted avg       0.55      0.55      0.55      1133





In [3]:
Fraud_pipeline.hyperparameter_tuning()

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best Random Forest Parameters: {'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 20, 'bootstrap': True}
--- Random Forest (Tuned) ---
              precision    recall  f1-score   support

           0       0.57      0.56      0.56       544
           1       0.60      0.60      0.60       589

    accuracy                           0.58      1133
   macro avg       0.58      0.58      0.58      1133
weighted avg       0.58      0.58      0.58      1133





Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best Gradient Boosting Parameters: {'subsample': 0.8, 'n_estimators': 300, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_depth': 7, 'learning_rate': 0.05}
--- Gradient Boosting (Tuned) ---
              precision    recall  f1-score   support

           0       0.55      0.57      0.56       544
           1       0.59      0.58      0.58       589

    accuracy                           0.57      1133
   macro avg       0.57      0.57      0.57      1133
weighted avg       0.57      0.57      0.57      1133



