In [1]:
import sys
import os

PROJECT_ROOT = os.path.abspath("..")
if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)

import pandas as pd
from src.data_split import split_data
from src.baseline_logistic import train_logistic
from src.random_forest import train_random_forest
from src.metrics import evaluate_model
from src.cross_validation import stratified_cv
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Load processed data
df = pd.read_csv("../data/processed/fraud_processed.csv")

TARGET = "class"

# Split
X_train, X_test, y_train, y_test = split_data(df, TARGET)

NON_NUMERIC_COLS = ["signup_time", "purchase_time", "device_id"]

X_train = X_train.drop(columns=NON_NUMERIC_COLS)
X_test = X_test.drop(columns=NON_NUMERIC_COLS)



# Baseline Logistic Regression
log_model = train_logistic(X_train, y_train)
log_results = evaluate_model(log_model, X_test, y_test)

print("Logistic Regression Results:")
print(log_results)

# Random Forest
rf_model = train_random_forest(X_train, y_train)
rf_results = evaluate_model(rf_model, X_test, y_test)

print("\nRandom Forest Results:")
print(rf_results)

# Cross-validation
log_cv = stratified_cv(
    LogisticRegression(max_iter=1000, class_weight="balanced"),
    X_train,
    y_train
)

rf_cv = stratified_cv(
    RandomForestClassifier(
        n_estimators=200,
        max_depth=12,
        class_weight="balanced",
        random_state=42
    ),
    X_train,
    y_train
)

print("\nCross-Validation Results:")
print("Logistic:", log_cv)
print("Random Forest:", rf_cv)


Logistic Regression Results:
{'AUC_PR': 0.09701511588125071, 'F1': 0.1534314103057845, 'Confusion_Matrix': array([[18236,  9157],
       [ 1834,   996]])}

Random Forest Results:
{'AUC_PR': 0.6350872632009363, 'F1': 0.5789281808239154, 'Confusion_Matrix': array([[26325,  1068],
       [ 1242,  1588]])}

Cross-Validation Results:
Logistic: {'F1_mean': np.float64(0.14714130819727558), 'F1_std': np.float64(0.012338047384237676), 'AUC_PR_mean': np.float64(0.09402119037523553), 'AUC_PR_std': np.float64(0.001466100679856417)}
Random Forest: {'F1_mean': np.float64(0.5422076055412205), 'F1_std': np.float64(0.01814816115461241), 'AUC_PR_mean': np.float64(0.628356110312205), 'AUC_PR_std': np.float64(0.010764825692522849)}
