# Model Building and Training

In this notebook, we train a baseline Logistic Regression model and an ensemble Random Forest model to detect fraud. We handle class imbalance using `class_weight='balanced'` and evaluate performance using AUC-PR and F1-Score.

In [None]:
import pandas as pd
import numpy as np
import sys
import os
from sklearn.model_selection import train_test_split

# Add src to path
sys.path.append(os.path.abspath(os.path.join('..', 'src')))

from data_loader import load_data
from modeling import train_baseline_model, train_ensemble_model, stratified_cross_validation, save_model
from evaluation import evaluate_model, compare_models

In [None]:
# Load Processed Data
df = load_data('../data/processed/Fraud_Data_Processed.csv')

if df is not None:
    # Define features and target
    # Excluding user_id and non-numeric columns that weren't encoded or shouldn't be targets
    X = df.drop(columns=['class', 'user_id', 'signup_time', 'purchase_time', 'device_id'], errors='ignore')
    y = df['class']
    
    # Stratified Train-Test Split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )
    
    print(f"Train set shape: {X_train.shape}")
    print(f"Test set shape: {X_test.shape}")

## 1. Baseline Model: Logistic Regression

In [None]:
baseline_model = train_baseline_model(X_train, y_train)
baseline_results = evaluate_model(baseline_model, X_test, y_test, "Baseline (LR)")

## 2. Ensemble Model: Random Forest

In [None]:
ensemble_model = train_ensemble_model(X_train, y_train)
ensemble_results = evaluate_model(ensemble_model, X_test, y_test, "Ensemble (RF)")

## 3. Stratified Cross-Validation

In [None]:
print("Running Cross-Validation for Random Forest...")
cv_results = stratified_cross_validation(ensemble_model, X, y)
print(f"CV F1: {cv_results['f1_mean']:.4f} +/- {cv_results['f1_std']:.4f}")
print(f"CV AUC-PR: {cv_results['auc_pr_mean']:.4f} +/- {cv_results['auc_pr_std']:.4f}")

## 4. Model Comparison

In [None]:
comparison = {
    'Baseline (LR)': baseline_results,
    'Ensemble (RF)': ensemble_results
}
compare_models(comparison)

## 5. Save Best Model

In [None]:
# Business Justification: Random Forest usually outperforms Logistic Regression 
# because it captures complex patterns. We'll save it as our production model.
os.makedirs('../models', exist_ok=True)
save_model(ensemble_model, '../models/best_model.joblib')