In [3]:
import sys, os
sys.path.append(os.path.abspath(".."))  # go one level up to project root

from src.preprocessing import FraudDataProcessor
from src.models import FraudDetectionModels
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Initialize
processor = FraudDataProcessor()
fraud_models = FraudDetectionModels()

# Load data
X_train, X_test, y_train, y_test = processor.load_and_split('../data/creditcard.csv')

# Feature engineering
X_train = processor.engineer_features(X_train)
X_test = processor.engineer_features(X_test)

# Split validation set
X_train_split, X_val, y_train_split, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
)

# Balance training data
X_res, y_res = processor.apply_smote(X_train_split, y_train_split)

# Train models
fraud_models.train_xgboost(X_res, y_res, X_val, y_val)
fraud_models.train_lightgbm(X_res, y_res, X_val, y_val)
fraud_models.train_random_forest(X_res, y_res)
fraud_models.train_isolation_forest(X_train_split[y_train_split == 0])
fraud_models.train_autoencoder(X_train_split[y_train_split == 0], X_val)

# Evaluate models
for name, model in fraud_models.models.items():
    fraud_models.evaluate_model(model, X_test, y_test, name)

# Ensemble prediction
print("\n🔹 Ensemble Results")
ensemble_pred = fraud_models.ensemble_predict(X_test)
print(classification_report(y_test, ensemble_pred))

# Save everything
fraud_models.save_models()
processor.save_processor('../models/preprocessor.pkl')
print("✅ All models and processor saved!")


✅ After SMOTE - Fraud cases: 79608
Training XGBoost...
Training LightGBM...
[LightGBM] [Info] Number of positive: 79608, number of negative: 159216
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.044840 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8417
[LightGBM] [Info] Number of data points in the train set: 238824, number of used features: 34
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.333333 -> initscore=-0.693147
[LightGBM] [Info] Start training from score -0.693147
Training until validation scores don't improve for 50 rounds
[50]	valid_0's auc: 0.992506
Early stopping, best iteration is:
[20]	valid_0's auc: 0.993709
Training Random Forest...
Training Isolation Forest...
Training Autoencoder...

Evaluating xgboost
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85295
           1       0.82      0.82      0.82       148

    accurac

  saving_api.save_model(
