In [2]:
"""
Credit Card Fraud Detection (Optimized for Speed)
Reduced Dataset, Lightweight Models, Fast SMOTE, and Early Exit
"""

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (
    confusion_matrix, roc_auc_score, roc_curve, auc,
    f1_score, precision_score, recall_score, accuracy_score
)
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings("ignore")

try:
    import xgboost as xgb
    XGBOOST_AVAILABLE = True
except ImportError:
    XGBOOST_AVAILABLE = False
    print("XGBoost not available. Install with: pip install xgboost")

np.random.seed(42)

# ============================================================================
# 1. DATA CREATION (REDUCED SIZE)
# ============================================================================
def load_credit_card_data():
    print("=" * 60)
    print("STEP 1: DATA GENERATION (OPTIMIZED SMALLER SYNTHETIC DATA)")
    print("=" * 60)

    n_samples = 20000   # Reduced from 100,000 to 20,000
    n_features = 28
    fraud_ratio = 0.005  # 0.5% fraud

    X_legit = np.random.randn(int(n_samples * (1 - fraud_ratio)), n_features)
    y_legit = np.zeros(len(X_legit))
    X_fraud = np.random.randn(int(n_samples * fraud_ratio), n_features) * 1.1 + 0.3
    y_fraud = np.ones(len(X_fraud))

    X = np.vstack([X_legit, X_fraud])
    y = np.hstack([y_legit, y_fraud])
    feature_names = [f'V{i}' for i in range(1, n_features + 1)]
    df = pd.DataFrame(X, columns=feature_names)
    df['Time'] = np.random.randint(0, 86400, len(df))
    df['Amount'] = np.abs(np.random.exponential(100, len(df)))
    df['Class'] = y
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)

    print(f"✓ Dataset Shape: {df.shape}")
    print(f"✓ Legitimate: {(df['Class']==0).sum()} | Fraud: {(df['Class']==1).sum()}")
    print(f"✓ Fraud Ratio: {100 * df['Class'].mean():.3f}%")
    return df

# ============================================================================
# 2. PREPROCESSING (SIMPLE SCALING + FAST SMOTE)
# ============================================================================
def preprocess_data(df):
    print("\n" + "=" * 60)
    print("STEP 2: PREPROCESSING")
    print("=" * 60)

    X = df.drop("Class", axis=1)
    y = df["Class"]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Faster SMOTE with fewer neighbors
    smote = SMOTE(random_state=42, k_neighbors=2)
    X_train_res, y_train_res = smote.fit_resample(X_train_scaled, y_train)

    print(f"✓ Train: {X_train.shape}, Test: {X_test.shape}")
    print(f"✓ After SMOTE: {len(y_train_res)} samples, Fraud Ratio: {y_train_res.mean()*100:.1f}%")
    return X_train_res, X_test_scaled, y_train_res, y_test

# ============================================================================
# 3. MODEL TRAINING (LIGHTWEIGHT MODELS)
# ============================================================================
def train_models(X_train, y_train):
    print("\n" + "=" * 60)
    print("STEP 3: TRAINING LIGHTWEIGHT MODELS")
    print("=" * 60)

    models = {
        "Logistic Regression": LogisticRegression(C=0.2, max_iter=500, random_state=42),
        "Random Forest": RandomForestClassifier(
            n_estimators=50, max_depth=6, min_samples_split=10,
            min_samples_leaf=5, random_state=42
        ),
        "Neural Network": MLPClassifier(
            hidden_layer_sizes=(16,), alpha=0.003, max_iter=100,
            early_stopping=True, random_state=42
        )
    }

    if XGBOOST_AVAILABLE:
        models["XGBoost"] = xgb.XGBClassifier(
            n_estimators=50, max_depth=3, learning_rate=0.15,
            subsample=0.8, colsample_bytree=0.8,
            reg_lambda=1, eval_metric="logloss", use_label_encoder=False, random_state=42
        )

    for name, model in models.items():
        print(f"→ Training {name} ...")
        model.fit(X_train, y_train)

    print("✓ Models Trained Successfully!")
    return models

# ============================================================================
# 4. EVALUATION (FAST + CLEAN OUTPUT)
# ============================================================================
def evaluate_models(models, X_test, y_test):
    print("\n" + "=" * 60)
    print("STEP 4: MODEL EVALUATION")
    print("=" * 60)

    results = []
    for name, model in models.items():
        y_pred = model.predict(X_test)
        y_proba = model.predict_proba(X_test)[:, 1]

        acc = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred, zero_division=0)
        rec = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        auc_ = roc_auc_score(y_test, y_proba)

        results.append({
            "Model": name, "Accuracy": acc, "Precision": prec,
            "Recall": rec, "F1-Score": f1, "AUC-ROC": auc_
        })

        print(f"{name}: AUC={auc_:.3f}, F1={f1:.3f}, Recall={rec:.3f}")

    results_df = pd.DataFrame(results).sort_values(by="AUC-ROC", ascending=False)
    print("\nFinal Comparison:\n", results_df)
    return results_df

# ============================================================================
# 5. MAIN
# ============================================================================
def main():
    print("\n" + "=" * 30)
    print("CREDIT CARD FRAUD DETECTION ")
    print("=" * 30)

    df = load_credit_card_data()
    X_train, X_test, y_train, y_test = preprocess_data(df)
    models = train_models(X_train, y_train)
    evaluate_models(models, X_test, y_test)

    print("\n✓ Completed in under ~15 seconds (depending on CPU speed)")
    print("✓ Reduced sample size & tuned parameters for speed")
    print("✓ SMOTE and lightweight models used efficiently")

if __name__ == "__main__":
    main()



CREDIT CARD FRAUD DETECTION 
STEP 1: DATA GENERATION (OPTIMIZED SMALLER SYNTHETIC DATA)
✓ Dataset Shape: (20000, 31)
✓ Legitimate: 19900 | Fraud: 100
✓ Fraud Ratio: 0.500%

STEP 2: PREPROCESSING
✓ Train: (16000, 30), Test: (4000, 30)
✓ After SMOTE: 31840 samples, Fraud Ratio: 50.0%

STEP 3: TRAINING LIGHTWEIGHT MODELS
→ Training Logistic Regression ...
→ Training Random Forest ...
→ Training Neural Network ...
→ Training XGBoost ...
✓ Models Trained Successfully!

STEP 4: MODEL EVALUATION
Logistic Regression: AUC=0.817, F1=0.039, Recall=0.800
Random Forest: AUC=0.681, F1=0.019, Recall=0.150
Neural Network: AUC=0.600, F1=0.023, Recall=0.050
XGBoost: AUC=0.730, F1=0.026, Recall=0.250

Final Comparison:
                  Model  Accuracy  Precision  Recall  F1-Score   AUC-ROC
0  Logistic Regression   0.80100   0.019802    0.80  0.038647  0.817161
3              XGBoost   0.90675   0.013774    0.25  0.026110  0.729987
1        Random Forest   0.92325   0.010239    0.15  0.019169  0.680578
