In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import (classification_report, confusion_matrix, 
                           roc_auc_score, average_precision_score, f1_score)
import warnings
warnings.filterwarnings('ignore')

print("="*80)
print("FRAUD DETECTION MODELING - SIMPLE VERSION")
print("="*80)

# Create synthetic numeric data
np.random.seed(42)

# E-commerce data
n_fraud = 15000
X_fraud = pd.DataFrame({
    'feature1': np.random.randn(n_fraud),
    'feature2': np.random.exponential(1, n_fraud),
    'feature3': np.random.randn(n_fraud),
    'feature4': np.random.randint(0, 2, n_fraud),
    'feature5': np.random.randn(n_fraud),
})
y_fraud = np.random.choice([0, 1], n_fraud, p=[0.985, 0.015])

# Credit card data
n_credit = 100000
X_credit = pd.DataFrame({
    'V1': np.random.randn(n_credit),
    'V2': np.random.randn(n_credit),
    'V3': np.random.randn(n_credit),
    'V4': np.random.randn(n_credit),
    'V5': np.random.randn(n_credit),
    'Amount': np.random.exponential(100, n_credit),
})
y_credit = np.random.choice([0, 1], n_credit, p=[0.9983, 0.0017])

print("âœ… Synthetic data created")
print(f"E-commerce: {X_fraud.shape}, Fraud rate: {y_fraud.mean():.3%}")
print(f"Credit Card: {X_credit.shape}, Fraud rate: {y_credit.mean():.5%}")

# Train-test split
X_fraud_train, X_fraud_test, y_fraud_train, y_fraud_test = train_test_split(
    X_fraud, y_fraud, test_size=0.2, random_state=42, stratify=y_fraud
)

X_credit_train, X_credit_test, y_credit_train, y_credit_test = train_test_split(
    X_credit, y_credit, test_size=0.2, random_state=42, stratify=y_credit
)

# Train Logistic Regression
print("\n" + "="*80)
print("TRAINING LOGISTIC REGRESSION")
print("="*80)

for (X_train, X_test, y_train, y_test, name) in [
    (X_fraud_train, X_fraud_test, y_fraud_train, y_fraud_test, "E-commerce"),
    (X_credit_train, X_credit_test, y_credit_train, y_credit_test, "Credit Card")
]:
    print(f"\nðŸ“Š {name} Data:")
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Train model
    lr = LogisticRegression(random_state=42, max_iter=1000, class_weight='balanced')
    lr.fit(X_train_scaled, y_train)
    
    # Predict
    y_pred = lr.predict(X_test_scaled)
    y_pred_proba = lr.predict_proba(X_test_scaled)[:, 1]
    
    # Metrics
    print(f"  ROC-AUC: {roc_auc_score(y_test, y_pred_proba):.4f}")
    print(f"  Avg Precision: {average_precision_score(y_test, y_pred_proba):.4f}")
    print(f"  F1-Score: {f1_score(y_test, y_pred):.4f}")

# Train Random Forest
print("\n" + "="*80)
print("TRAINING RANDOM FOREST")
print("="*80)

for (X_train, X_test, y_train, y_test, name) in [
    (X_fraud_train, X_fraud_test, y_fraud_train, y_fraud_test, "E-commerce"),
    (X_credit_train, X_credit_test, y_credit_train, y_credit_test, "Credit Card")
]:
    print(f"\nðŸ“Š {name} Data:")
    
    # Train model
    rf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced_subsample')
    rf.fit(X_train, y_train)
    
    # Predict
    y_pred = rf.predict(X_test)
    y_pred_proba = rf.predict_proba(X_test)[:, 1]
    
    # Metrics
    print(f"  ROC-AUC: {roc_auc_score(y_test, y_pred_proba):.4f}")
    print(f"  Avg Precision: {average_precision_score(y_test, y_pred_proba):.4f}")
    print(f"  F1-Score: {f1_score(y_test, y_pred):.4f}")

print("\n" + "="*80)
print("âœ… MODELING COMPLETE")
print("="*80)

FRAUD DETECTION MODELING - SIMPLE VERSION
âœ… Synthetic data created
E-commerce: (15000, 5), Fraud rate: 1.380%
Credit Card: (100000, 6), Fraud rate: 0.16400%

TRAINING LOGISTIC REGRESSION

ðŸ“Š E-commerce Data:
  ROC-AUC: 0.6013
  Avg Precision: 0.0256
  F1-Score: 0.0352

ðŸ“Š Credit Card Data:
  ROC-AUC: 0.6257
  Avg Precision: 0.0028
  F1-Score: 0.0045

TRAINING RANDOM FOREST

ðŸ“Š E-commerce Data:
  ROC-AUC: 0.5925
  Avg Precision: 0.0195
  F1-Score: 0.0000

ðŸ“Š Credit Card Data:
  ROC-AUC: 0.4823
  Avg Precision: 0.0022
  F1-Score: 0.0000

âœ… MODELING COMPLETE
