# ML Model Training
## Credit Card Fraud Detection

This notebook trains multiple ML models using modules from src folder.


In [2]:
# Import necessary libraries
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Add src to path
sys.path.append(str(Path('..')))
from src.data_loader import DataLoader
from src.models import FraudDetectionModels

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
%matplotlib inline


## 1. Load and Preprocess Data


In [3]:
# Initialize data loader
data_loader = DataLoader(data_dir='../data')

# Load cleaned data (from Data Cleaning notebook)
cleaned_csv_path = Path('../data/creditcard_cleaned.csv')
if cleaned_csv_path.exists():
    print(f"✓ Loading cleaned data from: {cleaned_csv_path.absolute()}")
    df_clean = pd.read_csv(cleaned_csv_path)
    print(f"  - Cleaned dataset: {len(df_clean):,} transactions (duplicates removed)")
else:
    print("⚠ WARNING: Cleaned data not found!")
    print("Loading original data and removing duplicates...")
    csv_path = Path('../data/creditcard.csv')
    df_original = data_loader.load_csv_data('creditcard.csv')
    df_clean = df_original.drop_duplicates(keep='first')
    print(f"  - Removed {df_original.duplicated().sum():,} duplicates")

# Preprocess cleaned data (feature engineering already done in previous notebook)
print(f"\n✓ Preprocessing cleaned dataset for ML training...")
X_train, X_test, y_train, y_test, feature_cols = data_loader.preprocess_data(
    df_clean,  # Using cleaned dataset
    target_col='Class',
    test_size=0.2,
    random_state=42
)

print(f"\n✓ Preprocessing complete:")
print(f"  Training set: {X_train.shape[0]:,} samples, {X_train.shape[1]} features")
print(f"  Test set: {X_test.shape[0]:,} samples, {X_test.shape[1]} features")
print(f"  Total features: {len(feature_cols)}")
print(f"\n  Training class distribution:")
print(f"    Normal (0): {y_train.value_counts()[0]:,} ({y_train.value_counts(normalize=True)[0]*100:.2f}%)")
print(f"    Fraud (1):  {y_train.value_counts()[1]:,} ({y_train.value_counts(normalize=True)[1]*100:.2f}%)")


✓ Loading cleaned data from: d:\h\Financial Fraud Detection-AI\transactions\notebooks\..\data\creditcard_cleaned.csv
  - Cleaned dataset: 283,726 transactions (duplicates removed)

✓ Preprocessing cleaned dataset for ML training...

Preprocessing data...
Features: 29
Feature columns: ['V1', 'V2', 'V3', 'V4', 'V5']... (showing first 5)

Train set: 226980 samples
  - Fraud: 378 (0.17%)
Test set: 56746 samples
  - Fraud: 95 (0.17%)

Scaling features...

✓ Preprocessing complete:
  Training set: 226,980 samples, 29 features
  Test set: 56,746 samples, 29 features
  Total features: 29

  Training class distribution:
    Normal (0): 226,602 (99.83%)
    Fraud (1):  378 (0.17%)


## 2. Initialize Models


In [4]:
# Initialize model trainer
models = FraudDetectionModels(models_dir='../models')

print("Models initialized. Available models:")
print("  - Logistic Regression")
print("  - Decision Tree")
print("  - Random Forest")
print("  - XGBoost")


Models initialized. Available models:
  - Logistic Regression
  - Decision Tree
  - Random Forest
  - XGBoost


## 3. Train Baseline Models


In [5]:
# Train Logistic Regression with class weights
lr_model = models.logistic_regression(
    X_train, y_train, 
    use_smote=False, 
    class_weight='balanced'
)
models.save_model('logistic_regression', lr_model)



Training Logistic Regression...
Using class_weight=balanced
Model saved to ..\models\logistic_regression.pkl


In [6]:
# Train Decision Tree
dt_model = models.decision_tree(
    X_train, y_train, 
    use_smote=False, 
    class_weight='balanced'
)
models.save_model('decision_tree', dt_model)



Training Decision Tree...
Using class_weight=balanced
Model saved to ..\models\decision_tree.pkl


## 4. Train Advanced Models


In [7]:
# Train Random Forest
rf_model = models.random_forest(
    X_train, y_train, 
    use_smote=False, 
    class_weight='balanced',
    tune_hyperparameters=False
)
models.save_model('random_forest', rf_model)



Training Random Forest...
Using class_weight=balanced
Model saved to ..\models\random_forest.pkl


In [8]:
# Train XGBoost
xgb_model = models.xgboost_model(
    X_train, y_train, 
    use_smote=False,
    tune_hyperparameters=False
)
models.save_model('xgboost', xgb_model)



Training XGBoost...
Using scale_pos_weight=599.48
Model saved to ..\models\xgboost.pkl


## 5. Save Scaler and Feature Columns


In [10]:
# Save scaler and feature columns for later use (Streamlit app)
import joblib

scaler = data_loader.get_scaler()
models_dir = Path('../models')
models_dir.mkdir(exist_ok=True)

joblib.dump(scaler, models_dir / 'scaler.pkl')
joblib.dump(feature_cols, models_dir / 'feature_columns.pkl')

print("✓ Saved preprocessing artifacts for model inference:")
print(f"  - scaler.pkl (StandardScaler fitted on cleaned data)")
print(f"  - feature_columns.pkl (feature column names)")
print(f"  Location: {models_dir.absolute()}")
print(f"\n  These will be used by the Streamlit app for predictions")


✓ Saved preprocessing artifacts for model inference:
  - scaler.pkl (StandardScaler fitted on cleaned data)
  - feature_columns.pkl (feature column names)
  Location: d:\h\Financial Fraud Detection-AI\transactions\notebooks\..\models

  These will be used by the Streamlit app for predictions


## 6. Training Summary


In [11]:
print("="*70)
print("ML TRAINING SUMMARY")
print("="*70)
print(f"\n1. Data Pipeline:")
print(f"   ✓ Used cleaned dataset from 01b_DataCleaning.ipynb")
print(f"   ✓ Applied preprocessing from 02_Preprocessing_FeatureEngineering.ipynb")
print(f"   ✓ All models trained on cleaned, preprocessed data")
print(f"\n2. Training Data:")
print(f"   - Training samples: {X_train.shape[0]:,}")
print(f"   - Test samples: {X_test.shape[0]:,}")
print(f"   - Features: {len(feature_cols)}")
print(f"   - Class imbalance handled with class weights/SMOTE")
print(f"\n3. Models Trained:")
print(f"   ✓ Logistic Regression (with class weights)")
print(f"   ✓ Decision Tree (with class weights)")
print(f"   ✓ Random Forest (with class weights)")
print(f"   ✓ XGBoost (with scale_pos_weight)")
print(f"\n4. Models Saved:")
print(f"   - Location: ../models/")
print(f"   - All models ready for evaluation")
print(f"\n5. Next Steps:")
print(f"   - Run 04_ML_Evaluation.ipynb to evaluate all models")
print(f"   - Compare model performance metrics")
print(f"   - Select best model for production")
print("="*70)


ML TRAINING SUMMARY

1. Data Pipeline:
   ✓ Used cleaned dataset from 01b_DataCleaning.ipynb
   ✓ Applied preprocessing from 02_Preprocessing_FeatureEngineering.ipynb
   ✓ All models trained on cleaned, preprocessed data

2. Training Data:
   - Training samples: 226,980
   - Test samples: 56,746
   - Features: 29
   - Class imbalance handled with class weights/SMOTE

3. Models Trained:
   ✓ Logistic Regression (with class weights)
   ✓ Decision Tree (with class weights)
   ✓ Random Forest (with class weights)
   ✓ XGBoost (with scale_pos_weight)

4. Models Saved:
   - Location: ../models/
   - All models ready for evaluation

5. Next Steps:
   - Run 04_ML_Evaluation.ipynb to evaluate all models
   - Compare model performance metrics
   - Select best model for production
