# üîß Feature Engineering for Credit Card Fraud Detection

## MLZoomcamp Capstone Project - Feature Engineering

---

### üìã Overview

This notebook implements comprehensive feature engineering based on insights from our EDA. We'll create new features, handle scaling, and prepare the data for our PyTorch models.

### üéØ Objectives

1. **Create temporal features** from the Time column
2. **Engineer amount-based features** with various transformations
3. **Generate interaction features** between important variables
4. **Apply scaling strategies** appropriate for anomaly detection
5. **Prepare train/validation/test splits** maintaining class distribution
6. **Create PyTorch-ready datasets** for model training

---

## 1. Environment Setup

In [1]:
# Core libraries
import os
import warnings
import pickle
import json
from datetime import datetime
from pathlib import Path
warnings.filterwarnings('ignore')

# Data manipulation
import numpy as np
import pandas as pd
from scipy import stats
from scipy.special import boxcox1p

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Machine Learning
from sklearn.preprocessing import (
    StandardScaler, RobustScaler, MinMaxScaler, 
    QuantileTransformer, PowerTransformer
)
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif

# PyTorch
import torch
from torch.utils.data import Dataset, DataLoader

# Set random seeds
np.random.seed(42)
torch.manual_seed(42)

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

# Custom color palette
COLORS = {
    'normal': '#2E7D32',
    'fraud': '#C62828',
    'primary': '#1565C0',
    'secondary': '#FF6F00'
}

print("‚úÖ Environment setup complete")
print(f"üî¢ NumPy version: {np.__version__}")
print(f"üêº Pandas version: {pd.__version__}")
print(f"üî• PyTorch version: {torch.__version__}")
print(f"üíæ CUDA available: {torch.cuda.is_available()}")
print(f"üíæ MPS available: {torch.mps.is_available()}")

‚úÖ Environment setup complete
üî¢ NumPy version: 2.4.0
üêº Pandas version: 2.3.3
üî• PyTorch version: 2.9.1
üíæ CUDA available: False
üíæ MPS available: True


## 2. Load Data and Previous Insights

In [2]:
# Load the dataset
data_path = '../data/creditcard.csv'
df = pd.read_csv(data_path)
print(f"‚úÖ Data loaded: {df.shape[0]:,} transactions √ó {df.shape[1]} features")

# Load EDA insights if available
try:
    with open('eda_insights.pkl', 'rb') as f:
        eda_insights = pickle.load(f)
    print("‚úÖ EDA insights loaded successfully")
    print(f"   Top features identified: {eda_insights['top_features'][:5]}")
except FileNotFoundError:
    print("‚ö†Ô∏è EDA insights not found. Running without prior insights.")
    eda_insights = None

‚úÖ Data loaded: 284,807 transactions √ó 31 features
‚úÖ EDA insights loaded successfully
   Top features identified: ['V17', 'V14', 'V12', 'V10', 'V16']


In [3]:
# Create a copy for feature engineering
df_fe = df.copy()
print(f"\nüìä Class Distribution:")
print(df_fe['Class'].value_counts())
print(f"Fraud percentage: {df_fe['Class'].mean()*100:.3f}%")


üìä Class Distribution:
Class
0    284315
1       492
Name: count, dtype: int64
Fraud percentage: 0.173%


## 3. Temporal Feature Engineering

In [4]:
class TemporalFeatureEngineer:
    """Engineer time-based features from the Time column"""
    
    def __init__(self):
        self.time_stats = {}
        
    def fit(self, df):
        """Calculate time statistics"""
        self.time_stats['mean'] = df['Time'].mean()
        self.time_stats['std'] = df['Time'].std()
        self.time_stats['max'] = df['Time'].max()
        return self
    
    def transform(self, df):
        """Create temporal features"""
        df_temp = df.copy()
        
        # Basic time transformations
        df_temp['Hour'] = (df_temp['Time'] / 3600) % 24
        df_temp['Day'] = np.floor(df_temp['Time'] / (3600 * 24))
        
        # Cyclical encoding for hour (captures circular nature)
        df_temp['Hour_sin'] = np.sin(2 * np.pi * df_temp['Hour'] / 24)
        df_temp['Hour_cos'] = np.cos(2 * np.pi * df_temp['Hour'] / 24)
        
        # Time of day categories
        df_temp['TimeOfDay'] = pd.cut(
            df_temp['Hour'],
            bins=[0, 6, 12, 18, 24],
            labels=['Night', 'Morning', 'Afternoon', 'Evening'],
            include_lowest=True
        )
        
        # One-hot encode time of day
        time_dummies = pd.get_dummies(df_temp['TimeOfDay'], prefix='TOD')
        df_temp = pd.concat([df_temp, time_dummies], axis=1)
        
        # Normalized time (0-1 scale)
        df_temp['Time_normalized'] = df_temp['Time'] / self.time_stats['max']
        
        # Time z-score
        df_temp['Time_zscore'] = (df_temp['Time'] - self.time_stats['mean']) / self.time_stats['std']
        
        # Is weekend (assuming data starts on Monday)
        df_temp['Is_Weekend'] = (df_temp['Day'] % 7).isin([5, 6]).astype(int)
        
        # Business hours indicator (9 AM - 5 PM)
        df_temp['Is_Business_Hours'] = df_temp['Hour'].between(9, 17).astype(int)
        
        # Late night indicator (12 AM - 6 AM)
        df_temp['Is_Late_Night'] = df_temp['Hour'].between(0, 6).astype(int)
        
        return df_temp
    
    def fit_transform(self, df):
        return self.fit(df).transform(df)

# Apply temporal feature engineering
temporal_engineer = TemporalFeatureEngineer()
df_fe = temporal_engineer.fit_transform(df_fe)

# Display new temporal features
temporal_features = ['Hour', 'Day', 'Hour_sin', 'Hour_cos', 'Time_normalized', 
                     'Time_zscore', 'Is_Weekend', 'Is_Business_Hours', 'Is_Late_Night']
print("‚úÖ Temporal features created:")
for feat in temporal_features:
    if feat in df_fe.columns:
        print(f"   - {feat}: {df_fe[feat].describe()[[1,2,5,7]].values}")

‚úÖ Temporal features created:
   - Hour: [14.53795075  5.84706125 15.01083333 23.99944444]
   - Day: [0.49163469 0.49993089 0.         1.        ]
   - Hour_sin: [-0.26678667  0.62813187 -0.4356239   1.        ]
   - Hour_cos: [-0.14073287  0.71726751 -0.31192039  1.        ]
   - Time_normalized: [0.54871672 0.27482838 0.49013843 1.        ]
   - Time_zscore: [ 0.          1.         -0.21314497  1.64205485]
   - Is_Weekend: [0. 0. 0. 0.]
   - Is_Business_Hours: [0.45492913 0.49796535 0.         1.        ]
   - Is_Late_Night: [0.08403937 0.27744733 0.         1.        ]


In [5]:
# Visualize temporal features
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=('Hour Distribution', 'Cyclical Hour Encoding',
                   'Business Hours vs Fraud', 'Weekend vs Fraud')
)

# Hour distribution
hour_counts = df_fe.groupby(['Hour', 'Class']).size().unstack(fill_value=0)
fig.add_trace(
    go.Bar(x=hour_counts.index, y=hour_counts[0], name='Normal', marker_color=COLORS['normal']),
    row=1, col=1
)
fig.add_trace(
    go.Bar(x=hour_counts.index, y=hour_counts[1], name='Fraud', marker_color=COLORS['fraud']),
    row=1, col=1
)

# Cyclical encoding
sample = df_fe.sample(1000)
fig.add_trace(
    go.Scatter(x=sample['Hour_cos'], y=sample['Hour_sin'], 
               mode='markers', marker=dict(color=sample['Hour'], colorscale='Viridis'),
               showlegend=False),
    row=1, col=2
)

# Business hours analysis
biz_fraud = df_fe.groupby(['Is_Business_Hours', 'Class']).size().unstack(fill_value=0)
biz_fraud_pct = biz_fraud.div(biz_fraud.sum(axis=1), axis=0) * 100
fig.add_trace(
    go.Bar(x=['Non-Business', 'Business'], y=biz_fraud_pct[1], 
           marker_color=COLORS['fraud'], showlegend=False),
    row=2, col=1
)

# Weekend analysis
weekend_fraud = df_fe.groupby(['Is_Weekend', 'Class']).size().unstack(fill_value=0)
weekend_fraud_pct = weekend_fraud.div(weekend_fraud.sum(axis=1), axis=0) * 100
fig.add_trace(
    go.Bar(x=['Weekday', 'Weekend'], y=weekend_fraud_pct[1], 
           marker_color=COLORS['fraud'], showlegend=False),
    row=2, col=2
)

fig.update_layout(height=700, title_text="Temporal Feature Analysis", showlegend=True)
fig.show()

## 4. Amount Feature Engineering

In [6]:
class AmountFeatureEngineer:
    """Engineer amount-based features with various transformations"""
    
    def __init__(self):
        self.amount_stats = {}
        self.quantiles = {}
        
    def fit(self, df):
        """Calculate amount statistics"""
        self.amount_stats = {
            'mean': df['Amount'].mean(),
            'median': df['Amount'].median(),
            'std': df['Amount'].std(),
            'min': df['Amount'].min(),
            'max': df['Amount'].max()
        }
        
        # Calculate quantiles for binning
        self.quantiles = {
            'q25': df['Amount'].quantile(0.25),
            'q50': df['Amount'].quantile(0.50),
            'q75': df['Amount'].quantile(0.75),
            'q90': df['Amount'].quantile(0.90),
            'q95': df['Amount'].quantile(0.95),
            'q99': df['Amount'].quantile(0.99)
        }
        return self
    
    def transform(self, df):
        """Create amount-based features"""
        df_amt = df.copy()
        
        # Log transformation (handles 0 values)
        df_amt['Amount_log'] = np.log1p(df_amt['Amount'])
        
        # Square root transformation
        df_amt['Amount_sqrt'] = np.sqrt(df_amt['Amount'])
        
        # Box-Cox transformation (lambda=0.15 based on common practice)
        df_amt['Amount_boxcox'] = boxcox1p(df_amt['Amount'], 0.15)
        
        # Z-score normalization
        df_amt['Amount_zscore'] = (df_amt['Amount'] - self.amount_stats['mean']) / self.amount_stats['std']
        
        # Min-max scaling
        df_amt['Amount_minmax'] = (df_amt['Amount'] - self.amount_stats['min']) / \
                                   (self.amount_stats['max'] - self.amount_stats['min'])
        
        # Robust scaling (using median and IQR)
        iqr = self.quantiles['q75'] - self.quantiles['q25']
        df_amt['Amount_robust'] = (df_amt['Amount'] - self.quantiles['q50']) / iqr if iqr > 0 else 0
        
        # Deviation from median
        df_amt['Amount_median_dev'] = abs(df_amt['Amount'] - self.amount_stats['median'])
        
        # Is high amount (top 10%)
        df_amt['Is_High_Amount'] = (df_amt['Amount'] > self.quantiles['q90']).astype(int)
        
        # Is very high amount (top 1%)
        df_amt['Is_Very_High_Amount'] = (df_amt['Amount'] > self.quantiles['q99']).astype(int)
        
        # Is zero amount
        df_amt['Is_Zero_Amount'] = (df_amt['Amount'] == 0).astype(int)
        
        # Amount bins (categorical)
        df_amt['Amount_Bin'] = pd.cut(
            df_amt['Amount'],
            bins=[0, 1, 10, 50, 100, 200, 500, float('inf')],
            labels=['0-1', '1-10', '10-50', '50-100', '100-200', '200-500', '500+']
        )
        
        # One-hot encode amount bins
        amount_dummies = pd.get_dummies(df_amt['Amount_Bin'], prefix='AmtBin')
        df_amt = pd.concat([df_amt, amount_dummies], axis=1)
        
        # Percentile rank
        df_amt['Amount_percentile'] = df_amt['Amount'].rank(pct=True)
        
        return df_amt
    
    def fit_transform(self, df):
        return self.fit(df).transform(df)

# Apply amount feature engineering
amount_engineer = AmountFeatureEngineer()
df_fe = amount_engineer.fit_transform(df_fe)

# Display new amount features
amount_features = ['Amount_log', 'Amount_sqrt', 'Amount_boxcox', 'Amount_zscore', 
                  'Amount_robust', 'Is_High_Amount', 'Is_Zero_Amount']
print("‚úÖ Amount features created:")
for feat in amount_features:
    if feat in df_fe.columns:
        if df_fe[feat].dtype in ['float64', 'float32']:
            print(f"   - {feat}: mean={df_fe[feat].mean():.3f}, std={df_fe[feat].std():.3f}")
        else:
            print(f"   - {feat}: {df_fe[feat].value_counts().to_dict()}")

‚úÖ Amount features created:
   - Amount_log: mean=3.152, std=1.657
   - Amount_sqrt: mean=6.683, std=6.609
   - Amount_boxcox: mean=4.369, std=2.811
   - Amount_zscore: mean=-0.000, std=1.000
   - Amount_robust: mean=0.927, std=3.495
   - Is_High_Amount: {0: 256349, 1: 28458}
   - Is_Zero_Amount: {0: 282982, 1: 1825}


In [7]:
# Visualize amount transformations
fig = make_subplots(
    rows=2, cols=3,
    subplot_titles=('Original Amount', 'Log Transform', 'Square Root',
                   'Box-Cox', 'Z-Score', 'Robust Scaling')
)

# Sample for visualization
sample = df_fe[df_fe['Class'] == 0].sample(5000)

transforms = [
    ('Amount', 1, 1),
    ('Amount_log', 1, 2),
    ('Amount_sqrt', 1, 3),
    ('Amount_boxcox', 2, 1),
    ('Amount_zscore', 2, 2),
    ('Amount_robust', 2, 3)
]

for feat, row, col in transforms:
    fig.add_trace(
        go.Histogram(x=sample[feat], nbinsx=50, name=feat, showlegend=False),
        row=row, col=col
    )

fig.update_layout(height=600, title_text="Amount Feature Transformations (Normal Transactions)")
fig.show()

## 5. PCA Feature Engineering

In [8]:
class PCAFeatureEngineer:
    """Engineer features from PCA components"""
    
    def __init__(self):
        self.pca_cols = [f'V{i}' for i in range(1, 29)]
        self.stats = {}
        
    def fit(self, df):
        """Calculate PCA statistics"""
        for col in self.pca_cols:
            self.stats[col] = {
                'mean': df[col].mean(),
                'std': df[col].std(),
                'min': df[col].min(),
                'max': df[col].max()
            }
        return self
    
    def transform(self, df):
        """Create PCA-based features"""
        df_pca = df.copy()
        
        # Statistical aggregations across PCA components
        df_pca['V_mean'] = df_pca[self.pca_cols].mean(axis=1)
        df_pca['V_std'] = df_pca[self.pca_cols].std(axis=1)
        df_pca['V_max'] = df_pca[self.pca_cols].max(axis=1)
        df_pca['V_min'] = df_pca[self.pca_cols].min(axis=1)
        df_pca['V_range'] = df_pca['V_max'] - df_pca['V_min']
        df_pca['V_median'] = df_pca[self.pca_cols].median(axis=1)
        df_pca['V_skew'] = df_pca[self.pca_cols].skew(axis=1)
        df_pca['V_kurtosis'] = df_pca[self.pca_cols].kurtosis(axis=1)
        
        # Sum of absolute values (L1 norm)
        df_pca['V_l1_norm'] = df_pca[self.pca_cols].abs().sum(axis=1)
        
        # Sum of squares (L2 norm squared)
        df_pca['V_l2_norm_sq'] = (df_pca[self.pca_cols] ** 2).sum(axis=1)
        df_pca['V_l2_norm'] = np.sqrt(df_pca['V_l2_norm_sq'])
        
        # Count of extreme values (beyond 3 std)
        extreme_count = 0
        for col in self.pca_cols:
            z_scores = abs((df_pca[col] - self.stats[col]['mean']) / self.stats[col]['std'])
            extreme_count += (z_scores > 3).astype(int)
        df_pca['V_extreme_count'] = extreme_count
        
        # Percentile features for top discriminative components (from EDA)
        if eda_insights and 'discrimination_scores' in eda_insights:
            top_v_features = [k for k in eda_insights['discrimination_scores'].keys() if k.startswith('V')][:5]
        else:
            top_v_features = ['V14', 'V4', 'V12', 'V10', 'V11']  # Default from typical analysis
        
        for feat in top_v_features:
            if feat in df_pca.columns:
                df_pca[f'{feat}_percentile'] = df_pca[feat].rank(pct=True)
                df_pca[f'{feat}_is_extreme'] = (abs(df_pca[feat]) > df_pca[feat].quantile(0.99)).astype(int)
        
        return df_pca
    
    def fit_transform(self, df):
        return self.fit(df).transform(df)

# Apply PCA feature engineering
pca_engineer = PCAFeatureEngineer()
df_fe = pca_engineer.fit_transform(df_fe)

# Display new PCA features
pca_new_features = ['V_mean', 'V_std', 'V_max', 'V_min', 'V_l1_norm', 'V_l2_norm', 'V_extreme_count']
print("‚úÖ PCA aggregate features created:")
for feat in pca_new_features:
    if feat in df_fe.columns:
        print(f"   - {feat}: mean={df_fe[feat].mean():.3f}, std={df_fe[feat].std():.3f}")

‚úÖ PCA aggregate features created:
   - V_mean: mean=0.000, std=0.198
   - V_std: mean=0.906, std=0.526
   - V_max: mean=2.118, std=1.234
   - V_min: mean=-2.050, std=1.705
   - V_l1_norm: mean=18.355, std=9.687
   - V_l2_norm: mean=4.781, std=2.806
   - V_extreme_count: mean=0.279, std=1.117


## 6. Interaction Features

In [9]:
class InteractionFeatureEngineer:
    """Create interaction features between important variables"""
    
    def __init__(self, top_features=None):
        self.top_features = top_features or ['V14', 'V4', 'V12', 'V10', 'V11']
        
    def transform(self, df):
        """Create interaction features"""
        df_int = df.copy()
        
        # Amount √ó Time interactions
        df_int['Amount_Time_interaction'] = df_int['Amount'] * df_int['Time_normalized']
        df_int['Amount_Hour_interaction'] = df_int['Amount'] * df_int['Hour']
        df_int['LogAmount_Time_interaction'] = df_int['Amount_log'] * df_int['Time_normalized']
        
        # Amount √ó Hour category interactions
        df_int['Amount_BusinessHours'] = df_int['Amount'] * df_int['Is_Business_Hours']
        df_int['Amount_LateNight'] = df_int['Amount'] * df_int['Is_Late_Night']
        df_int['Amount_Weekend'] = df_int['Amount'] * df_int['Is_Weekend']
        
        # High amount during specific times
        df_int['HighAmount_LateNight'] = df_int['Is_High_Amount'] * df_int['Is_Late_Night']
        df_int['HighAmount_Weekend'] = df_int['Is_High_Amount'] * df_int['Is_Weekend']
        
        # PCA component interactions (top 3 pairs)
        if len(self.top_features) >= 3:
            # Multiplicative interactions
            df_int[f'{self.top_features[0]}_{self.top_features[1]}_mult'] = \
                df_int[self.top_features[0]] * df_int[self.top_features[1]]
            df_int[f'{self.top_features[0]}_{self.top_features[2]}_mult'] = \
                df_int[self.top_features[0]] * df_int[self.top_features[2]]
            df_int[f'{self.top_features[1]}_{self.top_features[2]}_mult'] = \
                df_int[self.top_features[1]] * df_int[self.top_features[2]]
            
            # Difference interactions
            df_int[f'{self.top_features[0]}_{self.top_features[1]}_diff'] = \
                abs(df_int[self.top_features[0]] - df_int[self.top_features[1]])
            
            # Ratio interactions (with small constant to avoid division by zero)
            epsilon = 1e-5
            df_int[f'{self.top_features[0]}_{self.top_features[1]}_ratio'] = \
                df_int[self.top_features[0]] / (abs(df_int[self.top_features[1]]) + epsilon)
        
        # PCA statistics √ó Amount interactions
        df_int['V_mean_Amount'] = df_int['V_mean'] * df_int['Amount_log']
        df_int['V_std_Amount'] = df_int['V_std'] * df_int['Amount_log']
        df_int['V_extreme_HighAmount'] = df_int['V_extreme_count'] * df_int['Is_High_Amount']
        
        # Complex interaction: unusual PCA pattern with high amount
        df_int['Unusual_Pattern_HighAmount'] = \
            ((df_int['V_extreme_count'] > 0) & (df_int['Is_High_Amount'] == 1)).astype(int)
        
        return df_int
    
    def fit_transform(self, df):
        return self.transform(df)

# Apply interaction feature engineering
if eda_insights and 'top_features' in eda_insights:
    top_v_features = [f for f in eda_insights['top_features'] if f.startswith('V')][:5]
else:
    top_v_features = ['V14', 'V4', 'V12', 'V10', 'V11']

interaction_engineer = InteractionFeatureEngineer(top_features=top_v_features)
df_fe = interaction_engineer.fit_transform(df_fe)

# Display interaction features
interaction_features = [col for col in df_fe.columns if 'interaction' in col.lower() or '_mult' in col or '_diff' in col]
print(f"‚úÖ Created {len(interaction_features)} interaction features")
print(f"   Sample features: {interaction_features[:5]}")

‚úÖ Created 7 interaction features
   Sample features: ['Amount_Time_interaction', 'Amount_Hour_interaction', 'LogAmount_Time_interaction', 'V17_V14_mult', 'V17_V12_mult']


## 7. Feature Selection and Importance

In [10]:
# Identify all feature types
feature_groups = {
    'original_pca': [f'V{i}' for i in range(1, 29)],
    'original_basic': ['Time', 'Amount'],
    'temporal': ['Hour', 'Hour_sin', 'Hour_cos', 'Time_normalized', 'Time_zscore',
                 'Is_Weekend', 'Is_Business_Hours', 'Is_Late_Night'],
    'amount': ['Amount_log', 'Amount_sqrt', 'Amount_boxcox', 'Amount_zscore',
               'Amount_robust', 'Amount_median_dev', 'Is_High_Amount', 'Is_Zero_Amount'],
    'pca_aggregate': ['V_mean', 'V_std', 'V_max', 'V_min', 'V_range', 'V_l1_norm', 'V_l2_norm'],
    'interactions': [col for col in df_fe.columns if 'interaction' in col.lower() or '_mult' in col]
}

# Get all numerical features
exclude_cols = ['Class', 'TimeOfDay', 'Amount_Bin', 'Day'] + \
               [col for col in df_fe.columns if col.startswith('TOD_') or col.startswith('AmtBin_')]
numerical_features = [col for col in df_fe.columns if col not in exclude_cols and df_fe[col].dtype in ['float64', 'float32', 'int64']]

print(f"\nüìä Feature Summary:")
print(f"Total features created: {len(df_fe.columns)}")
print(f"Numerical features: {len(numerical_features)}")
for group, features in feature_groups.items():
    available = [f for f in features if f in df_fe.columns]
    print(f"  - {group}: {len(available)} features")


üìä Feature Summary:
Total features created: 103
Numerical features: 88
  - original_pca: 28 features
  - original_basic: 2 features
  - temporal: 8 features
  - amount: 8 features
  - pca_aggregate: 7 features
  - interactions: 6 features


In [11]:
# Calculate feature importance using mutual information
from sklearn.feature_selection import mutual_info_classif

# Prepare data for feature selection
X_feat_select = df_fe[numerical_features].fillna(0)
y_feat_select = df_fe['Class']

# Calculate mutual information scores
mi_scores = mutual_info_classif(X_feat_select, y_feat_select, random_state=42)

# Create importance dataframe
feature_importance = pd.DataFrame({
    'feature': numerical_features,
    'mi_score': mi_scores
}).sort_values('mi_score', ascending=False)

# Display top features
print("\nüèÜ Top 20 Features by Mutual Information:")
print(feature_importance.head(20).to_string(index=False))

# Save selected features
top_50_features = feature_importance.head(50)['feature'].tolist()
top_100_features = feature_importance.head(100)['feature'].tolist()


üèÜ Top 20 Features by Mutual Information:
          feature  mi_score
Is_Business_Hours     0.040
     V14_V12_mult     0.008
     V17_V14_mult     0.008
              V17     0.008
              V14     0.008
     V17_V12_mult     0.008
              V12     0.008
              V10     0.008
              V11     0.007
           V_mean     0.006
              V16     0.006
        V_l2_norm     0.006
     V_l2_norm_sq     0.006
        V_l1_norm     0.006
  V_extreme_count     0.006
            V_std     0.006
               V4     0.005
            V_min     0.005
               V3     0.005
    V3_percentile     0.005


In [12]:
# Visualize feature importance
fig = go.Figure()

top_20 = feature_importance.head(20)
fig.add_trace(go.Bar(
    x=top_20['mi_score'],
    y=top_20['feature'],
    orientation='h',
    marker_color=COLORS['primary']
))

fig.update_layout(
    title="Top 20 Features by Mutual Information Score",
    xaxis_title="Mutual Information Score",
    yaxis_title="Feature",
    height=600,
    template='plotly_white'
)
fig.show()

## 8. Data Splitting Strategy

In [13]:
# Time-based splitter (leakage-safe)
from dataclasses import dataclass

@dataclass
class TimeBasedDataSplitter:
    """Create train/validation/test splits based on chronological order (Time)."""
    train_size: float = 0.70
    val_size: float = 0.15
    test_size: float = 0.15

    def split_df(self, df: pd.DataFrame, feature_cols: list, target_col: str = 'Class', time_col: str = 'Time'):
        assert abs(self.train_size + self.val_size + self.test_size - 1.0) < 1e-6, "splits must sum to 1.0"
        df_sorted = df.sort_values(time_col).reset_index(drop=True)

        X = df_sorted[feature_cols].fillna(0).values
        y = df_sorted[target_col].values.astype(int)

        n = len(df_sorted)
        train_end = int(self.train_size * n)
        val_end = int((self.train_size + self.val_size) * n)

        X_train, y_train = X[:train_end], y[:train_end]
        X_val, y_val = X[train_end:val_end], y[train_end:val_end]
        X_test, y_test = X[val_end:], y[val_end:]
        return X_train, X_val, X_test, y_train, y_val, y_test

    def get_split_info(self, y_train, y_val, y_test):
        def info(y):
            y = np.asarray(y).astype(int)
            return {
                'total': int(len(y)),
                'fraud': int(y.sum()),
                'normal': int((y == 0).sum()),
                'fraud_rate': float(y.mean())
            }
        return {'train': info(y_train), 'val': info(y_val), 'test': info(y_test)}

# Prepare features and target (df_fe must include 'Time' for sorting)
X_features = top_100_features  # already computed earlier in the notebook

splitter = TimeBasedDataSplitter(train_size=0.70, val_size=0.15, test_size=0.15)
X_train, X_val, X_test, y_train, y_val, y_test = splitter.split_df(
    df_fe, feature_cols=X_features, target_col='Class', time_col='Time'
)

# Display split information
split_info = splitter.get_split_info(y_train, y_val, y_test)

print("\nüìä Data Split Information (time-based):")
print("=" * 60)
for split_name, info in split_info.items():
    print(f"\n{split_name.upper()} SET:")
    print(f"  Total samples: {info['total']:,}")
    print(f"  Normal: {info['normal']:,}")
    print(f"  Fraud: {info['fraud']:,}")
    print(f"  Fraud rate: {info['fraud_rate']:.4%}")



üìä Data Split Information (time-based):

TRAIN SET:
  Total samples: 199,364
  Normal: 198,980
  Fraud: 384
  Fraud rate: 0.1926%

VAL SET:
  Total samples: 42,721
  Normal: 42,665
  Fraud: 56
  Fraud rate: 0.1311%

TEST SET:
  Total samples: 42,722
  Normal: 42,670
  Fraud: 52
  Fraud rate: 0.1217%


## 9. Feature Scaling

In [14]:
class FeatureScaler:
    """Apply appropriate scaling for anomaly detection"""
    
    def __init__(self, scaler_type='robust'):
        self.scaler_type = scaler_type
        
        if scaler_type == 'standard':
            self.scaler = StandardScaler()
        elif scaler_type == 'robust':
            self.scaler = RobustScaler()
        elif scaler_type == 'minmax':
            self.scaler = MinMaxScaler()
        elif scaler_type == 'quantile':
            self.scaler = QuantileTransformer(output_distribution='uniform', random_state=42)
        elif scaler_type == 'power':
            self.scaler = PowerTransformer(method='yeo-johnson', standardize=True)
        else:
            raise ValueError(f"Unknown scaler type: {scaler_type}")
    
    def fit(self, X_train, train_on_normal_only=True, y_train=None):
        """Fit scaler on training data"""
        if train_on_normal_only and y_train is not None:
            # For anomaly detection, fit scaler only on normal samples
            normal_idx = y_train == 0
            X_normal = X_train[normal_idx]
            self.scaler.fit(X_normal)
            print(f"‚úÖ Scaler fitted on {X_normal.shape[0]:,} normal samples only")
        else:
            self.scaler.fit(X_train)
            print(f"‚úÖ Scaler fitted on all {X_train.shape[0]:,} training samples")
        return self
    
    def transform(self, X):
        """Transform features"""
        return self.scaler.transform(X)
    
    def fit_transform(self, X_train, train_on_normal_only=True, y_train=None):
        self.fit(X_train, train_on_normal_only, y_train)
        return self.transform(X_train)

# Apply scaling (RobustScaler is good for outliers)
scaler = FeatureScaler(scaler_type='robust')
X_train_scaled = scaler.fit_transform(X_train, train_on_normal_only=True, y_train=y_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

print(f"\nüìä Scaled Data Shapes:")
print(f"  X_train: {X_train_scaled.shape}")
print(f"  X_val: {X_val_scaled.shape}")
print(f"  X_test: {X_test_scaled.shape}")

‚úÖ Scaler fitted on 198,980 normal samples only

üìä Scaled Data Shapes:
  X_train: (199364, 88)
  X_val: (42721, 88)
  X_test: (42722, 88)


## 10. PyTorch Dataset Creation

In [15]:
class FraudDataset(Dataset):
    """PyTorch Dataset for fraud detection"""
    
    def __init__(self, X, y=None, transform=None):
        self.X = torch.FloatTensor(X)
        self.y = torch.FloatTensor(y) if y is not None else None
        self.transform = transform
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        x = self.X[idx]
        
        if self.transform:
            x = self.transform(x)
            
        if self.y is not None:
            return x, self.y[idx]
        return x
    
    def get_normal_only(self):
        """Get dataset with normal samples only (for autoencoder training)"""
        if self.y is None:
            return self
        
        normal_idx = self.y == 0
        X_normal = self.X[normal_idx]
        y_normal = self.y[normal_idx]
        
        return FraudDataset(X_normal, y_normal, self.transform)

# Create PyTorch datasets
train_dataset = FraudDataset(X_train_scaled, y_train)
val_dataset = FraudDataset(X_val_scaled, y_val)
test_dataset = FraudDataset(X_test_scaled, y_test)

# Create dataset with only normal samples for autoencoder training
train_dataset_normal = train_dataset.get_normal_only()

print("\nüî• PyTorch Datasets Created:")
print(f"  Train dataset (all): {len(train_dataset)} samples")
print(f"  Train dataset (normal only): {len(train_dataset_normal)} samples")
print(f"  Validation dataset: {len(val_dataset)} samples")
print(f"  Test dataset: {len(test_dataset)} samples")

# Create data loaders
batch_size = 256

train_loader_normal = DataLoader(train_dataset_normal, batch_size=batch_size, shuffle=True)
train_loader_all = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

print(f"\nüì¶ Data Loaders Created (batch_size={batch_size}):")
print(f"  Train loader (normal): {len(train_loader_normal)} batches")
print(f"  Validation loader: {len(val_loader)} batches")
print(f"  Test loader: {len(test_loader)} batches")


üî• PyTorch Datasets Created:
  Train dataset (all): 199364 samples
  Train dataset (normal only): 198980 samples
  Validation dataset: 42721 samples
  Test dataset: 42722 samples

üì¶ Data Loaders Created (batch_size=256):
  Train loader (normal): 778 batches
  Validation loader: 167 batches
  Test loader: 167 batches


## 11. Save Preprocessed Data and Artifacts

In [16]:
# Create directory for artifacts
artifacts_dir = Path('artifacts')
artifacts_dir.mkdir(exist_ok=True)

# Save preprocessed data
np.save(artifacts_dir / 'X_train_scaled.npy', X_train_scaled)
np.save(artifacts_dir / 'X_val_scaled.npy', X_val_scaled)
np.save(artifacts_dir / 'X_test_scaled.npy', X_test_scaled)
np.save(artifacts_dir / 'y_train.npy', y_train)
np.save(artifacts_dir / 'y_val.npy', y_val)
np.save(artifacts_dir / 'y_test.npy', y_test)

# Save feature engineering objects
engineering_objects = {
    'temporal_engineer': temporal_engineer,
    'amount_engineer': amount_engineer,
    'pca_engineer': pca_engineer,
    'interaction_engineer': interaction_engineer,
    'scaler': scaler,
    'feature_columns': top_100_features,
    'feature_importance': feature_importance.to_dict(),
    'split_info': split_info
}

with open(artifacts_dir / 'feature_engineering_artifacts.pkl', 'wb') as f:
    pickle.dump(engineering_objects, f)

# Save configuration
config = {
    'n_features': len(top_100_features),
    'batch_size': batch_size,
    'scaler_type': 'robust',
    'test_size': 0.2,
    'val_size': 0.2,
    'top_features': top_100_features[:20],
    'timestamp': datetime.now().isoformat()
}

with open(artifacts_dir / 'config.json', 'w') as f:
    json.dump(config, f, indent=2)

print("\nüíæ Artifacts Saved:")
print(f"  Location: {artifacts_dir.absolute()}")
print(f"  - Preprocessed data (*.npy files)")
print(f"  - Feature engineering objects (*.pkl)")
print(f"  - Configuration (config.json)")


üíæ Artifacts Saved:
  Location: /Users/inigo_ocariz/src/ml-zoomcamp-2025/ml-zoomcamp-2025/fraud-detection/notebooks/artifacts
  - Preprocessed data (*.npy files)
  - Feature engineering objects (*.pkl)
  - Configuration (config.json)


## 12. Feature Engineering Summary

In [17]:
# Create summary visualization
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=('Feature Categories', 'Top 10 Features', 
                   'Class Distribution in Splits', 'Feature Correlation Matrix'),
    specs=[[{'type': 'bar'}, {'type': 'bar'}],
           [{'type': 'bar'}, {'type': 'heatmap'}]]
)

# Feature categories
categories = []
counts = []
for group, features in feature_groups.items():
    available = [f for f in features if f in df_fe.columns]
    if len(available) > 0:
        categories.append(group.replace('_', ' ').title())
        counts.append(len(available))

fig.add_trace(
    go.Bar(x=categories, y=counts, marker_color=COLORS['primary'], showlegend=False),
    row=1, col=1
)

# Top features
top_10 = feature_importance.head(10)
fig.add_trace(
    go.Bar(y=top_10['feature'], x=top_10['mi_score'], 
           orientation='h', marker_color=COLORS['secondary'], showlegend=False),
    row=1, col=2
)

# Class distribution
splits = ['Train', 'Val', 'Test']
fraud_counts = [split_info['train']['fraud'], split_info['val']['fraud'], split_info['test']['fraud']]
normal_counts = [split_info['train']['normal'], split_info['val']['normal'], split_info['test']['normal']]

fig.add_trace(
    go.Bar(name='Normal', x=splits, y=normal_counts, marker_color=COLORS['normal']),
    row=2, col=1
)
fig.add_trace(
    go.Bar(name='Fraud', x=splits, y=fraud_counts, marker_color=COLORS['fraud']),
    row=2, col=1
)

# Feature correlation matrix (sample)
sample_features = top_100_features[:15]
corr_matrix = pd.DataFrame(X_train_scaled[:, :15]).corr()

fig.add_trace(
    go.Heatmap(z=corr_matrix.values, colorscale='RdBu', zmid=0, showscale=True),
    row=2, col=2
)

fig.update_layout(height=800, title_text="Feature Engineering Summary", showlegend=True)
fig.show()

print("\n‚úÖ Feature Engineering Complete!")
print(f"\nüìä Final Summary:")
print(f"  - Total features engineered: {len(df_fe.columns)}")
print(f"  - Features selected for modeling: {len(top_100_features)}")
print(f"  - Training samples (normal only): {len(train_dataset_normal):,}")
print(f"  - Validation samples: {len(val_dataset):,}")
print(f"  - Test samples: {len(test_dataset):,}")
print(f"\nüöÄ Ready for model training!")


‚úÖ Feature Engineering Complete!

üìä Final Summary:
  - Total features engineered: 103
  - Features selected for modeling: 88
  - Training samples (normal only): 198,980
  - Validation samples: 42,721
  - Test samples: 42,722

üöÄ Ready for model training!
