# Feature Engineering for Fraud Detection

This notebook focuses on extracting meaningful features from the parsed transaction logs to prepare data for anomaly detection models.

In [None]:
import sys
import os
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

from features.feature_engineering import FeatureExtractor
from utils.config import Config
from utils.visualization import VisualizationUtils

# Set up visualization
plt.style.use('default')
sns.set_palette("husl")
%matplotlib inline

## Load Configuration and Data

In [None]:
# Load configuration
config = Config()
viz = VisualizationUtils()

# Load parsed transaction data
data_path = '../data/parsed_transactions.csv'
if os.path.exists(data_path):
    df = pd.read_csv(data_path)
    print(f"Loaded {len(df)} parsed transactions")
    print(f"Columns: {list(df.columns)}")
else:
    print("Parsed transaction data not found. Please run the data exploration notebook first.")
    df = None

## Data Overview

In [None]:
if df is not None:
    # Basic data info
    print("Dataset Info:")
    print(f"Shape: {df.shape}")
    print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
    
    # Data types
    print("\nData Types:")
    print(df.dtypes)
    
    # Missing values
    print("\nMissing Values:")
    missing = df.isnull().sum()
    if missing.sum() > 0:
        print(missing[missing > 0])
    else:
        print("No missing values found")
    
    # Sample data
    print("\nSample Data:")
    display(df.head())

## Initialize Feature Extractor

In [None]:
# Initialize feature extractor
feature_extractor = FeatureExtractor(config)

print("Available feature categories:")
categories = feature_extractor.get_feature_categories()
for category, features in categories.items():
    print(f"  {category}: {len(features)} features")
    if len(features) <= 10:
        print(f"    {', '.join(features)}")
    else:
        print(f"    {', '.join(features[:5])}... and {len(features)-5} more")

## Extract Basic Features

In [None]:
if df is not None:
    print("Extracting basic features...")
    
    # Basic features
    basic_features = feature_extractor.extract_basic_features(df)
    print(f"Extracted {len(basic_features.columns)} basic features")
    
    # Display sample of basic features
    print("\nSample Basic Features:")
    display(basic_features.head())
    
    # Basic feature statistics
    print("\nBasic Feature Statistics:")
    display(basic_features.describe())

## Extract User Behavioral Features

In [None]:
if df is not None:
    print("Extracting user behavioral features...")
    
    # User behavioral features
    user_features = feature_extractor.extract_user_behavioral_features(df)
    print(f"Extracted {len(user_features.columns)} user behavioral features")
    
    # Display sample of user features
    print("\nSample User Behavioral Features:")
    display(user_features.head())
    
    # User feature statistics
    print("\nUser Behavioral Feature Statistics:")
    display(user_features.describe())

## Extract Temporal Features

In [None]:
if df is not None:
    print("Extracting temporal features...")
    
    # Temporal features
    temporal_features = feature_extractor.extract_temporal_features(df)
    print(f"Extracted {len(temporal_features.columns)} temporal features")
    
    # Display sample of temporal features
    print("\nSample Temporal Features:")
    display(temporal_features.head())
    
    # Temporal feature statistics
    print("\nTemporal Feature Statistics:")
    display(temporal_features.describe())

## Extract Contextual Features

In [None]:
if df is not None:
    print("Extracting contextual features...")
    
    # Contextual features
    contextual_features = feature_extractor.extract_contextual_features(df)
    print(f"Extracted {len(contextual_features.columns)} contextual features")
    
    # Display sample of contextual features
    print("\nSample Contextual Features:")
    display(contextual_features.head())
    
    # Contextual feature statistics
    print("\nContextual Feature Statistics:")
    display(contextual_features.describe())

## Combine All Features

In [None]:
if df is not None:
    print("Combining all features...")
    
    # Extract all features at once
    features_df, metadata = feature_extractor.extract_all_features(df)
    
    print(f"Total features extracted: {len(features_df.columns)}")
    print(f"Feature metadata categories: {list(metadata.keys())}")
    
    # Display feature metadata summary
    print("\nFeature Summary by Category:")
    for category, feature_list in metadata.items():
        print(f"  {category}: {len(feature_list)} features")
    
    # Display combined features sample
    print("\nSample Combined Features:")
    display(features_df.head())

## Feature Quality Analysis

In [None]:
if df is not None and 'features_df' in locals():
    print("Analyzing feature quality...")
    
    # Check for missing values in features
    feature_missing = features_df.isnull().sum()
    if feature_missing.sum() > 0:
        print("\nFeatures with missing values:")
        print(feature_missing[feature_missing > 0].sort_values(ascending=False))
    else:
        print("\nNo missing values in extracted features")
    
    # Check for constant features
    constant_features = []
    for col in features_df.columns:
        if features_df[col].nunique() <= 1:
            constant_features.append(col)
    
    if constant_features:
        print(f"\nConstant features (consider removing): {constant_features}")
    else:
        print("\nNo constant features found")
    
    # Feature value ranges
    print("\nFeature Value Ranges:")
    numeric_features = features_df.select_dtypes(include=[np.number])
    print(f"Numeric features: {len(numeric_features.columns)}")
    print(f"Min values range: [{numeric_features.min().min():.6f}, {numeric_features.min().max():.6f}]")
    print(f"Max values range: [{numeric_features.max().min():.6f}, {numeric_features.max().max():.6f}]")
    
    # Feature correlations (sample)
    print("\nHighly correlated feature pairs (|r| > 0.8):")
    corr_matrix = numeric_features.corr().abs()
    high_corr_pairs = []
    for i in range(len(corr_matrix.columns)):
        for j in range(i+1, len(corr_matrix.columns)):
            if corr_matrix.iloc[i, j] > 0.8:
                high_corr_pairs.append((
                    corr_matrix.columns[i], 
                    corr_matrix.columns[j], 
                    corr_matrix.iloc[i, j]
                ))
    
    if high_corr_pairs:
        for feat1, feat2, corr in sorted(high_corr_pairs, key=lambda x: x[2], reverse=True)[:10]:
            print(f"  {feat1} <-> {feat2}: {corr:.3f}")
    else:
        print("  No highly correlated pairs found")

## Feature Visualizations

In [None]:
if df is not None and 'features_df' in locals():
    print("Creating feature visualizations...")
    
    # Select a subset of features for visualization
    viz_features = [
        'hour', 'day_of_week', 'amount_log', 'user_transaction_count',
        'user_avg_amount', 'time_since_last_transaction', 'location_frequency',
        'device_frequency', 'transaction_type_frequency'
    ]
    
    available_viz_features = [f for f in viz_features if f in features_df.columns]
    
    if available_viz_features:
        # Feature distributions
        fig, axes = plt.subplots(3, 3, figsize=(15, 12))
        axes = axes.ravel()
        
        for i, feature in enumerate(available_viz_features[:9]):
            if i < len(axes):
                features_df[feature].hist(bins=30, ax=axes[i], alpha=0.7)
                axes[i].set_title(f'{feature}')
                axes[i].set_xlabel('Value')
                axes[i].set_ylabel('Frequency')
        
        # Hide unused subplots
        for i in range(len(available_viz_features), len(axes)):
            axes[i].set_visible(False)
        
        plt.tight_layout()
        plt.suptitle('Feature Distributions', y=1.02)
        plt.show()
        
        # Feature correlation heatmap (subset)
        if len(available_viz_features) > 1:
            plt.figure(figsize=(10, 8))
            corr_subset = features_df[available_viz_features].corr()
            sns.heatmap(corr_subset, annot=True, cmap='coolwarm', center=0,
                       square=True, fmt='.2f')
            plt.title('Feature Correlation Matrix (Subset)')
            plt.tight_layout()
            plt.show()
    else:
        print("No visualization features available")

## Feature Validation

In [None]:
if df is not None and 'features_df' in locals():
    print("Validating extracted features...")
    
    # Run feature validation
    validation_results = feature_extractor.validate_features(features_df, metadata)
    
    print("\nValidation Results:")
    for category, results in validation_results.items():
        print(f"\n{category.upper()}:")
        for check, result in results.items():
            status = "✓" if result['passed'] else "✗"
            print(f"  {status} {check}: {result['message']}")
            if not result['passed'] and 'details' in result:
                print(f"    Details: {result['details']}")

## Save Engineered Features

In [None]:
if df is not None and 'features_df' in locals():
    # Save features to CSV
    output_path = '../data/engineered_features.csv'
    features_df.to_csv(output_path, index=False)
    print(f"Saved {len(features_df)} rows and {len(features_df.columns)} features to {output_path}")
    
    # Save feature metadata
    metadata_path = '../data/feature_metadata.json'
    import json
    with open(metadata_path, 'w') as f:
        json.dump(metadata, f, indent=2)
    print(f"Saved feature metadata to {metadata_path}")
    
    # Create feature summary
    feature_summary = {
        'total_features': len(features_df.columns),
        'total_samples': len(features_df),
        'feature_categories': {cat: len(feats) for cat, feats in metadata.items()},
        'numeric_features': len(features_df.select_dtypes(include=[np.number]).columns),
        'categorical_features': len(features_df.select_dtypes(exclude=[np.number]).columns),
        'missing_values': int(features_df.isnull().sum().sum()),
        'constant_features': len(constant_features) if 'constant_features' in locals() else 0,
        'creation_time': datetime.now().isoformat()
    }
    
    summary_path = '../data/feature_engineering_summary.json'
    with open(summary_path, 'w') as f:
        json.dump(feature_summary, f, indent=2)
    print(f"Saved feature engineering summary to {summary_path}")
    
    print("\nFeature Engineering Summary:")
    for key, value in feature_summary.items():
        if key != 'creation_time':
            print(f"  {key}: {value}")

## Conclusion

In [None]:
print("Feature Engineering Complete!")
print("\nNext Steps:")
print("1. Run model training notebook to train anomaly detection models")
print("2. Evaluate model performance using the extracted features")
print("3. Fine-tune feature selection based on model performance")
print("4. Deploy the best performing model for fraud detection")

if 'features_df' in locals():
    print(f"\nReady for model training with {len(features_df.columns)} features and {len(features_df)} samples")