In [4]:
"""
Task 3 - Feature Engineering for Credit Risk Model
Complete self-contained script with error handling
"""

import pandas as pd
import numpy as np
import os
import json
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

print("=" * 70)
print("TASK 3 - FEATURE ENGINEERING")
print("Credit Risk Model using Alternative Data")
print("=" * 70)

def create_sample_data(n_transactions=10000, n_customers=200):
    """Create synthetic transaction data for testing"""
    print("Creating sample transaction data for testing...")
    
    np.random.seed(42)
    
    # Generate customer IDs
    customer_ids = [f'CUST{str(i).zfill(5)}' for i in range(n_customers)]
    
    # Generate dates
    start_date = pd.Timestamp('2023-01-01')
    end_date = pd.Timestamp('2024-01-01')
    dates = pd.date_range(start=start_date, end=end_date, periods=n_transactions)
    
    # Create dataframe
    data = {
        'TransactionId': [f'TX{str(i).zfill(7)}' for i in range(n_transactions)],
        'CustomerId': np.random.choice(customer_ids, n_transactions, p=np.random.dirichlet(np.ones(n_customers))),
        'TransactionStartTime': np.random.choice(dates, n_transactions),
        'Amount': np.random.exponential(5000, n_transactions) * np.random.choice([1, -1], n_transactions, p=[0.95, 0.05]),
        'Value': np.abs(np.random.exponential(5000, n_transactions)),
        'ProductCategory': np.random.choice(['Communications', 'Groceries', 'Entertainment', 'Transport', 'Electronics'], 
                                          n_transactions, p=[0.35, 0.25, 0.15, 0.15, 0.10]),
        'ChannelId': np.random.choice(['Android', 'Web', 'iOS', 'Pay Later'], 
                                     n_transactions, p=[0.45, 0.30, 0.15, 0.10]),
        'FraudResult': np.random.binomial(1, 0.01, n_transactions),
        'CurrencyCode': ['UGX'] * n_transactions,
        'CountryCode': ['UG'] * n_transactions
    }
    
    df = pd.DataFrame(data)
    print(f"Created sample data with {n_transactions} transactions for {n_customers} customers")
    return df

# -----------------------------
# 1. Load Data
# -----------------------------
print("\n" + "=" * 70)
print("1. LOADING DATA")
print("=" * 70)

df = None
data_path = "data/raw/data.csv"

# Try multiple data loading approaches
try:
    # Try to load from specified path
    if os.path.exists(data_path):
        print(f"Loading data from {data_path}...")
        # Load first 50k rows for testing if file is large
        df = pd.read_csv(data_path, nrows=50000)
        print(f"âœ“ Successfully loaded {len(df):,} rows from {data_path}")
    else:
        print(f"File not found at {data_path}")
        # Try alternative path
        alt_path = "../data/raw/data.csv"
        if os.path.exists(alt_path):
            print(f"Trying alternative path: {alt_path}")
            df = pd.read_csv(alt_path, nrows=50000)
            print(f"âœ“ Successfully loaded {len(df):,} rows from {alt_path}")
        else:
            print("No data file found. Creating sample data...")
            df = create_sample_data()
            
except Exception as e:
    print(f"Error loading data: {e}")
    print("Creating sample data instead...")
    df = create_sample_data()

print(f"\nData loaded successfully!")
print(f"Shape: {df.shape}")
print(f"Columns: {', '.join(df.columns.tolist())}")

# -----------------------------
# 2. Basic Data Cleaning
# -----------------------------
print("\n" + "=" * 70)
print("2. DATA CLEANING")
print("=" * 70)

# Make a copy
df_clean = df.copy()

# Convert datetime
if 'TransactionStartTime' in df_clean.columns:
    df_clean['TransactionStartTime'] = pd.to_datetime(df_clean['TransactionStartTime'], errors='coerce')
    print(f"âœ“ Converted TransactionStartTime to datetime")

# Ensure numeric columns
numeric_cols = ['Amount', 'Value', 'BatchId'] if 'BatchId' in df_clean.columns else ['Amount', 'Value']
for col in numeric_cols:
    if col in df_clean.columns:
        df_clean[col] = pd.to_numeric(df_clean[col], errors='coerce')
        print(f"âœ“ Converted {col} to numeric")

# Check for missing values
print(f"\nMissing values summary:")
missing = df_clean.isnull().sum()
for col in missing[missing > 0].index:
    print(f"  {col}: {missing[col]:,} missing ({missing[col]/len(df_clean)*100:.1f}%)")

# -----------------------------
# 3. Create RFMS Features
# -----------------------------
print("\n" + "=" * 70)
print("3. CREATING RFMS FEATURES")
print("=" * 70)

# Extract time features from transaction timestamp
if 'TransactionStartTime' in df_clean.columns:
    print("Extracting time-based features...")
    df_clean['transaction_hour'] = df_clean['TransactionStartTime'].dt.hour
    df_clean['transaction_day'] = df_clean['TransactionStartTime'].dt.day
    df_clean['transaction_month'] = df_clean['TransactionStartTime'].dt.month
    df_clean['transaction_year'] = df_clean['TransactionStartTime'].dt.year
    df_clean['transaction_dayofweek'] = df_clean['TransactionStartTime'].dt.dayofweek
    df_clean['transaction_weekend'] = df_clean['TransactionStartTime'].dt.dayofweek.isin([5, 6]).astype(int)
    print("âœ“ Extracted hour, day, month, year, dayofweek, weekend indicators")

# Define snapshot date for recency calculation
snapshot_date = df_clean['TransactionStartTime'].max() if 'TransactionStartTime' in df_clean.columns else pd.Timestamp.now()
print(f"Snapshot date for recency: {snapshot_date}")

# -----------------------------
# 4. Customer-Level Aggregation
# -----------------------------
print("\n" + "=" * 70)
print("4. CUSTOMER-LEVEL AGGREGATION")
print("=" * 70)

print("Grouping transactions by customer...")

# Define aggregation functions
agg_functions = {
    'Amount': ['sum', 'mean', 'std', 'min', 'max', 'count'],
    'Value': ['mean', 'std']
}

# Add datetime aggregations if available
if 'TransactionStartTime' in df_clean.columns:
    agg_functions['TransactionStartTime'] = ['min', 'max']

# Add categorical aggregations
categorical_cols = ['ProductCategory', 'ChannelId', 'CurrencyCode', 'CountryCode']
for col in categorical_cols:
    if col in df_clean.columns:
        agg_functions[col] = lambda x: x.mode()[0] if not x.mode().empty else 'Unknown'

# Perform aggregation
customer_features = df_clean.groupby('CustomerId').agg(agg_functions)

# Flatten column names
customer_features.columns = ['_'.join(col).strip() for col in customer_features.columns.values]

# Rename key columns for clarity
column_rename = {
    'Amount_sum': 'total_transaction_amount',
    'Amount_mean': 'avg_transaction_amount',
    'Amount_std': 'std_transaction_amount',
    'Amount_min': 'min_transaction_amount',
    'Amount_max': 'max_transaction_amount',
    'Amount_count': 'transaction_count',
    'Value_mean': 'avg_transaction_value',
    'Value_std': 'std_transaction_value'
}

if 'TransactionStartTime_min' in customer_features.columns:
    column_rename['TransactionStartTime_min'] = 'first_transaction_date'
    column_rename['TransactionStartTime_max'] = 'last_transaction_date'

customer_features = customer_features.rename(columns=column_rename)

# Reset index
customer_features = customer_features.reset_index()

print(f"âœ“ Created features for {len(customer_features):,} unique customers")

# -----------------------------
# 5. Calculate Derived RFMS Features
# -----------------------------
print("\n" + "=" * 70)
print("5. DERIVED FEATURES CALCULATION")
print("=" * 70)

# Calculate recency (days since last transaction)
if 'last_transaction_date' in customer_features.columns:
    print("Calculating recency...")
    customer_features['recency_days'] = (snapshot_date - customer_features['last_transaction_date']).dt.days
    print("âœ“ Calculated recency_days")

# Calculate customer tenure
if all(col in customer_features.columns for col in ['last_transaction_date', 'first_transaction_date']):
    print("Calculating customer tenure...")
    customer_features['customer_tenure_days'] = (
        customer_features['last_transaction_date'] - customer_features['first_transaction_date']
    ).dt.days
    print("âœ“ Calculated customer_tenure_days")

# Calculate frequency (transactions per day)
if all(col in customer_features.columns for col in ['transaction_count', 'customer_tenure_days']):
    print("Calculating frequency...")
    customer_features['frequency_per_day'] = (
        customer_features['transaction_count'] / 
        np.maximum(customer_features['customer_tenure_days'], 1)
    )
    print("âœ“ Calculated frequency_per_day")

# Calculate monetary metrics
print("Calculating monetary metrics...")
if 'total_transaction_amount' in customer_features.columns and 'transaction_count' in customer_features.columns:
    customer_features['avg_transaction_size'] = (
        customer_features['total_transaction_amount'] / customer_features['transaction_count']
    )

# Handle NaN values in standard deviation (customers with only 1 transaction)
if 'std_transaction_amount' in customer_features.columns:
    customer_features['std_transaction_amount'] = customer_features['std_transaction_amount'].fillna(0)

# Calculate transaction amount range
if all(col in customer_features.columns for col in ['max_transaction_amount', 'min_transaction_amount']):
    customer_features['transaction_range'] = (
        customer_features['max_transaction_amount'] - customer_features['min_transaction_amount']
    )

print(f"âœ“ Created {len(customer_features.columns) - 1} features per customer")

# -----------------------------
# 6. Feature Selection and Preparation
# -----------------------------
print("\n" + "=" * 70)
print("6. FEATURE PREPARATION")
print("=" * 70)

# Define core RFMS features
core_numerical_features = [
    'total_transaction_amount',
    'avg_transaction_amount',
    'std_transaction_amount',
    'transaction_count',
    'recency_days',
    'customer_tenure_days',
    'frequency_per_day'
]

# Add optional features if they exist
optional_features = ['avg_transaction_size', 'transaction_range', 'min_transaction_amount', 'max_transaction_amount']
for feat in optional_features:
    if feat in customer_features.columns:
        core_numerical_features.append(feat)

# Select only features that exist
numerical_features = [f for f in core_numerical_features if f in customer_features.columns]

# Define categorical features
categorical_features = []
for col in ['ProductCategory', 'ChannelId', 'CurrencyCode', 'CountryCode']:
    if f'{col}_<lambda>' in customer_features.columns:
        categorical_features.append(f'{col}_<lambda>')
        # Rename for clarity
        new_name = f'most_common_{col.lower()}'
        customer_features = customer_features.rename(columns={f'{col}_<lambda>': new_name})
        categorical_features[-1] = new_name

print(f"Selected {len(numerical_features)} numerical features:")
for feat in numerical_features:
    print(f"  - {feat}")

print(f"\nSelected {len(categorical_features)} categorical features:")
for feat in categorical_features:
    print(f"  - {feat}")

# -----------------------------
# 7. Process Numerical Features
# -----------------------------
print("\n" + "=" * 70)
print("7. NUMERICAL FEATURE PROCESSING")
print("=" * 70)

print("Processing numerical features...")

# Separate numerical features
X_num = customer_features[numerical_features].copy()

# Handle missing values
print("  Imputing missing values...")
imputer = SimpleImputer(strategy='median')
X_num_imputed = imputer.fit_transform(X_num)

# Scale features
print("  Scaling features...")
scaler = StandardScaler()
X_num_scaled = scaler.fit_transform(X_num_imputed)

print(f"âœ“ Processed {X_num_scaled.shape[1]} numerical features")

# -----------------------------
# 8. Process Categorical Features
# -----------------------------
print("\n" + "=" * 70)
print("8. CATEGORICAL FEATURE PROCESSING")
print("=" * 70)

if categorical_features:
    print(f"Processing {len(categorical_features)} categorical features...")
    
    # Extract categorical data
    X_cat = customer_features[categorical_features].copy()
    
    # One-hot encoding
    print("  Applying one-hot encoding...")
    X_cat_encoded = pd.get_dummies(X_cat, drop_first=True, prefix_sep='_')
    
    # Combine with numerical features
    X_processed = np.hstack([X_num_scaled, X_cat_encoded])
    feature_names = numerical_features + X_cat_encoded.columns.tolist()
    
    print(f"âœ“ Created {X_cat_encoded.shape[1]} encoded categorical features")
else:
    print("No categorical features to process")
    X_processed = X_num_scaled
    feature_names = numerical_features

print(f"\nâœ“ Final feature matrix: {X_processed.shape[0]} customers Ã— {X_processed.shape[1]} features")

# -----------------------------
# 9. Save Results
# -----------------------------
print("\n" + "=" * 70)
print("9. SAVING RESULTS")
print("=" * 70)

# Create directories if they don't exist
os.makedirs('data/processed', exist_ok=True)
os.makedirs('data/raw', exist_ok=True)

# Save customer features
customer_features_path = 'data/processed/customer_features.csv'
customer_features.to_csv(customer_features_path, index=False)
print(f"âœ“ Saved customer features to {customer_features_path}")

# Save processed feature matrix
processed_path = 'data/processed/X_processed.csv'
processed_df = pd.DataFrame(X_processed, columns=feature_names)
processed_df.to_csv(processed_path, index=False)
print(f"âœ“ Saved processed feature matrix to {processed_path}")

# Save feature names separately
feature_names_path = 'data/processed/feature_names.txt'
with open(feature_names_path, 'w') as f:
    for name in feature_names:
        f.write(name + '\n')
print(f"âœ“ Saved feature names to {feature_names_path}")

# Save metadata
metadata = {
    'task': 'Task 3 - Feature Engineering',
    'timestamp': pd.Timestamp.now().isoformat(),
    'data_source': 'Xente eCommerce Transactions',
    'n_customers': len(customer_features),
    'n_transactions_original': len(df),
    'n_features_total': X_processed.shape[1],
    'n_numerical_features': len(numerical_features),
    'n_categorical_features': len(categorical_features),
    'numerical_features': numerical_features,
    'categorical_features': categorical_features,
    'rfms_features_created': ['Recency', 'Frequency', 'Monetary', 'Standard Deviation'],
    'processing_steps': [
        'Datetime feature extraction',
        'Customer-level aggregation',
        'RFMS calculation',
        'Missing value imputation',
        'Feature scaling',
        'One-hot encoding'
    ]
}

metadata_path = 'data/processed/feature_metadata.json'
with open(metadata_path, 'w') as f:
    json.dump(metadata, f, indent=2)
print(f"âœ“ Saved metadata to {metadata_path}")

# -----------------------------
# 10. Summary Report
# -----------------------------
print("\n" + "=" * 70)
print("FEATURE ENGINEERING COMPLETE - SUMMARY")
print("=" * 70)

print(f"\nðŸ“Š DATA OVERVIEW")
print(f"   Original transactions: {len(df):,}")
print(f"   Unique customers: {len(customer_features):,}")
print(f"   Date range: {df_clean['TransactionStartTime'].min().date() if 'TransactionStartTime' in df_clean.columns else 'N/A'} to {df_clean['TransactionStartTime'].max().date() if 'TransactionStartTime' in df_clean.columns else 'N/A'}")

print(f"\nðŸŽ¯ RFMS FEATURES CREATED")
print(f"   Recency (R): Days since last transaction")
print(f"   Frequency (F): Transactions per day")
print(f"   Monetary (M): Total and average transaction amount")
print(f"   Standard Deviation (S): Transaction amount variability")

print(f"\nðŸ”¢ FEATURE STATISTICS")
print(f"   Total features: {X_processed.shape[1]}")
print(f"   Numerical features: {len(numerical_features)}")
print(f"   Categorical features (encoded): {X_processed.shape[1] - len(numerical_features)}")

print(f"\nðŸ’¾ FILES SAVED")
print(f"   1. data/processed/customer_features.csv - Raw customer-level features")
print(f"   2. data/processed/X_processed.csv - Processed feature matrix for modeling")
print(f"   3. data/processed/feature_names.txt - List of all feature names")
print(f"   4. data/processed/feature_metadata.json - Processing metadata")

print(f"\nðŸ“ˆ SAMPLE OF PROCESSED FEATURES (first 5 customers):")
print(processed_df.head().to_string())

print(f"\n" + "=" * 70)
print("NEXT STEP: Task 4 - Proxy Target Variable Engineering")
print("=" * 70)
print("\nReady for RFM clustering to create proxy risk labels!")
print("Use customer_features.csv as input for K-Means clustering.")

TASK 3 - FEATURE ENGINEERING
Credit Risk Model using Alternative Data

1. LOADING DATA
File not found at data/raw/data.csv
Trying alternative path: ../data/raw/data.csv
âœ“ Successfully loaded 50,000 rows from ../data/raw/data.csv

Data loaded successfully!
Shape: (50000, 16)
Columns: TransactionId, BatchId, AccountId, SubscriptionId, CustomerId, CurrencyCode, CountryCode, ProviderId, ProductId, ProductCategory, ChannelId, Amount, Value, TransactionStartTime, PricingStrategy, FraudResult

2. DATA CLEANING
âœ“ Converted TransactionStartTime to datetime
âœ“ Converted Amount to numeric
âœ“ Converted Value to numeric
âœ“ Converted BatchId to numeric

Missing values summary:
  BatchId: 50,000 missing (100.0%)

3. CREATING RFMS FEATURES
Extracting time-based features...
âœ“ Extracted hour, day, month, year, dayofweek, weekend indicators
Snapshot date for recency: 2019-01-03 19:41:16+00:00

4. CUSTOMER-LEVEL AGGREGATION
Grouping transactions by customer...
âœ“ Created features for 2,293 uniqu