# Feature Engineering: Customer Behavior Profiles

This notebook aggregates transaction-level data into customer-level behavior profiles for clustering and customer segmentation analysis.

## Objective
Transform individual transactions into meaningful customer behavior features that capture:
- Spending patterns
- Account activity levels
- Login security patterns
- Financial stability indicators

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import sys
from pathlib import Path

# Add src directory to path for imports
sys.path.insert(0, str(Path.cwd().parent / 'src'))

import warnings
warnings.filterwarnings('ignore')

# Set visualization style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

print("Libraries imported successfully!")

## Step 1: Load and Prepare Transaction Data

Load the raw transaction data and apply preprocessing transformations.

In [None]:
from data_preprocessing import preprocess_pipeline

# Load and preprocess data
data_path = Path('../data/bank_transactions_data_2.csv')

print("Loading and preprocessing transaction data...\n")
df_transactions = preprocess_pipeline(
    filepath=data_path,
    datetime_cols=['TransactionDate', 'PreviousTransactionDate'],
    missing_strategy='drop'
)

print(f"\nTransaction data shape: {df_transactions.shape}")
print(f"Number of unique customers: {df_transactions['AccountID'].nunique()}")
print(f"Date range: {df_transactions['TransactionDate'].min()} to {df_transactions['TransactionDate'].max()}")

# Display sample
print("\nSample of transaction data:")
print(df_transactions[['AccountID', 'TransactionAmount', 'TransactionDuration', 
                        'LoginAttempts', 'AccountBalance']].head(10))

## Step 2: Aggregate Transactions to Customer Level

Transform transaction-level data into customer profiles by grouping by AccountID.

### Behavioral Features Explained

| Feature | Formula | Why It Matters |
|---------|---------|---|
| **total_transaction_amount** | Sum of all transaction amounts | Captures total spending volume; high values indicate active/wealthy customers |
| **average_transaction_amount** | Mean of transaction amounts | Reflects typical transaction size; indicates spending patterns and risk profile |
| **transaction_frequency** | Count of transactions | Measures customer engagement; frequent activity indicates active account usage |
| **average_account_balance** | Mean account balance after transactions | Indicates financial stability and available funds; low balance signals risk |
| **average_login_attempts** | Mean login attempts per transaction | Security indicator; multiple attempts suggest account access issues |
| **std_transaction_amount** | Standard deviation of amounts | Measures spending consistency; high std indicates volatile behavior |
| **min_account_balance** | Minimum balance observed | Risk indicator; very low balance suggests potential overdraft issues |
| **max_account_balance** | Maximum balance observed | Wealth indicator; tracks customer's financial capacity |

In [None]:
print("="*80)
print("AGGREGATING TRANSACTIONS TO CUSTOMER PROFILES")
print("="*80)

# Aggregate transactions to customer level
customer_features = df_transactions.groupby('AccountID').agg({
    'TransactionAmount': ['sum', 'mean', 'std', 'count'],
    'AccountBalance': ['mean', 'min', 'max'],
    'LoginAttempts': 'mean',
    'TransactionDuration': 'mean',
    'CustomerAge': 'first',
    'CustomerOccupation': 'first',
    'TransactionType': lambda x: (x == 'Debit').sum() / len(x)  # Debit ratio
}).reset_index()

# Flatten column names
customer_features.columns = ['_'.join(col).strip('_') for col in customer_features.columns.values]

# Rename for clarity
customer_features.rename(columns={
    'TransactionAmount_sum': 'total_transaction_amount',
    'TransactionAmount_mean': 'average_transaction_amount',
    'TransactionAmount_std': 'std_transaction_amount',
    'TransactionAmount_count': 'transaction_frequency',
    'AccountBalance_mean': 'average_account_balance',
    'AccountBalance_min': 'min_account_balance',
    'AccountBalance_max': 'max_account_balance',
    'LoginAttempts_mean': 'average_login_attempts',
    'TransactionDuration_mean': 'average_transaction_duration',
    'CustomerAge_first': 'customer_age',
    'CustomerOccupation_first': 'customer_occupation',
    'TransactionType_<lambda>': 'debit_ratio'
}, inplace=True)

# Handle any NaN values in std (occurs when customer has only 1 transaction)
customer_features['std_transaction_amount'] = customer_features['std_transaction_amount'].fillna(0)

print(f"\n✓ Created customer profiles for {len(customer_features)} unique customers")
print(f"✓ Each row represents one customer (AccountID)")
print(f"\nFeatures created: {len(customer_features.columns)}")
print(f"\nDataframe shape: {customer_features.shape}")

print("\n" + "="*80)
print("Sample Customer Profiles (first 10 customers):")
print("="*80)
print(customer_features.head(10))

In [None]:
print("\n" + "="*80)
print("BEHAVIORAL FEATURE STATISTICS")
print("="*80)

behavioral_features = [
    'total_transaction_amount',
    'average_transaction_amount',
    'std_transaction_amount',
    'transaction_frequency',
    'average_account_balance',
    'min_account_balance',
    'max_account_balance',
    'average_login_attempts',
    'average_transaction_duration',
    'debit_ratio'
]

feature_stats = customer_features[behavioral_features].describe()
print(feature_stats)

## Step 3: Feature Analysis and Correlations

Analyze the relationships between behavioral features to understand customer segments.

In [None]:
# Correlation matrix for behavioral features
correlation_matrix = customer_features[behavioral_features].corr()

print("Correlation Matrix:")
print(correlation_matrix)

# Create correlation heatmap
fig, ax = plt.subplots(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
            square=True, linewidths=0.5, cbar_kws={'label': 'Correlation'}, ax=ax)
ax.set_title('Correlation Matrix: Customer Behavioral Features', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

print("\nKey Insights:")
print(f"  - Total vs Average transaction: {correlation_matrix.loc['total_transaction_amount', 'average_transaction_amount']:.3f}")
print(f"  - Frequency vs Total spending: {correlation_matrix.loc['total_transaction_amount', 'transaction_frequency']:.3f}")
print(f"  - Account balance measures: {correlation_matrix.loc['average_account_balance', 'max_account_balance']:.3f}")

## Step 4: Normalize Features Using Standard Scaling

Standardize all features to have mean=0 and std=1 for clustering algorithms.

In [None]:
print("="*80)
print("FEATURE NORMALIZATION")
print("="*80)

# Initialize scaler
scaler = StandardScaler()

# Create a copy for scaled features
customer_features_scaled = customer_features.copy()

# Scale only behavioral features
X_scaled = scaler.fit_transform(customer_features[behavioral_features])

# Replace original values with scaled values
customer_features_scaled[behavioral_features] = X_scaled

print(f"\n✓ Scaled {len(behavioral_features)} behavioral features using StandardScaler")
print(f"✓ Each feature now has mean≈0 and std≈1")

print("\nBefore Scaling (original):")
print(customer_features[behavioral_features].describe().round(2))

print("\n" + "-"*80)
print("After Scaling (normalized):")
print(customer_features_scaled[behavioral_features].describe().round(2))

print("\n" + "-"*80)
print("Sample: Original vs Scaled (first 5 customers)")
print("-"*80)
comparison_df = pd.DataFrame({
    'Customer': customer_features['AccountID'].head(5).values,
    'Total_Amt_Original': customer_features['total_transaction_amount'].head(5).values,
    'Total_Amt_Scaled': customer_features_scaled['total_transaction_amount'].head(5).values,
    'Avg_Attempts_Original': customer_features['average_login_attempts'].head(5).values,
    'Avg_Attempts_Scaled': customer_features_scaled['average_login_attempts'].head(5).values,
})
print(comparison_df)

## Step 5: Final Dataset Summary

Prepare the final customer profile dataset ready for clustering and segmentation.

In [None]:
print("="*80)
print("FINAL CUSTOMER PROFILE DATASET")
print("="*80)

print(f"\nDataset Dimensions:")
print(f"  Customers (rows): {len(customer_features_scaled)}")
print(f"  Total Features (cols): {len(customer_features_scaled.columns)}")
print(f"  Behavioral Features (for clustering): {len(behavioral_features)}")
print(f"  Metadata Features: {len(customer_features_scaled.columns) - len(behavioral_features)}")

print(f"\nColumns in Final Dataset:")
print(f"  Behavioral: {behavioral_features}")
print(f"  Metadata: {[col for col in customer_features_scaled.columns if col not in behavioral_features]}")

print(f"\nData Types:")
print(customer_features_scaled.dtypes)

print(f"\nNo missing values: {customer_features_scaled.isnull().sum().sum() == 0}")
print(f"No duplicate rows: {customer_features_scaled.duplicated().sum() == 0}")

print("\n" + "="*80)
print("READY FOR CLUSTERING")
print("="*80)
print(f"\n✓ Transformed {len(df_transactions)} transactions into {len(customer_features_scaled)} customer profiles")
print(f"✓ Features are normalized and ready for machine learning")
print(f"✓ Customer identifiers (AccountID) preserved for tracking")
print(f"✓ No sensitive or personally identifiable information exposed")

# Display final dataset sample
print("\nFinal Customer Profile (first 5 customers, scaled features):")
print(customer_features_scaled[['AccountID'] + behavioral_features[:5]].head())