# Feature Analysis for Pre-Delinquency Intervention Engine

This notebook loads engineered features and performs exploratory data analysis and feature importance analysis for pre-delinquency risk prediction.

In [None]:
import os
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import mutual_info_classif, chi2
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

plt.style.use('seaborn-v0_8-muted')
sns.set_context('notebook')

PROJECT_ROOT = Path.cwd()
DATA_DIR = PROJECT_ROOT / 'ml' / 'data' / 'processed'
LABELS_PATH = PROJECT_ROOT / 'ml' / 'data' / 'labels.csv'
REPORTS_DIR = PROJECT_ROOT / 'ml' / 'reports'
REPORTS_DIR.mkdir(parents=True, exist_ok=True)

features_path = DATA_DIR / 'features.csv'
print(f'Loading features from: {features_path}')
features = pd.read_csv(features_path)

labels = pd.read_csv(LABELS_PATH)
data = features.merge(labels, on='customer_id', how='inner')
print(f'Data shape: {data.shape}')
data.head()

In [None]:
# Basic info and summary statistics
display(data.describe().T)
data['default'].value_counts(normalize=True).rename('default_rate')

In [None]:
# Distribution plots for each numerical feature
num_features = [col for col in data.columns if col not in ['customer_id', 'default']]

for col in num_features:
    fig, ax = plt.subplots(figsize=(6, 4))
    sns.histplot(data[col], kde=True, ax=ax)
    ax.set_title(f'Distribution of {col}')
    ax.set_xlabel(col)
    plt.tight_layout()
    out_path = REPORTS_DIR / f'dist_{col}.png'
    fig.savefig(out_path, dpi=120)
    plt.close(fig)

len(num_features)

In [None]:
# Correlation heatmap between features
corr = data[num_features].corr()
fig, ax = plt.subplots(figsize=(12, 10))
sns.heatmap(corr, cmap='coolwarm', center=0, ax=ax)
ax.set_title('Feature Correlation Heatmap')
plt.tight_layout()
out_path = REPORTS_DIR / 'correlation_heatmap.png'
fig.savefig(out_path, dpi=140)
plt.close(fig)
corr

In [None]:
# Feature vs target relationship (boxplots)
for col in num_features:
    fig, ax = plt.subplots(figsize=(6, 4))
    sns.boxplot(x='default', y=col, data=data, ax=ax)
    ax.set_title(f'{col} by Default Status')
    plt.tight_layout()
    out_path = REPORTS_DIR / f'box_{col}_vs_default.png'
    fig.savefig(out_path, dpi=120)
    plt.close(fig)

len(num_features)

In [None]:
# Prepare data for feature importance
X = data[num_features].copy()
y = data['default'].astype(int)

# Handle any NaNs (should be rare after our pipeline)
X = X.fillna(X.median(numeric_only=True))

# For chi-square, features must be non-negative; scale to [0, 1]
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Train/val split for RandomForest
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

X_train_rf = X_train.copy()
y_train_rf = y_train.copy()

In [None]:
# Mutual information
mi_scores = mutual_info_classif(X_scaled, y, random_state=42)
mi_series = pd.Series(mi_scores, index=num_features, name='mutual_information').sort_values(ascending=False)
mi_series.head(10)

In [None]:
# Chi-square
chi2_scores, chi2_p = chi2(X_scaled, y)
chi2_series = pd.Series(chi2_scores, index=num_features, name='chi2').sort_values(ascending=False)
chi2_series.head(10)

In [None]:
# Random Forest feature importance
rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=6,
    random_state=42,
    n_jobs=-1,
)
rf.fit(X_train_rf, y_train_rf)
rf_importances = pd.Series(rf.feature_importances_, index=num_features, name='rf_importance').sort_values(ascending=False)
rf_importances.head(10)

In [None]:
# Combine importance scores into a single DataFrame
importance_df = pd.concat([mi_series, chi2_series, rf_importances], axis=1)
importance_df.sort_values('rf_importance', ascending=False, inplace=True)
importance_df.head(15)

In [None]:
# Plot top 10 most important features by Random Forest importance
top_n = 10
top_rf = importance_df.head(top_n)

fig, ax = plt.subplots(figsize=(8, 5))
sns.barplot(x=top_rf['rf_importance'], y=top_rf.index, ax=ax, orient='h')
ax.set_title('Top 10 Features by Random Forest Importance')
ax.set_xlabel('Importance')
ax.set_ylabel('Feature')
plt.tight_layout()
out_path = REPORTS_DIR / 'top10_rf_importance.png'
fig.savefig(out_path, dpi=140)
plt.close(fig)

top_rf

In [None]:
# Correlation of each feature with default
corr_with_default = data[num_features + ['default']].corr()['default'].drop('default').sort_values(key=lambda s: s.abs(), ascending=False)
corr_with_default.head(15)

In [None]:
# Plot top 10 strongest correlations with default
top_corr = corr_with_default.head(10)
fig, ax = plt.subplots(figsize=(8, 5))
sns.barplot(x=top_corr.values, y=top_corr.index, ax=ax, orient='h')
ax.set_title('Top 10 Feature Correlations with Default')
ax.set_xlabel('Correlation with default')
ax.set_ylabel('Feature')
plt.tight_layout()
out_path = REPORTS_DIR / 'top10_corr_with_default.png'
fig.savefig(out_path, dpi=140)
plt.close(fig)

top_corr