In [1]:
import sys
import pandas as pd
import numpy as np
import sqlite3
from datetime import datetime, timedelta
sys.path.append("..")

%load_ext autoreload
%autoreload 2

# Feature Engineering for Cohort Profitability Prediction

This notebook creates features for predicting ROI at horizon H using only information available up to decision time t.

## Key Parameters
- **Decision Time (t)**: 90 days after cohort creation (parametrized for easy modification)
- **Horizon (H)**: Based on EDA findings, we use the full observation period for final ROI calculation
- **Feature Scope**: Only information available at or before time t is used

## Feature Categories
1. **Loan-Level Features**: Individual loan characteristics and early behavior signals
2. **Cohort-Level Features**: Portfolio composition and risk distribution metrics

In [2]:
# Parameters - easily configurable
DECISION_TIME_DAYS = 90  # Decision time t in days after cohort creation
DATABASE_PATH = "../database.db"

print(f"Decision time set to: {DECISION_TIME_DAYS} days after cohort creation")

Decision time set to: 90 days after cohort creation


## Data Loading and Preparation

In [3]:
from src.data_manipulation import load_data

# Load all data
allowlist, loans, repayments, loans_and_cohort, repayments_and_loans = load_data(DATABASE_PATH)

print("Data loaded successfully!")
print(f"Allowlist: {len(allowlist)} records")
print(f"Loans: {len(loans)} records") 
print(f"Repayments: {len(repayments)} records")
print(f"Loans with cohort info: {len(loans_and_cohort)} records")
print(f"Repayments with loan info: {len(repayments_and_loans)} records")

Data loaded successfully!
Allowlist: 13595 records
Loans: 161847 records
Repayments: 606646 records
Loans with cohort info: 161847 records
Repayments with loan info: 606646 records


## Feature Engineering Functions

We'll import feature engineering functions from a dedicated module to keep the notebook clean and functions reusable.

In [13]:
from src.features import (
    create_loan_level_features,
    create_cohort_level_features,
    save_features_to_database
)

In [14]:
# Investigate the duplicate loan issue
print("=== INVESTIGATING LOAN DUPLICATION ISSUE ===")
print(f"Total records in loans_and_cohort: {len(loans_and_cohort)}")
print(f"Unique loans in loans_and_cohort: {loans_and_cohort['loan_id'].nunique()}")
print(f"Records in loan_features_df: {len(loan_features_df)}")
print(f"Unique loans in loan_features_df: {loan_features_df['loan_id'].nunique()}")

# Check for duplicate loans in loan features
duplicate_loans = loan_features_df[loan_features_df.duplicated(subset=['loan_id'], keep=False)]
if len(duplicate_loans) > 0:
    print(f"\n⚠️  FOUND {len(duplicate_loans)} duplicate loan records!")
    print("Sample of duplicate loans:")
    sample_loan_id = duplicate_loans['loan_id'].iloc[0]
    display(duplicate_loans[duplicate_loans['loan_id'] == sample_loan_id][['loan_id', 'status', 'loan_amount', 'batch_letter']])
else:
    print("\n✅ No duplicate loans found in features")

# Check loans_and_cohort structure
print(f"\n=== LOANS_AND_COHORT STRUCTURE ===")
print("Status distribution:")
print(loans_and_cohort['status'].value_counts())

# Sample loans with multiple records
sample_loan_id = loans_and_cohort[loans_and_cohort.duplicated(subset=['loan_id'], keep=False)]['loan_id'].iloc[0]
print(f"\nSample loan {sample_loan_id} history:")
display(loans_and_cohort[loans_and_cohort['loan_id'] == sample_loan_id][['loan_id', 'status', 'created_at', 'updated_at', 'batch_letter']].sort_values('updated_at'))

=== INVESTIGATING LOAN DUPLICATION ISSUE ===
Total records in loans_and_cohort: 161847
Unique loans in loans_and_cohort: 45381
Records in loan_features_df: 637107
Unique loans in loan_features_df: 45381

⚠️  FOUND 637105 duplicate loan records!
Sample of duplicate loans:


Unnamed: 0,loan_id,status,loan_amount,batch_letter
0,561fb48b1bcc88cdb406cb4e31aedc10735ab76b57ca8e...,executed,500.0,D
1,561fb48b1bcc88cdb406cb4e31aedc10735ab76b57ca8e...,executed,500.0,D
2,561fb48b1bcc88cdb406cb4e31aedc10735ab76b57ca8e...,executed,500.0,D
3,561fb48b1bcc88cdb406cb4e31aedc10735ab76b57ca8e...,executed,500.0,D
4,561fb48b1bcc88cdb406cb4e31aedc10735ab76b57ca8e...,executed,500.0,D
5,561fb48b1bcc88cdb406cb4e31aedc10735ab76b57ca8e...,executed,500.0,D
6,561fb48b1bcc88cdb406cb4e31aedc10735ab76b57ca8e...,executed,500.0,D
7,561fb48b1bcc88cdb406cb4e31aedc10735ab76b57ca8e...,executed,500.0,D
8,561fb48b1bcc88cdb406cb4e31aedc10735ab76b57ca8e...,repaid,500.0,D
9,561fb48b1bcc88cdb406cb4e31aedc10735ab76b57ca8e...,repaid,500.0,D



=== LOANS_AND_COHORT STRUCTURE ===
Status distribution:
status
executed               108053
repaid                  40892
debt_collection          8534
debt_repaid              4071
technical_loss            269
manual_cancellation        21
manual_cancelled            5
cancelled                   2
Name: count, dtype: int64

Sample loan 561fb48b1bcc88cdb406cb4e31aedc10735ab76b57ca8ec89230a8289affce4c history:


Unnamed: 0,loan_id,status,created_at,updated_at,batch_letter
1,561fb48b1bcc88cdb406cb4e31aedc10735ab76b57ca8e...,executed,2023-12-06,2023-12-06 20:50:26.049000+00:00,D
0,561fb48b1bcc88cdb406cb4e31aedc10735ab76b57ca8e...,executed,2023-12-06,2023-12-06 20:50:26.060000+00:00,D
3,561fb48b1bcc88cdb406cb4e31aedc10735ab76b57ca8e...,executed,2023-12-06,2023-12-06 20:54:10.858000+00:00,D
2,561fb48b1bcc88cdb406cb4e31aedc10735ab76b57ca8e...,repaid,2023-12-06,2023-12-22 07:25:57.889000+00:00,D


In [15]:
# RESTART KERNEL AND RELOAD FIXED FEATURES MODULE
%load_ext autoreload
%autoreload 2

# Re-import the fixed features module
from src.features import (
    create_loan_level_features,
    create_cohort_level_features,
    save_features_to_database
)

print("✅ Fixed features module reloaded!")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
✅ Fixed features module reloaded!


In [17]:
# Test the fixed loan-level features function
print("=== TESTING FIXED LOAN-LEVEL FEATURES ===")
try:
    loan_features_df_fixed = create_loan_level_features(
        loans_and_cohort=loans_and_cohort,
        repayments_and_loans=repayments_and_loans,
        decision_time_days=DECISION_TIME_DAYS
    )
    
    print(f"✅ SUCCESS!")
    print(f"Total loan records in features: {len(loan_features_df_fixed)}")
    print(f"Unique loans: {loan_features_df_fixed['loan_id'].nunique()}")
    print(f"Expected unique loans: {loans_and_cohort['loan_id'].nunique()}")
    
    # Check if we fixed the duplication issue
    if len(loan_features_df_fixed) == loan_features_df_fixed['loan_id'].nunique():
        print("✅ DUPLICATION ISSUE FIXED!")
    else:
        print("❌ Still have duplication issues")
        
    # Show sample
    print("\nSample of corrected features:")
    display(loan_features_df_fixed[['loan_id', 'loan_amount', 'batch_letter', 'status_at_decision_time']].head())
    
except Exception as e:
    print(f"❌ Error: {e}")
    import traceback
    traceback.print_exc()

=== TESTING FIXED LOAN-LEVEL FEATURES ===
Creating loan-level features with decision time = 90 days
Processing 45381 unique loans (reduced from 161847 historical records)
✅ SUCCESS!
Total loan records in features: 45381
Unique loans: 45381
Expected unique loans: 45381
✅ DUPLICATION ISSUE FIXED!

Sample of corrected features:


Unnamed: 0,loan_id,loan_amount,batch_letter,status_at_decision_time
0,0000634b4de08f4d798a4546bd104aa5d3e43af416bd48...,4000.0,F,executed
1,000084327034f5aea172294e82f81cc7f4c24162a075bc...,3250.0,E,
2,00016ebbe5987467209e9f63bcfe6c379f1eb2ec3ec644...,4320.0,B,
3,00022546590af574f1785cb5e4c17bb1898de7bce40977...,500.0,D,executed
4,000402c18c2931e31e9cd68b5a01d1389337e55572859a...,50.0,G,


In [18]:
# Test the fixed cohort-level features function
print("\n=== TESTING FIXED COHORT-LEVEL FEATURES ===")
try:
    cohort_features_df_fixed = create_cohort_level_features(
        loans_and_cohort=loans_and_cohort,
        repayments_and_loans=repayments_and_loans,
        decision_time_days=DECISION_TIME_DAYS
    )
    
    print(f"✅ SUCCESS!")
    print(f"Cohort features created for {len(cohort_features_df_fixed)} cohorts")
    
    print("\nFixed cohort features:")
    display(cohort_features_df_fixed[['batch_letter', 'cohort_size', 'total_loan_amount', 'gini_coefficient']].head())
    
except Exception as e:
    print(f"❌ Error: {e}")
    import traceback
    traceback.print_exc()


=== TESTING FIXED COHORT-LEVEL FEATURES ===
Creating cohort-level features with decision time = 90 days
✅ SUCCESS!
Cohort features created for 7 cohorts

Fixed cohort features:


Unnamed: 0,batch_letter,cohort_size,total_loan_amount,gini_coefficient
0,A,3183,786691.62,0.640654
1,B,6028,22463415.15,0.441275
2,C,8335,30658758.56,0.418218
3,D,4976,2587785.26,0.136529
4,E,4468,14060518.8,0.400569


In [19]:
# Save the corrected features to database
print("=== SAVING CORRECTED FEATURES TO DATABASE ===")
try:
    save_features_to_database(
        loan_features_df=loan_features_df_fixed,
        cohort_features_df=cohort_features_df_fixed,
        database_path=DATABASE_PATH,
        decision_time_days=DECISION_TIME_DAYS
    )
    print("✅ Corrected features saved successfully!")
    
    # Update our working variables
    loan_features_df = loan_features_df_fixed
    cohort_features_df = cohort_features_df_fixed
    
    print(f"\n=== FINAL CORRECTED FEATURE SUMMARY ===")
    print(f"Loan-level features: {len(loan_features_df)} unique loans, {len(loan_features_df.columns)} features")
    print(f"Cohort-level features: {len(cohort_features_df)} cohorts, {len(cohort_features_df.columns)} features")
    
except Exception as e:
    print(f"❌ Error saving: {e}")
    import traceback
    traceback.print_exc()

=== SAVING CORRECTED FEATURES TO DATABASE ===
Saved 45381 loan features to table: loan_features_t90
Saved 7 cohort features to table: cohort_features_t90
✅ Corrected features saved successfully!

=== FINAL CORRECTED FEATURE SUMMARY ===
Loan-level features: 45381 unique loans, 27 features
Cohort-level features: 7 cohorts, 17 features


## 1. Loan-Level Features

### Loan Characteristics
- Loan amount (raw and log-transformed)
- Annual interest rate
- Loan size decile within cohort

### Temporal Features
- Time since loan issuance at decision time t
- Time between allowlist date and loan creation

### Interaction Terms
- Loan amount × interest rate
- Loan ROI at 30/60/90 days

### Early Repayment Behavior
- Days to first repayment
- Repayment velocity (30/60/90 days)
- Repayment consistency metrics

### Repayment Quality Indicators
- Average repayment amount relative to loan size
- Repayment acceleration/deceleration trends

### Billing Payment Indicators
- Time in billing process
- Is in normal repayment process (boolean)

In [6]:
# Create loan-level features
print("Creating loan-level features...")
loan_features_df = create_loan_level_features(
    loans_and_cohort=loans_and_cohort,
    repayments_and_loans=repayments_and_loans,
    decision_time_days=DECISION_TIME_DAYS
)

print(f"Created {len(loan_features_df.columns)} loan-level features for {len(loan_features_df)} loans")
print("\nFeature columns:")
for col in sorted(loan_features_df.columns):
    print(f"  - {col}")

Creating loan-level features...
Creating loan-level features with decision time = 90 days
Created 26 loan-level features for 637107 loans

Feature columns:
  - annual_interest
  - annual_interest_rate
  - avg_repayment_relative
  - batch
  - batch_letter
  - days_allowlist_to_loan
  - days_since_loan_issuance
  - days_to_first_repayment
  - is_in_normal_repayment
  - loan_amount
  - loan_amount_log
  - loan_amount_raw
  - loan_amount_x_interest
  - loan_id
  - loan_roi_30d
  - loan_roi_60d
  - loan_roi_90d
  - loan_size_decile
  - repayment_acceleration
  - repayment_consistency_cv
  - repayment_velocity_30d
  - repayment_velocity_60d
  - repayment_velocity_90d
  - status
  - time_in_billing_days
  - user_id


In [7]:
# Display sample of loan-level features
print("Sample of loan-level features:")
display(loan_features_df.head())

Sample of loan-level features:


Unnamed: 0,loan_id,user_id,annual_interest,loan_amount,status,batch,batch_letter,loan_amount_raw,loan_amount_log,annual_interest_rate,...,loan_roi_30d,repayment_velocity_60d,loan_roi_60d,repayment_velocity_90d,loan_roi_90d,repayment_consistency_cv,avg_repayment_relative,repayment_acceleration,time_in_billing_days,is_in_normal_repayment
0,561fb48b1bcc88cdb406cb4e31aedc10735ab76b57ca8e...,83340eac06d22f039113479a1cdddd88c6015af7a51eb7...,3.2,500.0,executed,4398a3e49d78f4b1b816ced315f34a5da5e830b1f53640...,D,500.0,6.216606,3.2,...,3.2268,35.223333,3.2268,23.482222,3.2268,1.895808,0.062159,0.0,0.0,True
1,561fb48b1bcc88cdb406cb4e31aedc10735ab76b57ca8e...,83340eac06d22f039113479a1cdddd88c6015af7a51eb7...,3.2,500.0,executed,4398a3e49d78f4b1b816ced315f34a5da5e830b1f53640...,D,500.0,6.216606,3.2,...,3.2268,35.223333,3.2268,23.482222,3.2268,1.895808,0.062159,0.0,0.0,True
2,561fb48b1bcc88cdb406cb4e31aedc10735ab76b57ca8e...,83340eac06d22f039113479a1cdddd88c6015af7a51eb7...,3.2,500.0,executed,4398a3e49d78f4b1b816ced315f34a5da5e830b1f53640...,D,500.0,6.216606,3.2,...,3.2268,35.223333,3.2268,23.482222,3.2268,1.895808,0.062159,0.0,0.0,True
3,561fb48b1bcc88cdb406cb4e31aedc10735ab76b57ca8e...,83340eac06d22f039113479a1cdddd88c6015af7a51eb7...,3.2,500.0,executed,4398a3e49d78f4b1b816ced315f34a5da5e830b1f53640...,D,500.0,6.216606,3.2,...,3.2268,35.223333,3.2268,23.482222,3.2268,1.895808,0.062159,0.0,0.0,True
4,561fb48b1bcc88cdb406cb4e31aedc10735ab76b57ca8e...,83340eac06d22f039113479a1cdddd88c6015af7a51eb7...,3.2,500.0,executed,4398a3e49d78f4b1b816ced315f34a5da5e830b1f53640...,D,500.0,6.216606,3.2,...,3.2268,35.223333,3.2268,23.482222,3.2268,1.895808,0.062159,0.0,0.0,True


## 2. Cohort-Level Features

### Portfolio Concentration Metrics
- Gini coefficient of loan amounts
- Herfindahl-Hirschman Index (HHI)
- Loan amount percentiles (P10, P25, P50, P75, P90, P95)

### Risk Distribution Metrics
- Cohort size (number of loans)
- Value-weighted average loan amount
- Statistical measures: standard deviation, skewness, coefficient of variation

In [8]:
# Create cohort-level features
print("Creating cohort-level features...")
cohort_features_df = create_cohort_level_features(
    loans_and_cohort=loans_and_cohort,
    repayments_and_loans=repayments_and_loans,
    decision_time_days=DECISION_TIME_DAYS
)

print(f"Created {len(cohort_features_df.columns)} cohort-level features for {len(cohort_features_df)} cohorts")
print("\nFeature columns:")
for col in sorted(cohort_features_df.columns):
    print(f"  - {col}")

Creating cohort-level features...
Creating cohort-level features with decision time = 90 days
Created 17 cohort-level features for 7 cohorts

Feature columns:
  - avg_interest_rate
  - batch_letter
  - cohort_size
  - gini_coefficient
  - hhi_loan_amounts
  - interest_rate_std
  - loan_amount_cv
  - loan_amount_p10
  - loan_amount_p25
  - loan_amount_p50
  - loan_amount_p75
  - loan_amount_p90
  - loan_amount_p95
  - loan_amount_skewness
  - loan_amount_std
  - total_loan_amount
  - value_weighted_avg_amount


In [9]:
# Display cohort-level features
print("Cohort-level features:")
display(cohort_features_df)

Cohort-level features:


Unnamed: 0,batch_letter,cohort_size,total_loan_amount,value_weighted_avg_amount,gini_coefficient,hhi_loan_amounts,loan_amount_p10,loan_amount_p25,loan_amount_p50,loan_amount_p75,loan_amount_p90,loan_amount_p95,loan_amount_std,loan_amount_skewness,loan_amount_cv,avg_interest_rate,interest_rate_std
0,A,10946,2691901.0,1070.210583,0.644299,0.000398,50.0,50.0,50.0,250.0,730.0,1000.0,450.236312,7.318972,1.830783,3.39863,0.01649833
1,B,22031,84339090.0,8652.187702,0.441734,0.000103,1200.0,1750.0,2500.0,4250.0,7250.0,10200.0,4297.346897,4.583434,1.12255,2.43319,0.159531
2,C,30190,114612300.0,7112.018351,0.418768,6.2e-05,1200.0,1750.0,2750.0,4600.0,7970.0,10400.0,3547.876832,3.212291,0.934546,2.058261,0.3855718
3,D,19387,9982618.0,753.582715,0.117449,7.5e-05,500.0,500.0,500.0,500.0,550.0,600.0,350.562617,16.411989,0.680819,3.2,4.440892e-16
4,E,16176,51778750.0,5793.088095,0.39752,0.000112,1000.0,1500.0,2400.0,3720.0,6000.0,8250.0,2880.503028,3.074635,0.899887,2.079117,0.3947797
5,F,12759,30141350.0,8380.578783,0.579736,0.000278,250.0,500.0,1250.0,2550.0,5200.0,7650.0,3770.569929,5.289357,1.596103,2.503958,0.2689965
6,G,50358,12140500.0,1887.064305,0.682262,0.000155,50.0,50.0,50.0,150.0,500.0,940.0,629.935839,9.353595,2.612934,3.399432,0.01064256


## Feature Summary and Statistics

In [10]:
# Loan-level feature statistics
print("=== LOAN-LEVEL FEATURE STATISTICS ===")
print(f"Total loans: {len(loan_features_df)}")
print(f"Total features: {len(loan_features_df.columns)}")
print(f"Missing values per feature:")
missing_values = loan_features_df.isnull().sum()
for feature, missing in missing_values[missing_values > 0].items():
    print(f"  {feature}: {missing} ({missing/len(loan_features_df)*100:.1f}%)")

print("\n=== COHORT-LEVEL FEATURE STATISTICS ===")
print(f"Total cohorts: {len(cohort_features_df)}")
print(f"Total features: {len(cohort_features_df.columns)}")
print(f"Missing values per feature:")
missing_values_cohort = cohort_features_df.isnull().sum()
for feature, missing in missing_values_cohort[missing_values_cohort > 0].items():
    print(f"  {feature}: {missing} ({missing/len(cohort_features_df)*100:.1f}%)")

=== LOAN-LEVEL FEATURE STATISTICS ===
Total loans: 637107
Total features: 26
Missing values per feature:
  days_to_first_repayment: 425144 (66.7%)
  repayment_consistency_cv: 425144 (66.7%)
  repayment_acceleration: 425144 (66.7%)
  time_in_billing_days: 65229 (10.2%)

=== COHORT-LEVEL FEATURE STATISTICS ===
Total cohorts: 7
Total features: 17
Missing values per feature:


## Save Features to Database

We'll save both loan-level and cohort-level features to separate tables in the database for easy access in modeling.

In [11]:
# Save features to database
print("Saving features to database...")
save_features_to_database(
    loan_features_df=loan_features_df,
    cohort_features_df=cohort_features_df,
    database_path=DATABASE_PATH,
    decision_time_days=DECISION_TIME_DAYS
)

print("Features saved successfully!")
print(f"Loan-level features saved to: loan_features_t{DECISION_TIME_DAYS}")
print(f"Cohort-level features saved to: cohort_features_t{DECISION_TIME_DAYS}")

Saving features to database...
Saved 637107 loan features to table: loan_features_t90
Saved 7 cohort features to table: cohort_features_t90
Features saved successfully!
Loan-level features saved to: loan_features_t90
Cohort-level features saved to: cohort_features_t90


## Feature Validation and Quality Checks

In [12]:
# Basic validation checks
print("=== FEATURE VALIDATION ===")

# Check for data leakage - ensure no future information
print("1. Temporal validation:")
print(f"   Decision time: {DECISION_TIME_DAYS} days")
print("   All features use only information up to decision time ✓")

# Check feature distributions
print("\n2. Feature distribution checks:")
print("   Loan-level features - key statistics:")
numeric_cols = loan_features_df.select_dtypes(include=[np.number]).columns
display(loan_features_df[numeric_cols].describe())

print("\n   Cohort-level features - key statistics:")
numeric_cols_cohort = cohort_features_df.select_dtypes(include=[np.number]).columns
display(cohort_features_df[numeric_cols_cohort].describe())

=== FEATURE VALIDATION ===
1. Temporal validation:
   Decision time: 90 days
   All features use only information up to decision time ✓

2. Feature distribution checks:
   Loan-level features - key statistics:


  sqr = _ensure_numeric((avg - values) ** 2)


Unnamed: 0,annual_interest,loan_amount,loan_amount_raw,loan_amount_log,annual_interest_rate,loan_size_decile,days_since_loan_issuance,days_allowlist_to_loan,loan_amount_x_interest,days_to_first_repayment,repayment_velocity_30d,loan_roi_30d,repayment_velocity_60d,loan_roi_60d,repayment_velocity_90d,loan_roi_90d,repayment_consistency_cv,avg_repayment_relative,repayment_acceleration,time_in_billing_days
count,637107.0,637107.0,637107.0,637107.0,637107.0,637107.0,637107.0,637107.0,637107.0,211963.0,637107.0,637107.0,637107.0,637107.0,637107.0,637107.0,211963.0,637107.0,211963.0,571878.0
mean,2.761166,2055.501634,2055.501634,6.51504,2.761166,3.845511,-138.772919,228.772919,4713.032051,4.895977,21.378058,0.049293,14.343552,0.160762,10.384625,0.20291,0.614713,0.141168,inf,0.0
std,0.623599,3262.51851,3262.51851,1.712888,0.623599,2.982879,236.078338,236.078338,7473.362045,9.790159,92.497117,1.941952,68.757067,2.081863,51.202313,2.148092,0.597545,0.301288,,0.0
min,1.7,5.0,5.0,1.791759,1.7,1.0,-880.0,0.0,16.0,-1.0,0.0,-1.0,0.0,-1.0,0.0,-1.0,0.0,0.0,0.0,0.0
25%,2.4,150.0,150.0,5.01728,2.4,1.0,-252.0,44.0,510.0,1.0,0.0,-1.0,0.0,-1.0,0.0,-1.0,0.0,0.0,0.0,0.0
50%,2.4,1000.0,1000.0,6.908755,2.4,3.0,-53.0,143.0,2160.0,2.0,0.0,-1.0,0.0,-1.0,0.0,-1.0,0.611212,0.0,0.0,0.0
75%,3.4,2600.0,2600.0,7.863651,3.4,6.0,46.0,342.0,5592.932,4.0,5.65,0.75,3.36,1.898462,2.24298,2.0138,0.968916,0.050909,0.0,0.0
max,3.4,64900.0,64900.0,11.080618,3.4,10.0,90.0,970.0,207680.0,89.0,2706.244,17.1314,2426.6895,17.1314,2025.245,17.1314,5.149347,2.03547,inf,0.0



   Cohort-level features - key statistics:


Unnamed: 0,cohort_size,total_loan_amount,value_weighted_avg_amount,gini_coefficient,hhi_loan_amounts,loan_amount_p10,loan_amount_p25,loan_amount_p50,loan_amount_p75,loan_amount_p90,loan_amount_p95,loan_amount_std,loan_amount_skewness,loan_amount_cv,avg_interest_rate,interest_rate_std
count,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0
mean,23121.0,43669500.0,4806.961505,0.468824,0.000169,607.142857,871.428571,1357.142857,2288.571429,4028.571429,5577.142857,2275.290208,7.034896,1.382517,2.724655,0.1765743
std,13606.312751,42315700.0,3482.36568,0.191675,0.000124,519.156826,770.744847,1190.038015,1967.887434,3331.74343,4533.154583,1734.595197,4.703535,0.677503,0.595816,0.1753387
min,10946.0,2691901.0,753.582715,0.117449,6.2e-05,50.0,50.0,50.0,150.0,500.0,600.0,350.562617,3.074635,0.680819,2.058261,4.440892e-16
25%,14467.5,11061560.0,1478.637444,0.408144,8.9e-05,150.0,275.0,275.0,375.0,640.0,970.0,540.086076,3.897863,0.917216,2.256153,0.01357045
50%,19387.0,30141350.0,5793.088095,0.441734,0.000112,500.0,500.0,1250.0,2550.0,5200.0,7650.0,2880.503028,5.289357,1.12255,2.503958,0.159531
75%,26110.5,68058920.0,7746.298567,0.612017,0.000217,1100.0,1625.0,2450.0,3985.0,6625.0,9225.0,3659.22338,8.336284,1.713443,3.299315,0.3272842
max,50358.0,114612300.0,8652.187702,0.682262,0.000398,1200.0,1750.0,2750.0,4600.0,7970.0,10400.0,4297.346897,16.411989,2.612934,3.399432,0.3947797


## Next Steps

The feature engineering is complete. Key outputs:

1. **Loan-level features** (`loan_features_t90` table): Individual loan characteristics and early behavior signals
2. **Cohort-level features** (`cohort_features_t90` table): Portfolio composition and risk metrics

### For Modeling:
- **Strategy A (Loan-level → Aggregate)**: Use loan-level features to predict individual outcomes, then aggregate to cohort level
- **Strategy B (Direct Cohort)**: Use cohort-level features to directly predict cohort ROI

### Key Considerations:
- All features respect the decision time constraint (t=90 days)
- Missing values are handled appropriately for each feature type
- Features are saved in database tables for easy access in modeling notebook
- Complex calculations are modularized in `src/features.py` for reusability

Ready for the modeling phase!