In [1]:
DATA_DIR = "C:/Users/hansh/OneDrive/Desktop/OBITS Lab/MBS Simulate/data/Freddie data/"
import pandas as pd

# Load the processed data
df = pd.read_csv(DATA_DIR + "processed_mortgage_data_for_modeling.csv")

# Check distribution of EVENT
# 0 = Censored, 1 = Prepay, 2 = Default
event_counts = df['EVENT'].value_counts()
print("Event Distribution:")
print(event_counts)

# Check correlation of Default with Credit Score and LTV
df['IS_DEFAULT'] = df['EVENT'].apply(lambda x: 1 if x == 2 else 0)
print("\nCorrelation with Default:")
print(df[['IS_DEFAULT', 'CREDIT_SCORE', 'ORIGINAL_LTV', 'ORIGINAL_DEBT_TO_INCOME_RATIO']].corr()['IS_DEFAULT'])

  df = pd.read_csv(DATA_DIR + "processed_mortgage_data_for_modeling.csv")


Event Distribution:
EVENT
1    51354
0    27066
2     1580
Name: count, dtype: int64

Correlation with Default:
IS_DEFAULT                       1.000000
CREDIT_SCORE                    -0.041533
ORIGINAL_LTV                     0.039737
ORIGINAL_DEBT_TO_INCOME_RATIO    0.034488
Name: IS_DEFAULT, dtype: float64


In [2]:
import pandas as pd

# Load the processed data
df = pd.read_csv(DATA_DIR + "processed_mortgage_data_for_modeling.csv")

# Check Event distribution
event_counts = df['EVENT'].value_counts()
print("Event Counts:")
print(event_counts)

# Check Default Rate
default_count = event_counts.get(2, 0) # Event 2 is Default
total_count = len(df)
print(f"\nTotal Loans: {total_count}")
print(f"Total Defaults: {default_count} ({default_count/total_count:.2%})")

# Check correlation of FICO/LTV with Default
df['IS_DEFAULT'] = (df['EVENT'] == 2).astype(int)
print("\nCorrelation with Default:")
print(df[['IS_DEFAULT', 'CREDIT_SCORE', 'ORIGINAL_LTV', 'ORIGINAL_DEBT_TO_INCOME_RATIO']].corr()['IS_DEFAULT'])

  df = pd.read_csv(DATA_DIR + "processed_mortgage_data_for_modeling.csv")


Event Counts:
EVENT
1    51354
0    27066
2     1580
Name: count, dtype: int64

Total Loans: 80000
Total Defaults: 1580 (1.98%)

Correlation with Default:
IS_DEFAULT                       1.000000
CREDIT_SCORE                    -0.041533
ORIGINAL_LTV                     0.039737
ORIGINAL_DEBT_TO_INCOME_RATIO    0.034488
Name: IS_DEFAULT, dtype: float64


In [8]:
import pandas as pd
import numpy as np

# 1. Load Data
df = pd.read_csv(DATA_DIR + "processed_mortgage_data_for_modeling.csv")

# 2. Market Rates Dictionary (re-defined for clarity in this context)
MARKET_RATES = {
    # 2017
    201701: 4.15, 201702: 4.17, 201703: 4.20, 201704: 4.05, 201705: 4.01, 201706: 3.90,
    201707: 3.97, 201708: 3.88, 201709: 3.81, 201710: 3.90, 201711: 3.92, 201712: 3.95,
    # 2018
    201801: 4.03, 201802: 4.33, 201803: 4.44, 201804: 4.47, 201805: 4.59, 201806: 4.57,
    201807: 4.53, 201808: 4.55, 201809: 4.63, 201810: 4.83, 201811: 4.87, 201812: 4.64,
    # 2019
    201901: 4.46, 201902: 4.37, 201903: 4.27, 201904: 4.14, 201905: 4.07, 201906: 3.80,
    201907: 3.77, 201908: 3.62, 201909: 3.61, 201910: 3.69, 201911: 3.70, 201912: 3.72,
    # 2020
    202001: 3.62, 202002: 3.47, 202003: 3.45, 202004: 3.31, 202005: 3.23, 202006: 3.16,
    202007: 3.02, 202008: 2.94, 202009: 2.89, 202010: 2.83, 202011: 2.77, 202012: 2.68
}

# 3. Feature Engineering for Default
def engineer_default_features(row):
    # A. SATO (Spread At Origination)
    # Did the bank charge them a premium? (High SATO = Hidden Risk)
    try:
        orig_date = int(row['FIRST_PAYMENT_DATE'])
        # Handle simple quarter/month offsets if needed, but direct lookup is usually fine for first payment
        mkt_rate_orig = MARKET_RATES.get(orig_date, 4.0) 
        sato = row['ORIGINAL_INTEREST_RATE'] - mkt_rate_orig
    except:
        sato = 0.0

    # B. FICO Buckets (Non-linear risk)
    fico = row['CREDIT_SCORE']
    if fico >= 750: fico_bucket = 1 # Super Prime
    elif fico >= 700: fico_bucket = 2 # Prime
    elif fico >= 660: fico_bucket = 3 # Near Prime
    else: fico_bucket = 4 # Subprime/Risky

    # C. Equity Risk (High LTV)
    ltv = row['ORIGINAL_LTV']
    high_ltv_flag = 1 if ltv > 80 else 0
    
    return pd.Series([sato, fico_bucket, high_ltv_flag])

print("Engineering Default Features...")
df[['SATO', 'FICO_BUCKET', 'HIGH_LTV_FLAG']] = df.apply(engineer_default_features, axis=1)

# 4. Prepare Final Dataset
# Target: EVENT = 2 (Default). We treat Event 1 (Prepay) as Censored (0) for this specific model.
df['IS_DEFAULT'] = df['EVENT'].apply(lambda x: 1 if x == 2 else 0)

# Select Columns
cols = ['LOAN_SEQUENCE_NUMBER', 'DURATION', 'IS_DEFAULT', 
        'CREDIT_SCORE', 'ORIGINAL_LTV', 'ORIGINAL_DEBT_TO_INCOME_RATIO', 
        'SATO', 'FICO_BUCKET', 'HIGH_LTV_FLAG']

df_default = df[cols].dropna()

# Save
output_file = DATA_DIR + "default_modeling_dataset.csv"
df_default.to_csv(output_file, index=False)
print(f"Default dataset saved to {output_file} ({len(df_default)} loans)")
print(df_default.head())
print("\nDefault Rate by FICO Bucket (1=Best, 4=Worst):")
print(df_default.groupby('FICO_BUCKET')['IS_DEFAULT'].mean())

  df = pd.read_csv(DATA_DIR + "processed_mortgage_data_for_modeling.csv")


Engineering Default Features...
Default dataset saved to C:/Users/hansh/OneDrive/Desktop/OBITS Lab/MBS Simulate/data/Freddie data/default_modeling_dataset.csv (80000 loans)
  LOAN_SEQUENCE_NUMBER  DURATION  IS_DEFAULT  CREDIT_SCORE  ORIGINAL_LTV  \
0         F17Q10141137        98           0           705            75   
1         F17Q10000777        51           0           722            95   
2         F17Q10240479        98           0           719            56   
3         F17Q10166939        60           0           745            78   
4         F17Q10287022        35           0           770            80   

   ORIGINAL_DEBT_TO_INCOME_RATIO   SATO  FICO_BUCKET  HIGH_LTV_FLAG  
0                             32  1.240          2.0            0.0  
1                             47  0.050          2.0            1.0  
2                             35 -0.010          2.0            0.0  
3                             34  0.200          2.0            0.0  
4                   

In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# Libraries for Survival Analysis
# pip install lifelines scikit-survival
from lifelines import CoxPHFitter
from lifelines.utils import concordance_index
from sksurv.ensemble import RandomSurvivalForest
from sksurv.util import Surv
import joblib
# --- CONFIGURATION ---
DATA_FILE = DATA_DIR + "default_modeling_dataset.csv"
TEST_SIZE = 0.25
RANDOM_SEED = 42

def main():
    print("1. Loading Default Data...")
    df = pd.read_csv(DATA_FILE)
    
    # Define Default-Specific Features
    features = ['SATO', 'FICO_BUCKET', 'HIGH_LTV_FLAG', 'CREDIT_SCORE', 
                'ORIGINAL_LTV', 'ORIGINAL_DEBT_TO_INCOME_RATIO']
    
    target_duration = 'DURATION'
    target_event = 'IS_DEFAULT' # 1=Default, 0=Censored/Prepaid
    
    # Drop Missing
    df_clean = df[features + [target_duration, target_event]].dropna()
    print(f"   Modeling Universe: {len(df_clean)} loans")
    
    # Split Data
    train, test = train_test_split(df_clean, test_size=TEST_SIZE, random_state=RANDOM_SEED)
    print(f"   Training: {len(train)}, Test: {len(test)}")

    # ---------------------------------------------------------
    # MODEL A: COX PROPORTIONAL HAZARD
    # ---------------------------------------------------------
    print("\n2. Training Cox Model (Linear)...")
    cph_default = CoxPHFitter()
    try:
        cph_default.fit(train, duration_col=target_duration, event_col=target_event)
        
        # Evaluate
        cox_pred = cph_default.predict_partial_hazard(test)
        cox_c = concordance_index(test[target_duration], -cox_pred, test[target_event])
        print(f"   -> Cox C-Index: {cox_c:.4f}")
        
        print("\n   Key Risk Factors (Hazard Ratios):")
        print(cph_default.params_.sort_values(ascending=False).head(3))
        
    except Exception as e:
        print(f"   Cox Error: {e}")
        cox_c = 0

    # ---------------------------------------------------------
    # MODEL B: RANDOM SURVIVAL FOREST
    # ---------------------------------------------------------
    print("\n3. Training RSF (Non-Linear)...")
    X_train = train[features]
    y_train = Surv.from_dataframe(target_event, target_duration, train)
    
    X_test = test[features]
    y_test = Surv.from_dataframe(target_event, target_duration, test)
    
    rsf = RandomSurvivalForest(
        n_estimators=100, 
        min_samples_leaf=10, # Allow smaller leaves to catch rare defaults
        n_jobs=-1, 
        random_state=RANDOM_SEED
    )
    rsf.fit(X_train, y_train)
    
    rsf_c = rsf.score(X_test, y_test)
    print(f"   -> RSF C-Index: {rsf_c:.4f}")

    # ---------------------------------------------------------
    # COMPARISON
    # ---------------------------------------------------------
    print("\n" + "="*40)
    print("DEFAULT MODEL COMPARISON")
    print("="*40)
    print(f"Cox Model: {cox_c:.4f}")
    print(f"RSF Model: {rsf_c:.4f}")
    
    if rsf_c > cox_c:
        print("\nWinner: Random Survival Forest")
        print("Reason: Default risk often has sharp 'cliffs' (e.g., FICO < 660) that trees handle better.")
    else:
        print("\nWinner: Cox Model")



    # ... (After fitting cph and rsf) ...

    print("Saving models to disk...")

# 1. Save Cox Model
# Note: Lifelines objects pickle well, but ensure you use the same library version when loading.
    joblib.dump(cph_default, DATA_DIR + "cox_default_model.pkl")

# 2. Save Random Survival Forest
# Scikit-survival objects are scikit-learn compatible.
    joblib.dump(rsf, DATA_DIR + "rsf_default_model.pkl")

    print("Models saved successfully!")

if __name__ == "__main__":
    main()

1. Loading Default Data...
   Modeling Universe: 80000 loans
   Training: 60000, Test: 20000

2. Training Cox Model (Linear)...
   -> Cox C-Index: 0.7395

   Key Risk Factors (Hazard Ratios):
covariate
SATO             0.553880
HIGH_LTV_FLAG    0.392062
FICO_BUCKET      0.027650
Name: coef, dtype: float64

3. Training RSF (Non-Linear)...
   -> RSF C-Index: 0.7163

DEFAULT MODEL COMPARISON
Cox Model: 0.7395
RSF Model: 0.7163

Winner: Cox Model
Saving models to disk...
Models saved successfully!
