In [1]:
import pandas as pd
import numpy as np
import os
import glob

# --- CONFIGURATION ---
# Path where your massive time files are stored
DATA_DIR = "C:/Users/hansh/OneDrive/Desktop/OBITS Lab/MBS Simulate/data/Freddie data/"
PERF_FILE = DATA_DIR + "combined_performance_history_2017_2020.csv"
ORIG_FILE = DATA_DIR + "combined_sampled_mortgages_2017_2020.csv"
OUTPUT_FILE = DATA_DIR + "mortgage_survival_dataset.csv"


In [2]:


# Updated Columns based on your sample (33 Cols)
TIME_COLS = [
    "LOAN_SEQUENCE_NUMBER", "MONTHLY_REPORTING_PERIOD", "CURRENT_ACTUAL_UPB", 
    "CURRENT_LOAN_DELINQUENCY_STATUS", "LOAN_AGE", "REMAINING_MONTHS_TO_LEGAL_MATURITY", 
    "REPURCHASE_FLAG", "MODIFICATION_FLAG", "ZERO_BALANCE_CODE", 
    "ZERO_BALANCE_EFFECTIVE_DATE", "CURRENT_INTEREST_RATE", "CURRENT_DEFERRED_UPB", 
    "DUE_DATE_OF_LAST_PAID_INSTALLMENT", "MI_RECOVERIES", "NET_SALES_PROCEEDS", 
    "NON_MI_RECOVERIES", "EXPENSES", "LEGAL_COSTS", "MAINTENANCE_AND_PRESERVATION_COSTS", 
    "TAXES_AND_INSURANCE", "MISC_EXPENSES", "ACTUAL_LOSS_CALCULATION", "MODIFICATION_COST", 
    "STEP_MODIFICATION_FLAG", "DEFERRED_PAYMENT_PLAN", "ESTIMATED_LOAN_TO_VALUE", 
    "ZERO_BALANCE_REMOVAL_UPB", "DELINQUENT_ACCRUED_INTEREST", "DELINQUENCY_DUE_TO_DISASTER", 
    "BORROWER_ASSISTANCE_STATUS_CODE", "CURRENT_MONTH_MODIFICATION_COST", "INTEREST_BEARING_UPB", 
    "SOURCE_QUARTER"
]

def get_event_status(group):
    """
    Determines the terminal state of a loan.
    Returns: (Duration, Event_Type)
    Event 0: Censored (Active)
    Event 1: Prepaid
    Event 2: Default
    """
    # 1. Sort by Age to ensure chronological order
    group = group.sort_values("LOAN_AGE")
    last_row = group.iloc[-1]
    
    # 2. Check Terminal State (Zero Balance Code)
    # Handle both string '01' and float 1.0
    zbc = last_row["ZERO_BALANCE_CODE"]
    
    try:
        zbc_float = float(zbc)
    except (ValueError, TypeError):
        zbc_float = 0.0

    # Logic: Prepay (Code 1)
    if zbc_float == 1.0:
        return last_row["LOAN_AGE"], 1
    
    # Logic: Default (Codes 3, 6, 9)
    if zbc_float in [3.0, 6.0, 9.0]:
        return last_row["LOAN_AGE"], 2
        
    # 3. Check Delinquency History (Technical Default)
    # If a loan ever hits 6+ months delinquent (D180), we treat it as a default event
    # even if it cures later (Competing Risk logic).
    # Convert 'R', 'XX', etc to NaN -> 0
    delinq_vals = pd.to_numeric(group["CURRENT_LOAN_DELINQUENCY_STATUS"], errors='coerce').fillna(0)
    
    if (delinq_vals >= 6).any():
        # Duration is the FIRST time it hit D6
        first_def_idx = (delinq_vals >= 6).idxmax()
        duration = group.loc[first_def_idx, "LOAN_AGE"]
        return duration, 2

    # 4. Censored (Still Active)
    return last_row["LOAN_AGE"], 0

def main():
    print("1. Loading Origination Data...")
    df_orig = pd.read_csv(ORIG_FILE, sep='|', low_memory=False)
    
    print("2. Processing Performance Data (Grouping by Loan)...")
    # Reading the large combined file. If this is too large for RAM, use chunking (previous method).
    # Assuming the 6000-loan merged file fits in memory.
    df_perf = pd.read_csv(PERF_FILE, sep='|', names=TIME_COLS, header=0, low_memory=False)
    
    # Filter to ensure we only process the IDs we have originations for
    target_ids = set(df_orig['LOAN_SEQUENCE_NUMBER'])
    df_perf = df_perf[df_perf['LOAN_SEQUENCE_NUMBER'].isin(target_ids)]
    
    # Apply Logic
    print("   Calculating Events...")
    events = df_perf.groupby("LOAN_SEQUENCE_NUMBER").apply(get_event_status)
    
    # Structure into DataFrame
    df_events = pd.DataFrame(events.tolist(), index=events.index, columns=['DURATION', 'EVENT'])
    
    # Merge Covariates
    print("3. Merging Covariates...")
    final_df = df_orig.merge(df_events, left_on='LOAN_SEQUENCE_NUMBER', right_index=True)
    
    final_df.to_csv(OUTPUT_FILE, index=False)
    print(f"Done! Saved {len(final_df)} loans to {OUTPUT_FILE}")

if __name__ == "__main__":
    main()

1. Loading Origination Data...
2. Processing Performance Data (Grouping by Loan)...
   Calculating Events...


  events = df_perf.groupby("LOAN_SEQUENCE_NUMBER").apply(get_event_status)


3. Merging Covariates...
Done! Saved 80000 loans to C:/Users/hansh/OneDrive/Desktop/OBITS Lab/MBS Simulate/data/Freddie data/mortgage_survival_dataset.csv


In [19]:
import pandas as pd
import numpy as np

# 1. Define Market Rates (Full dictionary from previous context)
MARKET_RATES = {
    # 2017
    201701: 4.15, 201702: 4.17, 201703: 4.20, 201704: 4.05, 201705: 4.01, 201706: 3.90,
    201707: 3.97, 201708: 3.88, 201709: 3.81, 201710: 3.90, 201711: 3.92, 201712: 3.95,
    # 2018
    201801: 4.03, 201802: 4.33, 201803: 4.44, 201804: 4.47, 201805: 4.59, 201806: 4.57,
    201807: 4.53, 201808: 4.55, 201809: 4.63, 201810: 4.83, 201811: 4.87, 201812: 4.64,
    # 2019
    201901: 4.46, 201902: 4.37, 201903: 4.27, 201904: 4.14, 201905: 4.07, 201906: 3.80,
    201907: 3.77, 201908: 3.62, 201909: 3.61, 201910: 3.69, 201911: 3.70, 201912: 3.72,
    # 2020
    202001: 3.62, 202002: 3.47, 202003: 3.45, 202004: 3.31, 202005: 3.23, 202006: 3.16,
    202007: 3.02, 202008: 2.94, 202009: 2.89, 202010: 2.83, 202011: 2.77, 202012: 2.68,
    # 2021
    202101: 2.74, 202102: 2.81, 202103: 3.08, 202104: 3.06, 202105: 2.96, 202106: 2.98,
    202107: 2.87, 202108: 2.84, 202109: 2.90, 202110: 3.07, 202111: 3.07, 202112: 3.10,
    # 2022
    202201: 3.45, 202202: 3.76, 202203: 4.17, 202204: 4.98, 202205: 5.23, 202206: 5.52,
    202207: 5.41, 202208: 5.22, 202209: 6.11, 202210: 6.90, 202211: 6.76, 202212: 6.35,
    # 2023
    202301: 6.25, 202302: 6.30, 202303: 6.54, 202304: 6.34, 202305: 6.43, 202306: 6.71,
    202307: 6.84, 202308: 7.07, 202309: 7.20, 202310: 7.62, 202311: 7.44, 202312: 6.82,
    # 2024
    202401: 6.64, 202402: 6.78, 202403: 6.82, 202404: 6.99, 202405: 7.06, 202406: 6.92,
    202407: 6.82, 202408: 6.50, 202409: 6.18, 202410: 6.43, 202411: 6.81, 202412: 6.72
}

# 2. Load Data
df = pd.read_csv(DATA_DIR + "mortgage_survival_dataset.csv")

# 3. Feature Engineering Function
def get_incentive_metrics(row):
    try:
        start_date = int(row['FIRST_PAYMENT_DATE'])
        duration = int(row['DURATION'])
        note_rate = row['ORIGINAL_INTEREST_RATE']
        
        start_year = start_date // 100
        start_month = start_date % 100
        
        cumulative_incentive = 0.0
        current_incentive = 0.0
        
        # Limit loop to avoid hanging on bad data, though DURATION should be reasonable
        duration = min(duration, 360) 
        
        for i in range(duration):
            total_months = start_month + i - 1
            curr_year = start_year + (total_months // 12)
            curr_month = (total_months % 12)
            if curr_month == 0:
                curr_month = 12
                curr_year -= 1
            
            yyyymm = (curr_year * 100) + curr_month
            market_rate = MARKET_RATES.get(yyyymm, 4.0)
            
            incentive = note_rate - market_rate
            
            if incentive > 0:
                cumulative_incentive += incentive
            
            if i == duration - 1:
                current_incentive = incentive
                
        return pd.Series([current_incentive, cumulative_incentive])
    except:
        return pd.Series([0.0, 0.0])

# 4. Apply Feature Engineering
print("Processing data...")
metrics = df.apply(get_incentive_metrics, axis=1)
df[['RATE_INCENTIVE', 'BURNOUT_PROXY']] = metrics

# 5. Save Processed Data for User
output_filename = DATA_DIR + "processed_mortgage_data_for_modeling.csv"
df.to_csv(output_filename, index=False)

print(f"Processed data saved to {output_filename}")
print(df[['LOAN_SEQUENCE_NUMBER', 'DURATION', 'EVENT', 'RATE_INCENTIVE', 'BURNOUT_PROXY']].head())

  df = pd.read_csv(DATA_DIR + "mortgage_survival_dataset.csv")


Processing data...
Processed data saved to C:/Users/hansh/OneDrive/Desktop/OBITS Lab/MBS Simulate/data/Freddie data/processed_mortgage_data_for_modeling.csv
  LOAN_SEQUENCE_NUMBER  DURATION  EVENT  RATE_INCENTIVE  BURNOUT_PROXY
0         F17Q10141137        98      0           1.250         100.25
1         F17Q10000777        51      1           1.190          26.28
2         F17Q10240479        98      0           0.000          26.70
3         F17Q10166939        60      1           0.490          37.70
4         F17Q10287022        35      1           0.905          11.40


In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# You need to install these: pip install lifelines scikit-survival
from lifelines import CoxPHFitter
from lifelines.utils import concordance_index
from sksurv.ensemble import RandomSurvivalForest
from sksurv.metrics import concordance_index_censored, integrated_brier_score
from sksurv.util import Surv

# --- CONFIGURATION ---
DATA_FILE = DATA_DIR + "processed_mortgage_data_for_modeling.csv"
TEST_SIZE = 0.25
RANDOM_SEED = 42

def main():
    print("1. Loading Processed Data...")
    df = pd.read_csv(DATA_FILE)
    
    # Define Predictors and Target
    # We use our new features: RATE_INCENTIVE and BURNOUT_PROXY
    features = ['RATE_INCENTIVE', 'BURNOUT_PROXY', 'CREDIT_SCORE', 'ORIGINAL_LTV', 
                'ORIGINAL_DEBT_TO_INCOME_RATIO', 'ORIGINAL_INTEREST_RATE']
    
    target_duration = 'DURATION'
    target_event = 'EVENT' # 0=Censored, 1=Prepay, 2=Default
    
    # Filter: Analysis of Prepayment Risk Only
    # Treat Defaults (2) as Censored (0)
    print("   Setting up Prepayment Target...")
    df['IS_PREPAID'] = df[target_event].apply(lambda x: 1 if x == 1 else 0)
    
    # Drop rows with missing values to prevent model errors
    df_clean = df[features + [target_duration, 'IS_PREPAID']].dropna()
    print(f"   Modeling Universe: {len(df_clean)} loans")

    # 2. Split Data (Train / Test)
    print(f"\n2. Splitting Data ({int(TEST_SIZE*100)}% Hold-out)...")
    train, test = train_test_split(df_clean, test_size=TEST_SIZE, random_state=RANDOM_SEED)
    
    # ---------------------------------------------------------
    # MODEL A: COX PROPORTIONAL HAZARD
    # ---------------------------------------------------------
    print("\n3. Training Cox Proportional Hazard Model...")
    cph = CoxPHFitter()
    try:
        cph.fit(train, duration_col=target_duration, event_col='IS_PREPAID')
        
        # Evaluate C-Index
        cox_pred = cph.predict_partial_hazard(test)
        cox_c_index = concordance_index(test[target_duration], -cox_pred, test['IS_PREPAID'])
        print(f"   -> Cox Model C-Index (Test): {cox_c_index:.4f}")
        
        # Print Coefficients to verify Incentive/Burnout logic
        print("\n   Cox Coefficients:")
        print(cph.params_)
        
    except Exception as e:
        print(f"   Cox Error: {e}")
        cox_c_index = 0

    # ---------------------------------------------------------
    # MODEL B: RANDOM SURVIVAL FOREST (ML)
    # ---------------------------------------------------------
    print("\n4. Training Random Survival Forest (RSF)...")
    
    # Format data for Scikit-Survival
    X_train = train[features]
    X_test = test[features]
    
    # Create structured Target array (Boolean Event, Time)
    y_train_surv = Surv.from_dataframe("IS_PREPAID", target_duration, train)
    y_test_surv = Surv.from_dataframe("IS_PREPAID", target_duration, test)
    
    rsf = RandomSurvivalForest(
        n_estimators=100, 
        min_samples_split=10, 
        min_samples_leaf=15, 
        n_jobs=-1, 
        random_state=RANDOM_SEED
    )
    rsf.fit(X_train, y_train_surv)
    
    # Evaluate C-Index
    rsf_c_index = rsf.score(X_test, y_test_surv)
    print(f"   -> RSF Model C-Index (Test): {rsf_c_index:.4f}")
    
    # Evaluate Integrated Brier Score (IBS) - Accuracy Check
    # We check accuracy at specific time points (e.g. 12, 24, 36 months)
    try:
        times = np.quantile(test[target_duration][test['IS_PREPAID']==1], np.linspace(0.1, 0.9, 10))
        surv_probs = rsf.predict_survival_function(X_test)
        
        # Matrix of probabilities for IBS
        preds = np.row_stack([fn(times) for fn in surv_probs])
        ibs_score = integrated_brier_score(y_train_surv, y_test_surv, preds, times)
        print(f"   -> RSF Integrated Brier Score: {ibs_score:.4f} (Lower is better)")
    except Exception as e:
        print(f"   (IBS Calculation skipped: {e})")

    # ---------------------------------------------------------
    # COMPARISON
    # ---------------------------------------------------------
    print("\n" + "="*40)
    print("FINAL MODEL COMPARISON")
    print("="*40)
    print(f"{'Metric':<20} | {'Cox Model':<10} | {'RSF (ML)':<10}")
    print("-" * 40)
    print(f"{'C-Index (Ranking)':<20} | {cox_c_index:.4f}     | {rsf_c_index:.4f}")
    print("-" * 40)
    
    if rsf_c_index > cox_c_index:
        print("\nWinner: Random Survival Forest")
        print("Reasoning: The Prepayment function is likely non-linear (S-Curve).")
    else:
        print("\nWinner: Cox Proportional Hazard")
        print("Reasoning: The relationship is linear and Incentive-driven.")

if __name__ == "__main__":
    main()

1. Loading Processed Data...


  df = pd.read_csv(DATA_FILE)


   Setting up Prepayment Target...
   Modeling Universe: 80000 loans

2. Splitting Data (25% Hold-out)...

3. Training Cox Proportional Hazard Model...
   -> Cox Model C-Index (Test): 0.8015

   Cox Coefficients:
covariate
RATE_INCENTIVE                   0.151122
BURNOUT_PROXY                   -0.090219
CREDIT_SCORE                     0.000161
ORIGINAL_LTV                    -0.000931
ORIGINAL_DEBT_TO_INCOME_RATIO   -0.000444
ORIGINAL_INTEREST_RATE           2.304980
Name: coef, dtype: float64

4. Training Random Survival Forest (RSF)...
   -> RSF Model C-Index (Test): 0.9064


  preds = np.row_stack([fn(times) for fn in surv_probs])


   -> RSF Integrated Brier Score: 0.0513 (Lower is better)

FINAL MODEL COMPARISON
Metric               | Cox Model  | RSF (ML)  
----------------------------------------
C-Index (Ranking)    | 0.8015     | 0.9064
----------------------------------------

Winner: Random Survival Forest
Reasoning: The Prepayment function is likely non-linear (S-Curve).


In [21]:
import joblib

# ... (After fitting cph and rsf) ...

print("Saving models to disk...")

# 1. Save Cox Model
# Note: Lifelines objects pickle well, but ensure you use the same library version when loading.
joblib.dump(cph, DATA_DIR + "cox_prepayment_model.pkl")

# 2. Save Random Survival Forest
# Scikit-survival objects are scikit-learn compatible.
joblib.dump(rsf, DATA_DIR + "rsf_prepayment_model.pkl")

print("Models saved successfully!")

Saving models to disk...
Models saved successfully!


In [None]:
# Selection of MBS Pool Prime Candidates
import pandas as pd
import numpy as np

# 1. Load Data
file_path = DATA_DIR + "combined_sampled_mortgages_2017_2020.csv"
df = pd.read_csv(file_path, sep='|', low_memory=False)

# 2. Data Cleaning & Standardization
# Convert numeric columns, handling errors
cols_to_clean = ['CREDIT_SCORE', 'ORIGINAL_UPB', 'ORIGINAL_LTV', 'ORIGINAL_DEBT_TO_INCOME_RATIO', 'ORIGINAL_INTEREST_RATE']
for col in cols_to_clean:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Handle specific valid ranges/missing codes
# Freddie Mac: 9999 for Credit Score is missing. 999 for DTI is missing.
df['CREDIT_SCORE'] = df['CREDIT_SCORE'].apply(lambda x: x if 300 <= x <= 850 else np.nan)
df['ORIGINAL_DEBT_TO_INCOME_RATIO'] = df['ORIGINAL_DEBT_TO_INCOME_RATIO'].apply(lambda x: x if 0 <= x <= 100 else np.nan)

# Drop rows with critical missing data for underwriting
df_clean = df.dropna(subset=['CREDIT_SCORE', 'ORIGINAL_LTV', 'ORIGINAL_DEBT_TO_INCOME_RATIO', 'ORIGINAL_UPB'])

# 3. Apply "Prime MBS" Eligibility Criteria (Quality)
# Criteria: High FICO, Good Equity (LTV), Ability to Pay (DTI), Owner Occupied
# Note: These are stricter than standard Agency to create a "High Quality" pool.
prime_mask = (
    (df_clean['CREDIT_SCORE'] >= 720) &       # Prime Credit
    (df_clean['ORIGINAL_LTV'] <= 80) &        # Significant Equity (<=80% usually avoids MI)
    (df_clean['ORIGINAL_DEBT_TO_INCOME_RATIO'] <= 43) & # QM Safe Harbor standard
    (df_clean['OCCUPANCY_STATUS'] == 'P') &   # Principal Residence only (lower default risk)
    (df_clean['PROPERTY_TYPE'].isin(['SF', 'PU'])) # Single Family or PUD (avoid condos/co-ops for uniformity if desired, though condos are fine)
)

eligible_pool = df_clean[prime_mask]

print(f"Original Pool Size: {len(df)}")
print(f"Cleaned Pool Size: {len(df_clean)}")
print(f"Eligible 'Prime' Candidates: {len(eligible_pool)}")

# 4. Selection & Diversification (Geographic)
# We want 6000 loans.
# Strategy: Stratified Sampling by State to ensure geographic representativeness of the Prime universe,
# but we might want to cap exposure to any single state (e.g., CA/TX/FL) if they are too dominant.

target_size = 6000

if len(eligible_pool) < target_size:
    print("Warning: Not enough loans met the strict 'Prime' criteria. Relaxing criteria...")
    # Relax logic would go here, but with 80k rows, we likely have enough.
    # Fallback: Just take top N sorted by Credit Score
    selected_pool = eligible_pool.sort_values('CREDIT_SCORE', ascending=False).head(target_size)
else:
    # Check State Concentrations in Eligible Pool
    state_counts = eligible_pool['PROPERTY_STATE'].value_counts(normalize=True)
    
    # Simple random sample from eligible pool generally preserves distribution.
    # Let's do a random sample first.
    selected_pool = eligible_pool.sample(n=target_size, random_state=42)

# 5. Analysis of the Selected Pool vs Original
def get_stats(d, name):
    return pd.Series({
        'Count': len(d),
        'Total UPB ($M)': d['ORIGINAL_UPB'].sum() / 1e6,
        'WA FICO': np.average(d['CREDIT_SCORE'], weights=d['ORIGINAL_UPB']),
        'WA LTV': np.average(d['ORIGINAL_LTV'], weights=d['ORIGINAL_UPB']),
        'WA DTI': np.average(d['ORIGINAL_DEBT_TO_INCOME_RATIO'], weights=d['ORIGINAL_UPB']),
        'WA Rate': np.average(d['ORIGINAL_INTEREST_RATE'], weights=d['ORIGINAL_UPB']),
        'Top State %': d['PROPERTY_STATE'].value_counts(normalize=True).iloc[0] * 100,
        'Top State': d['PROPERTY_STATE'].value_counts(normalize=True).index[0]
    }, name=name)

stats_orig = get_stats(df_clean, "Original (Cleaned)")
stats_pool = get_stats(selected_pool, "Selected MBS Pool")

comparison = pd.concat([stats_orig, stats_pool], axis=1)

# Geographic Diversification Check (Top 5 States)
top_states_pool = selected_pool['PROPERTY_STATE'].value_counts(normalize=True).head(5) * 100

print("\n--- Comparative Statistics ---")
print(comparison)
print("\n--- Geographic Concentration (Selected Pool) ---")
print(top_states_pool)

# 6. Save Result
selected_pool.to_csv("selected_6000_prime_mbs_pool.csv", sep='|', index=False)
print("\nSaved to 'selected_6000_prime_mbs_pool.csv'")

In [None]:
import pandas as pd
import joblib
import numpy as np

# --- CONFIGURATION ---
NEW_DATA_FILE = "selected_mbs_pool_6000.csv"  # The pool we selected earlier
MARKET_RATE_NOW = 6.72  # Current market rate (Dec 2024) for incentive calc
OUTPUT_SCORES = "scored_mbs_pool.csv"

# 1. Load Models
print("1. Loading Models...")
cph = joblib.load("cox_prepayment_model.pkl")
rsf = joblib.load("rsf_prepayment_model.pkl")

# 2. Load New Data
print("2. Loading New Portfolio...")
df = pd.read_csv(NEW_DATA_FILE, sep='|', low_memory=False)

# 3. Feature Engineering (Generate Predictors)
print("3. Generating Predictors...")

# A. Rate Incentive (Simple "Spot" Calculation for new loans)
# Incentive = Note Rate - Current Market Rate
df['RATE_INCENTIVE'] = df['ORIGINAL_INTEREST_RATE'] - MARKET_RATE_NOW

# B. Burnout Proxy
# For new loans (Age=0), Burnout is 0.
# For seasoned loans, you would need their history.
# Assuming these are seasoned loans from our 2017-2020 set:
# (We use a simplified logic here: If Incentive < 0, Burnout stays 0. If > 0, it accumulates).
# For scoring a static file without full history, we might approximate or set to 0 if unknown.
df['BURNOUT_PROXY'] = 0.0 # Placeholder if full history unavailable

# Ensure columns match training features EXACTLY
features = ['RATE_INCENTIVE', 'BURNOUT_PROXY', 'CREDIT_SCORE', 
            'ORIGINAL_LTV', 'ORIGINAL_DEBT_TO_INCOME_RATIO', 'ORIGINAL_INTEREST_RATE']

# Handle missing data (e.g., fillna with training means)
df_score = df[features].fillna(0)

# 4. Scoring (Prediction)
print("4. Scoring Loans...")

# --- SCORE A: COX HAZARD (Relative Risk) ---
# Returns "partial hazard". Exp(partial) = Multiplier vs Baseline.
# e.g., 1.5 means "50% more likely to prepay than average".
df['COX_RISK_SCORE'] = cph.predict_partial_hazard(df_score)

# --- SCORE B: RSF SURVIVAL PROBABILITY (12-Month) ---
# "What is the probability this loan survives (does NOT prepay) for the next 12 months?"
# We need to predict survival function, then look up t=12.
surv_funcs = rsf.predict_survival_function(df_score)

# Extract probability at t=12 months
prob_survival_12m = []
for fn in surv_funcs:
    # fn is a step function. We evaluate it at x=12.
    # Note: If 12 is beyond the training duration (unlikely), it returns last known value.
    try:
        p = fn(12) 
    except:
        p = 0.0 # Fallback
    prob_survival_12m.append(p)

df['RSF_SURV_PROB_12M'] = prob_survival_12m

# 5. Ranking & Strategy
# Define "Fast Prepay" as High Cox Score OR Low RSF Survival Prob
df['PREPAY_RANK'] = df['COX_RISK_SCORE'].rank(ascending=False)

# 6. Save
print(f"5. Saving Scores to {OUTPUT_SCORES}...")
output_cols = ['LOAN_SEQUENCE_NUMBER', 'ORIGINAL_INTEREST_RATE', 'RATE_INCENTIVE', 
               'COX_RISK_SCORE', 'RSF_SURV_PROB_12M', 'PREPAY_RANK']
df[output_cols].to_csv(OUTPUT_SCORES, index=False)

print("\n--- TOP 5 FASTEST PREPAY CANDIDATES ---")
print(df[output_cols].sort_values(by='COX_RISK_SCORE', ascending=False).head())