In [None]:
# ======================================= 
# Loan Approval Prediction – Pitch Assets
# EDA, preprocessing, and baseline models
# =======================================

from google.colab import drive
drive.mount('/content/drive')

import os, json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

# File paths
TRAIN_PATH = "/content/drive/My Drive/Data Corsair/playground_series_s4e10/train.csv"
TEST_PATH  = "/content/drive/My Drive/Data Corsair/playground_series_s4e10/test.csv"

# Directory for plots
FIG_DIR = "/content/drive/My Drive/Data Corsair/loan_pitch_figures"
os.makedirs(FIG_DIR, exist_ok=True)

def save_fig(path):
    plt.tight_layout()
    plt.savefig(path, dpi=150, bbox_inches="tight")
    plt.close()

# Load data
train = pd.read_csv(TRAIN_PATH)
test  = pd.read_csv(TEST_PATH)

# Class balance
plt.figure(figsize=(6,4))
train["loan_status"].value_counts().sort_index().plot(kind="bar")
plt.title("Class Balance: loan_status")
plt.xlabel("loan_status")
plt.ylabel("Count")
save_fig(os.path.join(FIG_DIR, "class_balance_loan_status.png"))

# Missing values per column
plt.figure(figsize=(8,4))
train.isna().sum().sort_values(ascending=False).plot(kind="bar")
plt.title("Missing Values per Column")
plt.ylabel("Count")
save_fig(os.path.join(FIG_DIR, "missingness_per_column.png"))

# Numeric feature distributions (clipped to 1–99 percentiles to reduce extreme outliers)
numeric_cols = [c for c in train.select_dtypes(include=np.number).columns if c not in ["id","loan_status"]]
for col in numeric_cols:
    plt.figure(figsize=(6,4))
    vals = train[col].dropna()
    low, high = np.percentile(vals,[1,99])
    vals = vals.clip(low,high)
    plt.hist(vals, bins=30)
    plt.title(f"Distribution: {col}")
    save_fig(os.path.join(FIG_DIR, f"hist_{col}.png"))

# Categorical feature distributions
categorical_cols = ["person_home_ownership","loan_intent","loan_grade","cb_person_default_on_file"]
for col in categorical_cols:
    if col in train.columns:
        plt.figure(figsize=(7,4))
        train[col].value_counts().plot(kind="bar")
        plt.title(f"Distribution: {col}")
        save_fig(os.path.join(FIG_DIR, f"bar_{col}.png"))

# Correlation heatmap for numeric features
corr = train[numeric_cols+["loan_status"]].corr()
plt.figure(figsize=(7,6))
plt.imshow(corr, cmap="coolwarm", interpolation="nearest")
plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)
plt.yticks(range(len(corr.index)), corr.index)
plt.colorbar()
plt.title("Correlation Heatmap (Numeric Features)")
save_fig(os.path.join(FIG_DIR, "correlation_heatmap_numeric.png"))

# Simple target–feature relationships
if "loan_percent_income" in train.columns:
    plt.figure(figsize=(6,4))
    train.boxplot(column="loan_percent_income", by="loan_status")
    plt.suptitle("")
    plt.title("loan_percent_income by loan_status")
    save_fig(os.path.join(FIG_DIR,"box_loan_percent_income_by_status.png"))

if "loan_grade" in train.columns:
    train.groupby("loan_grade")["loan_status"].mean().plot(kind="bar")
    plt.title("Approval Rate by Loan Grade")
    save_fig(os.path.join(FIG_DIR,"approval_rate_by_loan_grade.png"))

if "loan_intent" in train.columns:
    train.groupby("loan_intent")["loan_status"].mean().plot(kind="bar")
    plt.title("Approval Rate by Loan Intent")
    save_fig(os.path.join(FIG_DIR,"approval_rate_by_loan_intent.png"))

# Baseline preprocessing and logistic regression check
X = train.drop(columns=["loan_status"])
y = train["loan_status"]

num_features = [c for c in numeric_cols if c in X.columns]
cat_features = [c for c in categorical_cols if c in X.columns]

preprocess = ColumnTransformer([
    ("num", StandardScaler(), num_features),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_features)
])

pipe = Pipeline([
    ("preprocess", preprocess),
    ("clf", LogisticRegression(max_iter=500))
])

X_train,X_valid,y_train,y_valid = train_test_split(X,y,test_size=0.2,stratify=y,random_state=42)
pipe.fit(X_train,y_train)
probs = pipe.predict_proba(X_valid)[:,1]
auc = roc_auc_score(y_valid, probs)

summary = {
    "train_shape": train.shape,
    "test_shape": test.shape,
    "numeric_features": num_features,
    "categorical_features": cat_features,
    "baseline_auc": auc
}
with open(os.path.join(FIG_DIR,"readiness_summary.json"),"w") as f:
    json.dump(summary,f,indent=2)

print("Baseline AUC (LogReg):", auc)
print("Figures and summaries saved in", FIG_DIR)

# =======================================
# PROJECT CHECKPOINT 2 EXTENSION (V3)
# (Adds neural net, SVM, tuning, and ensembles)
# =======================================

print("\n=== Running Checkpoint 2 (v3) ===")

# Extra imports for Checkpoint 2
import time
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import (
    f1_score,
    precision_score,
    recall_score,
    accuracy_score,
    roc_curve
)

# Additional models
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.neural_network import MLPClassifier

try:
    from xgboost import XGBClassifier
except ImportError:
    print("XGBoost not installed. Please run: pip install xgboost")
    XGBClassifier = None

# Directory for Checkpoint 2 plots and tables
FIG_DIR_CP2 = "/content/drive/My Drive/Data Corsair/loan_checkpoint2_figures"
os.makedirs(FIG_DIR_CP2, exist_ok=True)
print(f"Checkpoint 2 artifacts will be saved in: {FIG_DIR_CP2}")

# =======================================
# 1. Enhanced preprocessing with imputation
# =======================================

print("\n1. Building Enhanced Preprocessing Pipeline...")

# Numeric: impute median, then scale
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Categorical: impute mode, then one-hot encode
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Combined preprocessor used in all later models
preprocess_cp2 = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_features),
        ('cat', categorical_transformer, cat_features)
    ],
    remainder='passthrough'
)

# =======================================
# 2. Define the set of models to compare
# =======================================

print("\n2. Defining 9 models for comparison...")

# Class-weight scaling for XGBoost (based on CP1 y_train)
scale_pos_weight = y_train.value_counts()[0] / y_train.value_counts()[1]

models = {
    'Logistic Regression': LogisticRegression(
        max_iter=1000, class_weight='balanced', random_state=42
    ),
    'k-Nearest Neighbors (k-NN)': KNeighborsClassifier(),
    'Support Vector Machine (SVM)': SVC(
        probability=True, class_weight='balanced', random_state=42
    ),
    'Gaussian Naive Bayes': GaussianNB(),
    'Decision Tree': DecisionTreeClassifier(
        class_weight='balanced', random_state=42
    ),
    'Random Forest': RandomForestClassifier(
        class_weight='balanced', random_state=42, n_jobs=-1
    ),
    'AdaBoost': AdaBoostClassifier(random_state=42),
    'Neural Network (MLP)': MLPClassifier(
        random_state=42,
        max_iter=500,
        early_stopping=True,
        hidden_layer_sizes=(100, 50)
    )
}

if XGBClassifier:
    models['XGBoost'] = XGBClassifier(
        scale_pos_weight=scale_pos_weight, random_state=42, n_jobs=-1, eval_metric='logloss'
    )
else:
    print("Skipping XGBoost model as it is not installed.")

print(f"Defined {len(models)} models.")

# =======================================
# 3. Baseline comparison for all models
# =======================================

print("\n3. Running baseline model comparison (this may take a minute)...")

baseline_results = []
roc_curves = {}

for name, model in models.items():
    start_time = time.time()

    # Full pipeline: shared preprocessing + model
    pipe = Pipeline(steps=[
        ('preprocess', preprocess_cp2),
        ('clf', model)
    ])

    # Fit
    pipe.fit(X_train, y_train)

    # Predictions and probabilities
    preds = pipe.predict(X_valid)
    probs = pipe.predict_proba(X_valid)[:, 1]

    # Metrics
    auc = roc_auc_score(y_valid, probs)
    f1 = f1_score(y_valid, preds)
    precision = precision_score(y_valid, preds)
    recall = recall_score(y_valid, preds)
    accuracy = accuracy_score(y_valid, preds)

    # ROC curve data
    fpr, tpr, _ = roc_curve(y_valid, probs)
    roc_curves[name] = {'fpr': fpr, 'tpr': tpr, 'auc': auc}

    # Store metrics for table
    baseline_results.append({
        "Model": name,
        "AUC": auc,
        "F1": f1,
        "Precision": precision,
        "Recall": recall,
        "Accuracy": accuracy
    })

    end_time = time.time()
    print(f"  ... {name} complete. AUC: {auc:.4f} (Time: {end_time - start_time:.2f}s)")

# Table of baseline results
baseline_results_df = pd.DataFrame(baseline_results).sort_values(
    by="AUC", ascending=False
)

# Save baseline results
baseline_results_df.to_csv(
    os.path.join(FIG_DIR_CP2, "baseline_model_comparison.csv"), index=False
)

print("\n--- Baseline Model Comparison (Sorted by AUC) ---")
print(baseline_results_df.to_markdown(index=False, floatfmt=".4f"))
print("\nANALYSIS: XGBoost and RF are top performers.")
print("The default MLP is strong but likely needs SMOTE and tuning.")

# =======================================
# 4. Hyperparameter tuning and soft-voting ensemble
# =======================================

print("\n4. Proposing improvements: Hyperparameter tuning for top 2 models...")

# Cross-validation setup
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# We tune RF and XGB and then build an ensemble using them
tuned_estimators_for_ensemble = {}   # Estimator definitions for the ensemble
tuned_pipelines_for_eval = []        # Fitted pipelines for evaluation

# 4a. Random Forest tuning
if 'Random Forest' in models:
    print("\n--- Tuning Random Forest (this may take a few minutes) ---")
    pipe_rf = Pipeline(steps=[
        ('preprocess', preprocess_cp2),
        ('clf', RandomForestClassifier(class_weight='balanced', random_state=42, n_jobs=-1))
    ])

    param_grid_rf = {
        'clf__n_estimators': [100, 200],
        'clf__max_depth': [10, 20],
        'clf__min_samples_leaf': [2, 4]
    }

    grid_rf = GridSearchCV(
        pipe_rf, param_grid_rf, cv=cv, scoring='roc_auc', verbose=1, n_jobs=-1
    )
    grid_rf.fit(X_train, y_train)
    print(f"Best RF Params: {grid_rf.best_params_}")

    # Store fitted pipeline for evaluation
    tuned_pipelines_for_eval.append({
        "Model": "Tuned Random Forest",
        "estimator": grid_rf.best_estimator_
    })
    # Store classifier-only config for the ensemble
    rf_best_params_clean = {k.replace('clf__', ''): v for k, v in grid_rf.best_params_.items()}
    tuned_estimators_for_ensemble['rf'] = RandomForestClassifier(
        **rf_best_params_clean, class_weight='balanced', random_state=42, n_jobs=-1
    )

# 4b. XGBoost tuning
if 'XGBoost' in models:
    print("\n--- Tuning XGBoost (this may take a few minutes) ---")
    pipe_xgb = Pipeline(steps=[
        ('preprocess', preprocess_cp2),
        ('clf', XGBClassifier(
            scale_pos_weight=scale_pos_weight, random_state=42, n_jobs=-1, eval_metric='logloss'
        ))
    ])

    param_grid_xgb = {
        'clf__n_estimators': [100, 200],
        'clf__learning_rate': [0.05, 0.1],
        'clf__max_depth': [3, 5]
    }

    grid_xgb = GridSearchCV(
        pipe_xgb, param_grid_xgb, cv=cv, scoring='roc_auc', verbose=1, n_jobs=-1
    )
    grid_xgb.fit(X_train, y_train)
    print(f"Best XGB Params: {grid_xgb.best_params_}")

    # Store fitted pipeline for evaluation
    tuned_pipelines_for_eval.append({
        "Model": "Tuned XGBoost",
        "estimator": grid_xgb.best_estimator_
    })
    # Store classifier-only config for the ensemble
    xgb_best_params_clean = {k.replace('clf__', ''): v for k, v in grid_xgb.best_params_.items()}
    tuned_estimators_for_ensemble['xgb'] = XGBClassifier(
        **xgb_best_params_clean, scale_pos_weight=scale_pos_weight,
        random_state=42, n_jobs=-1, eval_metric='logloss'
    )

# 4c. Soft voting ensemble using tuned models + logistic regression
print("\n4c. Proposing improvement: Soft Voting Ensemble...")

# Add baseline logistic regression to the ensemble members
tuned_estimators_for_ensemble['lr'] = LogisticRegression(
    max_iter=1000, class_weight='balanced', random_state=42
)

# Build ensemble only if tuned RF and XGB are available
if 'rf' in tuned_estimators_for_ensemble and 'xgb' in tuned_estimators_for_ensemble:
    voting_classifier = VotingClassifier(
        estimators=[
            ('lr', tuned_estimators_for_ensemble['lr']),
            ('rf', tuned_estimators_for_ensemble['rf']),
            ('xgb', tuned_estimators_for_ensemble['xgb'])
        ],
        voting='soft',
        weights=[1, 2, 2]
    )

    pipe_voting = Pipeline(steps=[
        ('preprocess', preprocess_cp2),
        ('clf', voting_classifier)
    ])

    print("Fitting Voting Classifier (this may take a minute)...")
    pipe_voting.fit(X_train, y_train)

    tuned_pipelines_for_eval.append({
        "Model": "Voting Ensemble",
        "estimator": pipe_voting
    })
else:
    print("Skipping Voting Ensemble: RF or XGB models were not tuned.")

# =======================================
# 5. Evaluation of tuned models and ensemble
# =======================================

print("\n5. Evaluating tuned models and ensemble on validation set...")

final_metrics = []

# Evaluate tuned models and ensemble
for item in tuned_pipelines_for_eval:
    name = item['Model']
    model = item['estimator']

    preds = model.predict(X_valid)
    probs = model.predict_proba(X_valid)[:, 1]

    auc = roc_auc_score(y_valid, probs)
    f1 = f1_score(y_valid, preds)
    precision = precision_score(y_valid, preds)
    recall = recall_score(y_valid, preds)
    accuracy = accuracy_score(y_valid, preds)

    fpr, tpr, _ = roc_curve(y_valid, probs)
    roc_curves[name] = {'fpr': fpr, 'tpr': tpr, 'auc': auc}

    final_metrics.append({
        "Model": name,
        "AUC": auc,
        "F1": f1,
        "Precision": precision,
        "Recall": recall,
        "Accuracy": accuracy
    })

# Add selected baseline models to the same table for comparison
for model_name in ['Logistic Regression', 'Random Forest', 'XGBoost', 'Neural Network (MLP)', 'Support Vector Machine (SVM)']:
    if model_name in baseline_results_df['Model'].values:
        final_metrics.append(
            baseline_results_df[baseline_results_df['Model'] == model_name].to_dict('records')[0]
        )

# Final comparison table
final_comparison_df = pd.DataFrame(final_metrics).sort_values(by="AUC", ascending=False)

# Save final comparison
final_comparison_df.to_csv(
    os.path.join(FIG_DIR_CP2, "tuned_model_comparison.csv"), index=False
)

print("\n--- Final Comparison: Tuned, Ensemble, and Baseline ---")
print(final_comparison_df.to_markdown(index=False, floatfmt=".4f"))
print("\nDELIVERABLE: Enhanced results show the Voting Ensemble provides")
print("the best and most robust performance.")

# Final ROC curve figure
print("\nGenerating final ROC curve plot...")
plt.figure(figsize=(12, 10))

models_to_plot = [
    'Logistic Regression',
    'Random Forest',
    'Tuned Random Forest',
    'XGBoost',
    'Tuned XGBoost',
    'Neural Network (MLP)',
    'Support Vector Machine (SVM)',
    'Voting Ensemble'
]

for name in models_to_plot:
    if name in roc_curves:
        data = roc_curves[name]
        plt.plot(
            data['fpr'],
            data['tpr'],
            label=f"{name} (AUC = {data['auc']:.4f})",
            linewidth=2
        )

plt.plot([0, 1], [0, 1], 'k--', label='Chance (AUC = 0.50)')
plt.xlabel('False Positive Rate', fontsize=12)
plt.ylabel('True Positive Rate', fontsize=12)
plt.title('ROC Curve Comparison: Tuned vs. Baseline Models', fontsize=16)
plt.legend(loc='lower right', fontsize=10)
plt.grid(True)

# Save ROC plot to Checkpoint 2 directory
save_fig(os.path.join(FIG_DIR_CP2, "final_roc_curve_comparison.png"))
print(f"Final ROC plot saved to {FIG_DIR_CP2}")

# =======================================
# 6. Future work plan (for the project report)
# =======================================

print("\n" + "="*50)
print("6. Future Plans for Final Project")
print("="*50)
print("1. Advanced Feature Engineering: Create interaction features (e.g., 'loan_amnt' / 'person_income') and apply log transforms to skewed numeric features identified in EDA.")
print("2. Advanced Imbalance Handling: Compare the current `class_weight` method against a pipeline using SMOTE, especially for the Neural Network (MLP) which performed poorly without it.")
print("3. Deeper Tuning: Use RandomizedSearchCV on a wider grid for the individual components of the Voting Ensemble.")
print("4. Model Interpretability: Use SHAP values on the final Voting Ensemble (or its best component, XGBoost) to explain key drivers, moving beyond the simple coefficients of the baseline [cite: 343-385].")
print("5. Final Submission: Train the best, fully-tuned pipeline (the Voting Ensemble) on the *entire* training dataset ('train.csv') and generate predictions for the 'test.csv' file.")
print("\n=== Checkpoint 2 Script Complete ===\n")

Mounted at /content/drive


  vals = vals.clip(low,high)


Baseline AUC (LogReg): 0.9045863536869
Figures and summaries saved in /content/drive/My Drive/Data Corsair/loan_pitch_figures

=== Running Checkpoint 2 (v3) ===
Checkpoint 2 artifacts will be saved in: /content/drive/My Drive/Data Corsair/loan_checkpoint2_figures

1. Building Enhanced Preprocessing Pipeline...

2. Defining 9 models for comparison...
Defined 9 models.

3. Running baseline model comparison (this may take a minute)...


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


  ... Logistic Regression complete. AUC: 0.9062 (Time: 20.97s)
  ... k-Nearest Neighbors (k-NN) complete. AUC: 0.5503 (Time: 6.03s)
  ... Support Vector Machine (SVM) complete. AUC: 0.4969 (Time: 976.52s)
  ... Gaussian Naive Bayes complete. AUC: 0.8862 (Time: 0.28s)
  ... Decision Tree complete. AUC: 0.8258 (Time: 0.59s)
  ... Random Forest complete. AUC: 0.9321 (Time: 5.54s)
  ... AdaBoost complete. AUC: 0.9147 (Time: 3.41s)
  ... Neural Network (MLP) complete. AUC: 0.7643 (Time: 11.34s)
  ... XGBoost complete. AUC: 0.9488 (Time: 0.96s)

--- Baseline Model Comparison (Sorted by AUC) ---
| Model                        |    AUC |     F1 |   Precision |   Recall |   Accuracy |
|:-----------------------------|-------:|-------:|------------:|---------:|-----------:|
| XGBoost                      | 0.9488 | 0.7698 |      0.7342 |   0.8090 |     0.9311 |
| Random Forest                | 0.9321 | 0.8037 |      0.9349 |   0.7048 |     0.9510 |
| AdaBoost                     | 0.9147 | 0.6990

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



5. Evaluating tuned models and ensemble on validation set...

--- Final Comparison: Tuned, Ensemble, and Baseline ---
| Model                        |    AUC |     F1 |   Precision |   Recall |   Accuracy |
|:-----------------------------|-------:|-------:|------------:|---------:|-----------:|
| Tuned XGBoost                | 0.9545 | 0.7642 |      0.7055 |   0.8335 |     0.9268 |
| XGBoost                      | 0.9488 | 0.7698 |      0.7342 |   0.8090 |     0.9311 |
| Voting Ensemble              | 0.9479 | 0.7784 |      0.7644 |   0.7928 |     0.9357 |
| Tuned Random Forest          | 0.9350 | 0.7961 |      0.8613 |   0.7401 |     0.9460 |
| Random Forest                | 0.9321 | 0.8037 |      0.9349 |   0.7048 |     0.9510 |
| Logistic Regression          | 0.9062 | 0.6057 |      0.4753 |   0.8347 |     0.8453 |
| Neural Network (MLP)         | 0.7643 | 0.3466 |      0.4726 |   0.2737 |     0.8531 |
| Support Vector Machine (SVM) | 0.4969 | 0.2161 |      0.1389 |   0.4862 |     

In [None]:
from sklearn.metrics import (
    balanced_accuracy_score,
    precision_recall_curve,
    auc,
    confusion_matrix
)

print("\n" + "="*50)
print("ADDITIONAL EVALUATION: Extra Metrics for Main Models")
print("="*50)

def evaluate_model(name, model, X_train, X_val, y_train, y_val, threshold=0.5):
    # Fit model (safe to re-fit; uses same train split)
    model.fit(X_train, y_train)
    y_prob = model.predict_proba(X_val)[:, 1]
    y_pred = (y_prob >= threshold).astype(int)

    roc_auc = roc_auc_score(y_val, y_prob)
    acc = accuracy_score(y_val, y_pred)
    bal_acc = balanced_accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)
    prec_curve, rec_curve, _ = precision_recall_curve(y_val, y_prob)
    pr_auc = auc(rec_curve, prec_curve)
    tn, fp, fn, tp = confusion_matrix(y_val, y_pred).ravel()

    print(f"\n=== {name} ===")
    print(f"ROC-AUC:            {roc_auc:.4f}")
    print(f"Accuracy:           {acc:.4f}")
    print(f"Balanced Accuracy:  {bal_acc:.4f}")
    print(f"Precision:          {prec:.4f}")
    print(f"Recall:             {rec:.4f}")
    print(f"F1:                 {f1:.4f}")
    print(f"PR-AUC:             {pr_auc:.4f}")
    print(f"Confusion matrix [tn, fp, fn, tp]: {tn}, {fp}, {fn}, {tp}")

    return {
        "roc_auc": roc_auc,
        "accuracy": acc,
        "balanced_accuracy": bal_acc,
        "precision": prec,
        "recall": rec,
        "f1": f1,
        "pr_auc": pr_auc,
        "tn": tn, "fp": fp, "fn": fn, "tp": tp
    }

# Pipelines for main models using the enhanced preprocessing
lr_pipe = Pipeline(steps=[
    ('preprocess', preprocess_cp2),
    ('clf', LogisticRegression(
        max_iter=1000, class_weight='balanced', random_state=42
    ))
])


rf_pipe = None
xgb_pipe = None

if 'rf' in tuned_estimators_for_ensemble:
    rf_pipe = Pipeline(steps=[
        ('preprocess', preprocess_cp2),
        ('clf', tuned_estimators_for_ensemble['rf'])
    ])

if 'xgb' in tuned_estimators_for_ensemble:
    xgb_pipe = Pipeline(steps=[
        ('preprocess', preprocess_cp2),
        ('clf', tuned_estimators_for_ensemble['xgb'])
    ])

# Voting ensemble pipeline exists as `pipe_voting`
voting_pipe = pipe_voting

# Run evaluations
results_lr = evaluate_model("Logistic Regression (baseline-style)", lr_pipe,
                            X_train, X_valid, y_train, y_valid)

if rf_pipe is not None:
    results_rf = evaluate_model("Random Forest (tuned)", rf_pipe,
                                X_train, X_valid, y_train, y_valid)

if xgb_pipe is not None:
    results_xgb = evaluate_model("XGBoost (tuned)", xgb_pipe,
                                 X_train, X_valid, y_train, y_valid)

results_ens = evaluate_model("Voting Ensemble", voting_pipe,
                             X_train, X_valid, y_train, y_valid)



ADDITIONAL EVALUATION: Extra Metrics for Main Models


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



=== Logistic Regression (baseline-style) ===
ROC-AUC:            0.9062
Accuracy:           0.8453
Balanced Accuracy:  0.8409
Precision:          0.4753
Recall:             0.8347
F1:                 0.6057
PR-AUC:             0.6994
Confusion matrix [tn, fp, fn, tp]: 8520, 1539, 276, 1394

=== Random Forest (tuned) ===
ROC-AUC:            0.9350
Accuracy:           0.9460
Balanced Accuracy:  0.8602
Precision:          0.8613
Recall:             0.7401
F1:                 0.7961
PR-AUC:             0.8458
Confusion matrix [tn, fp, fn, tp]: 9860, 199, 434, 1236

=== XGBoost (tuned) ===
ROC-AUC:            0.9545
Accuracy:           0.9268
Balanced Accuracy:  0.8879
Precision:          0.7055
Recall:             0.8335
F1:                 0.7642
PR-AUC:             0.8761
Confusion matrix [tn, fp, fn, tp]: 9478, 581, 278, 1392


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



=== Voting Ensemble ===
ROC-AUC:            0.9479
Accuracy:           0.9357
Balanced Accuracy:  0.8761
Precision:          0.7644
Recall:             0.7928
F1:                 0.7784
PR-AUC:             0.8673
Confusion matrix [tn, fp, fn, tp]: 9651, 408, 346, 1324


In [None]:
from sklearn.model_selection import StratifiedKFold, cross_val_score

print("\n" + "="*50)
print("ADDITIONAL EVALUATION: 5-fold Cross-Validated ROC-AUC")
print("="*50)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Logistic Regression CV
lr_cv = Pipeline(steps=[
    ('preprocess', preprocess_cp2),
    ('clf', LogisticRegression(
        max_iter=1000, class_weight='balanced', random_state=42
    ))
])
lr_scores = cross_val_score(lr_cv, X, y, cv=cv, scoring='roc_auc')
print(f"Logistic Regression 5-fold ROC-AUC: {lr_scores.mean():.4f} ± {lr_scores.std():.4f}")

# XGBoost CV 
if XGBClassifier is not None:
    xgb_cv = Pipeline(steps=[
        ('preprocess', preprocess_cp2),
        ('clf', XGBClassifier(
            n_estimators=200,
            max_depth=4,
            learning_rate=0.1,
            subsample=0.8,
            colsample_bytree=0.8,
            objective="binary:logistic",
            scale_pos_weight=scale_pos_weight,
            eval_metric="logloss",
            random_state=42,
            n_jobs=-1
        ))
    ])
    xgb_scores = cross_val_score(xgb_cv, X, y, cv=cv, scoring='roc_auc')
    print(f"XGBoost 5-fold ROC-AUC: {xgb_scores.mean():.4f} ± {xgb_scores.std():.4f}")
else:
    print("XGBoost not installed, skipping 5-fold ROC-AUC for XGBoost.")



ADDITIONAL EVALUATION: 5-fold Cross-Validated ROC-AUC


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Logistic Regression 5-fold ROC-AUC: 0.9034 ± 0.0049
XGBoost 5-fold ROC-AUC: 0.9530 ± 0.0040


In [None]:
# =======================================
# 7. Stacked Cost-Sensitive Loan Approval Model
# =======================================
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import (
    balanced_accuracy_score,
    precision_recall_curve,
    auc,
    confusion_matrix
)

print("\n" + "="*50)
print("7. Novel Framework: Stacked Cost-Sensitive Loan Approval Model")
print("="*50)

# ---------------------------------------------------------
# 7.1 Stacking meta-learner (model-side novelty)
# ---------------------------------------------------------
# Base estimators: tuned RF/XGB when available, plus LR and MLP
base_estimators = []

# Base Logistic Regression
base_estimators.append(
    ('lr_base', LogisticRegression(
        max_iter=1000,
        class_weight='balanced',
        random_state=42
    ))
)

# Base Random Forest (tuned from earlier GridSearch)
if 'rf' in tuned_estimators_for_ensemble:
    base_estimators.append(('rf_base', tuned_estimators_for_ensemble['rf']))

# Base XGBoost (tuned) if available
if XGBClassifier is not None and 'xgb' in tuned_estimators_for_ensemble:
    base_estimators.append(('xgb_base', tuned_estimators_for_ensemble['xgb']))

# Base MLP
mlp_base = MLPClassifier(
    random_state=42,
    max_iter=500,
    early_stopping=True,
    hidden_layer_sizes=(100, 50)
)
base_estimators.append(('mlp_base', mlp_base))

# Meta-learner: Logistic Regression on stacked probabilities + passthrough features
stack_clf = StackingClassifier(
    estimators=base_estimators,
    final_estimator=LogisticRegression(
        max_iter=1000,
        class_weight='balanced',
        random_state=42
    ),
    stack_method='predict_proba',
    passthrough=True,
    n_jobs=-1
)

stack_pipe = Pipeline(steps=[
    ('preprocess', preprocess_cp2),
    ('clf', stack_clf)
])

print("Fitting stacked model...")
stack_pipe.fit(X_train, y_train)

# ---------------------------------------------------------
# 7.2 Cost-sensitive decision rule (decision-side)
# ---------------------------------------------------------

# Assumptions:
#   y = 1 -> "good" (approve)
#   y = 0 -> "bad"  (reject)
# Simple profit model (arbitrary units):
#   TP: +1.0   (approve good)
#   FP: -5.0   (approve bad)
#   FN: -0.5   (reject good)
#   TN:  0.0   (reject bad)
def cost_sensitive_sweep(y_true, y_prob,
                         gain_tp=1.0,
                         loss_fp=-5.0,
                         loss_fn=-0.5,
                         gain_tn=0.0,
                         n_thresholds=99):
    thresholds = np.linspace(0.01, 0.99, n_thresholds)
    rows = []
    for thr in thresholds:
        y_pred = (y_prob >= thr).astype(int)
        tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
        profit = tp * gain_tp + fp * loss_fp + fn * loss_fn + tn * gain_tn
        rows.append({
            "threshold": thr,
            "tp": tp,
            "fp": fp,
            "fn": fn,
            "tn": tn,
            "profit": profit
        })
    df = pd.DataFrame(rows)
    best = df.loc[df['profit'].idxmax()].copy()
    return df, best

# Validation probabilities from stacked model
stack_val_prob = stack_pipe.predict_proba(X_valid)[:, 1]
cs_df, best_cs = cost_sensitive_sweep(y_valid, stack_val_prob)

best_thr = float(best_cs["threshold"])
print(f"\nBest cost-sensitive threshold: {best_thr:.3f}")
print(f"Max expected profit (relative units): {best_cs['profit']:.2f}")

# Evaluate stacked model at the selected threshold
stack_cs_pred = (stack_val_prob >= best_thr).astype(int)
tn, fp, fn, tp = confusion_matrix(y_valid, stack_cs_pred).ravel()

stack_cs_metrics = {
    "threshold": best_thr,
    "tp": int(tp),
    "fp": int(fp),
    "fn": int(fn),
    "tn": int(tn),
    "roc_auc": roc_auc_score(y_valid, stack_val_prob),
    "accuracy": accuracy_score(y_valid, stack_cs_pred),
    "balanced_accuracy": balanced_accuracy_score(y_valid, stack_cs_pred),
    "precision": precision_score(y_valid, stack_cs_pred),
    "recall": recall_score(y_valid, stack_cs_pred),
    "f1": f1_score(y_valid, stack_cs_pred)
}

print("\n=== Stacked Cost-Sensitive Model (Validation) ===")
for k, v in stack_cs_metrics.items():
    if isinstance(v, float):
        print(f"{k:20s}: {v:.4f}")
    else:
        print(f"{k:20s}: {v}")

# ---------------------------------------------------------
# 7.3 Threshold–profit curve for stacked model
# ---------------------------------------------------------
plt.figure(figsize=(7, 5))
plt.plot(cs_df["threshold"], cs_df["profit"])
plt.xlabel("Decision Threshold")
plt.ylabel("Expected Profit (relative units)")
plt.title("Stacked Model: Cost-Sensitive Threshold Sweep")
plt.grid(True)
save_fig(os.path.join(FIG_DIR_CP2, "stacked_cost_sensitive_profit_curve.png"))
print(f"Cost-sensitive profit curve saved to {FIG_DIR_CP2}")

# Save metrics for later use in the report
with open(os.path.join(FIG_DIR_CP2, "stacked_cost_sensitive_metrics.json"), "w") as f:
    json.dump(stack_cs_metrics, f, indent=2)



7. Novel Framework: Stacked Cost-Sensitive Loan Approval Model
Fitting stacked model...


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Best cost-sensitive threshold: 0.970
Max expected profit (relative units): 584.00

=== Stacked Cost-Sensitive Model (Validation) ===
threshold           : 0.9700
tp                  : 1136
fp                  : 57
fn                  : 534
tn                  : 10002
roc_auc             : 0.9524
accuracy            : 0.9496
balanced_accuracy   : 0.8373
precision           : 0.9522
recall              : 0.6802
f1                  : 0.7936
Cost-sensitive profit curve saved to /content/drive/My Drive/Data Corsair/loan_checkpoint2_figures


In [None]:
# =======================================
# 8. Segment-specific expert ensemble
# =======================================
from sklearn.base import clone

print("\n" + "="*50)
print("8. Novel Framework: Segment-Specific Expert Ensemble")
print("="*50)

# ---------------------------------------------------------
# 8.1 Defining segmentation rule on raw feature space
# ---------------------------------------------------------
# We split applicants by loan_percent_income into two segments:
#   - low_ratio  : loan_percent_income <= median
#   - high_ratio : loan_percent_income > median
if "loan_percent_income" not in X_train.columns:
    raise ValueError(
        "loan_percent_income not found in X_train; "
        "cannot build segment-specific expert ensemble."
    )

income_ratio_median = X_train["loan_percent_income"].median()
print(f"Segment split on loan_percent_income median: {income_ratio_median:.4f}")

train_low_mask = X_train["loan_percent_income"] <= income_ratio_median
train_high_mask = ~train_low_mask

val_low_mask = X_valid["loan_percent_income"] <= income_ratio_median
val_high_mask = ~val_low_mask

print(f"Train segment sizes -> low_ratio: {train_low_mask.sum()}, "
      f"high_ratio: {train_high_mask.sum()}")
print(f"Valid segment sizes -> low_ratio: {val_low_mask.sum()}, "
      f"high_ratio: {val_high_mask.sum()}")

# ---------------------------------------------------------
# 8.2 Segment-specific expert models
# ---------------------------------------------------------

# Low-ratio expert: tuned RF if available, otherwise a reasonable RF
if 'rf' in tuned_estimators_for_ensemble:
    clf_low = clone(tuned_estimators_for_ensemble['rf'])
else:
    clf_low = RandomForestClassifier(
        n_estimators=200,
        max_depth=20,
        min_samples_leaf=2,
        class_weight='balanced',
        random_state=42,
        n_jobs=-1
    )

# High-ratio expert: tuned XGB if available, else a strong XGB, else logistic regression
if XGBClassifier is not None and 'xgb' in tuned_estimators_for_ensemble:
    clf_high = clone(tuned_estimators_for_ensemble['xgb'])
elif XGBClassifier is not None:
    clf_high = XGBClassifier(
        n_estimators=200,
        max_depth=4,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        objective="binary:logistic",
        scale_pos_weight=scale_pos_weight,
        eval_metric="logloss",
        random_state=42,
        n_jobs=-1
    )
else:
    clf_high = LogisticRegression(
        max_iter=1000,
        class_weight='balanced',
        random_state=42
    )

low_expert = Pipeline(steps=[
    ("preprocess", preprocess_cp2),
    ("clf", clf_low)
])

high_expert = Pipeline(steps=[
    ("preprocess", preprocess_cp2),
    ("clf", clf_high)
])

print("\nFitting low-risk expert on low_ratio segment...")
low_expert.fit(X_train[train_low_mask], y_train[train_low_mask])

print("Fitting high-risk expert on high_ratio segment...")
high_expert.fit(X_train[train_high_mask], y_train[train_high_mask])

# ---------------------------------------------------------
# 8.3 Helper to get segment-wise probabilities
# ---------------------------------------------------------
def segmented_proba(X_df):
    """
    Route each sample to its segment-specific expert and
    return a single probability vector plus the segment masks.
    """
    proba = np.zeros(len(X_df), dtype=float)
    low_mask = X_df["loan_percent_income"] <= income_ratio_median
    high_mask = ~low_mask

    if low_mask.any():
        proba[low_mask] = low_expert.predict_proba(X_df[low_mask])[:, 1]
    if high_mask.any():
        proba[high_mask] = high_expert.predict_proba(X_df[high_mask])[:, 1]

    return proba, low_mask, high_mask

val_proba_seg, val_low_mask, val_high_mask = segmented_proba(X_valid)

# ---------------------------------------------------------
# 8.4 Segment-specific cost-sensitive threshold search
# ---------------------------------------------------------
# Same business cost model as before:
#   TP: +1.0   (approve good)
#   FP: -5.0   (approve bad)
#   FN: -0.5   (reject good)
#   TN:  0.0   (reject bad)
def segmented_cost_sensitive_sweep(
    y_true,
    proba,
    low_mask,
    high_mask,
    gain_tp=1.0,
    loss_fp=-5.0,
    loss_fn=-0.5,
    gain_tn=0.0,
    thresholds=np.linspace(0.05, 0.95, 19)
):
    rows = []
    for thr_low in thresholds:
        for thr_high in thresholds:
            # Segment-specific thresholds:
            # low_ratio  -> thr_low
            # high_ratio -> thr_high
            thr_vec = np.where(low_mask, thr_low, thr_high)
            y_pred = (proba >= thr_vec).astype(int)
            tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
            profit = tp * gain_tp + fp * loss_fp + fn * loss_fn + tn * gain_tn
            rows.append({
                "thr_low": thr_low,
                "thr_high": thr_high,
                "tp": tp,
                "fp": fp,
                "fn": fn,
                "tn": tn,
                "profit": profit
            })
    df = pd.DataFrame(rows)
    best = df.loc[df["profit"].idxmax()].copy()
    return df, best

seg_cs_df, seg_best = segmented_cost_sensitive_sweep(
    y_valid.values,
    val_proba_seg,
    val_low_mask.values,
    val_high_mask.values
)

print("\nBest segment-specific thresholds:")
print(f"  low_ratio segment threshold  : {seg_best['thr_low']:.3f}")
print(f"  high_ratio segment threshold : {seg_best['thr_high']:.3f}")
print(f"  Max expected profit (relative units): {seg_best['profit']:.2f}")

# Evaluate with the best thresholds
thr_low_best = float(seg_best["thr_low"])
thr_high_best = float(seg_best["thr_high"])

thr_vec_val = np.where(val_low_mask.values, thr_low_best, thr_high_best)
seg_pred_val = (val_proba_seg >= thr_vec_val).astype(int)

tn, fp, fn, tp = confusion_matrix(y_valid, seg_pred_val).ravel()
segmented_metrics = {
    "thr_low": thr_low_best,
    "thr_high": thr_high_best,
    "tp": int(tp),
    "fp": int(fp),
    "fn": int(fn),
    "tn": int(tn),
    "roc_auc": roc_auc_score(y_valid, val_proba_seg),
    "accuracy": accuracy_score(y_valid, seg_pred_val),
    "balanced_accuracy": balanced_accuracy_score(y_valid, seg_pred_val),
    "precision": precision_score(y_valid, seg_pred_val),
    "recall": recall_score(y_valid, seg_pred_val),
    "f1": f1_score(y_valid, seg_pred_val)
}

print("\n=== Segment-Specific Expert Ensemble (Validation) ===")
for k, v in segmented_metrics.items():
    if isinstance(v, float):
        print(f"{k:22s}: {v:.4f}")
    else:
        print(f"{k:22s}: {v}")


# ---------------------------------------------------------
# 8.5 Save metrics and profit surface
# ---------------------------------------------------------
with open(os.path.join(FIG_DIR_CP2, "segmented_expert_metrics.json"), "w") as f:
    json.dump(segmented_metrics, f, indent=2)

seg_cs_df.to_csv(
    os.path.join(FIG_DIR_CP2, "segmented_threshold_profit_grid.csv"),
    index=False
)

# Heatmap of profit over (thr_low, thr_high)
pivot = seg_cs_df.pivot(index="thr_low", columns="thr_high", values="profit")
plt.figure(figsize=(7, 6))
im = plt.imshow(pivot.values, origin="lower", aspect="auto")
plt.xticks(
    range(len(pivot.columns)),
    [f"{c:.2f}" for c in pivot.columns],
    rotation=90
)
plt.yticks(
    range(len(pivot.index)),
    [f"{r:.2f}" for r in pivot.index]
)
plt.xlabel("High-Risk Threshold (thr_high)")
plt.ylabel("Low-Risk Threshold (thr_low)")
plt.title("Segmented Expert Ensemble: Profit Surface")
plt.colorbar(im, label="Expected Profit")
save_fig(os.path.join(FIG_DIR_CP2, "segmented_expert_profit_heatmap.png"))
print(f"Segmented expert profit heatmap saved to {FIG_DIR_CP2}")



8. Novel Framework: Segment-Specific Expert Ensemble
Segment split on loan_percent_income median: 0.1400
Train segment sizes -> low_ratio: 24343, high_ratio: 22573
Valid segment sizes -> low_ratio: 5951, high_ratio: 5778

Fitting low-risk expert on low_ratio segment...
Fitting high-risk expert on high_ratio segment...

Best segment-specific thresholds:
  low_ratio segment threshold  : 0.700
  high_ratio segment threshold : 0.900
  Max expected profit (relative units): 553.00

=== Segment-Specific Expert Ensemble (Validation) ===
thr_low               : 0.7000
thr_high              : 0.9000
tp                    : 1112
fp                    : 56
fn                    : 558
tn                    : 10003
roc_auc               : 0.9354
accuracy              : 0.9477
balanced_accuracy     : 0.8302
precision             : 0.9521
recall                : 0.6659
f1                    : 0.7837
Segmented expert profit heatmap saved to /content/drive/My Drive/Data Corsair/loan_checkpoint2_figures