In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_predict, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    classification_report, confusion_matrix, roc_auc_score,
    average_precision_score, accuracy_score
)
import sklearn
from packaging import version
from sklearn.preprocessing import OneHotEncoder
from sklearn.calibration import CalibratedClassifierCV
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import LinearSVC
np.set_printoptions(suppress=True)
plt.rcParams["figure.dpi"] = 120
RANDOM_STATE = 42

In [2]:
def make_ohe():
    if version.parse(sklearn.__version__) >= version.parse("1.2"):
        return OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    else:
        return OneHotEncoder(handle_unknown="ignore", sparse=False)

def calibrate_prefit(prefit_estimator, X_valid, y_valid, method="isotonic"):
    if version.parse(sklearn.__version__) >= version.parse("1.4"):
        cal = CalibratedClassifierCV(estimator=prefit_estimator, method=method, cv="prefit")
    else:
        cal = CalibratedClassifierCV(base_estimator=prefit_estimator, method=method, cv="prefit")
    cal.fit(X_valid, y_valid)
    return cal


In [3]:

DATA = r"C:\Users\habib\OneDrive\المستندات\Graduation Project\GRAD-proj-DEPI\DS1\Cardiovascular Diseases Risk Prediction Dataset export 2025-10-15 21-12-56.csv"
df = pd.read_csv(DATA)

In [4]:
CAT = [c for c in [
    "General_Health","Checkup","Exercise","Skin_Cancer","Other_Cancer",
    "Depression","Diabetes","Arthritis","Sex","Age_Category","Smoking_History",
    "BMI_Category"
] if c in df.columns]

NUM = [c for c in [
    "Height_(cm)","Weight_(kg)","BMI","Alcohol_Consumption",
    "Fruit_Consumption","Green_Vegetables_Consumption","FriedPotato_Consumption"
] if c in df.columns]

TARGET = "Heart_Disease"
#Ensure target is numeric 0/1
if df[TARGET].dtype=="O":
    df[TARGET] = df[TARGET].map({"Yes":1,"No":0}).astype(int)

X = df[CAT + NUM].copy()
y = df[TARGET].copy()


In [5]:

#Data Splitting
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, stratify=y, random_state=42
)

X_valid, X_test, y_valid, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, stratify=y_temp, random_state=42
)


In [6]:

# Preprocess
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', make_ohe()) 
])

num_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())
])
preprocess = ColumnTransformer(
transformers=[('cat', cat_transformer, CAT), ('num', num_transformer, NUM)],
remainder='drop',
verbose_feature_names_out=False)



In [7]:

def evaluate_model(pipe, name):
    """Train on full train set and evaluate on test set (binary)."""
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)

    # Binary ROC-AUC using positive-class probabilities when available
    try:
        y_proba = pipe.predict_proba(X_test)[:, 1]
        roc = roc_auc_score(y_test, y_proba)
    except Exception:
        roc = np.nan

    acc = accuracy_score(y_test, y_pred)

    print(f"\n=== {name} ===")
    print(classification_report(y_test, y_pred, digits=3))

    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(6,5))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
    plt.title(f"{name} — Confusion Matrix")
    plt.xlabel("Predicted"); plt.ylabel("True")
    plt.show()

    print("Accuracy:", f"{acc:.3f}")
    print("ROC-AUC:", "NA" if np.isnan(roc) else f"{roc:.3f}")
    return acc, roc

def cv_prob_metrics(pipe, X, y, name, n_splits=5):
    """Cross-validated ROC-AUC and PR-AUC on training data."""
    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    y_oof_proba = cross_val_predict(pipe, X, y, cv=cv, method='predict_proba')[:,1]
    
    roc = roc_auc_score(y, y_oof_proba)
    pr = average_precision_score(y, y_oof_proba)
    
    print(f"{name} — CV ROC-AUC: {roc:.3f}, CV PR-AUC: {pr:.3f}")
    return roc, pr, y_oof_proba

def calibrate_on_valid(pipe, X_valid, y_valid, method='isotonic'):
    """Calibrate predicted probabilities on validation set."""
    calib = CalibratedClassifierCV(pipe, method=method, cv='prefit')
    calib.fit(X_valid, y_valid)
    return calib

def pick_threshold_by_f1(probs, y_true=None):
    from sklearn.metrics import f1_score
    if y_true is None:
        y_true = y_valid
    thresholds = np.linspace(0.01, 0.99, 99)
    f1_scores = [f1_score(y_true, probs >= t) for t in thresholds]
    best_idx = np.argmax(f1_scores)
    return thresholds[best_idx]


def eval_on_test(calib, tau, name):
    """Evaluate calibrated model on test set with selected threshold."""
    y_proba = calib.predict_proba(X_test)[:,1]
    y_pred = (y_proba >= tau).astype(int)
    
    roc = roc_auc_score(y_test, y_proba)
    pr = average_precision_score(y_test, y_proba)
    acc = accuracy_score(y_test, y_pred)
    
    print(f"\n{name} — Test Evaluation:")
    print(f"Threshold: {tau:.3f}")
    print(f"Accuracy: {acc:.3f}, ROC-AUC: {roc:.3f}, PR-AUC: {pr:.3f}")
    
    return {'accuracy': acc, 'roc_auc': roc, 'pr_auc': pr, 'threshold': tau}


`Interpretation:`

Precision: of all patients predicted to have class X, how many actually had it.

Recall: of all patients who truly had class X, how many did the model find.

F1-score: a balance between precision and recall.

Support: how many samples of that class exist in the test set.

Accuracy: overall correct predictions / total samples.

Macro average: average of all classes equally.

Weighted average: average weighted by class sizes.

In [8]:
# Model candidates (start with LogisticRegression; you can swap in XGBoost/LightGBM)
logistic_clf = LogisticRegression(
    max_iter=4000, class_weight='balanced', n_jobs=None)
lr_pipe = ImbPipeline(
    steps=[('prep', preprocess),('clf', logistic_clf)])

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
y_tr_oof_proba = cross_val_predict(lr_pipe, X_train, y_train, cv=cv, method='predict_proba')[:,1]

print("CV ROC-AUC:", roc_auc_score(y_train, y_tr_oof_proba))
print("CV PR-AUC:", average_precision_score(y_train, y_tr_oof_proba))




CV ROC-AUC: 0.8337879132218997
CV PR-AUC: 0.3051473957296993


**Define model pipelines**

In [9]:
models = {
"LogisticRegression": Pipeline([('prep', preprocess),
('clf', LogisticRegression(max_iter=4000, class_weight='balanced'))]),
"NaiveBayes": Pipeline([('prep', preprocess),
('clf', GaussianNB())]),
"KNN": Pipeline([('prep', preprocess),
('clf', KNeighborsClassifier(n_neighbors=15))]),
"DecisionTree": Pipeline([('prep', preprocess),
('clf', DecisionTreeClassifier(max_depth=6, class_weight='balanced', random_state=RANDOM_STATE))]),
"RandomForest": Pipeline([('prep', preprocess),
('clf', RandomForestClassifier(n_estimators=400, max_depth=None, min_samples_leaf=2,
class_weight='balanced_subsample', n_jobs=-1, random_state=RANDOM_STATE))]),
"XGB": Pipeline([('prep', preprocess),
('clf', XGBClassifier(
n_estimators=600, max_depth=4, learning_rate=0.05, subsample=0.8, colsample_bytree=0.8,
reg_lambda=1.0, reg_alpha=0.0, eval_metric='logloss', n_jobs=-1, random_state=RANDOM_STATE
))]),

}

**Train, calibrate, evaluate**

In [10]:
results = {}
best_name, best_pr = None, -np.inf
calibrated_models = {}

for name, pipe in models.items():
    print("\n====", name, "====")
    
    # Cross-validated metrics on training set
    roc, pr, _ = cv_prob_metrics(pipe, X_train, y_train, name)
    
    # Fit on full training set
    pipe.fit(X_train, y_train)
    
    # Calibrate on validation set
    calib = calibrate_on_valid(pipe, X_valid, y_valid, method='isotonic')
    
    # Pick best threshold on validation
    p_valid = calib.predict_proba(X_valid)[:,1]
    tau = pick_threshold_by_f1(p_valid, y_valid)
    
    # Evaluate on test set
    metrics = eval_on_test(calib, tau, name)
    results[name] = metrics
    calibrated_models[name] = (calib, tau)
    
    # Track best model by PR-AUC
    if metrics['pr_auc'] > best_pr:
        best_pr = metrics['pr_auc']
        best_name = name

print("\nBest model by PR-AUC:", best_name, results[best_name])


==== LogisticRegression ====
LogisticRegression — CV ROC-AUC: 0.834, CV PR-AUC: 0.305





LogisticRegression — Test Evaluation:
Threshold: 0.170
Accuracy: 0.852, ROC-AUC: 0.840, PR-AUC: 0.308

==== NaiveBayes ====
NaiveBayes — CV ROC-AUC: 0.800, CV PR-AUC: 0.259





NaiveBayes — Test Evaluation:
Threshold: 0.160
Accuracy: 0.830, ROC-AUC: 0.810, PR-AUC: 0.264

==== KNN ====
KNN — CV ROC-AUC: 0.761, CV PR-AUC: 0.215





KNN — Test Evaluation:
Threshold: 0.130
Accuracy: 0.830, ROC-AUC: 0.772, PR-AUC: 0.228

==== DecisionTree ====
DecisionTree — CV ROC-AUC: 0.782, CV PR-AUC: 0.244





DecisionTree — Test Evaluation:
Threshold: 0.150
Accuracy: 0.834, ROC-AUC: 0.791, PR-AUC: 0.254

==== RandomForest ====
RandomForest — CV ROC-AUC: 0.821, CV PR-AUC: 0.276





RandomForest — Test Evaluation:
Threshold: 0.160
Accuracy: 0.845, ROC-AUC: 0.829, PR-AUC: 0.292

==== XGB ====
XGB — CV ROC-AUC: 0.835, CV PR-AUC: 0.303





XGB — Test Evaluation:
Threshold: 0.180
Accuracy: 0.866, ROC-AUC: 0.842, PR-AUC: 0.312

Best model by PR-AUC: XGB {'accuracy': 0.8664551360918648, 'roc_auc': 0.8415844743469426, 'pr_auc': 0.31201809082741294, 'threshold': np.float64(0.18000000000000002)}


| Model               | Threshold | Accuracy | ROC-AUC | PR-AUC |
| ------------------- | --------- | -------- | ------- | ------ |
| Logistic Regression | 0.170     | 0.852    | 0.840   | 0.308  |
| Naive Bayes         | 0.160     | 0.830    | 0.810   | 0.264  |
| KNN                 | 0.130     | 0.830    | 0.772   | 0.228  |
| Decision Tree      | 0.150     | 0.834     | 0.791     | 0.254     | Easily interpretable, but prone to overfitting and limited generalization. |
| Random Forest     | 0.160     | 0.845     | 0.829     | 0.292     | Stronger ensemble; more stable and less overfitted than a single tree.     |
| XGBoost (XGB)       | 0.180     | **0.866** | **0.842** | **0.312** | Best performer overall; excellent tradeoff between accuracy and PR-AUC.  |

In [11]:
# LinearSVC (fast alternative to SVC-RBF)
svc_pipe = Pipeline([
    ('prep', preprocess),                     # Preprocessing: scaling & encoding
    ('clf', LinearSVC(max_iter=5000))        # Linear SVM
])
# LinearSVC + SMOTE (handle imbalance)
svc_smote_pipe = ImbPipeline([
    ('prep', preprocess),
    ('smote', SMOTE(random_state=42)),
    ('clf', LinearSVC(max_iter=5000))
])

# Models dictionary
models = {
    'LinearSVC': svc_pipe,
    'LinearSVC + SMOTE': svc_smote_pipe
}

In [12]:
calibrated_models = {}
for name, pipe in models.items():
    print(f"\n==== {name} ====")
    
    # Fit on full training set
    pipe.fit(X_train, y_train)
    
    # Calibrate probabilities (LinearSVC does not provide predict_proba by default)
    calib = CalibratedClassifierCV(pipe, method='isotonic', cv='prefit')
    calib.fit(X_valid, y_valid)
    
    # Store calibrated model
    calibrated_models[name] = calib

def pick_threshold_by_f1(probs, y_true):
    from sklearn.metrics import f1_score
    thresholds = np.linspace(0.01, 0.99, 99)
    f1_scores = [f1_score(y_true, probs >= t) for t in thresholds]
    best_idx = np.argmax(f1_scores)
    return thresholds[best_idx]

thresholds = {}
for name, calib in calibrated_models.items():
    p_valid = calib.predict_proba(X_valid)[:,1]
    tau = pick_threshold_by_f1(p_valid, y_valid)
    thresholds[name] = tau
    print(f"{name} — selected threshold: {tau:.3f}")


==== LinearSVC ====





==== LinearSVC + SMOTE ====




LinearSVC — selected threshold: 0.190
LinearSVC + SMOTE — selected threshold: 0.180


In [13]:
def eval_on_test(calib, tau, name):
    y_proba = calib.predict_proba(X_test)[:,1]
    y_pred = (y_proba >= tau).astype(int)
    
    from sklearn.metrics import accuracy_score, roc_auc_score, average_precision_score
    acc = accuracy_score(y_test, y_pred)
    roc = roc_auc_score(y_test, y_proba)
    pr = average_precision_score(y_test, y_proba)
    
    print(f"\n{name} — Test Evaluation:")
    print(f"Threshold: {tau:.3f}")
    print(f"Accuracy: {acc:.3f}, ROC-AUC: {roc:.3f}, PR-AUC: {pr:.3f}")
    
    return {'accuracy': acc, 'roc_auc': roc, 'pr_auc': pr, 'threshold': tau}

results = {}
for name, calib in calibrated_models.items():
    tau = thresholds[name]
    results[name] = eval_on_test(calib, tau, name)

# =========================
# Best model by PR-AUC
# =========================
best_name = max(results, key=lambda k: results[k]['pr_auc'])
print("\nBest model by PR-AUC:", best_name, results[best_name])


LinearSVC — Test Evaluation:
Threshold: 0.190
Accuracy: 0.865, ROC-AUC: 0.838, PR-AUC: 0.309

LinearSVC + SMOTE — Test Evaluation:
Threshold: 0.180
Accuracy: 0.861, ROC-AUC: 0.838, PR-AUC: 0.304

Best model by PR-AUC: LinearSVC {'accuracy': 0.8645340931166224, 'roc_auc': 0.8376281920654465, 'pr_auc': 0.3089954469723619, 'threshold': np.float64(0.19)}


| Model             | Threshold | Accuracy | ROC-AUC | PR-AUC |
| ----------------- | --------- | -------- | ------- | ------ |
| LinearSVC         | 0.190     | 0.865    | 0.838   | 0.309  |
| LinearSVC + SMOTE | 0.180     | 0.861    | 0.838   | 0.304  |


| Model                   | Threshold | Accuracy  | ROC-AUC   | PR-AUC    | Key Insights                                                               |
| ----------------------- | --------- | --------- | --------- | --------- | -------------------------------------------------------------------------- |
| **Logistic Regression** | 0.170     | 0.852     | 0.840     | 0.308     | Stable baseline; well-calibrated probabilities and balanced performance.   |
| **Naive Bayes**         | 0.160     | 0.830     | 0.810     | 0.264     | Fast, interpretable model but weaker precision-recall on minority class.   |
| **KNN**                 | 0.130     | 0.830     | 0.772     | 0.228     | Simpler method; struggles with complex/nonlinear boundaries.               |
| **LinearSVC**           | 0.190     | 0.865     | 0.838     | 0.309     | High accuracy and PR-AUC; efficient for large datasets.                    |
| **LinearSVC + SMOTE**   | 0.180     | 0.861     | 0.838     | 0.304     | Oversampling slightly improves recall, minor PR-AUC drop.                  |
| **Decision Tree**       | 0.150     | 0.834     | 0.791     | 0.254     | Easily interpretable, but prone to overfitting and limited generalization. |
| **Random Forest**       | 0.160     | 0.845     | 0.829     | 0.292     | Stronger ensemble; more stable and less overfitted than a single tree.     |
| **XGBoost (XGB)**       | 0.180     | **0.866** | **0.842** | **0.312** | Best performer overall; excellent tradeoff between accuracy and PR-AUC.  |


**hyperparameter tuning for top candidates**

In [14]:
# from sklearn.model_selection import GridSearchCV

# top_to_tune = ['XGB', 'LinearSVC + SMOTE', 'LinearSVC', 'LogisticRegression']

# for name in top_to_tune:
#     if name not in models:
#         continue

#     print(f"\nGridSearch for {name}")
#     base = models[name]

#     if name == 'LinearSVC':
#         param_grid = {
#             'clf__C': [0.5, 1, 2, 4],
#             'clf__loss': ['hinge', 'squared_hinge'],
#             'clf__max_iter': [2000, 4000]
#         }

#     elif name == 'LinearSVC + SMOTE':
#         param_grid = {
#             'clf__C': [0.5, 1, 2],
#             'clf__loss': ['hinge', 'squared_hinge'],
#             'clf__max_iter': [2000, 4000]
#         }

#     elif name == 'LogisticRegression':
#         param_grid = {
#             'clf__C': [0.5, 1, 2, 4],
#             'clf__penalty': ['l2'],
#             'clf__solver': ['lbfgs'],
#             'clf__max_iter': [2000, 4000]
#         }

#     # GridSearchCV for PR-AUC (minority sensitivity)
#     gs = GridSearchCV(
#         estimator=base,
#         param_grid=param_grid,
#         scoring='average_precision',
#         cv=cv,
#         n_jobs=-1
#     )

#     # Fit on training data
#     gs.fit(X_train, y_train)

#     print(" Best params:", gs.best_params_)
#     print(" Best PR-AUC (CV):", round(gs.best_score_, 4))

#     # Refit and calibrate
#     best_pipe = gs.best_estimator_
#     calib = calibrate_on_valid(best_pipe, X_valid, y_valid, method='isotonic')

#     # Optimize threshold by F1 score
#     p_valid = calib.predict_proba(X_valid)[:, 1]
#     tau = pick_threshold_by_f1(p_valid, y_valid)

#     # Final test evaluation
#     tuned_metrics = eval_on_test(calib, tau, f"{name}_Tuned")
#     results[f"{name}_Tuned"] = tuned_metrics
#     calibrated_models[f"{name}_Tuned"] = (calib, tau)


**Grid search is very slow**
- GridSearchCV tests every combination in param_grid.

- If cv=5, and 4×2×2 combinations → that’s 16×5 = 80 model fits per model.

- Doing that for 3–4 models is hundreds of fits.

- LinearSVC is slow (non-probabilistic, iterative convergence).

- XGB by default uses CPU-based tree building, which is slower.

In [15]:
import joblib, time, os
from xgboost import XGBClassifier
from imblearn.pipeline import Pipeline as ImbPipeline

# لا نستخدم FunctionTransformer ولا add_feats إطلاقاً
xgb_pipe = ImbPipeline([
    ("pre", preprocess),   # نفس الـ preprocess اللي عندك
    ("clf", XGBClassifier(
        n_estimators=100,
        max_depth=5,
        learning_rate=0.1,
        random_state=42,
        use_label_encoder=False,
        eval_metric='logloss'
    ))
])

xgb_pipe.fit(X, y)
print("XGBoost pipeline retrained WITHOUT add_feats!")

os.makedirs("Models", exist_ok=True)
ts = time.strftime("%Y%m%d_%H%M%S")
joblib.dump(xgb_pipe, f"Models/stage1_xgb_{ts}.joblib")
joblib.dump(xgb_pipe, "Models/stage1_xgb_latest.joblib")
print("Saved:", f"Models/stage1_xgb_{ts}.joblib")
 

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost pipeline retrained WITHOUT add_feats!
Saved: Models/stage1_xgb_20251129_231213.joblib


In [16]:

# --- Prediction helper ---
def stage1_predict_contract(model, df_rows, id_col=None):
    """
    Predicts class, probabilities, max probability, entropy, and margin.
    df_rows: dataframe with same features used for training.
    """
    P = model.predict_proba(df_rows)              # shape (n, classes)
    yhat = P.argmax(axis=1)
    max_prob = P.max(axis=1)
    
    # entropy
    with np.errstate(divide='ignore', invalid='ignore'):
        ent = -(P * np.log(P + 1e-12)).sum(axis=1)
    
    # margin: top - second
    sortedP = np.sort(P, axis=1)[:, ::-1]
    margin = sortedP[:,0] - sortedP[:,1]

    out = pd.DataFrame({
        "pred_class": yhat,
        **{f"p{i}": P[:,i] for i in range(P.shape[1])},
        "max_prob": max_prob,
        "entropy": ent,
        "margin": margin,
        "stage1_ts": time.strftime("%Y-%m-%d %H:%M:%S")
    }, index=df_rows.index)

    if id_col and id_col in df_rows.columns:
        out.insert(0, "patient_id", df_rows[id_col].values)
    else:
        out.insert(0, "patient_id", df_rows.index.astype(str))

    return out

# --- Example usage ---
stage1_contract_test = stage1_predict_contract(xgb_pipe, X_test)
stage1_contract_test.head(10)


Unnamed: 0,patient_id,pred_class,p0,p1,max_prob,entropy,margin,stage1_ts
18467,18467,0,0.967996,0.032004,0.967996,0.14164,0.935993,2025-11-29 23:12:13
190998,190998,0,0.961581,0.038419,0.961581,0.162886,0.923163,2025-11-29 23:12:13
201522,201522,0,0.778115,0.221885,0.778115,0.529284,0.556229,2025-11-29 23:12:13
302375,302375,0,0.98388,0.01612,0.98388,0.082528,0.96776,2025-11-29 23:12:13
84071,84071,0,0.980228,0.019772,0.980228,0.097149,0.960456,2025-11-29 23:12:13
296879,296879,0,0.994955,0.005045,0.994955,0.031717,0.98991,2025-11-29 23:12:13
107694,107694,0,0.713193,0.286807,0.713193,0.599268,0.426387,2025-11-29 23:12:13
115683,115683,0,0.989675,0.010325,0.989675,0.057489,0.97935,2025-11-29 23:12:13
140830,140830,0,0.853519,0.146481,0.853519,0.416556,0.707038,2025-11-29 23:12:13
285090,285090,0,0.873255,0.126745,0.873255,0.380151,0.746511,2025-11-29 23:12:13


In [17]:
import numpy as np
import pandas as pd
import time

def stage1_predict_simple(df, model, id_col=None):
    """
    Predict and summarize classification results in a minimal format.
    
    Args:
        df (pd.DataFrame): input features
        model: fitted model with predict_proba
        id_col (str, optional): column to use as patient ID
    
    Returns:
        pd.DataFrame with prediction summary
    """
    # Predict probabilities
    P = model.predict_proba(df)             # shape (n_samples, n_classes)
    
    # Predicted class
    pred_class = P.argmax(axis=1)
    
    # Margin as % for uncertainty
    sortedP = np.sort(P, axis=1)[:, ::-1]
    margin = (sortedP[:,0] - sortedP[:,1]) * 100
    uncertainty = pd.cut(margin, bins=[-0.01, 30, 60, 100], labels=["High", "Medium", "Low"])
    
    # Assemble output
    out = pd.DataFrame({
        "pred_class": pred_class,
        "margin_%": margin.round(1),
        "uncertainty": uncertainty,
        "prediction_ts": time.strftime("%Y-%m-%d %H:%M:%S")
    }, index=df.index)
    
    # Insert patient ID if provided
    if id_col and id_col in df.columns:
        out.insert(0, "patient_id", df[id_col].values)
    else:
        out.insert(0, "patient_id", df.index.astype(str))
    
    return out

# Example usage
stage1_summary = stage1_predict_simple(X_test,xgb_pipe )
stage1_summary.head(15)


Unnamed: 0,patient_id,pred_class,margin_%,uncertainty,prediction_ts
18467,18467,0,93.599998,Low,2025-11-29 23:12:14
190998,190998,0,92.300003,Low,2025-11-29 23:12:14
201522,201522,0,55.599998,Medium,2025-11-29 23:12:14
302375,302375,0,96.800003,Low,2025-11-29 23:12:14
84071,84071,0,96.0,Low,2025-11-29 23:12:14
296879,296879,0,99.0,Low,2025-11-29 23:12:14
107694,107694,0,42.599998,Medium,2025-11-29 23:12:14
115683,115683,0,97.900002,Low,2025-11-29 23:12:14
140830,140830,0,70.699997,Low,2025-11-29 23:12:14
285090,285090,0,74.699997,Low,2025-11-29 23:12:14
