In [24]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score
from catboost import CatBoostClassifier, Pool


In [25]:
df = pd.read_csv("Pathology_Cohorts.csv")
print(df.shape)
df.head()
df.columns = (
    df.columns
    .str.strip()
    .str.replace(" ", "_")
    .str.replace(".", "_")
)


(3651, 93)


In [26]:
# Alcohol Option A (Yes/No)
df['alcohol_binary'] = df['exposures_alcohol_history'].map({
    'YES': 1, 'Yes': 1, 'yes': 1,
    'NO': 0, 'No': 0, 'no': 0
})

# Stage binary early/late â†’ requires your stage definition
# We convert pathologic stage into Early/Late
# Early = I/II, Late = III/IV
def simplify_stage(stage):
    if pd.isna(stage): return np.nan
    s = str(stage).upper()
    if "I" in s and "II" not in s: return 'Early'
    if "II" in s: return 'Early'
    if "III" in s or "IV" in s: return 'Late'
    return np.nan

df['stage_early_late'] = df['diagnoses_ajcc_clinical_stage'].apply(simplify_stage).map({
    'Early': 0,
    'Late': 1
})

# Multiclass stage (I, II, III, IV)
def stage_simple(stage):
    if pd.isna(stage): return np.nan
    s = str(stage).upper()
    if "I" in s and "II" not in s: return "I"
    if "II" in s: return "II"
    if "III" in s: return "III"
    if "IV" in s: return "IV"
    return np.nan

df['stage_simple'] = df['diagnoses_ajcc_clinical_stage'].apply(stage_simple)

# Smoking (ever vs never)
df['smoking_binary'] = df['exposures_tobacco_smoking_status'].map({
    'Lifelong Non-smoker': 0,
    'Current smoker': 1,
    'Current Smoker': 1,
    'Smoker': 1,
    'Former smoker': 1,
    'Current reformed smoker for > 15 years': 1
})


In [27]:
def make_X_y(df, target_col):
    df = df.copy()
    df = df.dropna(subset=[target_col])

    y = df[target_col]
    X = df.drop(columns=[target_col])

    # Convert categorical cols to string
    for col in X.columns:
        if X[col].dtype == object:
            X[col] = X[col].astype(str)

    X = X.fillna("NaN")
    y = y.fillna("NaN")

    cat_features = [i for i, col in enumerate(X.columns) if X[col].dtype == "object"]

    return X, y, cat_features


In [28]:
def train_catboost_cv(X, y, cat_features, model_name="model", loss_function="Logloss"):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    fold_acc = []
    fold_auc = []

    print(f"\n===== Training {model_name} =====")

    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        print(f"\n--- Fold {fold+1} ---")

        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        train_pool = Pool(X_train, y_train, cat_features=cat_features)
        val_pool = Pool(X_val, y_val, cat_features=cat_features)

        model = CatBoostClassifier(
            loss_function=loss_function,
            depth=4,
            learning_rate=0.03,
            iterations=600,
            l2_leaf_reg=9,
            random_seed=42,
            od_type='Iter',
            od_wait=30,
            verbose=False
        )

        model.fit(train_pool, eval_set=val_pool)

        preds = model.predict(val_pool)
        acc = accuracy_score(y_val, preds)
        fold_acc.append(acc)

        if loss_function == "Logloss":
            preds_proba = model.predict_proba(val_pool)[:, 1]
            auc = roc_auc_score(y_val, preds_proba)
            fold_auc.append(auc)
            print(f"Accuracy={acc:.3f} | AUC={auc:.3f}")
        else:
            print(f"Accuracy={acc:.3f}")

    print("\n===== FINAL RESULTS =====")
    print("Mean Accuracy:", np.mean(fold_acc))
    if loss_function == "Logloss":
        print("Mean AUC:", np.mean(fold_auc))

    return model
