In [1]:
# ---------------------------
# 1) Load dependencies and data
# ---------------------------
import logging
import numpy as np
import pandas as pd
import joblib, os
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score, brier_score_loss, log_loss
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.frozen import FrozenEstimator
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from xgboost import XGBClassifier

from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical

from joblib import Parallel, delayed


# Logging
logging.basicConfig(format="%(asctime)s - %(levelname)s - %(message)s", level=logging.INFO)
logger = logging.getLogger(__name__)

# Load sample dataset (smaller or subset for testing)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
print("PyTorch path:", torch.__path__ )

logger.info("Loading dataset...")
chunksize = 100
list_of_dataframes = []
for df in pd.read_csv('DarpaQCGenoPheno.csv', chunksize=chunksize, index_col=0):
    list_of_dataframes.append(df)
df = pd.concat(list_of_dataframes)

ids = df["ID"].values
ax_columns = [col for col in df.columns if col.startswith('AX')]
X = df[ax_columns]
y = df["Status"]
X = X.to_numpy()
y = y.to_numpy()

scaler = StandardScaler()
X = scaler.fit_transform(X)

models = ["LR", "RF", "GB", "MLP"]


2025-09-03 10:43:40,105 - INFO - Loading dataset...


Using device: cuda
PyTorch path: ['/hpc/group/schultzlab/hs325/miniconda3/envs/gsAI/lib/python3.12/site-packages/torch']


In [None]:
# ---------------------------
# 2) Define search spaces and run Bayesian optimization (smaller/fewer iterations)
# ---------------------------

def get_small_search_spaces():
    return {
        "LR": (
            LogisticRegression(max_iter=200, solver="saga"),
            {
                "C": Real(1e-2, 1.0, prior="log-uniform"),
                "penalty": Categorical(["l1", "l2"]),
            },
        ),
        "RF": (
            RandomForestClassifier(n_jobs=-1),
            {
                "n_estimators": Integer(50, 200),
                "max_depth": Integer(2, 10),
            },
        ),
        "GB": (
            XGBClassifier(
                tree_method="hist",  
                device="cuda",
                eval_metric="logloss",
                use_label_encoder=False
            ),
            {
                "n_estimators": Integer(50, 200),
                "max_depth": Integer(2, 6),
                "learning_rate": Real(0.05, 0.3, prior="log-uniform"),
            },
        ),

        "MLP": (
            MLPClassifier(max_iter=200),
            {
                "hidden_layer_sizes": Categorical([(32,), (64,)]),
                "alpha": Real(1e-5, 1e-2, prior="log-uniform"),
                "learning_rate_init": Real(1e-3, 1e-2, prior="log-uniform"),
            },
        ),
    }

def tune_model(X, y, model_name, n_iter=5):
    base_model, search_space = get_small_search_spaces()[model_name]
    logger.info(f"Bayesian optimization for {model_name}")
    opt = BayesSearchCV(
        estimator=base_model,
        search_spaces=search_space,
        n_iter=n_iter,
        cv=3,
        scoring="roc_auc",
        n_jobs=-1,
        verbose=0,
    )
    opt.fit(X, y)
    logger.info(f"Best {model_name} params: {opt.best_params_}")
    return opt.best_estimator_

tuned_models = {name: tune_model(X, y, name) for name in models}


2025-09-03 11:43:43,259 - INFO - Bayesian optimization for LR


In [None]:
# ---------------------------
# 3) Run simplified cross-validation (3 folds instead of 10)
# ---------------------------
skf_outer = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

results = []
all_preds = []

for fold, (train_val_idx, test_idx) in enumerate(skf_outer.split(X, y)):
    logger.info(f"Outer Fold {fold+1}/3")

    X_train_val, X_test = X[train_val_idx], X[test_idx]
    y_train_val, y_test = y[train_val_idx], y[test_idx]
    ids_test = ids[test_idx]

    X_train, X_cal, y_train, y_cal = train_test_split(
        X_train_val, y_train_val, test_size=0.2, stratify=y_train_val, random_state=42
    )

    fold_df = pd.DataFrame({"ID": ids_test, "true_label": y_test, "fold": fold+1})

    for name, tuned_model in tuned_models.items():
        tuned_model.fit(X_train, y_train)
        frozen = FrozenEstimator(tuned_model)
        calibrated = CalibratedClassifierCV(frozen, method="isotonic", cv="prefit")
        calibrated.fit(X_cal, y_cal)

        probs = calibrated.predict_proba(X_test)[:, 1]
        fold_df[name] = probs

        auc = roc_auc_score(y_test, probs)
        logger.info(f"{name} Fold {fold+1} AUC={auc:.3f}")

    all_preds.append(fold_df)

prob_df = pd.concat(all_preds, axis=0).sort_values("ID")
prob_df.head()


In [None]:
# ---------------------------
# 4) Save model output
# ---------------------------
# Save predictions
prob_df.to_csv("debug_predicted_probabilities.csv", index=False)
logger.info("Saved debug predictions to debug_predicted_probabilities.csv")

# Save models
os.makedirs("models", exist_ok=True)
for name, model in tuned_models.items():
    path = os.path.join("models", f"{name}_debug_model.joblib")
    joblib.dump(model, path)
    logger.info(f"Saved {name} model to {path}")
