## Loading Data

In [44]:
# ---------------------------
# 1) Load dependencies and data
# ---------------------------
import logging
import numpy as np
import pandas as pd
import joblib, os
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import seaborn as sns
import matplotlib.pyplot as plt


from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score, brier_score_loss, log_loss
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.frozen import FrozenEstimator
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from scipy.stats import pearsonr


from xgboost import XGBClassifier

from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical

from joblib import Parallel, delayed


# Logging
logging.basicConfig(format="%(asctime)s - %(levelname)s - %(message)s", level=logging.INFO)
logger = logging.getLogger(__name__)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
print("PyTorch path:", torch.__path__ )

logger.info("Loading dataset...")
chunksize = 100
list_of_dataframes = []
for df in pd.read_csv('DarpaQCGenoPheno.csv', chunksize=chunksize, index_col=0):
    list_of_dataframes.append(df)
df = pd.concat(list_of_dataframes)

ids = df["ID"].values
ax_columns = [col for col in df.columns if col.startswith('AX')]
X = df[ax_columns]
y = df["Status"]
X = X.to_numpy()
y = y.to_numpy()

scaler = StandardScaler()
X = scaler.fit_transform(X)


2025-09-11 10:06:33,143 - INFO - Loading dataset...


Using device: cuda
PyTorch path: ['/hpc/group/schultzlab/hs325/miniconda3/envs/gsAI/lib/python3.12/site-packages/torch']


## Testing NN proba output

In [31]:
ids = df["ID"].values  
X = df[ax_columns].values
y = df["Status"].values

X_train, X_test, y_train, y_test, id_train, id_test = train_test_split(
    X, y, ids, test_size=0.2, random_state=42
)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Convert to tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).unsqueeze(1)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

In [32]:
class SimpleMLP(nn.Module):
    def __init__(self, input_dim):
        super(SimpleMLP, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 1),   
            nn.Sigmoid()       
        )
    def forward(self, x):
        return self.network(x)

model = SimpleMLP(input_dim=X_train.shape[1]).to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

n_epochs = 20

for epoch in range(n_epochs):
    model.train()
    epoch_loss = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        y_pred = model(X_batch)
        loss = criterion(y_pred, y_batch)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    
    print(f"Epoch {epoch+1}/{n_epochs}, Loss: {epoch_loss/len(train_loader):.4f}")


Epoch 1/20, Loss: 2.8400
Epoch 2/20, Loss: 0.5109
Epoch 3/20, Loss: 0.1881
Epoch 4/20, Loss: 0.0817
Epoch 5/20, Loss: 0.0283
Epoch 6/20, Loss: 0.0102
Epoch 7/20, Loss: 0.0074
Epoch 8/20, Loss: 0.0079
Epoch 9/20, Loss: 0.0237
Epoch 10/20, Loss: 0.0634
Epoch 11/20, Loss: 0.1137
Epoch 12/20, Loss: 0.3005
Epoch 13/20, Loss: 0.2760
Epoch 14/20, Loss: 0.1159
Epoch 15/20, Loss: 0.0557
Epoch 16/20, Loss: 0.0276
Epoch 17/20, Loss: 0.0362
Epoch 18/20, Loss: 0.0068
Epoch 19/20, Loss: 0.0011
Epoch 20/20, Loss: 0.0003


In [45]:
model.eval()
with torch.no_grad():
    probs = model(X_test_tensor.to(device))
    probs = probs.squeeze().cpu().numpy()   
    preds = (probs > 0.5).astype(int)  

acc = accuracy_score(y_test, preds)
auc = roc_auc_score(y_test, probs)
r_val, _ = pearsonr(probs, y_test)

print(f"Accuracy: {acc:.3f}, AUC: {auc:.3f}, R-value: {r_val:.3f}")


Accuracy: 0.574, AUC: 0.605, R-value: 0.160


In [43]:
results_df = pd.DataFrame({
    "ID": id_test,
    "TrueLabel": y_test,
    "PredProb": probs,
    "PredClass": preds
})
results_df


Unnamed: 0,ID,TrueLabel,PredProb,PredClass
0,P444,0,0.019979,0
1,O_1318,1,0.902031,1
2,P1334,0,0.000052,0
3,Y_520,0,0.586152,1
4,B-463,1,0.832218,1
...,...,...,...,...
467,GS822,1,0.999996,1
468,B-529,0,0.417637,0
469,P452,1,0.991505,1
470,GS1289,1,0.997775,1


## Old Training Debugging

### Hyperparameter Tuning

In [None]:
# ---------------------------
# 2) Define search spaces and run Bayesian optimization (smaller/fewer iterations)
# ---------------------------

models = ["LR", "RF", "GB"]

def get_small_search_spaces():
    return {
        "LR": (
            LogisticRegression(max_iter=200, solver="saga"),
            {
                "C": Real(1e-2, 1.0, prior="log-uniform"),
                "penalty": Categorical(["l1", "l2"]),
            },
        ),
        "RF": (
            RandomForestClassifier(n_jobs=-1),
            {
                "n_estimators": Integer(50, 200),
                "max_depth": Integer(2, 10),
            },
        ),
        "GB": (
            XGBClassifier(
                tree_method="hist",  
                device="cuda",
                eval_metric="logloss",
            ),
            {
                "n_estimators": Integer(50, 200),
                "max_depth": Integer(2, 6),
                "learning_rate": Real(0.05, 0.3, prior="log-uniform"),
            },
        ),
        # "MLP": (
        #     MLPClassifier(max_iter=200),
        #     {
        #         "hidden_layer_sizes": Categorical([32, 64, (32, 16), (64, 32), (128,)]),
        #         "alpha": Real(1e-5, 1e-2, prior="log-uniform"),
        #         "learning_rate_init": Real(1e-3, 1e-2, prior="log-uniform"),
        #     },
        # ),
    }

def tune_model(X, y, model_name, n_iter=5):
    base_model, search_space = get_small_search_spaces()[model_name]
    logger.info(f"Bayesian optimization for {model_name}")
    opt = BayesSearchCV(
        estimator=base_model,
        search_spaces=search_space,
        n_iter=n_iter,
        cv=3,
        scoring="roc_auc",
        n_jobs=-1,
        verbose=0,
    )
    opt.fit(X, y)
    logger.info(f"Best {model_name} params: {opt.best_params_}")
    return opt.best_estimator_

tuned_models = {name: tune_model(X, y, name) for name in models}


### 3fold CV

In [16]:
# ---------------------------
# 3) Run simplified cross-validation (3 folds instead of 10)
# ---------------------------
skf_outer = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

results = []
all_preds = []

for fold, (train_val_idx, test_idx) in enumerate(skf_outer.split(X, y)):
    logger.info(f"Outer Fold {fold+1}/3")

    X_train_val, X_test = X[train_val_idx], X[test_idx]
    y_train_val, y_test = y[train_val_idx], y[test_idx]
    ids_test = ids[test_idx]

    X_train, X_cal, y_train, y_cal = train_test_split(
        X_train_val, y_train_val, test_size=0.2, stratify=y_train_val, random_state=42
    )

    fold_df = pd.DataFrame({"ID": ids_test, "true_label": y_test, "fold": fold+1})

    for name, tuned_model in tuned_models.items():
        tuned_model.fit(X_train, y_train)
        frozen = FrozenEstimator(tuned_model)
        calibrated = CalibratedClassifierCV(frozen, method="isotonic", cv="prefit")
        calibrated.fit(X_cal, y_cal)

        probs = calibrated.predict_proba(X_test)[:, 1]
        fold_df[name] = probs

        auc = roc_auc_score(y_test, probs)
        logger.info(f"{name} Fold {fold+1} AUC={auc:.3f}")

    all_preds.append(fold_df)

prob_df = pd.concat(all_preds, axis=0).sort_values("ID")
prob_df.head()


2025-09-03 15:50:01,640 - INFO - Outer Fold 1/3
2025-09-03 15:52:21,120 - INFO - LR Fold 1 AUC=0.619
2025-09-03 15:52:22,055 - INFO - RF Fold 1 AUC=0.583
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)
2025-09-03 15:52:24,450 - INFO - GB Fold 1 AUC=0.616
2025-09-03 15:52:24,452 - INFO - Outer Fold 2/3
2025-09-03 15:54:36,086 - INFO - LR Fold 2 AUC=0.605
2025-09-03 15:54:36,873 - INFO - RF Fold 2 AUC=0.547
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
2025-09-03 15:54:39,168 - INFO - GB Fold 2 AUC=0.605
2025-09-03 15:54:39,169 - INFO - Outer Fold 3/3
2025-09-03 15:56:47,334 - INFO - LR Fold 3 AUC=0.592
2025-09-03 15:56:48,229 - INFO - RF Fold 3 AUC=0.578
Parameters: { "use_label_encoder" } are not used.

  bst.update

Unnamed: 0,ID,true_label,fold,LR,RF,GB
29,B-1,1,2,0.495726,0.53527,0.48538
0,B-1000,0,1,0.208333,0.542857,0.40678
0,B-1002,0,2,0.495726,0.53527,0.48538
1,B-1003,0,2,0.495726,0.272727,0.48538
2,B-1005,0,2,0.495726,0.53527,0.48538


### Save data

In [20]:
# ---------------------------
# 4) Save model output
# ---------------------------
# Save predictions
# prob_df.to_csv("debug_predicted_probabilities.csv", index=False)
logger.info("Saved debug predictions to debug_predicted_probabilities.csv")

# Save models
os.makedirs("models/debug", exist_ok=True)
for name, model in tuned_models.items():
    path = os.path.join("models/debug/", f"{name}_debug_model.joblib")
    joblib.dump(model, path)
    logger.info(f"Saved {name} model to {path}")

prob_df

2025-09-03 15:58:12,351 - INFO - Saved debug predictions to debug_predicted_probabilities.csv
2025-09-03 15:58:12,367 - INFO - Saved LR model to models/debug/LR_debug_model.joblib
2025-09-03 15:58:12,422 - INFO - Saved RF model to models/debug/RF_debug_model.joblib
2025-09-03 15:58:12,433 - INFO - Saved GB model to models/debug/GB_debug_model.joblib


Unnamed: 0,ID,true_label,fold,LR,RF,GB
29,B-1,1,2,0.495726,0.535270,0.485380
0,B-1000,0,1,0.208333,0.542857,0.406780
0,B-1002,0,2,0.495726,0.535270,0.485380
1,B-1003,0,2,0.495726,0.272727,0.485380
2,B-1005,0,2,0.495726,0.535270,0.485380
...,...,...,...,...,...,...
782,Y_988,1,3,0.396552,0.432836,0.491803
785,Y_991,1,2,0.571429,0.535270,0.538462
783,Y_992,0,3,0.576642,0.531746,0.602941
784,Y_995,1,3,0.402923,0.700000,0.727273
