### Import packages and standardize data

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from scipy.stats import pearsonr
from sklearn.frozen import FrozenEstimator
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.datasets import make_classification
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.metrics import roc_auc_score
import seaborn as sns
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.metrics import make_scorer
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score, mean_squared_error
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
import xgboost as xgb
from xgboost import XGBClassifier
import multiprocessing
import logging
from sklearn.pipeline import Pipeline
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

In [3]:
# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
print(torch.__path__ )



Using device: cuda
['/hpc/group/schultzlab/hs325/miniconda3/envs/gsAI/lib/python3.12/site-packages/torch']


In [4]:
### Update with joined geno-pheno data

chunksize = 100

list_of_dataframes = []

for df in pd.read_csv('DarpaQCGenoPheno.csv', chunksize=chunksize, index_col=0):
    list_of_dataframes.append(df)

result = pd.concat(list_of_dataframes)
df = result
df

Unnamed: 0,ID,AX-574114010,AX-564298109,AX-574114011,AX-577073921,AX-564298209,AX-574114024,AX-576891682,AX-564298228,AX-576891683,...,AX-568138441,AX-574093181,AX-574093191,AX-574093227,Pop,Plate,Status,WWt,Group,Generation
0,B-1000,-1,-1,-1,-1,-1,-1,-1,1,1,...,0,-1,0,0,Training,RU11,0,,F0,F0
1,B-1002,-1,-1,-1,1,1,1,1,1,1,...,0,0,-1,0,Training,RU10,0,,F0,F0
2,B-1003,-1,-1,0,-1,0,0,0,0,0,...,0,-1,-1,1,Training,RU10,0,,F0,F0
3,B-1005,-1,-1,1,1,0,0,0,0,0,...,1,-1,0,1,Training,RU11,0,,F0,F0
4,B-1006,-1,-1,1,1,1,1,1,1,1,...,0,-1,-1,1,Training,RU10,1,,F0,F0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2352,Y_988,-1,-1,0,0,-1,-1,0,0,0,...,-1,0,0,1,Training,RU28,1,12.73,FLGS23,F2
2353,Y_991,0,0,-1,-1,-1,-1,-1,1,1,...,-1,-1,0,1,Training,RU28,1,11.30,FLGS23,F2
2354,Y_992,-1,-1,0,1,-1,0,0,0,0,...,1,0,0,0,Training,RU28,0,9.48,FLGS23,F2
2355,Y_995,-1,-1,-1,-1,-1,-1,-1,1,1,...,0,0,0,1,Training,RU28,1,7.81,FLGS23,F2


In [5]:
ax_columns = [col for col in df.columns if col.startswith('AX')]
# len(ax_columns)
X = df[ax_columns]
y = df["Status"]
X = X.to_numpy()
y = y.to_numpy()

scaler = StandardScaler()
X = scaler.fit_transform(X)
X

array([[-0.81869812, -0.76517567, -0.94206249, ..., -1.01229399,
         1.03438912, -1.87421104],
       [-0.81869812, -0.76517567, -0.94206249, ...,  0.40793037,
        -0.76491358, -1.87421104],
       [-0.81869812, -0.76517567,  0.58611458, ..., -1.01229399,
        -0.76491358,  0.43017885],
       ...,
       [-0.81869812, -0.76517567,  0.58611458, ...,  0.40793037,
         1.03438912, -1.87421104],
       [-0.81869812, -0.76517567, -0.94206249, ...,  0.40793037,
         1.03438912,  0.43017885],
       [ 1.8782571 ,  2.16499337, -0.94206249, ...,  1.82815473,
        -0.76491358, -1.87421104]])

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

## XGB hp tuning

In [14]:
class ProgressPrinter:
    def __init__(self, total_iter):
        self.total_iter = total_iter
        self.current_iter = 0

    def __call__(self, optim_result):
        self.current_iter += 1
        print(f"Iteration {self.current_iter}/{self.total_iter} finished")
        return False  # continue optimization
        
if __name__ == "__main__":
    print("Starting Bayesian optimization for XGBoost HP tuning")

    clf = XGBClassifier(
        random_state=8,
        tree_method="hist",  # efficient with many features
        n_jobs=multiprocessing.cpu_count() // 2,
        use_label_encoder=False,
        eval_metric="logloss"
    )

    # Param search space
    search_space = {
        'max_depth': Integer(2, 8),
        'learning_rate': Real(0.001, 1.0, prior='log-uniform'),
        'subsample': Real(0.5, 1.0),
        'colsample_bytree': Real(0.5, 1.0),
        'colsample_bylevel': Real(0.5, 1.0),
        'colsample_bynode': Real(0.5, 1.0),
        'reg_alpha': Real(0.0, 10.0),
        'reg_lambda': Real(0.0, 10.0),
        'gamma': Real(0.0, 10.0),
        'n_estimators': Integer(100, 1000)
    }


    # 3-fold CV with Bayesian optimization
    opt = BayesSearchCV(
        estimator=clf,
        cv=3,
        search_spaces=search_space,
        n_iter=25, #adjust as needed
        scoring='roc_auc',
        n_jobs=2,
        random_state=8,
        verbose=0  
    )

    print("Fitting BayesSearchCV...")
    opt.fit(X_train, y_train, callback=[ProgressPrinter(n_iter)])

    print(f"Best cross-validated AUC: {opt.best_score_:.4f}")
    print(f"Best parameters: {opt.best_params_}")

    # Evaluate on held-out test set
    y_pred_proba = opt.best_estimator_.predict_proba(X_test)[:, 1]
    test_auc = roc_auc_score(y_test, y_pred_proba)
    print(f"Test AUC: {test_auc:.4f}")

    best_model = opt.best_estimator_


Starting Bayesian optimization for XGBoost HP tuning
Fitting BayesSearchCV...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Iteration 1/50 finished


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Iteration 2/50 finished


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Iteration 3/50 finished


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Iteration 4/50 finished


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Iteration 5/50 finished


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Iteration 6/50 finished


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Iteration 7/50 finished


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Iteration 8/50 finished


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Iteration 9/50 finished


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Iteration 10/50 finished


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Iteration 11/50 finished


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Iteration 12/50 finished


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Iteration 13/50 finished


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Iteration 14/50 finished


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Iteration 15/50 finished


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Iteration 16/50 finished


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Iteration 17/50 finished


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Iteration 18/50 finished


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Iteration 19/50 finished


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Iteration 20/50 finished


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Iteration 21/50 finished


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Iteration 22/50 finished


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Iteration 23/50 finished


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Iteration 24/50 finished


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Iteration 25/50 finished


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Best cross-validated AUC: 0.6074
Best parameters: OrderedDict({'colsample_bylevel': 0.5331080642783614, 'colsample_bynode': 0.7262320733948379, 'colsample_bytree': 0.5362052518135637, 'gamma': 7.436578137661654, 'learning_rate': 0.0011002938624638172, 'max_depth': 5, 'n_estimators': 551, 'reg_alpha': 1.964788283871643, 'reg_lambda': 0.5172779921798499, 'subsample': 0.8441395984251803})
Test AUC: 0.6726


In [None]:
## now: test best model 10x StratifiedKFold
'''
Best cross-validated AUC: 0.6074
Best parameters: OrderedDict({'colsample_bylevel': 0.5331080642783614, 'colsample_bynode': 0.7262320733948379, 'colsample_bytree': 0.5362052518135637, 'gamma': 7.436578137661654, 'learning_rate': 0.0011002938624638172, 'max_depth': 5, 'n_estimators': 551, 'reg_alpha': 1.964788283871643, 'reg_lambda': 0.5172779921798499, 'subsample': 0.8441395984251803})
Test AUC: 0.6726
'''
