In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split , cross_val_score
from sklearn.metrics import make_scorer, roc_auc_score, accuracy_score, f1_score, log_loss, recall_score
from tqdm import tqdm
import logging
import optuna
from optuna.samplers import TPESampler
from optuna.pruners import MedianPruner
from optuna.logging import get_logger

Log data preperation

In [2]:
# Set up logging configuration
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Create a custom file handler
file_handler = logging.FileHandler('optuna_logs_XGBoost.log')
file_handler.setLevel(logging.INFO)
file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))

# Add file handler to Optuna's logger
logger = optuna.logging.get_logger('optuna')
logger.addHandler(file_handler)

Data preperation

In [3]:
# Load the dataset using Pandas
#df = pd.read_excel('../step1 (Datasets)/PHR_1.0.3b_LabResultsMVRemoved.xlsx')
#df = pd.read_excel('../step1 (Datasets)/PHR_1.0.3b_LabResultsMVRemoved_MICEimputed.xlsx')
#df = pd.read_excel('../step1 (Datasets)/PHR_1.0.2b_MICEimputed.xlsx')
df = pd.read_excel('../../step1 (Datasets)/PHR_1.0.3b_LabResultsMVRemoved_refined_MedianImputation.xlsx')

# Specify the target column name
target_name = 'Cancer Label'

# Separate features and target
X = df.drop(columns=[target_name])
y = df[target_name]

# Encode categorical variables if needed
for column in X.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    X[column] = le.fit_transform(X[column])

Ultiparameters

In [4]:
# Number of random splits for HPO
HPO_splits = 10

# Number of random splits for evaluation of HPO-adjusted model
Model_evaluation_splits = 30

Optuna HPO & evaluation

In [5]:
# Function to optimize hyperparameters using Optuna
def objective(trial, state):
    param_grid = {
        'gpu_id': 0,
        'booster': trial.suggest_categorical('booster', ['gbtree', 'gblinear', 'dart']),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.008, 0.2),
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 1.0),
        'gamma': trial.suggest_float('gamma', 0.1, 1.0),
        'lambda': trial.suggest_float('lambda', 1e-3, 1.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-3, 1.0, log=True),
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'verbose':0
    }

    # Create train/test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=state)
    
    model = xgb.XGBClassifier(**param_grid)
    
    # Use cross-validation to evaluate model with recall
    recall_scorer = make_scorer(recall_score)  # Recall (sensitivity) scorer
    recall = cross_val_score(model, X_train, y_train, cv=5, scoring=recall_scorer).mean()

    return recall


model_overall_best_Hparams_model_performance_metrics = []
model_overall_best_Hparams_model_auc_roc = 0
model_overall_best_Hparams_list = []
model_overall_best_Hparams_splitIndex = 0

# Main evaluation loop for finding best Hp and overall performance
for h_split in tqdm(range(HPO_splits), desc="Hyperparameter Optimization Progress"):
    print(f"Processing split state: {h_split}")
    
    # Optimize hyperparameters for the split (state)
    study = optuna.create_study(direction='maximize', sampler=TPESampler(), pruner=MedianPruner())
    study.optimize(lambda trial: objective(trial, h_split), n_trials=100)

    # Get the best hyperparameters for the split in 100 trials
    best_params = study.best_params
    print(f"Best parameters for split state {h_split}: {best_params}")

    all_metrics = []
    
    # Evaluate the split-based optimized hyperparameters 
    for e_split in tqdm(range(Model_evaluation_splits), desc="Evaluation Progress", leave=False):
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=e_split)
        
        model = xgb.XGBClassifier(**best_params)
        model.fit(X_train, y_train)
        y_pred_test = model.predict_proba(X_test)[:, 1]
        y_pred_bin_test = model.predict(X_test)
        
        # Evaluate on the test set
        test_loss = log_loss(y_test, y_pred_test)
        accuracy = accuracy_score(y_test, model.predict(X_test))
        auc_roc = roc_auc_score(y_test, y_pred_test)
        f1 = f1_score(y_test, model.predict(X_test))
        sensitivity = recall_score(y_test, y_pred_bin_test) 
        
        all_metrics.append((test_loss, accuracy, sensitivity, auc_roc, f1))
        
    # Calculate average metrics
    all_metrics = np.array(all_metrics)
    avg_metrics = np.mean(all_metrics, axis=0)

    if avg_metrics[2] > model_overall_best_Hparams_model_auc_roc:
        model_overall_best_Hparams_model_auc_roc = avg_metrics[2]
        model_overall_best_Hparams_list = best_params
        model_overall_best_Hparams_model_performance_metrics = avg_metrics
        model_overall_best_Hparams_splitIndex = h_split



[I 2024-11-11 17:57:30,716] Trial 47 finished with value: 0.7227856090889965 and parameters: {'booster': 'dart', 'max_depth': 5, 'learning_rate': 0.17438575777437146, 'n_estimators': 252, 'min_child_weight': 10, 'subsample': 0.603616472937582, 'colsample_bytree': 0.7600706908966082, 'gamma': 0.3886445345631777, 'lambda': 0.030544878931287336, 'alpha': 0.6245300509138169}. Best is trial 46 with value: 0.7473385230380812.
Parameters: { "verbose" } are not used.

Parameters: { "verbose" } are not used.

Parameters: { "verbose" } are not used.

Parameters: { "verbose" } are not used.

Parameters: { "verbose" } are not used.

[I 2024-11-11 17:57:59,023] Trial 48 finished with value: 0.7391752577319588 and parameters: {'booster': 'dart', 'max_depth': 4, 'learning_rate': 0.18209698757423268, 'n_estimators': 238, 'min_child_weight': 10, 'subsample': 0.7378348562812174, 'colsample_bytree': 0.6403658941340229, 'gamma': 0.4404622916768427, 'lambda': 0.024479908188896615, 'alpha': 0.88667574566209

Best parameters for split state 9: {'booster': 'dart', 'max_depth': 9, 'learning_rate': 0.16757526880337048, 'n_estimators': 271, 'min_child_weight': 10, 'subsample': 0.6462807491201441, 'colsample_bytree': 0.899034861491886, 'gamma': 0.5134323315817606, 'lambda': 0.33831616398436576, 'alpha': 0.11680506336726469}



Evaluation Progress:   0%|                                                                                                                             | 0/30 [00:00<?, ?it/s][A
Evaluation Progress:   3%|███▉                                                                                                                 | 1/30 [00:22<10:38, 22.03s/it][A
Evaluation Progress:   7%|███████▊                                                                                                             | 2/30 [00:45<10:33, 22.62s/it][A
Evaluation Progress:  10%|███████████▋                                                                                                         | 3/30 [01:07<10:11, 22.64s/it][A
Evaluation Progress:  13%|███████████████▌                                                                                                     | 4/30 [01:31<10:00, 23.11s/it][A
Evaluation Progress:  17%|███████████████████▌                                                               

In [6]:
print(f"Best Hp is from split: {model_overall_best_Hparams_splitIndex}")
print(f"Best Hp are: {model_overall_best_Hparams_list}")
print(f"Best Hp tuned model performance on {Model_evaluation_splits} splits: {model_overall_best_Hparams_model_performance_metrics}")


Best Hp is from split: 9
Best Hp are: {'booster': 'dart', 'max_depth': 9, 'learning_rate': 0.16757526880337048, 'n_estimators': 271, 'min_child_weight': 10, 'subsample': 0.6462807491201441, 'colsample_bytree': 0.899034861491886, 'gamma': 0.5134323315817606, 'lambda': 0.33831616398436576, 'alpha': 0.11680506336726469}
Best Hp tuned model performance on 30 splits: [0.41344806 0.82947577 0.74320587 0.89839639 0.76482592]
