In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split , cross_val_score
from sklearn.metrics import make_scorer, roc_auc_score, accuracy_score, f1_score, log_loss, recall_score
from tqdm import tqdm
import logging
import optuna
from optuna.samplers import TPESampler
from optuna.pruners import MedianPruner
from optuna.logging import get_logger

Log data preperation

In [2]:
# Set up logging configuration
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Create a custom file handler
file_handler = logging.FileHandler('optuna_logs_XGBoost.log')
file_handler.setLevel(logging.INFO)
file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))

# Add file handler to Optuna's logger
logger = optuna.logging.get_logger('optuna')
logger.addHandler(file_handler)

Data preperation

In [3]:
# Load the dataset using Pandas
#df = pd.read_excel('../step1 (Datasets)/PHR_1.0.3b_LabResultsMVRemoved.xlsx')
#df = pd.read_excel('../step1 (Datasets)/PHR_1.0.3b_LabResultsMVRemoved_MICEimputed.xlsx')
#df = pd.read_excel('../step1 (Datasets)/PHR_1.0.2b_MICEimputed.xlsx')
df = pd.read_excel('../../step1 (Datasets)/PHR_1.0.3b_LabResultsMVRemoved_refined_MedianImputation.xlsx')

# Specify the target column name
target_name = 'Cancer Label'

# Separate features and target
X = df.drop(columns=[target_name])
y = df[target_name]

# Encode categorical variables if needed
for column in X.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    X[column] = le.fit_transform(X[column])

Ultiparameters

In [4]:
# Number of random splits for HPO
HPO_splits = 10

# Number of random splits for evaluation of HPO-adjusted model
Model_evaluation_splits = 30

Optuna HPO & evaluation

In [None]:
# Function to optimize hyperparameters using Optuna
def objective(trial, state):
    param_grid = {
        'gpu_id': 1,  # Using GPU with device ID 2
        'boosting_type': trial.suggest_categorical('boosting_type', ['gbdt', 'dart', 'goss', 'rf']),  # LightGBM's boosting types
        'max_depth': trial.suggest_int('max_depth', 3, 10),  # Depth of trees
        'learning_rate': trial.suggest_float('learning_rate', 0.008, 0.2),  # Learning rate
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),  # Number of trees
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),  # Minimum child weight for splits
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),  # Fraction of samples for building each tree
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 1.0),  # Fraction of features for each tree
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-3, 1.0, log=True),  # L1 regularization
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-3, 1.0, log=True),  # L2 regularization
        'max_bin': trial.suggest_int('max_bin', 100, 500),  # Max number of bins for feature discretization
        'objective': 'binary',  # Binary classification task
        'metric': 'binary_logloss',  # Loss metric for binary classification
        'verbose': -1 # Suppress logging
    }

    # Create train/test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=state)
    
    # Create the LightGBM classifier & train & predict
    model = lgb.LGBMClassifier(**param_grid)
    
    # Use cross-validation to evaluate model with recall
    recall_scorer = make_scorer(recall_score)  # Recall (sensitivity) scorer
    recall = cross_val_score(model, X_train, y_train, cv=5, scoring=recall_scorer).mean()

    return recall


model_overall_best_Hparams_model_performance_metrics = []
model_overall_best_Hparams_model_recall = 0
model_overall_best_Hparams_list = []
model_overall_best_Hparams_splitIndex = 0

# Main evaluation loop for finding best Hp and overall performance
for h_split in tqdm(range(HPO_splits), desc="Hyperparameter Optimization Progress"):
    print(f"Processing split state: {h_split}")
    
    # Optimize hyperparameters for the split (state)
    study = optuna.create_study(direction='maximize', sampler=TPESampler(), pruner=MedianPruner())
    study.optimize(lambda trial: objective(trial, h_split), n_trials=100)

    # Get the best hyperparameters for the split in 100 trials
    best_params = study.best_params
    print(f"Best parameters for split state {h_split}: {best_params}")

    all_metrics = []
    
    # Evaluate the split-based optimized hyperparameters 
    for e_split in tqdm(range(Model_evaluation_splits), desc="Evaluation Progress", leave=False):
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=e_split)
        
        # Create the LightGBM classifier & train & predict
        model = lgb.LGBMClassifier(**best_params)
        model.fit(X_train, y_train)
        y_pred_test = model.predict_proba(X_test)[:, 1]
        y_pred_bin_test = model.predict(X_test)
        
        # Evaluate on the test set
        test_loss = log_loss(y_test, y_pred_test)
        accuracy = accuracy_score(y_test, model.predict(X_test))
        auc_roc = roc_auc_score(y_test, y_pred_test)
        f1 = f1_score(y_test, model.predict(X_test))
        sensitivity = recall_score(y_test, y_pred_bin_test)  
        
        all_metrics.append((test_loss, accuracy, sensitivity, auc_roc, f1))
        
    # Calculate average metrics
    all_metrics = np.array(all_metrics)
    avg_metrics = np.mean(all_metrics, axis=0)

    if avg_metrics[2] > model_overall_best_Hparams_model_recall:
        model_overall_best_Hparams_model_auc_roc = avg_metrics[2]
        model_overall_best_Hparams_list = best_params
        model_overall_best_Hparams_model_performance_metrics = avg_metrics
        model_overall_best_Hparams_splitIndex = h_split



In [6]:
print(f"Best Hp is from split: {model_overall_best_Hparams_splitIndex}")
print(f"Best Hp are: {model_overall_best_Hparams_list}")
print(f"Best Hp tuned model performance on {Model_evaluation_splits} splits: {model_overall_best_Hparams_model_performance_metrics}")


Best Hp is from split: 9
Best Hp are: {'boosting_type': 'gbdt', 'max_depth': 4, 'learning_rate': 0.11874834409580293, 'n_estimators': 262, 'min_child_weight': 10, 'subsample': 0.7692529288310478, 'colsample_bytree': 0.7086822061982037, 'lambda_l1': 0.7504480117007168, 'lambda_l2': 0.25455736309470217, 'max_bin': 306}
Best Hp tuned model performance on 30 splits: [0.39617785 0.82987141 0.73518969 0.90047873 0.76332499]
