Package Import

In [16]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split , cross_val_score
from sklearn.metrics import make_scorer, roc_auc_score, accuracy_score, f1_score, log_loss, recall_score
from tqdm import tqdm
import optuna
from optuna.samplers import TPESampler
from optuna.pruners import MedianPruner

User Configuration

In [23]:
# Specify your datset location and label column name
dataset = "Your dataset location"
label_column = 'Your label column name'

# Specify the number of random splits for HPO
HPO_splits = 10
# Specify the number of random splits for evaluation of HPO-adjusted model
Model_evaluation_splits = 30

# Dynamic ranges for LightGBM hyperparameters
boosting_type_options = ['gbdt', 'dart', 'goss', 'rf']
max_depth_range = (3, 10)
learning_rate_range = (0.008, 0.2)
n_estimators_range = (50, 300)
min_child_weight_range = (1, 10)
subsample_range = (0.5, 1.0)
colsample_bytree_range = (0.3, 1.0)
lambda_l1_range = (1e-3, 1.0)
lambda_l2_range = (1e-3, 1.0)
max_bin_range = (100, 500)
metric = 'binary_logloss'
verbose = -1
gpu_id = 1

Data Preperation

In [19]:
# Load the dataset using Pandas
df = pd.read_excel(dataset)

# Separate features and target
X = df.drop(columns=[label_column])
y = df[label_column]

# Encode categorical variables if needed
for column in X.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    X[column] = le.fit_transform(X[column])

Optuna HPO initiation

In [None]:
# Function to optimize hyperparameters using Optuna
def objective(trial, state):
    
    param_grid = {
        'gpu_id': gpu_id,  # Specify GPU ID
        'boosting_type': trial.suggest_categorical('boosting_type', boosting_type_options),  # LightGBM's boosting types
        'max_depth': trial.suggest_int('max_depth', *max_depth_range),  # Use unpacked range
        'learning_rate': trial.suggest_float('learning_rate', *learning_rate_range),
        'n_estimators': trial.suggest_int('n_estimators', *n_estimators_range),
        'min_child_weight': trial.suggest_int('min_child_weight', *min_child_weight_range),
        'subsample': trial.suggest_float('subsample', *subsample_range),
        'colsample_bytree': trial.suggest_float('colsample_bytree', *colsample_bytree_range),
        'lambda_l1': trial.suggest_float('lambda_l1', lambda_l1_range[0], lambda_l1_range[1], log=True),
        'lambda_l2': trial.suggest_float('lambda_l2', lambda_l2_range[0], lambda_l2_range[1], log=True),
        'max_bin': trial.suggest_int('max_bin', *max_bin_range),
        'objective': 'binary',
        'metric': metric,
        'verbose': verbose  # Suppress logging
    }

    # Create train/test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=state)
    
    model = lgb.LGBMClassifier(**param_grid)
    
    # Use cross-validation to evaluate model with recall
    recall_scorer = make_scorer(recall_score)  # Recall (sensitivity) scorer
    recall = cross_val_score(model, X_train, y_train, cv=5, scoring=recall_scorer).mean()

    return recall


model_overall_best_Hparams_model_performance_metrics = []
model_overall_best_Hparams_model_sensitivity = 0
model_overall_best_Hparams_list = []
model_overall_best_Hparams_splitIndex = 0

# Specify metric names for printing
metric_labels = ['Log Loss', 'Accuracy', 'Sensitivity (Recall)', 'AUC-ROC', 'F1-Score']

# Main evaluation loop for finding best Hp and overall performance
for h_split in tqdm(range(HPO_splits), desc="Hyperparameter Optimization Progress"):
    print(f"Processing split state: {h_split}")
    
    # Optimize hyperparameters for the split (state)
    study = optuna.create_study(direction='maximize', sampler=TPESampler(), pruner=MedianPruner())
    study.optimize(lambda trial: objective(trial, h_split), n_trials=100)

    # Get the best hyperparameters for the split in 100 trials
    best_params = study.best_params
    print(f"Best parameters for split state {h_split}: {best_params}")

    all_metrics = []
    
    # Evaluate the split-based optimized hyperparameters 
    for e_split in tqdm(range(Model_evaluation_splits), desc="Evaluation Progress", leave=False):
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=e_split)
        
        model = lgb.LGBMClassifier(**best_params)
        model.fit(X_train, y_train)
        y_pred_test = model.predict_proba(X_test)[:, 1]
        y_pred_bin_test = model.predict(X_test)
        
        # Evaluate on the test set
        test_loss = log_loss(y_test, y_pred_test)
        accuracy = accuracy_score(y_test, model.predict(X_test))
        auc_roc = roc_auc_score(y_test, y_pred_test)
        f1 = f1_score(y_test, model.predict(X_test))
        sensitivity = recall_score(y_test, y_pred_bin_test) 
        
        all_metrics.append((test_loss, accuracy, sensitivity, auc_roc, f1))
        
    # Calculate average metrics
    all_metrics = np.array(all_metrics)
    avg_metrics = np.mean(all_metrics, axis=0)

    if avg_metrics[2] > model_overall_best_Hparams_model_sensitivity:
        model_overall_best_Hparams_model_sensitivity = avg_metrics[2]
        model_overall_best_Hparams_list = best_params
        model_overall_best_Hparams_model_performance_metrics = avg_metrics
        model_overall_best_Hparams_splitIndex = h_split

Result of HPO

In [None]:
print("============================== LightGBM HPO results ==============================")
print(f"Best Hp is from split: {model_overall_best_Hparams_splitIndex}")
print(f"Best Hp are: {model_overall_best_Hparams_list}")
print("\nBest HP Tuned Model Performance:")
for label, value in zip(metric_labels, model_overall_best_Hparams_model_performance_metrics):
    print(f"{label}: {value:.4f}")