In [1]:
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import make_scorer, roc_auc_score, accuracy_score, f1_score, log_loss, recall_score
from tqdm import tqdm
import logging
import optuna
from optuna.samplers import TPESampler
from optuna.pruners import MedianPruner
from optuna.logging import get_logger

Log data preperation

In [2]:
# Set up logging configuration
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Create a custom file handler
file_handler = logging.FileHandler('optuna_logs_CatBoost.log')
file_handler.setLevel(logging.INFO)
file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))

# Add file handler to Optuna's logger
logger = optuna.logging.get_logger('optuna')
logger.addHandler(file_handler)


Data preperation

In [4]:
# Load the dataset using Pandas
df = pd.read_excel('../../step1 (Datasets)/PHR_1.0.3b_LabResultsMVRemoved_refined_MedianImputation.xlsx')

# Specify the target column name
target_name = 'Cancer Label'

# Separate features and target
X = df.drop(columns=[target_name])
y = df[target_name]

# Encode categorical variables if needed
cat_features = []
for column in X.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    X[column] = le.fit_transform(X[column])
    cat_features.append(column)

Ultraparameters

In [5]:
# Number of random splits for HPO
HPO_splits = 10

# Number of random splits for evaluation of HPO-adjusted model
Model_evaluation_splits = 30

Optuna HPO & evaluation

In [6]:
# Function to optimize hyperparameters using Optuna
def objective(trial, state):
    param_grid = {
        'task_type' : 'GPU',
        'devices' : '2',
        'bootstrap_type' : 'Bayesian',
        'iterations': trial.suggest_int('iterations', 50, 300),
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.2),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-3, 10.0, log=True),
        'random_strength': trial.suggest_float('random_strength', 1e-3, 10.0, log=True),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0),
        'od_type': 'Iter',
        'od_wait': 50,
        'verbose': False
    }

    if param_grid['bootstrap_type'] == 'Bayesian':
        param_grid['bagging_temperature'] = trial.suggest_float('bagging_temperature', 0.0, 1.0)
    elif param_grid['bootstrap_type'] == 'Bernoulli':
        param_grid['subsample'] = trial.suggest_float('subsample', 0.1, 1.0)

    # Create train/test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=state)
    
    train_pool = Pool(X_train, y_train, cat_features=cat_features)
    
    model = CatBoostClassifier(**param_grid)
    
    # Use cross-validation to evaluate model with recall
    recall_scorer = make_scorer(recall_score)  # Recall (sensitivity) scorer
    recall = cross_val_score(model, X_train, y_train, cv=5, scoring=recall_scorer).mean()

    return recall

model_overall_best_Hparams_model_performance_metrics = []
model_overall_best_Hparams_model_auc_roc = 0
model_overall_best_Hparams_list = []
model_overall_best_Hparams_splitIndex = 0

# Main evaluation loop for finding best Hp and overall performance
for h_split in tqdm(range(HPO_splits), desc="Hyperparameter Optimization Progress"):
    print(f"Processing split state: {h_split}")
    
    # Optimize hyperparameters for the split (state)
    study = optuna.create_study(direction='maximize', sampler=TPESampler(), pruner=MedianPruner())
    study.optimize(lambda trial: objective(trial, h_split), n_trials=100)

    # Get the best hyperparameters for the split in 100 trials
    best_params = study.best_params
    print(f"Best parameters for split state {h_split}: {best_params}")

    all_metrics = []
    
    # Evaluate the split-based optimized hyperparameters 
    for e_split in tqdm(range(Model_evaluation_splits), desc="Evaluation Progress", leave=False):
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=e_split)
        
        train_pool = Pool(X_train, y_train, cat_features=cat_features)
        test_pool = Pool(X_test, y_test, cat_features=cat_features)
        
        model = CatBoostClassifier(**best_params)
        model.fit(train_pool, verbose=False)
        y_pred_test = model.predict_proba(X_test)[:, 1]
        y_pred_bin_test = model.predict(X_test)
        
        # Evaluate on the test set
        test_loss = log_loss(y_test, y_pred_test)
        accuracy = accuracy_score(y_test, model.predict(X_test))
        auc_roc = roc_auc_score(y_test, y_pred_test)
        f1 = f1_score(y_test, model.predict(X_test))
        sensitivity = recall_score(y_test, y_pred_bin_test) 
        
        all_metrics.append((test_loss, accuracy, sensitivity, auc_roc, f1))
        
    # Calculate average metrics
    all_metrics = np.array(all_metrics)
    avg_metrics = np.mean(all_metrics, axis=0)

    if avg_metrics[2] > model_overall_best_Hparams_model_auc_roc:
        model_overall_best_Hparams_model_auc_roc = avg_metrics[2]
        model_overall_best_Hparams_list = best_params
        model_overall_best_Hparams_model_performance_metrics = avg_metrics
        model_overall_best_Hparams_splitIndex = h_split


[I 2024-11-11 17:05:31,703] Trial 64 finished with value: 0.6745292176276452 and parameters: {'iterations': 228, 'learning_rate': 0.17818099654276345, 'depth': 7, 'l2_leaf_reg': 1.5012107648391222, 'random_strength': 0.004373072099631633, 'bagging_temperature': 0.21639557540743007}. Best is trial 24 with value: 0.7100368860415454.
[I 2024-11-11 17:05:35,780] Trial 65 finished with value: 0.6666860803727431 and parameters: {'iterations': 151, 'learning_rate': 0.027349480296992468, 'depth': 5, 'l2_leaf_reg': 0.027249147332896014, 'random_strength': 0.023992740462391818, 'bagging_temperature': 0.09690071168517467}. Best is trial 24 with value: 0.7100368860415454.
[I 2024-11-11 17:05:39,310] Trial 66 finished with value: 0.6665501844302077 and parameters: {'iterations': 89, 'learning_rate': 0.19308579419557226, 'depth': 6, 'l2_leaf_reg': 0.22855334189605747, 'random_strength': 0.00712645303635379, 'bagging_temperature': 0.5677806516813173}. Best is trial 24 with value: 0.7100368860415454.


Best parameters for split state 7: {'iterations': 225, 'learning_rate': 0.16375732432868179, 'depth': 5, 'l2_leaf_reg': 0.06557838260639168, 'random_strength': 0.03154800614243475, 'bagging_temperature': 0.5507852973993383}



Evaluation Progress:   0%|                                                                                                                             | 0/30 [00:00<?, ?it/s][A
Evaluation Progress:   3%|███▉                                                                                                                 | 1/30 [00:02<01:16,  2.64s/it][A
Evaluation Progress:   7%|███████▊                                                                                                             | 2/30 [00:05<01:10,  2.50s/it][A
Evaluation Progress:  10%|███████████▋                                                                                                         | 3/30 [00:07<01:06,  2.48s/it][A
Evaluation Progress:  13%|███████████████▌                                                                                                     | 4/30 [00:09<01:03,  2.46s/it][A
Evaluation Progress:  17%|███████████████████▌                                                               

Processing split state: 8


[I 2024-11-11 17:10:32,635] Trial 0 finished with value: 0.6823719779173806 and parameters: {'iterations': 229, 'learning_rate': 0.04679245268669562, 'depth': 7, 'l2_leaf_reg': 0.7635584679804905, 'random_strength': 0.0057776532378432605, 'bagging_temperature': 0.654764691664512}. Best is trial 0 with value: 0.6823719779173806.
[I 2024-11-11 17:10:37,660] Trial 1 finished with value: 0.6589377498572244 and parameters: {'iterations': 253, 'learning_rate': 0.030557104528603742, 'depth': 4, 'l2_leaf_reg': 5.669938569960652, 'random_strength': 0.007190149417659419, 'bagging_temperature': 0.25885002620199915}. Best is trial 0 with value: 0.6823719779173806.
[I 2024-11-11 17:10:41,448] Trial 2 finished with value: 0.6823339044355606 and parameters: {'iterations': 170, 'learning_rate': 0.0669433883290083, 'depth': 4, 'l2_leaf_reg': 0.001949531919113018, 'random_strength': 0.004594363645559278, 'bagging_temperature': 0.5032289394042421}. Best is trial 0 with value: 0.6823719779173806.
[I 2024-

Best parameters for split state 8: {'iterations': 254, 'learning_rate': 0.19228438833160327, 'depth': 5, 'l2_leaf_reg': 1.4123398102304356, 'random_strength': 0.01595509086818344, 'bagging_temperature': 0.5929781150397694}



Evaluation Progress:   0%|                                                                                                                             | 0/30 [00:00<?, ?it/s][A
Evaluation Progress:   3%|███▉                                                                                                                 | 1/30 [00:03<01:35,  3.30s/it][A
Evaluation Progress:   7%|███████▊                                                                                                             | 2/30 [00:06<01:23,  2.98s/it][A
Evaluation Progress:  10%|███████████▋                                                                                                         | 3/30 [00:08<01:17,  2.87s/it][A
Evaluation Progress:  13%|███████████████▌                                                                                                     | 4/30 [00:11<01:14,  2.85s/it][A
Evaluation Progress:  17%|███████████████████▌                                                               

Processing split state: 9


[I 2024-11-11 17:22:10,831] Trial 0 finished with value: 0.6960235640648011 and parameters: {'iterations': 166, 'learning_rate': 0.11164311290511894, 'depth': 5, 'l2_leaf_reg': 0.0037138133092436236, 'random_strength': 5.401653247071952, 'bagging_temperature': 0.17391417281020782}. Best is trial 0 with value: 0.6960235640648011.
[I 2024-11-11 17:22:16,326] Trial 1 finished with value: 0.6940037870818431 and parameters: {'iterations': 275, 'learning_rate': 0.056255568711288576, 'depth': 4, 'l2_leaf_reg': 0.08719290876154115, 'random_strength': 0.0010314179218319838, 'bagging_temperature': 0.5207335651641212}. Best is trial 0 with value: 0.6960235640648011.
[I 2024-11-11 17:22:27,795] Trial 2 finished with value: 0.6590784767515253 and parameters: {'iterations': 213, 'learning_rate': 0.10620223334928942, 'depth': 9, 'l2_leaf_reg': 0.01798679395857335, 'random_strength': 0.006602010675262447, 'bagging_temperature': 0.596157397907584}. Best is trial 0 with value: 0.6960235640648011.
[I 202

Best parameters for split state 9: {'iterations': 181, 'learning_rate': 0.09684661833140838, 'depth': 4, 'l2_leaf_reg': 2.5168938138963797, 'random_strength': 0.7948227813370159, 'bagging_temperature': 0.6167656443744117}



Evaluation Progress:   0%|                                                                                                                             | 0/30 [00:00<?, ?it/s][A
Evaluation Progress:   3%|███▉                                                                                                                 | 1/30 [00:00<00:23,  1.23it/s][A
Evaluation Progress:   7%|███████▊                                                                                                             | 2/30 [00:01<00:22,  1.27it/s][A
Evaluation Progress:  10%|███████████▋                                                                                                         | 3/30 [00:02<00:21,  1.24it/s][A
Evaluation Progress:  13%|███████████████▌                                                                                                     | 4/30 [00:03<00:21,  1.23it/s][A
Evaluation Progress:  17%|███████████████████▌                                                               

In [8]:
print(f"Best Hp is from split: {model_overall_best_Hparams_splitIndex}")
print(f"Best Hp are: {model_overall_best_Hparams_list}")
print(f"Best Hp tuned model performance on {Model_evaluation_splits} splits: {model_overall_best_Hparams_model_performance_metrics}")


Best Hp is from split: 6
Best Hp are: {'iterations': 241, 'learning_rate': 0.12880007267833182, 'depth': 5, 'l2_leaf_reg': 2.344766822412313, 'random_strength': 0.015809299525362315, 'bagging_temperature': 0.4928820155688858}
Best Hp tuned model performance on 30 splits: [0.40056181 0.83679525 0.72614741 0.90103372 0.76865519]
