# Machine Learning: Tuning with Optuna
Author: Joel Enrique Díaz Villanueva   
Organisation: Universidad de Monterrey   

---

# Machine Learning

In [None]:
!pip install scikit-optimize
!pip install catboost
!pip install optuna

Collecting optuna
  Downloading optuna-4.3.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.16.1-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.3.0-py3-none-any.whl (386 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m386.6/386.6 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.16.1-py3-none-any.whl (242 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m242.5/242.5 kB[0m [31m22.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.16.1 colorlog-6.9.0 optuna-4.3.0


## Importing the libraries

In [None]:
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import numpy as np
import optuna
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, roc_auc_score, roc_curve, auc, accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import VotingClassifier
from sklearn.base import clone
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

In [None]:
df = pd.read_csv('/content/ADNI_24month_RFECV.csv')

df.drop('Paciente', axis=1, inplace=True)

In [None]:
df[df.columns] = df[df.columns].apply(pd.to_numeric, errors='coerce')
df = df.dropna()

In [None]:
X = df.drop(["Target"], axis=1)
y = df['Target']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=42, shuffle=True)

## Feature Scaling

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

classifiers = {
    "LogisticRegression": LogisticRegression(random_state=1),
    "RandomForest": RandomForestClassifier(n_estimators=50, random_state=1),
    "GaussianNB": GaussianNB(),
    "XGBoost": XGBClassifier(random_state=1),
    "CatBoost": CatBoostClassifier(random_state=1, verbose=0),
    "HistGradientBoosting": HistGradientBoostingClassifier(random_state=1),
    "GradientBoosting": GradientBoostingClassifier(random_state=1),
    "KNN": KNeighborsClassifier(),
    "SVC": SVC(probability=True, random_state=1),
    "DecisionTree": DecisionTreeClassifier(random_state=1)
}

results = []

for name, clf in classifiers.items():
    if name in ["LogisticRegression","KNN", "SVC"]:
        clf.fit(X_train_scaled, y_train);
        y_proba = clf.predict_proba(X_test_scaled)[:, 1]
        y_pred = clf.predict(X_test_scaled)
    else:
        clf.fit(X_train, y_train);
        y_proba = clf.predict_proba(X_test)[:, 1]
        y_pred = clf.predict(X_test)

    auc = roc_auc_score(y_test, y_proba)
    accuracy = accuracy_score(y_test, y_pred)

    results.append({
        "Model": name,
        "ROC AUC": auc,
        "Accuracy": accuracy
    })

results_df = pd.DataFrame(results).sort_values("ROC AUC", ascending=False)
print(results_df)

                  Model   ROC AUC  Accuracy
2            GaussianNB  0.764008  0.721805
1          RandomForest  0.754392  0.676692
4              CatBoost  0.746676  0.691729
3               XGBoost  0.738604  0.706767
5  HistGradientBoosting  0.731481  0.661654
6      GradientBoosting  0.726496  0.676692
7                   KNN  0.712369  0.699248
8                   SVC  0.711538  0.669173
0    LogisticRegression  0.682336  0.661654
9          DecisionTree  0.620370  0.646617


## Evaluating Top Models

## CatBoost Classifier

In [None]:
def objective(trial):
    params = {
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.5, log=True),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-6, 10, log=True),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'random_strength': trial.suggest_float('random_strength', 1e-6, 10),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 0.5, 2.0),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 20),
        'grow_policy': trial.suggest_categorical('grow_policy', ['SymmetricTree', 'Depthwise']),
        'loss_function': 'Logloss',
        'eval_metric': 'AUC',
        'random_seed': 42,
        'verbose': False
    }

    model = CatBoostClassifier(**params)

    scores = cross_val_score(model, X, y, cv=3, n_jobs=-1, scoring='roc_auc')

    return np.mean(scores)

In [None]:
study = optuna.create_study(direction="maximize", study_name="catboost")

study.optimize(objective, n_trials=100, timeout=1800, show_progress_bar=True, pruner=optuna.pruners.HyperbandPruner())

print("Best parameters:")
print(study.best_params)
print(f"\Best AUC: {study.best_value:.4f}")

[I 2025-06-02 05:21:37,081] A new study created in memory with name: catboost


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2025-06-02 05:21:48,403] Trial 0 finished with value: 0.7816643804298126 and parameters: {'learning_rate': 0.191698632382241, 'depth': 7, 'l2_leaf_reg': 0.2973492164651126, 'border_count': 217, 'subsample': 0.6576441002167643, 'random_strength': 5.831867681080175, 'bagging_temperature': 0.5165623490148564, 'scale_pos_weight': 1.794369143498908, 'min_data_in_leaf': 18, 'grow_policy': 'Depthwise'}. Best is trial 0 with value: 0.7816643804298126.
[I 2025-06-02 05:21:54,204] Trial 1 finished with value: 0.7839426651557623 and parameters: {'learning_rate': 0.1268871947475493, 'depth': 5, 'l2_leaf_reg': 2.915989618057166, 'border_count': 190, 'subsample': 0.6178442312202117, 'random_strength': 8.790272293240527, 'bagging_temperature': 0.027434736667191317, 'scale_pos_weight': 0.9059474156782027, 'min_data_in_leaf': 19, 'grow_policy': 'Depthwise'}. Best is trial 1 with value: 0.7839426651557623.
[I 2025-06-02 05:22:03,426] Trial 2 finished with value: 0.8022603924375261 and parameters: {'l

In [None]:
best_model = CatBoostClassifier(
        **study.best_params,
        loss_function='Logloss',
        eval_metric='AUC',
        random_seed=42,
        verbose=True
    )

best_model.fit(X, y)

0:	total: 4.96ms	remaining: 4.96s
1:	total: 35.7ms	remaining: 17.8s
2:	total: 59.1ms	remaining: 19.6s
3:	total: 81.5ms	remaining: 20.3s
4:	total: 104ms	remaining: 20.7s
5:	total: 126ms	remaining: 21s
6:	total: 150ms	remaining: 21.2s
7:	total: 175ms	remaining: 21.7s
8:	total: 199ms	remaining: 21.9s
9:	total: 225ms	remaining: 22.3s
10:	total: 248ms	remaining: 22.3s
11:	total: 270ms	remaining: 22.3s
12:	total: 293ms	remaining: 22.3s
13:	total: 316ms	remaining: 22.2s
14:	total: 339ms	remaining: 22.3s
15:	total: 364ms	remaining: 22.4s
16:	total: 390ms	remaining: 22.5s
17:	total: 413ms	remaining: 22.5s
18:	total: 437ms	remaining: 22.6s
19:	total: 460ms	remaining: 22.5s
20:	total: 483ms	remaining: 22.5s
21:	total: 505ms	remaining: 22.4s
22:	total: 527ms	remaining: 22.4s
23:	total: 550ms	remaining: 22.4s
24:	total: 572ms	remaining: 22.3s
25:	total: 599ms	remaining: 22.4s
26:	total: 628ms	remaining: 22.6s
27:	total: 655ms	remaining: 22.7s
28:	total: 677ms	remaining: 22.7s
29:	total: 699ms	remai

<catboost.core.CatBoostClassifier at 0x7e7dc57e0390>

In [None]:
print(f"\nROC_AUC Score: {roc_auc_score(y,best_model.predict(X))}")
print(f"\nClasification Report: \n\n{classification_report(y,best_model.predict(X))}")


ROC_AUC Score: 1.0

Clasification Report: 

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       243
           1       1.00      1.00      1.00       137

    accuracy                           1.00       380
   macro avg       1.00      1.00      1.00       380
weighted avg       1.00      1.00      1.00       380



In [None]:
import joblib

joblib.dump(best_model, "catboost optuna.pkl")

['catboost optuna.pkl']

## GaussianNB

In [None]:
def objective(trial):
    params = {
        'var_smoothing': trial.suggest_float('var_smoothing', 1e-10, 1e-2, log=True) # Added the parameter name 'var_smoothing'
    }

    model = GaussianNB(**params)

    scores = cross_val_score(model, X, y, cv=3, n_jobs=-1, scoring='roc_auc')

    return np.mean(scores)

In [None]:
study = optuna.create_study(direction="maximize", study_name="GaussianNB", pruner=optuna.pruners.HyperbandPruner())

study.optimize(objective, n_trials=50, timeout=1800, show_progress_bar=True)

print("Best parameters:")
print(study.best_params)
print(f"\Best AUC: {study.best_value:.4f}")

[I 2025-06-02 06:02:14,239] A new study created in memory with name: GaussianNB


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-06-02 06:02:17,961] Trial 0 finished with value: 0.763118029462635 and parameters: {'var_smoothing': 0.0027556156199870503}. Best is trial 0 with value: 0.763118029462635.
[I 2025-06-02 06:02:18,053] Trial 1 finished with value: 0.7840798393670106 and parameters: {'var_smoothing': 2.385548880728004e-07}. Best is trial 1 with value: 0.7840798393670106.
[I 2025-06-02 06:02:18,105] Trial 2 finished with value: 0.763118029462635 and parameters: {'var_smoothing': 0.0008963665315736362}. Best is trial 1 with value: 0.7840798393670106.
[I 2025-06-02 06:02:18,159] Trial 3 finished with value: 0.7909802986024136 and parameters: {'var_smoothing': 6.780784551960684e-08}. Best is trial 3 with value: 0.7909802986024136.
[I 2025-06-02 06:02:18,205] Trial 4 finished with value: 0.7666428102821018 and parameters: {'var_smoothing': 4.031057475933189e-06}. Best is trial 3 with value: 0.7909802986024136.
[I 2025-06-02 06:02:18,269] Trial 5 finished with value: 0.7651080495417585 and parameters: {

In [None]:
best_model = GaussianNB(**study.best_params)

best_model.fit(X, y)

In [None]:
print(f"\nROC_AUC Score: {roc_auc_score(y,best_model.predict(X))}")
print(f"\nClasification Report: \n\n{classification_report(y,best_model.predict(X))}")


ROC_AUC Score: 0.7399897870295274

Clasification Report: 

              precision    recall  f1-score   support

           0       0.81      0.82      0.82       243
           1       0.68      0.66      0.67       137

    accuracy                           0.76       380
   macro avg       0.74      0.74      0.74       380
weighted avg       0.76      0.76      0.76       380



## Random Forest

In [None]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False]),
        'class_weight': trial.suggest_categorical('class_weight', ['balanced', None])
        }

    model = RandomForestClassifier(**params)

    scores = cross_val_score(model, X, y, cv=3, n_jobs=-1, scoring='roc_auc')

    return np.mean(scores)

In [None]:
study = optuna.create_study(direction="maximize", study_name="Random Forest", pruner=optuna.pruners.HyperbandPruner())

study.optimize(objective, n_trials=100, timeout=1800, show_progress_bar=True)

print("Best parameters:")
print(study.best_params)
print(f"\Best AUC: {study.best_value:.4f}")

[I 2025-06-02 06:09:03,110] A new study created in memory with name: Random Forest


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2025-06-02 06:09:07,960] Trial 0 finished with value: 0.8169579133615633 and parameters: {'n_estimators': 102, 'max_depth': 16, 'min_samples_split': 14, 'min_samples_leaf': 6, 'max_features': 'sqrt', 'bootstrap': False, 'class_weight': 'balanced'}. Best is trial 0 with value: 0.8169579133615633.
[I 2025-06-02 06:09:09,932] Trial 1 finished with value: 0.8193674082026203 and parameters: {'n_estimators': 286, 'max_depth': 19, 'min_samples_split': 10, 'min_samples_leaf': 15, 'max_features': 'sqrt', 'bootstrap': False, 'class_weight': 'balanced'}. Best is trial 1 with value: 0.8193674082026203.
[I 2025-06-02 06:09:12,117] Trial 2 finished with value: 0.8245621359416314 and parameters: {'n_estimators': 443, 'max_depth': 15, 'min_samples_split': 5, 'min_samples_leaf': 17, 'max_features': 'log2', 'bootstrap': True, 'class_weight': None}. Best is trial 2 with value: 0.8245621359416314.
[I 2025-06-02 06:09:13,138] Trial 3 finished with value: 0.8159678734021192 and parameters: {'n_estimators

In [None]:
best_model = RandomForestClassifier(**study.best_params)

best_model.fit(X, y)

In [None]:
print(f"\nROC_AUC Score: {roc_auc_score(y,best_model.predict(X))}")
print(f"\nClasification Report: \n\n{classification_report(y,best_model.predict(X))}")


ROC_AUC Score: 0.7979934516836382

Clasification Report: 

              precision    recall  f1-score   support

           0       0.85      0.88      0.86       243
           1       0.77      0.72      0.74       137

    accuracy                           0.82       380
   macro avg       0.81      0.80      0.80       380
weighted avg       0.82      0.82      0.82       380

