# Machine Learning: Bayesian Search
Author: Joel Enrique Díaz Villanueva   
Organisation: Universidad de Monterrey   

---

# Machine Learning

In [None]:
!pip install scikit-optimize
!pip install catboost

Collecting scikit-optimize
  Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl.metadata (9.7 kB)
Collecting pyaml>=16.9 (from scikit-optimize)
  Downloading pyaml-25.1.0-py3-none-any.whl.metadata (12 kB)
Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl (107 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.8/107.8 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyaml-25.1.0-py3-none-any.whl (26 kB)
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml-25.1.0 scikit-optimize-0.10.2
Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


## Importing the libraries

In [None]:
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, roc_auc_score, roc_curve, auc, accuracy_score
from sklearn.ensemble import VotingClassifier
from sklearn.base import clone
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

In [None]:
df = pd.read_csv('/content/ADNI_24month_RFECV.csv')

df.drop('Paciente', axis=1, inplace=True)

In [None]:
df[df.columns] = df[df.columns].apply(pd.to_numeric, errors='coerce')
df = df.dropna()

In [None]:
X = df.drop(["Target"], axis=1)
y = df['Target']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=42, shuffle=True)

## Feature Scaling

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

classifiers = {
    "LogisticRegression": LogisticRegression(random_state=1),
    "RandomForest": RandomForestClassifier(n_estimators=50, random_state=1),
    "GaussianNB": GaussianNB(),
    "XGBoost": XGBClassifier(random_state=1),
    "CatBoost": CatBoostClassifier(random_state=1, verbose=0),
    "HistGradientBoosting": HistGradientBoostingClassifier(random_state=1),
    "GradientBoosting": GradientBoostingClassifier(random_state=1),
    "KNN": KNeighborsClassifier(),
    "SVC": SVC(probability=True, random_state=1),
    "DecisionTree": DecisionTreeClassifier(random_state=1)
}

results = []

for name, clf in classifiers.items():
    if name in ["LogisticRegression","KNN", "SVC"]:
        clf.fit(X_train_scaled, y_train);
        y_proba = clf.predict_proba(X_test_scaled)[:, 1]
        y_pred = clf.predict(X_test_scaled)
    else:
        clf.fit(X_train, y_train);
        y_proba = clf.predict_proba(X_test)[:, 1]
        y_pred = clf.predict(X_test)

    auc = roc_auc_score(y_test, y_proba)
    accuracy = accuracy_score(y_test, y_pred)

    results.append({
        "Model": name,
        "ROC AUC": auc,
        "Accuracy": accuracy
    })

results_df = pd.DataFrame(results).sort_values("ROC AUC", ascending=False)
print(results_df)

                  Model   ROC AUC  Accuracy
2            GaussianNB  0.764008  0.721805
1          RandomForest  0.754392  0.676692
4              CatBoost  0.746676  0.691729
3               XGBoost  0.738604  0.706767
5  HistGradientBoosting  0.731481  0.661654
6      GradientBoosting  0.726496  0.676692
7                   KNN  0.712369  0.699248
8                   SVC  0.711538  0.669173
0    LogisticRegression  0.682336  0.661654
9          DecisionTree  0.620370  0.646617


## Evaluating Top Models

## CatBoost Classifier

In [None]:
catboost_space = {
    'learning_rate': Real(0.005, 0.5, prior='log-uniform'),  # Wider range than default
    'depth': Integer(4, 10),  # Tree depth (shallower trees prevent overfitting)
    'l2_leaf_reg': Real(1e-6, 10, prior='log-uniform'),  # L2 regularization strength
    'border_count': Integer(32, 255),  # Number of splits for numerical features
    'subsample': Real(0.6, 1.0),  # Random subset of data for each tree
    'random_strength': Real(1e-6, 10),  # Adds randomness to score calculations
    'bagging_temperature': Real(0.0, 1.0),  # Controls Bayesian bootstrap
    'scale_pos_weight': Real(0.5, 2.0),  # Handles class imbalance
    'min_data_in_leaf': Integer(1, 20),  # Prevents overfitting on leaves
    'grow_policy': Categorical(['SymmetricTree', 'Depthwise'])  # Tree growth strategies
}

catboost = BayesSearchCV(
    estimator=CatBoostClassifier(
        loss_function='Logloss',
        eval_metric='AUC',
        random_seed=42,
        thread_count=-1,
        verbose=1
    ),
    search_spaces=catboost_space,
    n_iter=50,
    cv=5,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=3,
    random_state=42,
    refit=True
)

catboost.fit(X_train, y_train, verbose=10)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi

In [None]:
print("Best parameters:", catboost.best_params_)
print("\nBest validation AUC:", catboost.best_score_)
print(f"\nROC_AUC Score: {roc_auc_score(y_test,catboost.predict(X_test))}")
print(f"\nClasification Report: \n\n{classification_report(y_test,catboost.predict(X_test))}")

Best parameters: OrderedDict([('bagging_temperature', 1.0), ('border_count', 32), ('depth', 7), ('grow_policy', 'Depthwise'), ('l2_leaf_reg', 10.0), ('learning_rate', 0.005), ('min_data_in_leaf', 1), ('random_strength', 10.0), ('scale_pos_weight', 1.4905428631727293), ('subsample', 1.0)])

Best validation AUC: 0.8516154188948306

ROC_AUC Score: 0.6842355175688509

Clasification Report: 

              precision    recall  f1-score   support

           0       0.75      0.75      0.75        81
           1       0.62      0.62      0.62        52

    accuracy                           0.70       133
   macro avg       0.68      0.68      0.68       133
weighted avg       0.70      0.70      0.70       133



## Random Forest Classifier

In [None]:
rf_space = {
    'n_estimators': Integer(50, 500),          # Number of trees in the forest
    'max_depth': Integer(3, 20),               # Maximum tree depth (shallower trees prevent overfitting)
    'min_samples_split': Integer(2, 20),       # Minimum samples required to split a node
    'min_samples_leaf': Integer(1, 20),        # Minimum samples required at a leaf node
    'max_features': Categorical(['sqrt', 'log2', None]),  # Features to consider for splits
    'bootstrap': Categorical([True, False]),   # Whether bootstrap samples are used
    'class_weight': Categorical(['balanced', None])  # Handles class imbalance
}

rf = BayesSearchCV(
    estimator=RandomForestClassifier(
        random_state=42,
        verbose=1,
        n_jobs=-1
    ),
    search_spaces=rf_space,
    n_iter=100,
    cv=5,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=3,
    random_state=42,
    refit=True
)

rf.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 446 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:    1.2s finished


In [None]:
print("Best parameters:", rf.best_params_)
print("\nBest validation AUC:", rf.best_score_)
print(f"\nROC_AUC Score: {roc_auc_score(y_test,rf.predict(X_test))}")
print(f"\nClasification Report: \n\n{classification_report(y_test,rf.predict(X_test))}")

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s


Best parameters: OrderedDict([('bootstrap', True), ('class_weight', 'balanced'), ('max_depth', 4), ('max_features', 'sqrt'), ('min_samples_leaf', 8), ('min_samples_split', 20), ('n_estimators', 500)])

Best validation AUC: 0.8476158645276293

ROC_AUC Score: 0.7384852801519469


[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 500 out of 500 | elapsed:    0.1s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.1s



Clasification Report: 

              precision    recall  f1-score   support

           0       0.81      0.77      0.78        81
           1       0.66      0.71      0.69        52

    accuracy                           0.74       133
   macro avg       0.73      0.74      0.73       133
weighted avg       0.75      0.74      0.75       133



[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 500 out of 500 | elapsed:    0.1s finished


## GaussianNB

In [None]:
gnb_space = {
    'var_smoothing': Real(1e-10, 1e-2, prior='log-uniform')
}

gnb = BayesSearchCV(
    estimator=GaussianNB(),
    search_spaces=gnb_space,
    n_iter=100,
    cv=5,
    scoring='roc_auc',
    n_jobs=-1,
    random_state=42,
    verbose=1
)

gnb.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi



Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi

In [None]:
print("Best parameters:", gnb.best_params_)
print("\nBest validation AUC:", gnb.best_score_)
print(f"\nROC_AUC Score: {roc_auc_score(y_test,gnb.predict(X_test))}")
print(f"\nClasification Report: \n\n{classification_report(y_test,gnb.predict(X_test))}")

Best parameters: OrderedDict([('var_smoothing', 1.0044439876998996e-10)])

Best validation AUC: 0.8450868983957219

ROC_AUC Score: 0.7061965811965811

Clasification Report: 

              precision    recall  f1-score   support

           0       0.77      0.78      0.77        81
           1       0.65      0.63      0.64        52

    accuracy                           0.72       133
   macro avg       0.71      0.71      0.71       133
weighted avg       0.72      0.72      0.72       133



## Voting Classifier w/ best models

### Training with X_train and y_train

In [None]:
eclf = VotingClassifier(estimators=[
        ('catboost', catboost.best_estimator_),
        ('rf', rf.best_estimator_), ('gnb', gnb.best_estimator_)],
        voting='soft')

eclf = eclf.fit(X_train, y_train)

0:	total: 2.84ms	remaining: 2.83s
1:	total: 5.5ms	remaining: 2.75s
2:	total: 8.75ms	remaining: 2.91s
3:	total: 18.3ms	remaining: 4.56s
4:	total: 21.8ms	remaining: 4.35s
5:	total: 24.1ms	remaining: 4s
6:	total: 27.5ms	remaining: 3.9s
7:	total: 29.9ms	remaining: 3.71s
8:	total: 33.9ms	remaining: 3.73s
9:	total: 37ms	remaining: 3.66s
10:	total: 39.8ms	remaining: 3.58s
11:	total: 42.2ms	remaining: 3.47s
12:	total: 45ms	remaining: 3.42s
13:	total: 47.8ms	remaining: 3.37s
14:	total: 50.5ms	remaining: 3.32s
15:	total: 53.3ms	remaining: 3.28s
16:	total: 56.3ms	remaining: 3.25s
17:	total: 63.8ms	remaining: 3.48s
18:	total: 66.7ms	remaining: 3.44s
19:	total: 69.2ms	remaining: 3.39s
20:	total: 71.8ms	remaining: 3.35s
21:	total: 74.7ms	remaining: 3.32s
22:	total: 77.9ms	remaining: 3.31s
23:	total: 80.5ms	remaining: 3.27s
24:	total: 83.4ms	remaining: 3.25s
25:	total: 86.8ms	remaining: 3.25s
26:	total: 89.6ms	remaining: 3.23s
27:	total: 93.1ms	remaining: 3.23s
28:	total: 96.5ms	remaining: 3.23s
29:	

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 446 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:    0.9s finished


In [None]:
model_scores = []

for name, model in eclf.named_estimators_.items():
    if hasattr(model, "predict_proba"):
        y_proba = model.predict_proba(X_test)[:, 1]
        auc = roc_auc_score(y_test, y_proba)
        score = model.score(X_test, y_test)
        model_scores.append((name, auc, score))
        print(f"{name}: AUC = {auc:.4f} | Accuracy = {score:.4f}")

model_scores.sort(key=lambda x: x[1], reverse=True)

print("\nTop AUC models:")
for i, (name, auc, score) in enumerate(model_scores[:3], 1):
    print(f"{i}. {name}: AUC = {auc:.4f} | Accuracy = {score:.4f}")

y_proba_ensemble = eclf.predict_proba(X_test)[:, 1]
auc_ensemble = roc_auc_score(y_test, y_proba_ensemble)
print(f"\nVotingClassifier: AUC = {auc_ensemble:.4f}")
print(f"\nClasification Report: \n\n{classification_report(y_test,eclf.predict(X_test))}")

catboost: AUC = 0.7566 | Accuracy = 0.6992


[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 500 out of 500 | elapsed:    0.1s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 500 out of 500 | elapsed:    0.1s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 500 out of 500 | elapsed:    0.1s finished
[

rf: AUC = 0.7804 | Accuracy = 0.7444
gnb: AUC = 0.7671 | Accuracy = 0.7218

Top AUC models:
1. rf: AUC = 0.7804 | Accuracy = 0.7444
2. gnb: AUC = 0.7671 | Accuracy = 0.7218
3. catboost: AUC = 0.7566 | Accuracy = 0.6992

VotingClassifier: AUC = 0.7723

Clasification Report: 

              precision    recall  f1-score   support

           0       0.77      0.78      0.77        81
           1       0.65      0.63      0.64        52

    accuracy                           0.72       133
   macro avg       0.71      0.71      0.71       133
weighted avg       0.72      0.72      0.72       133



[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 500 out of 500 | elapsed:    0.1s finished


### Training with all data

In [None]:
X_all = X

In [None]:
eclf_all_data = VotingClassifier(estimators=[
        ('catboost', catboost.best_estimator_),
        ('rf', rf.best_estimator_) , ('gnb', gnb.best_estimator_)],
        voting='soft')

eclf_all_data = eclf_all_data.fit(X, y)

0:	total: 9.61ms	remaining: 9.6s
1:	total: 28ms	remaining: 14s
2:	total: 38.9ms	remaining: 12.9s
3:	total: 48.1ms	remaining: 12s
4:	total: 58.4ms	remaining: 11.6s
5:	total: 77.7ms	remaining: 12.9s
6:	total: 86.2ms	remaining: 12.2s
7:	total: 92ms	remaining: 11.4s
8:	total: 100ms	remaining: 11s
9:	total: 109ms	remaining: 10.8s
10:	total: 112ms	remaining: 10.1s
11:	total: 117ms	remaining: 9.61s
12:	total: 126ms	remaining: 9.6s
13:	total: 137ms	remaining: 9.63s
14:	total: 147ms	remaining: 9.67s
15:	total: 157ms	remaining: 9.66s
16:	total: 170ms	remaining: 9.82s
17:	total: 186ms	remaining: 10.1s
18:	total: 192ms	remaining: 9.91s
19:	total: 196ms	remaining: 9.62s
20:	total: 201ms	remaining: 9.36s
21:	total: 205ms	remaining: 9.11s
22:	total: 211ms	remaining: 8.98s
23:	total: 223ms	remaining: 9.07s
24:	total: 233ms	remaining: 9.08s
25:	total: 244ms	remaining: 9.16s
26:	total: 256ms	remaining: 9.21s
27:	total: 266ms	remaining: 9.24s
28:	total: 282ms	remaining: 9.43s
29:	total: 291ms	remaining: 

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 446 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:    1.0s finished


In [None]:
model_scores = []

for name, model in eclf_all_data.named_estimators_.items():
    if hasattr(model, "predict_proba"):
        y_proba = model.predict_proba(X_test)[:, 1]
        auc = roc_auc_score(y_test, y_proba)
        score = model.score(X_test, y_test)
        model_scores.append((name, auc, score))
        print(f"{name}: AUC = {auc:.4f} | Accuracy = {score:.4f}")

model_scores.sort(key=lambda x: x[1], reverse=True)

print("\nTop 5 AUC models:")
for i, (name, auc, score) in enumerate(model_scores[:5], 1):
    print(f"{i}. {name}: AUC = {auc:.4f} | Accuracy = {score:.4f}")

y_proba_ensemble = eclf_all_data.predict_proba(X_test)[:, 1]
auc_ensemble = roc_auc_score(y_test, y_proba_ensemble)
print(f"\nVotingClassifier: AUC = {auc_ensemble:.4f}")
print(f"\nClasification Report: \n\n{classification_report(y_test,eclf_all_data.predict(X_test))}")

catboost: AUC = 0.9684 | Accuracy = 0.8797


[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 500 out of 500 | elapsed:    0.1s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 500 out of 500 | elapsed:    0.2s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.1s


rf: AUC = 0.8796 | Accuracy = 0.7820
gnb: AUC = 0.7804 | Accuracy = 0.7293

Top 5 AUC models:
1. catboost: AUC = 0.9684 | Accuracy = 0.8797
2. rf: AUC = 0.8796 | Accuracy = 0.7820
3. gnb: AUC = 0.7804 | Accuracy = 0.7293

VotingClassifier: AUC = 0.8894

Clasification Report: 

              precision    recall  f1-score   support

           0       0.83      0.79      0.81        81
           1       0.70      0.75      0.72        52

    accuracy                           0.77       133
   macro avg       0.76      0.77      0.77       133
weighted avg       0.78      0.77      0.78       133



[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 500 out of 500 | elapsed:    0.1s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 500 out of 500 | elapsed:    0.1s finished


Random Forrest is superior to the VotingClassifier model

## Training the best model individually with all data

In [None]:
best_model_rf = clone(rf.best_estimator_ )

best_model_rf.fit(X_all, y)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 446 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:    1.2s finished


In [None]:
best_model_catboost = clone(catboost.best_estimator_ )

best_model_catboost.fit(X_all, y)

0:	total: 3.55ms	remaining: 3.55s
1:	total: 6.99ms	remaining: 3.49s
2:	total: 18.6ms	remaining: 6.17s
3:	total: 21.6ms	remaining: 5.38s
4:	total: 24.6ms	remaining: 4.9s
5:	total: 28ms	remaining: 4.63s
6:	total: 31ms	remaining: 4.4s
7:	total: 33.9ms	remaining: 4.21s
8:	total: 37.4ms	remaining: 4.12s
9:	total: 40.5ms	remaining: 4s
10:	total: 43.4ms	remaining: 3.9s
11:	total: 46.6ms	remaining: 3.83s
12:	total: 49.7ms	remaining: 3.77s
13:	total: 52.7ms	remaining: 3.71s
14:	total: 55.9ms	remaining: 3.67s
15:	total: 58.9ms	remaining: 3.62s
16:	total: 62.3ms	remaining: 3.6s
17:	total: 65.6ms	remaining: 3.58s
18:	total: 69.4ms	remaining: 3.58s
19:	total: 73.3ms	remaining: 3.59s
20:	total: 76.8ms	remaining: 3.58s
21:	total: 80.1ms	remaining: 3.56s
22:	total: 82.4ms	remaining: 3.5s
23:	total: 85.8ms	remaining: 3.49s
24:	total: 88.8ms	remaining: 3.46s
25:	total: 92.1ms	remaining: 3.45s
26:	total: 95.6ms	remaining: 3.44s
27:	total: 98.8ms	remaining: 3.43s
28:	total: 102ms	remaining: 3.41s
29:	tota

<catboost.core.CatBoostClassifier at 0x7eaa7d84d250>

## Saving the most important models

In [None]:
import joblib

joblib.dump(best_model_rf, "model rf.pkl")
joblib.dump(best_model_catboost, "model catboost.pkl")

['model catboost.pkl']

Bibliography

- VotingClassifier. (n. a.). Scikit-learn. https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.VotingClassifier.html