# Machine Learning: Bayesian Search
Author: Joel Enrique Díaz Villanueva   
Organisation: Universidad de Monterrey   

---

# Machine Learning

In [None]:
!pip install scikit-optimize
!pip install catboost

Collecting scikit-optimize
  Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl.metadata (9.7 kB)
Collecting pyaml>=16.9 (from scikit-optimize)
  Downloading pyaml-25.1.0-py3-none-any.whl.metadata (12 kB)
Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl (107 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.8/107.8 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyaml-25.1.0-py3-none-any.whl (26 kB)
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml-25.1.0 scikit-optimize-0.10.2
Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


## Importing the libraries

In [None]:
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, roc_auc_score, roc_curve
from sklearn.ensemble import VotingClassifier
from sklearn.base import clone
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

In [None]:
df = pd.read_csv('/content/ADNI_24month_RFECV.csv')

df.drop('Paciente', axis=1, inplace=True)

In [None]:
df[df.columns] = df[df.columns].apply(pd.to_numeric, errors='coerce')
df = df.dropna()

In [None]:
X = df.drop(["Target"], axis=1)
y = df['Target']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=42, shuffle=True)

## Feature Scaling

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## Voting Classifier

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

clf1 = LogisticRegression(random_state=1)
clf2 = RandomForestClassifier(n_estimators=50, random_state=1)
clf3 = GaussianNB()
clf4 = XGBClassifier(random_state=1)
clf5 = CatBoostClassifier(random_state=1)
clf6 = HistGradientBoostingClassifier(random_state=1)
clf7 = GradientBoostingClassifier(random_state=1)
clf8 = KNeighborsClassifier()
clf9 = SVC(probability=True,random_state=1)
clf10 = DecisionTreeClassifier(random_state=1)

eclf1 = VotingClassifier(estimators=[
        ('lr', clf1), ('rf', clf2), ('gnb', clf3), ('xgb', clf4), ('cb', clf5), ('hg', clf6),
        ('gb', clf7), ('kn', clf8), ('svc', clf9), ('dt', clf10)],  voting='soft')
eclf1 = eclf1.fit(X_train, y_train)

Learning rate set to 0.00567
0:	learn: 0.6898276	total: 53.9ms	remaining: 53.8s
1:	learn: 0.6859822	total: 67.3ms	remaining: 33.6s
2:	learn: 0.6828586	total: 76.3ms	remaining: 25.4s
3:	learn: 0.6795196	total: 90.7ms	remaining: 22.6s
4:	learn: 0.6760737	total: 101ms	remaining: 20s
5:	learn: 0.6728911	total: 118ms	remaining: 19.6s
6:	learn: 0.6694395	total: 134ms	remaining: 19s
7:	learn: 0.6651664	total: 145ms	remaining: 17.9s
8:	learn: 0.6614987	total: 153ms	remaining: 16.9s
9:	learn: 0.6574267	total: 162ms	remaining: 16.1s
10:	learn: 0.6537826	total: 167ms	remaining: 15s
11:	learn: 0.6500563	total: 171ms	remaining: 14.1s
12:	learn: 0.6470263	total: 175ms	remaining: 13.3s
13:	learn: 0.6439815	total: 179ms	remaining: 12.6s
14:	learn: 0.6409097	total: 183ms	remaining: 12s
15:	learn: 0.6376325	total: 187ms	remaining: 11.5s
16:	learn: 0.6345401	total: 191ms	remaining: 11s
17:	learn: 0.6317588	total: 195ms	remaining: 10.6s
18:	learn: 0.6282911	total: 202ms	remaining: 10.4s
19:	learn: 0.62510

In [None]:
model_scores = []

for name, model in eclf1.named_estimators_.items():
    if hasattr(model, "predict_proba"):
        y_proba = model.predict_proba(X_test)[:, 1]
        auc = roc_auc_score(y_test, y_proba)
        score = model.score(X_test, y_test)
        model_scores.append((name, auc, score))
        print(f"{name}: AUC = {auc:.4f} | Accuracy = {score:.4f}")

model_scores.sort(key=lambda x: x[1], reverse=True)

print("\nTop 5 AUC models:")
for i, (name, auc, score) in enumerate(model_scores[:5], 1):
    print(f"{i}. {name}: AUC = {auc:.4f} | Accuracy = {score:.4f}")

y_proba_ensemble = eclf1.predict_proba(X_test)[:, 1]
auc_ensemble = roc_auc_score(y_test, y_proba_ensemble)
print(f"\nVotingClassifier: AUC = {auc_ensemble:.4f}")
print(f"\nClasification Report: \n\n{classification_report(y_test,eclf1.predict(X_test))}")

lr: AUC = 0.6823 | Accuracy = 0.6617
rf: AUC = 0.7542 | Accuracy = 0.6767
gnb: AUC = 0.7614 | Accuracy = 0.7068
xgb: AUC = 0.7386 | Accuracy = 0.7068
cb: AUC = 0.7467 | Accuracy = 0.6917
hg: AUC = 0.7315 | Accuracy = 0.6617
gb: AUC = 0.7265 | Accuracy = 0.6767
kn: AUC = 0.7124 | Accuracy = 0.6992
svc: AUC = 0.7115 | Accuracy = 0.6692
dt: AUC = 0.6204 | Accuracy = 0.6466

Top 5 AUC models:
1. gnb: AUC = 0.7614 | Accuracy = 0.7068
2. rf: AUC = 0.7542 | Accuracy = 0.6767
3. cb: AUC = 0.7467 | Accuracy = 0.6917
4. xgb: AUC = 0.7386 | Accuracy = 0.7068
5. hg: AUC = 0.7315 | Accuracy = 0.6617

VotingClassifier: AUC = 0.7493

Clasification Report: 

              precision    recall  f1-score   support

           0       0.71      0.79      0.75        81
           1       0.60      0.50      0.55        52

    accuracy                           0.68       133
   macro avg       0.66      0.65      0.65       133
weighted avg       0.67      0.68      0.67       133



## Extreme Gradient Boosting Classifier

In [None]:
xgb_space = {
    'learning_rate': Real(0.01, 0.3, prior='log-uniform'),  # Wider range
    'max_depth': Integer(3, 8),  # Smaller and greater depths
    'subsample': Real(0.6, 1.0, prior='uniform'),  # More granularity
    'colsample_bytree': Real(0.6, 1.0, prior='uniform'),
    'colsample_bylevel': Real(0.6, 1.0, prior='uniform'),
    'colsample_bynode': Real(0.6, 1.0, prior='uniform'),
    'gamma': Real(0, 5),  # Extended range
    'reg_alpha': Real(1e-6, 10, prior='log-uniform'),  # L1 regularization - Changed to 1e-6
    'reg_lambda': Real(1e-6, 10, prior='log-uniform'),  # L2 regularization - Changed to 1e-6
    'n_estimators': Integer(100, 500),  # More possible trees
    'min_child_weight': Integer(1, 5),  # Overfitting control
    'scale_pos_weight': Real(0.5, 2.0)  # For handling imbalanced classes
}

xgb = BayesSearchCV(
    estimator=XGBClassifier(
        objective='binary:logistic',
        eval_metric='auc',
        random_state=42,
        tree_method='hist',
        enable_categorical=False
    ),
    search_spaces=xgb_space,
    n_iter=100,
    cv=5,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=3,
    random_state=42,
    refit=True  # Retrain with best parameters at the end
)

xgb.fit(X_train, y_train, verbose=10)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi

In [None]:
print(f"Best parameters: {xgb.best_params_}")
print(f"Best AUC on validation: {xgb.best_score_:.4f}")
print(f"\nROC_AUC Score: {roc_auc_score(y_test,xgb.predict(X_test))}")
print(f"\nClasification Report: \n\n{classification_report(y_test,xgb.predict(X_test))}")

Best parameters: OrderedDict([('colsample_bylevel', 0.6), ('colsample_bynode', 1.0), ('colsample_bytree', 0.6), ('gamma', 0.0), ('learning_rate', 0.024768610656106062), ('max_depth', 8), ('min_child_weight', 5), ('n_estimators', 500), ('reg_alpha', 1e-06), ('reg_lambda', 10.0), ('scale_pos_weight', 0.7838099137227368), ('subsample', 0.6)])
Best AUC on validation: 0.8465

ROC_AUC Score: 0.6381766381766383

Clasification Report: 

              precision    recall  f1-score   support

           0       0.70      0.81      0.75        81
           1       0.62      0.46      0.53        52

    accuracy                           0.68       133
   macro avg       0.66      0.64      0.64       133
weighted avg       0.67      0.68      0.67       133



## CatBoost Classifier

In [None]:
catboost_space = {
    'learning_rate': Real(0.005, 0.5, prior='log-uniform'),  # Wider range than default
    'depth': Integer(4, 10),  # Tree depth (shallower trees prevent overfitting)
    'l2_leaf_reg': Real(1e-6, 10, prior='log-uniform'),  # L2 regularization strength
    'border_count': Integer(32, 255),  # Number of splits for numerical features
    'subsample': Real(0.6, 1.0),  # Random subset of data for each tree
    'random_strength': Real(1e-6, 10),  # Adds randomness to score calculations
    'bagging_temperature': Real(0.0, 1.0),  # Controls Bayesian bootstrap
    'scale_pos_weight': Real(0.5, 2.0),  # Handles class imbalance
    'min_data_in_leaf': Integer(1, 20),  # Prevents overfitting on leaves
    'grow_policy': Categorical(['SymmetricTree', 'Depthwise'])  # Tree growth strategies
}

catboost = BayesSearchCV(
    estimator=CatBoostClassifier(
        loss_function='Logloss',
        eval_metric='AUC',
        random_seed=42,
        thread_count=-1,
        verbose=1
    ),
    search_spaces=catboost_space,
    n_iter=50,
    cv=5,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=3,
    random_state=42,
    refit=True
)

catboost.fit(X_train, y_train, verbose=10)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi

In [None]:
print("Best parameters:", catboost.best_params_)
print("\nBest validation AUC:", catboost.best_score_)
print(f"\nROC_AUC Score: {roc_auc_score(y_test,catboost.predict(X_test))}")
print(f"\nClasification Report: \n\n{classification_report(y_test,catboost.predict(X_test))}")

Best parameters: OrderedDict([('bagging_temperature', 0.0), ('border_count', 255), ('depth', 4), ('grow_policy', 'Depthwise'), ('l2_leaf_reg', 10.0), ('learning_rate', 0.005), ('min_data_in_leaf', 7), ('random_strength', 8.726294865741208), ('scale_pos_weight', 2.0), ('subsample', 1.0)])

Best validation AUC: 0.8511252228163994

ROC_AUC Score: 0.7199667616334283

Clasification Report: 

              precision    recall  f1-score   support

           0       0.80      0.73      0.76        81
           1       0.63      0.71      0.67        52

    accuracy                           0.72       133
   macro avg       0.71      0.72      0.71       133
weighted avg       0.73      0.72      0.72       133



## Random Forest Classifier

In [None]:
rf_space = {
    'n_estimators': Integer(50, 500),          # Number of trees in the forest
    'max_depth': Integer(3, 20),               # Maximum tree depth (shallower trees prevent overfitting)
    'min_samples_split': Integer(2, 20),       # Minimum samples required to split a node
    'min_samples_leaf': Integer(1, 20),        # Minimum samples required at a leaf node
    'max_features': Categorical(['sqrt', 'log2', None]),  # Features to consider for splits
    'bootstrap': Categorical([True, False]),   # Whether bootstrap samples are used
    'class_weight': Categorical(['balanced', None])  # Handles class imbalance
}

rf = BayesSearchCV(
    estimator=RandomForestClassifier(
        random_state=42,
        verbose=1,
        n_jobs=-1
    ),
    search_spaces=rf_space,
    n_iter=100,
    cv=5,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=3,
    random_state=42,
    refit=True
)

rf.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 446 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:    0.8s finished


In [None]:
print("Best parameters:", rf.best_params_)
print("\nBest validation AUC:", rf.best_score_)
print(f"\nROC_AUC Score: {roc_auc_score(y_test,rf.predict(X_test))}")
print(f"\nClasification Report: \n\n{classification_report(y_test,rf.predict(X_test))}")

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.0s


Best parameters: OrderedDict([('bootstrap', True), ('class_weight', 'balanced'), ('max_depth', 5), ('max_features', 'sqrt'), ('min_samples_leaf', 8), ('min_samples_split', 2), ('n_estimators', 500)])

Best validation AUC: 0.8476938502673796

ROC_AUC Score: 0.7384852801519469


[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 500 out of 500 | elapsed:    0.1s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.0s



Clasification Report: 

              precision    recall  f1-score   support

           0       0.81      0.77      0.78        81
           1       0.66      0.71      0.69        52

    accuracy                           0.74       133
   macro avg       0.73      0.74      0.73       133
weighted avg       0.75      0.74      0.75       133



[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 500 out of 500 | elapsed:    0.1s finished


## GaussianNB

In [None]:
gnb_space = {
    'var_smoothing': Real(1e-10, 1e-2, prior='log-uniform')
}

gnb = BayesSearchCV(
    estimator=GaussianNB(),
    search_spaces=gnb_space,
    n_iter=100,
    cv=5,
    scoring='roc_auc',
    n_jobs=-1,
    random_state=42,
    verbose=1
)

gnb.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits


In [None]:
print("Best parameters:", gnb.best_params_)
print("\nBest validation AUC:", gnb.best_score_)
print(f"\nROC_AUC Score: {roc_auc_score(y_test,gnb.predict(X_test))}")
print(f"\nClasification Report: \n\n{classification_report(y_test,gnb.predict(X_test))}")

Best parameters: OrderedDict([('var_smoothing', 1.9091131576909444e-07)])

Best validation AUC: 0.841488413547237

ROC_AUC Score: 0.686965811965812

Clasification Report: 

              precision    recall  f1-score   support

           0       0.75      0.78      0.76        81
           1       0.63      0.60      0.61        52

    accuracy                           0.71       133
   macro avg       0.69      0.69      0.69       133
weighted avg       0.70      0.71      0.71       133



## Voting Classifier w/ best models

### Training with X_train and y_train

In [None]:
eclf = VotingClassifier(estimators=[
        ('xgb', xgb.best_estimator_), ('catboost', catboost.best_estimator_),
        ('rf', rf.best_estimator_), ('gnb', gnb.best_estimator_)],
        voting='soft')

eclf = eclf.fit(X_train, y_train)

0:	total: 6.78ms	remaining: 6.77s
1:	total: 9.8ms	remaining: 4.89s
2:	total: 18.9ms	remaining: 6.3s
3:	total: 27.3ms	remaining: 6.79s
4:	total: 36.7ms	remaining: 7.31s
5:	total: 40.8ms	remaining: 6.76s
6:	total: 49.3ms	remaining: 6.99s
7:	total: 57.3ms	remaining: 7.1s
8:	total: 66.2ms	remaining: 7.29s
9:	total: 70ms	remaining: 6.93s
10:	total: 79.2ms	remaining: 7.12s
11:	total: 87.3ms	remaining: 7.18s
12:	total: 94.5ms	remaining: 7.17s
13:	total: 102ms	remaining: 7.18s
14:	total: 105ms	remaining: 6.89s
15:	total: 113ms	remaining: 6.92s
16:	total: 120ms	remaining: 6.95s
17:	total: 124ms	remaining: 6.74s
18:	total: 137ms	remaining: 7.06s
19:	total: 140ms	remaining: 6.85s
20:	total: 151ms	remaining: 7.03s
21:	total: 159ms	remaining: 7.07s
22:	total: 163ms	remaining: 6.9s
23:	total: 172ms	remaining: 6.99s
24:	total: 181ms	remaining: 7.04s
25:	total: 189ms	remaining: 7.08s
26:	total: 197ms	remaining: 7.08s
27:	total: 205ms	remaining: 7.12s
28:	total: 213ms	remaining: 7.13s
29:	total: 218ms	

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 446 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:    0.8s finished


In [None]:
model_scores = []

for name, model in eclf.named_estimators_.items():
    if hasattr(model, "predict_proba"):
        y_proba = model.predict_proba(X_test)[:, 1]
        auc = roc_auc_score(y_test, y_proba)
        score = model.score(X_test, y_test)
        model_scores.append((name, auc, score))
        print(f"{name}: AUC = {auc:.4f} | Accuracy = {score:.4f}")

model_scores.sort(key=lambda x: x[1], reverse=True)

print("\nTop 5 AUC models:")
for i, (name, auc, score) in enumerate(model_scores[:5], 1):
    print(f"{i}. {name}: AUC = {auc:.4f} | Accuracy = {score:.4f}")

y_proba_ensemble = eclf.predict_proba(X_test)[:, 1]
auc_ensemble = roc_auc_score(y_test, y_proba_ensemble)
print(f"\nVotingClassifier: AUC = {auc_ensemble:.4f}")
print(f"\nClasification Report: \n\n{classification_report(y_test,eclf1.predict(X_test))}")

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.0s


xgb: AUC = 0.7502 | Accuracy = 0.6767
catboost: AUC = 0.7545 | Accuracy = 0.7218


[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 500 out of 500 | elapsed:    0.1s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 500 out of 500 | elapsed:    0.1s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s


rf: AUC = 0.7809 | Accuracy = 0.7444
gnb: AUC = 0.7614 | Accuracy = 0.7068

Top 5 AUC models:
1. rf: AUC = 0.7809 | Accuracy = 0.7444
2. gnb: AUC = 0.7614 | Accuracy = 0.7068
3. catboost: AUC = 0.7545 | Accuracy = 0.7218
4. xgb: AUC = 0.7502 | Accuracy = 0.6767

VotingClassifier: AUC = 0.7631

Clasification Report: 

              precision    recall  f1-score   support

           0       0.71      0.79      0.75        81
           1       0.60      0.50      0.55        52

    accuracy                           0.68       133
   macro avg       0.66      0.65      0.65       133
weighted avg       0.67      0.68      0.67       133



[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 500 out of 500 | elapsed:    0.1s finished


### Training with all data

In [None]:
X_all = scaler.transform(X)

In [None]:
eclf_all_data = VotingClassifier(estimators=[
        ('xgb', xgb.best_estimator_), ('catboost', catboost.best_estimator_),
        ('rf', rf.best_estimator_) , ('gnb', gnb.best_estimator_)],
        voting='soft')

eclf_all_data = eclf_all_data.fit(X_all, y)

0:	total: 3.45ms	remaining: 3.44s
1:	total: 5.75ms	remaining: 2.87s
2:	total: 7.83ms	remaining: 2.6s
3:	total: 9.39ms	remaining: 2.34s
4:	total: 12.1ms	remaining: 2.42s
5:	total: 14.3ms	remaining: 2.37s
6:	total: 17ms	remaining: 2.41s
7:	total: 19.6ms	remaining: 2.43s
8:	total: 22.2ms	remaining: 2.45s
9:	total: 24.6ms	remaining: 2.44s
10:	total: 27.4ms	remaining: 2.46s
11:	total: 29.9ms	remaining: 2.46s
12:	total: 32.4ms	remaining: 2.46s
13:	total: 34.9ms	remaining: 2.46s
14:	total: 37.2ms	remaining: 2.44s
15:	total: 39.7ms	remaining: 2.44s
16:	total: 42.1ms	remaining: 2.43s
17:	total: 44.5ms	remaining: 2.43s
18:	total: 46.8ms	remaining: 2.42s
19:	total: 49.4ms	remaining: 2.42s
20:	total: 51.8ms	remaining: 2.41s
21:	total: 54.4ms	remaining: 2.42s
22:	total: 56.5ms	remaining: 2.4s
23:	total: 58.7ms	remaining: 2.38s
24:	total: 60.4ms	remaining: 2.35s
25:	total: 62.6ms	remaining: 2.35s
26:	total: 65.4ms	remaining: 2.36s
27:	total: 68.1ms	remaining: 2.36s
28:	total: 70.7ms	remaining: 2.37s

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 446 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:    0.9s finished


In [None]:
model_scores = []

for name, model in eclf.named_estimators_.items():
    if hasattr(model, "predict_proba"):
        y_proba = model.predict_proba(X_test)[:, 1]
        auc = roc_auc_score(y_test, y_proba)
        score = model.score(X_test, y_test)
        model_scores.append((name, auc, score))
        print(f"{name}: AUC = {auc:.4f} | Accuracy = {score:.4f}")

model_scores.sort(key=lambda x: x[1], reverse=True)

print("\nTop 5 AUC models:")
for i, (name, auc, score) in enumerate(model_scores[:5], 1):
    print(f"{i}. {name}: AUC = {auc:.4f} | Accuracy = {score:.4f}")

y_proba_ensemble = eclf.predict_proba(X_test)[:, 1]
auc_ensemble = roc_auc_score(y_test, y_proba_ensemble)
print(f"\nVotingClassifier: AUC = {auc_ensemble:.4f}")
print(f"\nClasification Report: \n\n{classification_report(y_test,eclf1.predict(X_test))}")

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.0s


xgb: AUC = 0.7502 | Accuracy = 0.6767
catboost: AUC = 0.7545 | Accuracy = 0.7218


[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 500 out of 500 | elapsed:    0.1s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 500 out of 500 | elapsed:    0.1s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s


rf: AUC = 0.7809 | Accuracy = 0.7444
gnb: AUC = 0.7614 | Accuracy = 0.7068

Top 5 AUC models:
1. rf: AUC = 0.7809 | Accuracy = 0.7444
2. gnb: AUC = 0.7614 | Accuracy = 0.7068
3. catboost: AUC = 0.7545 | Accuracy = 0.7218
4. xgb: AUC = 0.7502 | Accuracy = 0.6767

VotingClassifier: AUC = 0.7631

Clasification Report: 

              precision    recall  f1-score   support

           0       0.71      0.79      0.75        81
           1       0.60      0.50      0.55        52

    accuracy                           0.68       133
   macro avg       0.66      0.65      0.65       133
weighted avg       0.67      0.68      0.67       133



[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 500 out of 500 | elapsed:    0.1s finished


Random Forrest is superior to the VotingClassifier model

## Training the best model individually with all data

In [None]:
best_model = clone(rf.best_estimator_ )

best_model.fit(X_all, y)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 446 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:    0.9s finished


## Saving the most important models

In [None]:
import joblib

joblib.dump(scaler, "Scaler.pkl")
joblib.dump(rf, "Random Forest.pkl")
joblib.dump(best_model, "model rf.pkl")

['model rf.pkl']

Bibliography

- VotingClassifier. (n. a.). Scikit-learn. https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.VotingClassifier.html