# 1. Introdução
O notebook abaixo apresenta e implementa o spot-checking de modelos preditivos supervisionados, desenvolvido para primeiro trabalho da disciplina Aprendizado de Máquina da Universidade Federal do Rio Grande do Sul (2024/2).

Neste trabalho, buscamos analisar a relação de diversos fatores, como gênero e notas do primeiro semestre, com a taxa de desistência de alunos. No modelo abaixo usamos o dataset carregado nesse notebook, analisamos quais os fatores que de fato influenciam na desistência dos alunos e possibilitamos que inputs personalizados sejam adicionados ao modelo para que seja calculado a probabilidade de um aluno desistir do curso.


## Setup

In [1]:
!pip install pandas plotly matplotlib seaborn scikit-learn xgboost optuna hyperopt setuptools nbformat



In [2]:
# Módulo para leitura e manipulação dos dados
import pandas as pd

# Módulo para manipulação de arrays e matrizes
import numpy as np

# Módulos para visualização de dados e plotagem de gráficos
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns

# Módulos específicos da sklearn
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score,f1_score,precision_score,recall_score, roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn import svm
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import VotingClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Biblioteca com algoritmos específicos de machine learning
from xgboost import XGBClassifier

# Módulo para balanceamento de classes
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

import optuna
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

  from .autonotebook import tqdm as notebook_tqdm


# 1. Carregamento dos dados

Dataset pré-processado no notebook [t1-spot-checking.ipynb](./t1-spot-checking.ipynb)

---
Dataset obtido em https://www.kaggle.com/datasets/thedevastator/higher-education-predictors-of-student-retention/data

Original: https://zenodo.org/records/5777340#.Y7FJotJBwUE

In [3]:
data = pd.read_csv("../data/clean-dataset.csv")

In [4]:
# Separa atributos preditivos e atributo alvo
X = data.drop('Target', axis=1)
y = data['Target']

In [5]:
# Algoritmos selecionados para treinamento
dtree = DecisionTreeClassifier(random_state=0)
dtree2 = DecisionTreeClassifier(random_state=0, max_depth=10)
rfc_gini = RandomForestClassifier(random_state=2)
rfc_entropy = RandomForestClassifier(random_state=2, criterion='entropy')
lr = LogisticRegression(random_state=42)
knn_3 = KNeighborsClassifier(n_neighbors=3)
knn_5 = KNeighborsClassifier(n_neighbors=5)
abc = AdaBoostClassifier(n_estimators=50,learning_rate=1, random_state=0, algorithm='SAMME')
svmachine = svm.SVC(kernel='linear',probability=True)

algo_dict = {'Decision Tree': dtree, 'Decision Tree Max depth 5': dtree2, 'Random Forest gini': rfc_gini, 'Random Forest entropy': rfc_entropy, 'Logistic Regression': lr, '3-Nearest Neighbors': knn_3, '5-Nearest Neighbors': knn_5, 'AdaBoost': abc, 'SVM': svmachine}

In [6]:
# Referências
# https://machinelearningmastery.com/spot-check-machine-learning-algorithms-in-python/
# https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html
def make_pipeline(model):
    steps = list()

    steps.append(('Normalização', StandardScaler()))
    steps.append(('Balanceamento da classe minoritária', SMOTE(sampling_strategy='minority')))
    steps.append(('Modelo', model))

    # Cria a pipeline
    pipe = Pipeline(steps=steps)

    return pipe

In [7]:
# Define hyperparameter grids for each model
def define_grid_search_params(model_name):
    if model_name == 'Random Forest gini' or model_name == 'Random Forest entropy':
        return {
            'Modelo__n_estimators': [50, 100, 150],
            'Modelo__max_depth': [10, 20, 30]
        }
    elif model_name == 'Decision Tree' or model_name == 'Decision Tree Max depth 5':
        return {
            'Modelo__max_depth': [5, 10, 20, 30]
        }
    elif model_name == 'Logistic Regression':
        return {
            'Modelo__C': [0.01, 0.1, 1, 10, 100]
        }
    elif model_name == '3-Nearest Neighbors' or model_name == '5-Nearest Neighbors':
        return {
            'Modelo__n_neighbors': [3, 5, 7, 10]
        }
    elif model_name == 'AdaBoost':
        return {
            'Modelo__n_estimators': [50, 100, 150],
            'Modelo__learning_rate': [0.01, 0.1, 1]
        }
    elif model_name == 'SVM':
        return {
            'Modelo__C': [0.01, 0.1, 1, 10],
            'Modelo__kernel': ['linear', 'rbf']
        }
    else:
        raise ValueError(f"Não foi definido parâmetro para o modelo: {model_name}")

In [8]:
def nested_cv_with_gridsearch(X, y, models, outer_folds=5, inner_folds=3, metric='f1'):
    outer_cv = StratifiedKFold(n_splits=outer_folds, shuffle=True, random_state=42)
    results = []

    for model_name, model in models.items():
        param_grid = define_grid_search_params(model_name)

        for train_idx, test_idx in outer_cv.split(X, y):
            X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
            y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

            pipeline = make_pipeline(model)

            grid_search = GridSearchCV(
                estimator=pipeline,
                param_grid=param_grid,
                cv=StratifiedKFold(n_splits=inner_folds, shuffle=True, random_state=42),
                scoring=metric,
                n_jobs=-1
            )

            grid_search.fit(X_train, y_train)

            best_pipeline = grid_search.best_estimator_
            y_pred = best_pipeline.predict(X_test)

            metrics = {
                'Model': model_name,
                'F1 Score': f1_score(y_test, y_pred),
                'Precision': precision_score(y_test, y_pred),
                'Recall': recall_score(y_test, y_pred),
                'ROC AUC': roc_auc_score(y_test, best_pipeline.predict_proba(X_test)[:, 1])
            }
            results.append(metrics)

    return pd.DataFrame(results)

In [9]:
results_df = nested_cv_with_gridsearch(X, y, algo_dict, outer_folds=5, inner_folds=3, metric='f1')
display(results_df)

  _data = np.array(data, dtype=dtype, copy=copy,
  _data = np.array(data, dtype=dtype, copy=copy,


Unnamed: 0,Model,F1 Score,Precision,Recall,ROC AUC
0,Decision Tree,0.738411,0.69906,0.782456,0.874693
1,Decision Tree,0.761905,0.713846,0.816901,0.888604
2,Decision Tree,0.777003,0.768966,0.785211,0.893994
3,Decision Tree,0.751286,0.732441,0.771127,0.864068
4,Decision Tree,0.766031,0.754266,0.778169,0.888207
5,Decision Tree Max depth 5,0.736301,0.719064,0.754386,0.873006
6,Decision Tree Max depth 5,0.770519,0.734824,0.809859,0.892928
7,Decision Tree Max depth 5,0.770053,0.779783,0.760563,0.889943
8,Decision Tree Max depth 5,0.749568,0.735593,0.764085,0.877531
9,Decision Tree Max depth 5,0.769517,0.814961,0.728873,0.891329


In [10]:
def define_hyperopt_space(model_name):
    if model_name == 'Random Forest gini' or model_name == 'Random Forest entropy':
        return {
            'n_estimators': hp.quniform('n_estimators', 50, 300, 10),
            'max_depth': hp.quniform('max_depth', 5, 50, 1),
        }
    elif model_name == 'Decision Tree' or model_name == 'Decision Tree Max depth 5':
        return {
            'max_depth': hp.quniform('max_depth', 5, 50, 1),
        }
    elif model_name == 'Logistic Regression':
        return {
            'C': hp.loguniform('C', np.log(1e-4), np.log(1e2)),
        }
    elif model_name == '3-Nearest Neighbors' or model_name == '5-Nearest Neighbors':
        return {
            'n_neighbors': hp.quniform('n_neighbors', 3, 20, 1),
        }
    elif model_name == 'AdaBoost':
        return {
            'n_estimators': hp.quniform('n_estimators', 50, 300, 10),
            'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(2)),
        }
    elif model_name == 'SVM':
        return {
            'C': hp.loguniform('C', np.log(1e-4), np.log(1e2)),
            'kernel': hp.choice('kernel', ['linear', 'rbf']),
        }

In [11]:
def nested_cv_with_hyperopt(X, y, models, outer_folds=5, inner_folds=3, metric='f1'):
    outer_cv = StratifiedKFold(n_splits=outer_folds, shuffle=True, random_state=42)
    results = []

    for model_name, model in models.items():

        # Define objective function for Hyperopt
        def objective(params):
            # If SVM, map the kernel index to string for correct cross-validation
            if 'kernel' in params and model_name == 'SVM':
                kernel_mapping = ['linear', 'rbf']
                if isinstance(params['kernel'], int):
                    params['kernel'] = kernel_mapping[params['kernel']]

            pipeline = make_pipeline(model)

            # Convert integer-like floats to int
            final_params = {}
            for k, v in params.items():
                if isinstance(v, float) and v.is_integer():
                    v = int(v)  # Convert to int if it is a whole number float
                final_params[f'Modelo__{k}'] = v

            pipeline.set_params(**final_params)

            inner_cv = StratifiedKFold(n_splits=inner_folds, shuffle=True, random_state=42)
            scores = cross_val_score(pipeline, X_train, y_train, cv=inner_cv, scoring=metric, n_jobs=-1)
            return {'loss': -np.mean(scores), 'status': STATUS_OK}

        # Outer loop
        for train_idx, test_idx in outer_cv.split(X, y):
            X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
            y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

            search_space = define_hyperopt_space(model_name)
            trials = Trials()

            best_params = fmin(
                fn=objective,
                space=search_space,
                algo=tpe.suggest,
                max_evals=20,
                trials=trials,
            )

            # Map SVM kernel back if needed
            if model_name == 'SVM' and 'kernel' in best_params:
                kernel_mapping = ['linear', 'rbf']
                best_params['kernel'] = kernel_mapping[best_params['kernel']]

            # Convert parameters and retrain pipeline
            final_params = {}
            for k, v in best_params.items():
                if isinstance(v, float) and v.is_integer():
                    v = int(v)
                final_params[f'Modelo__{k}'] = v

            # Train the pipeline with the best parameters
            pipeline = make_pipeline(model)
            pipeline.set_params(**final_params)
            pipeline.fit(X_train, y_train)

            # Evaluate on the test set
            y_pred = pipeline.predict(X_test)
            metrics = {
                'Model': model_name,
                'F1 Score': f1_score(y_test, y_pred),
                'Precision': precision_score(y_test, y_pred),
                'Recall': recall_score(y_test, y_pred),
                'ROC AUC': roc_auc_score(y_test, pipeline.predict_proba(X_test)[:, 1])
            }
            results.append(metrics)

    return pd.DataFrame(results)

In [12]:
results_df = nested_cv_with_hyperopt(X, y, algo_dict, outer_folds=5, inner_folds=3, metric='f1')
display(results_df)

100%|██████████| 20/20 [00:00<00:00, 41.36trial/s, best loss: -0.7466157096378799]
100%|██████████| 20/20 [00:00<00:00, 37.78trial/s, best loss: -0.756714256258778]
100%|██████████| 20/20 [00:00<00:00, 40.87trial/s, best loss: -0.7438711420197831]
100%|██████████| 20/20 [00:00<00:00, 40.07trial/s, best loss: -0.7439041160657812]
100%|██████████| 20/20 [00:00<00:00, 41.84trial/s, best loss: -0.753807524716202]
100%|██████████| 20/20 [00:00<00:00, 39.84trial/s, best loss: -0.7505056523252992]
100%|██████████| 20/20 [00:00<00:00, 40.01trial/s, best loss: -0.7433640545561461]
100%|██████████| 20/20 [00:00<00:00, 37.46trial/s, best loss: -0.7402313435176732]
100%|██████████| 20/20 [00:00<00:00, 39.98trial/s, best loss: -0.7484820417175051]
100%|██████████| 20/20 [00:00<00:00, 37.16trial/s, best loss: -0.7488543712722627]
100%|██████████| 20/20 [00:08<00:00,  2.39trial/s, best loss: -0.7750283324947119]
100%|██████████| 20/20 [00:09<00:00,  2.10trial/s, best loss: -0.7592205532517687]
100%|█

Unnamed: 0,Model,F1 Score,Precision,Recall,ROC AUC
0,Decision Tree,0.734918,0.767176,0.705263,0.859977
1,Decision Tree,0.761092,0.738411,0.785211,0.882432
2,Decision Tree,0.77087,0.777778,0.764085,0.891601
3,Decision Tree,0.749568,0.735593,0.764085,0.864838
4,Decision Tree,0.769517,0.814961,0.728873,0.889158
5,Decision Tree Max depth 5,0.717391,0.741573,0.694737,0.856909
6,Decision Tree Max depth 5,0.767974,0.716463,0.827465,0.878562
7,Decision Tree Max depth 5,0.764505,0.741722,0.788732,0.8823
8,Decision Tree Max depth 5,0.729452,0.71,0.75,0.848718
9,Decision Tree Max depth 5,0.769231,0.747508,0.792254,0.889196


In [13]:
def define_optuna_space(model_name, trial):
    if model_name == 'Random Forest gini' or model_name == 'Random Forest entropy':
        return {
            'Modelo__n_estimators': trial.suggest_int('n_estimators', 50, 300),
            'Modelo__max_depth': trial.suggest_int('max_depth', 5, 50)
        }
    elif model_name == 'Decision Tree' or model_name == 'Decision Tree Max depth 5':
        return {
            'Modelo__max_depth': trial.suggest_int('max_depth', 5, 50)
        }
    elif model_name == 'Logistic Regression':
        return {
            'Modelo__C': trial.suggest_loguniform('C', 1e-4, 1e2)
        }
    elif model_name == '3-Nearest Neighbors' or model_name == '5-Nearest Neighbors':
        return {
            'Modelo__n_neighbors': trial.suggest_int('n_neighbors', 3, 20)
        }
    elif model_name == 'AdaBoost':
        return {
            'Modelo__n_estimators': trial.suggest_int('n_estimators', 50, 300),
            'Modelo__learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 2)
        }
    elif model_name == 'SVM':
        return {
            'Modelo__C': trial.suggest_loguniform('C', 1e-4, 1e2),
            'Modelo__kernel': trial.suggest_categorical('kernel', ['linear', 'rbf'])
        }

In [14]:
# def nested_cv_with_optuna(X, y, models, outer_folds=5, inner_folds=3, metric='f1'):
#     outer_cv = StratifiedKFold(n_splits=outer_folds, shuffle=True, random_state=42)
#     results = []

#     for model_name, model in models.items():
#         # Define objective for Optuna
#         def objective(trial):
#             params = define_optuna_space(model_name, trial)

#             pipeline = make_pipeline(model)
#             pipeline.set_params(**params)
#             inner_cv = StratifiedKFold(n_splits=inner_folds, shuffle=True, random_state=42)

#             return cross_val_score(pipeline, X_train, y_train, cv=inner_cv, scoring=metric, n_jobs=-1).mean()

#         # Outer loop
#         for train_idx, test_idx in outer_cv.split(X, y):
#             X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
#             y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

#             study = optuna.create_study(direction="maximize")
#             study.optimize(objective, n_trials=20)

#             # Add the 'Modelo__' prefix to all best_params keys
#             best_params = {f'Modelo__{key}': value for key, value in study.best_params.items()}

#             pipeline = make_pipeline(model)
#             pipeline.set_params(**best_params)
#             pipeline.fit(X_train, y_train)

#             # Evaluate on test set
#             y_pred = pipeline.predict(X_test)
#             metrics = {
#                 'Model': model_name,
#                 'F1 Score': f1_score(y_test, y_pred),
#                 'Precision': precision_score(y_test, y_pred),
#                 'Recall': recall_score(y_test, y_pred),
#                 'ROC AUC': roc_auc_score(y_test, pipeline.predict_proba(X_test)[:, 1])
#             }
#             results.append(metrics)

#     return pd.DataFrame(results)

In [20]:
# results_df = nested_cv_with_optuna(X, y, algo_dict, outer_folds=5, inner_folds=3, metric='f1')
# display(results_df)

In [26]:
import optuna
from optuna.visualization import (
    plot_optimization_history, 
    plot_slice, 
    plot_contour, 
    plot_parallel_coordinate, 
    plot_param_importances
)

In [27]:
def nested_cv_with_optuna(X, y, models, outer_folds=5, inner_folds=3, metric='f1'):
    outer_cv = StratifiedKFold(n_splits=outer_folds, shuffle=True, random_state=42)
    results = []
    studies = {}  # Dictionary to store the last study for each model

    for model_name, model in models.items():
        print(f"Optimizing and evaluating model: {model_name}")

        def objective(trial):
            params = define_optuna_space(model_name, trial)

            pipeline = make_pipeline(model)
            pipeline.set_params(**params)
            inner_cv = StratifiedKFold(n_splits=inner_folds, shuffle=True, random_state=42)

            return cross_val_score(pipeline, X_train, y_train, cv=inner_cv, scoring=metric, n_jobs=-1).mean()

        for train_idx, test_idx in outer_cv.split(X, y):
            X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
            y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

            study = optuna.create_study(direction="maximize")
            study.optimize(objective, n_trials=20)
            studies[model_name] = study  # Store the last study for this model

            best_params = {f'Modelo__{key}': value for key, value in study.best_params.items()}

            pipeline = make_pipeline(model)
            pipeline.set_params(**best_params)
            pipeline.fit(X_train, y_train)

            y_pred = pipeline.predict(X_test)
            metrics = {
                'Model': model_name,
                'F1 Score': f1_score(y_test, y_pred),
                'Precision': precision_score(y_test, y_pred),
                'Recall': recall_score(y_test, y_pred),
                'ROC AUC': roc_auc_score(y_test, pipeline.predict_proba(X_test)[:, 1])
            }
            results.append(metrics)

    return pd.DataFrame(results), studies  # Return results and all studies


In [28]:
results_df, studies = nested_cv_with_optuna(X, y, algo_dict, outer_folds=5, inner_folds=3, metric='f1')
display(results_df)

[I 2024-12-07 15:15:54,857] A new study created in memory with name: no-name-7979fbaa-c20e-4889-920c-9facddb3ca78


Optimizing and evaluating model: Decision Tree


[I 2024-12-07 15:15:55,922] Trial 0 finished with value: 0.7087434977801946 and parameters: {'max_depth': 32}. Best is trial 0 with value: 0.7087434977801946.
[I 2024-12-07 15:15:56,552] Trial 1 finished with value: 0.706601594888553 and parameters: {'max_depth': 29}. Best is trial 0 with value: 0.7087434977801946.
[I 2024-12-07 15:15:57,234] Trial 2 finished with value: 0.7506669911893842 and parameters: {'max_depth': 5}. Best is trial 2 with value: 0.7506669911893842.
[I 2024-12-07 15:15:57,878] Trial 3 finished with value: 0.7495220948105342 and parameters: {'max_depth': 6}. Best is trial 2 with value: 0.7506669911893842.
[I 2024-12-07 15:15:58,500] Trial 4 finished with value: 0.7092261022048625 and parameters: {'max_depth': 22}. Best is trial 2 with value: 0.7506669911893842.
[I 2024-12-07 15:15:59,130] Trial 5 finished with value: 0.705472565520385 and parameters: {'max_depth': 29}. Best is trial 2 with value: 0.7506669911893842.
[I 2024-12-07 15:15:59,765] Trial 6 finished with 

Optimizing and evaluating model: Decision Tree Max depth 5


[I 2024-12-07 15:16:03,999] Trial 9 finished with value: 0.7067029264754955 and parameters: {'max_depth': 23}. Best is trial 6 with value: 0.7574313148797418.
[I 2024-12-07 15:16:04,015] Trial 10 finished with value: 0.7621921413150651 and parameters: {'max_depth': 5}. Best is trial 10 with value: 0.7621921413150651.
[I 2024-12-07 15:16:04,030] Trial 11 finished with value: 0.7595770344025027 and parameters: {'max_depth': 5}. Best is trial 10 with value: 0.7621921413150651.
[I 2024-12-07 15:16:04,055] Trial 12 finished with value: 0.7176299076591665 and parameters: {'max_depth': 13}. Best is trial 10 with value: 0.7621921413150651.
[I 2024-12-07 15:16:04,071] Trial 13 finished with value: 0.7535002699989485 and parameters: {'max_depth': 5}. Best is trial 10 with value: 0.7621921413150651.
[I 2024-12-07 15:16:04,096] Trial 14 finished with value: 0.7082101206391216 and parameters: {'max_depth': 20}. Best is trial 10 with value: 0.7621921413150651.
[I 2024-12-07 15:16:04,121] Trial 15 fi

Optimizing and evaluating model: Random Forest gini


[I 2024-12-07 15:16:06,216] Trial 0 finished with value: 0.7602682906974194 and parameters: {'n_estimators': 99, 'max_depth': 15}. Best is trial 0 with value: 0.7602682906974194.
[I 2024-12-07 15:16:06,575] Trial 1 finished with value: 0.7652914055391141 and parameters: {'n_estimators': 203, 'max_depth': 13}. Best is trial 1 with value: 0.7652914055391141.
[I 2024-12-07 15:16:06,808] Trial 2 finished with value: 0.7622157521034328 and parameters: {'n_estimators': 130, 'max_depth': 23}. Best is trial 1 with value: 0.7652914055391141.
[I 2024-12-07 15:16:07,062] Trial 3 finished with value: 0.7597578710145311 and parameters: {'n_estimators': 138, 'max_depth': 49}. Best is trial 1 with value: 0.7652914055391141.
[I 2024-12-07 15:16:07,232] Trial 4 finished with value: 0.7615717733344183 and parameters: {'n_estimators': 97, 'max_depth': 13}. Best is trial 1 with value: 0.7652914055391141.
[I 2024-12-07 15:16:07,706] Trial 5 finished with value: 0.7614316100162433 and parameters: {'n_estima

Optimizing and evaluating model: Random Forest entropy


[I 2024-12-07 15:16:37,748] Trial 0 finished with value: 0.7580978316815932 and parameters: {'n_estimators': 133, 'max_depth': 50}. Best is trial 0 with value: 0.7580978316815932.
[I 2024-12-07 15:16:37,882] Trial 1 finished with value: 0.7559968606274884 and parameters: {'n_estimators': 62, 'max_depth': 46}. Best is trial 0 with value: 0.7580978316815932.
[I 2024-12-07 15:16:38,001] Trial 2 finished with value: 0.7552658178816442 and parameters: {'n_estimators': 55, 'max_depth': 39}. Best is trial 0 with value: 0.7580978316815932.
[I 2024-12-07 15:16:38,464] Trial 3 finished with value: 0.7669026916961051 and parameters: {'n_estimators': 251, 'max_depth': 13}. Best is trial 3 with value: 0.7669026916961051.
[I 2024-12-07 15:16:39,012] Trial 4 finished with value: 0.7645595025620966 and parameters: {'n_estimators': 286, 'max_depth': 16}. Best is trial 3 with value: 0.7669026916961051.
[I 2024-12-07 15:16:39,214] Trial 5 finished with value: 0.7596444403050443 and parameters: {'n_estima

Optimizing and evaluating model: Logistic Regression



suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use suggest_float(..., log=True) instead.

[I 2024-12-07 15:17:12,571] Trial 13 finished with value: 0.7571775922760682 and parameters: {'C': 0.021538982964718414}. Best is trial 3 with value: 0.7592400244978332.

suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use suggest_float(..., log=True) instead.

[I 2024-12-07 15:17:12,587] Trial 14 finished with value: 0.753396752043415 and parameters: {'C': 1.525566778952835}. Best is trial 3 with value: 0.7592400244978332.

suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use suggest_float(..., log=True) instead.

[I 2024-12-07 15:17:12,602] Trial 15 finished with value: 0.7505334588044478 and paramet

Optimizing and evaluating model: 3-Nearest Neighbors


[I 2024-12-07 15:17:14,051] Trial 5 finished with value: 0.7359682302166247 and parameters: {'n_neighbors': 13}. Best is trial 2 with value: 0.748656685930921.
[I 2024-12-07 15:17:14,085] Trial 6 finished with value: 0.7463530286075383 and parameters: {'n_neighbors': 15}. Best is trial 2 with value: 0.748656685930921.
[I 2024-12-07 15:17:14,119] Trial 7 finished with value: 0.729601891684791 and parameters: {'n_neighbors': 6}. Best is trial 2 with value: 0.748656685930921.
[I 2024-12-07 15:17:14,152] Trial 8 finished with value: 0.7358056415096348 and parameters: {'n_neighbors': 7}. Best is trial 2 with value: 0.748656685930921.
[I 2024-12-07 15:17:14,186] Trial 9 finished with value: 0.7408859099995898 and parameters: {'n_neighbors': 13}. Best is trial 2 with value: 0.748656685930921.
[I 2024-12-07 15:17:14,222] Trial 10 finished with value: 0.7535946549645179 and parameters: {'n_neighbors': 20}. Best is trial 10 with value: 0.7535946549645179.
[I 2024-12-07 15:17:14,259] Trial 11 fin

Optimizing and evaluating model: 5-Nearest Neighbors


[I 2024-12-07 15:17:17,768] Trial 5 finished with value: 0.7342777710667354 and parameters: {'n_neighbors': 9}. Best is trial 1 with value: 0.7444404054397528.
[I 2024-12-07 15:17:17,802] Trial 6 finished with value: 0.7439431318471027 and parameters: {'n_neighbors': 11}. Best is trial 1 with value: 0.7444404054397528.
[I 2024-12-07 15:17:17,836] Trial 7 finished with value: 0.7410874325141324 and parameters: {'n_neighbors': 11}. Best is trial 1 with value: 0.7444404054397528.
[I 2024-12-07 15:17:17,870] Trial 8 finished with value: 0.7486548717998591 and parameters: {'n_neighbors': 18}. Best is trial 8 with value: 0.7486548717998591.
[I 2024-12-07 15:17:17,904] Trial 9 finished with value: 0.741480822028611 and parameters: {'n_neighbors': 17}. Best is trial 8 with value: 0.7486548717998591.
[I 2024-12-07 15:17:17,940] Trial 10 finished with value: 0.746014499454421 and parameters: {'n_neighbors': 19}. Best is trial 8 with value: 0.7486548717998591.
[I 2024-12-07 15:17:17,976] Trial 11

Optimizing and evaluating model: AdaBoost


[I 2024-12-07 15:17:21,466] Trial 0 finished with value: 0.759910182829557 and parameters: {'n_estimators': 179, 'learning_rate': 0.0423230241324649}. Best is trial 0 with value: 0.759910182829557.

suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use suggest_float(..., log=True) instead.

[I 2024-12-07 15:17:21,542] Trial 1 finished with value: 0.7385488132371902 and parameters: {'n_estimators': 57, 'learning_rate': 0.012383140222232402}. Best is trial 0 with value: 0.759910182829557.

suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use suggest_float(..., log=True) instead.

[I 2024-12-07 15:17:21,817] Trial 2 finished with value: 0.764911435198213 and parameters: {'n_estimators': 230, 'learning_rate': 1.2475084699126462}. Best is trial 2 with value: 0.764911435198213.

suggest_loguniform 

Optimizing and evaluating model: SVM



suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use suggest_float(..., log=True) instead.

[I 2024-12-07 15:17:46,288] Trial 0 finished with value: 0.7721963864210014 and parameters: {'C': 0.718583186298238, 'kernel': 'rbf'}. Best is trial 0 with value: 0.7721963864210014.

suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use suggest_float(..., log=True) instead.

[I 2024-12-07 15:17:47,213] Trial 1 finished with value: 0.7674328461497746 and parameters: {'C': 36.272373730747034, 'kernel': 'rbf'}. Best is trial 0 with value: 0.7721963864210014.

suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use suggest_float(..., log=True) instead.

[I 2024-12-07 15:17:47,521] Trial 2 finished with value: 

Unnamed: 0,Model,F1 Score,Precision,Recall,ROC AUC
0,Decision Tree,0.734545,0.762264,0.708772,0.864889
1,Decision Tree,0.765306,0.740132,0.792254,0.886337
2,Decision Tree,0.773196,0.755034,0.792254,0.890523
3,Decision Tree,0.758148,0.73913,0.778169,0.881055
4,Decision Tree,0.768142,0.772242,0.764085,0.893298
5,Decision Tree Max depth 5,0.743945,0.733788,0.754386,0.87555
6,Decision Tree Max depth 5,0.769492,0.74183,0.799296,0.890675
7,Decision Tree Max depth 5,0.774081,0.770035,0.778169,0.894668
8,Decision Tree Max depth 5,0.746946,0.740484,0.753521,0.866408
9,Decision Tree Max depth 5,0.77643,0.764505,0.788732,0.888621


In [29]:
# Generate plots for each model
for model_name, study in studies.items():
    print(f"Plots for model: {model_name}")
    
    # Optimization history
    fig_opt_history = plot_optimization_history(study)
    fig_opt_history.show()

    # Slice plot
    fig_slice = plot_slice(study)
    fig_slice.show()

    # Parallel coordinate plot
    fig_parallel = plot_parallel_coordinate(study)
    fig_parallel.show()

    # Contour plot
    fig_contour = plot_contour(study)
    fig_contour.show()

    # Parameter importances
    fig_param_importances = plot_param_importances(study)
    fig_param_importances.show()


Plots for model: Decision Tree


Plots for model: Decision Tree Max depth 5


Plots for model: Random Forest gini


Plots for model: Random Forest entropy


Plots for model: Logistic Regression


Plots for model: 3-Nearest Neighbors


Plots for model: 5-Nearest Neighbors


Plots for model: AdaBoost


Plots for model: SVM
