# 1. Introdução
O notebook abaixo apresenta e implementa o spot-checking de modelos preditivos supervisionados, desenvolvido para primeiro trabalho da disciplina Aprendizado de Máquina da Universidade Federal do Rio Grande do Sul (2024/2).

Neste trabalho, buscamos analisar a relação de diversos fatores, como gênero e notas do primeiro semestre, com a taxa de desistência de alunos. No modelo abaixo usamos o dataset carregado nesse notebook, analisamos quais os fatores que de fato influenciam na desistência dos alunos e possibilitamos que inputs personalizados sejam adicionados ao modelo para que seja calculado a probabilidade de um aluno desistir do curso.


## Setup

In [268]:
!pip install pandas plotly matplotlib seaborn scikit-learn xgboost optuna hyperopt setuptools nbformat

Collecting nbformat
  Using cached nbformat-5.10.4-py3-none-any.whl.metadata (3.6 kB)
Collecting fastjsonschema>=2.15 (from nbformat)
  Using cached fastjsonschema-2.21.1-py3-none-any.whl.metadata (2.2 kB)
Collecting jsonschema>=2.6 (from nbformat)
  Using cached jsonschema-4.23.0-py3-none-any.whl.metadata (7.9 kB)
Collecting attrs>=22.2.0 (from jsonschema>=2.6->nbformat)
  Using cached attrs-24.2.0-py3-none-any.whl.metadata (11 kB)
Collecting jsonschema-specifications>=2023.03.6 (from jsonschema>=2.6->nbformat)
  Using cached jsonschema_specifications-2024.10.1-py3-none-any.whl.metadata (3.0 kB)
Collecting referencing>=0.28.4 (from jsonschema>=2.6->nbformat)
  Using cached referencing-0.35.1-py3-none-any.whl.metadata (2.8 kB)
Collecting rpds-py>=0.7.1 (from jsonschema>=2.6->nbformat)
  Downloading rpds_py-0.22.3-cp312-cp312-win_amd64.whl.metadata (4.2 kB)
Using cached nbformat-5.10.4-py3-none-any.whl (78 kB)
Using cached fastjsonschema-2.21.1-py3-none-any.whl (23 kB)
Using cached json

In [249]:
# Módulo para leitura e manipulação dos dados
import pandas as pd

# Módulo para manipulação de arrays e matrizes
import numpy as np

# Módulos para visualização de dados e plotagem de gráficos
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns

# Módulos específicos da sklearn
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score,f1_score,precision_score,recall_score, roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn import svm
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import VotingClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Biblioteca com algoritmos específicos de machine learning
from xgboost import XGBClassifier

# Módulo para balanceamento de classes
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

import optuna
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

# 1. Carregamento dos dados

Dataset pré-processado no notebook [t1-spot-checking.ipynb](./t1-spot-checking.ipynb)

---
Dataset obtido em https://www.kaggle.com/datasets/thedevastator/higher-education-predictors-of-student-retention/data

Original: https://zenodo.org/records/5777340#.Y7FJotJBwUE

In [250]:
data = pd.read_csv("../data/clean-dataset.csv")

In [251]:
# Separa atributos preditivos e atributo alvo
X = data.drop('Target', axis=1)
y = data['Target']

In [252]:
# Algoritmos selecionados para treinamento
dtree = DecisionTreeClassifier(random_state=0)
dtree2 = DecisionTreeClassifier(random_state=0, max_depth=10)
rfc_gini = RandomForestClassifier(random_state=2)
rfc_entropy = RandomForestClassifier(random_state=2, criterion='entropy')
lr = LogisticRegression(random_state=42)
knn_3 = KNeighborsClassifier(n_neighbors=3)
knn_5 = KNeighborsClassifier(n_neighbors=5)
abc = AdaBoostClassifier(n_estimators=50,learning_rate=1, random_state=0, algorithm='SAMME')
svmachine = svm.SVC(kernel='linear',probability=True)

algo_dict = {'Decision Tree': dtree, 'Decision Tree Max depth 5': dtree2, 'Random Forest gini': rfc_gini, 'Random Forest entropy': rfc_entropy, 'Logistic Regression': lr, '3-Nearest Neighbors': knn_3, '5-Nearest Neighbors': knn_5, 'AdaBoost': abc, 'SVM': svmachine}

In [253]:
# Referências
# https://machinelearningmastery.com/spot-check-machine-learning-algorithms-in-python/
# https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html
def make_pipeline(model):
    steps = list()

    steps.append(('Normalização', StandardScaler()))
    steps.append(('Balanceamento da classe minoritária', SMOTE(sampling_strategy='minority')))
    steps.append(('Modelo', model))

    # Cria a pipeline
    pipe = Pipeline(steps=steps)

    return pipe

In [254]:
# Define hyperparameter grids for each model
def define_grid_search_params(model_name):
    if model_name == 'Random Forest gini' or model_name == 'Random Forest entropy':
        return {
            'Modelo__n_estimators': [50, 100, 150],
            'Modelo__max_depth': [10, 20, 30]
        }
    elif model_name == 'Decision Tree' or model_name == 'Decision Tree Max depth 5':
        return {
            'Modelo__max_depth': [5, 10, 20, 30]
        }
    elif model_name == 'Logistic Regression':
        return {
            'Modelo__C': [0.01, 0.1, 1, 10, 100]
        }
    elif model_name == '3-Nearest Neighbors' or model_name == '5-Nearest Neighbors':
        return {
            'Modelo__n_neighbors': [3, 5, 7, 10]
        }
    elif model_name == 'AdaBoost':
        return {
            'Modelo__n_estimators': [50, 100, 150],
            'Modelo__learning_rate': [0.01, 0.1, 1]
        }
    elif model_name == 'SVM':
        return {
            'Modelo__C': [0.01, 0.1, 1, 10],
            'Modelo__kernel': ['linear', 'rbf']
        }
    else:
        raise ValueError(f"Não foi definido parâmetro para o modelo: {model_name}")

In [255]:
def nested_cv_with_gridsearch(X, y, models, outer_folds=5, inner_folds=3, metric='f1'):
    outer_cv = StratifiedKFold(n_splits=outer_folds, shuffle=True, random_state=42)
    results = []

    for model_name, model in models.items():
        param_grid = define_grid_search_params(model_name)

        for train_idx, test_idx in outer_cv.split(X, y):
            X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
            y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

            pipeline = make_pipeline(model)

            grid_search = GridSearchCV(
                estimator=pipeline,
                param_grid=param_grid,
                cv=StratifiedKFold(n_splits=inner_folds, shuffle=True, random_state=42),
                scoring=metric,
                n_jobs=-1
            )

            grid_search.fit(X_train, y_train)

            best_pipeline = grid_search.best_estimator_
            y_pred = best_pipeline.predict(X_test)

            metrics = {
                'Model': model_name,
                'F1 Score': f1_score(y_test, y_pred),
                'Precision': precision_score(y_test, y_pred),
                'Recall': recall_score(y_test, y_pred),
                'ROC AUC': roc_auc_score(y_test, best_pipeline.predict_proba(X_test)[:, 1])
            }
            results.append(metrics)

    return pd.DataFrame(results)

In [256]:
results_df = nested_cv_with_gridsearch(X, y, algo_dict, outer_folds=5, inner_folds=3, metric='f1')
display(results_df)

Unnamed: 0,Model,F1 Score,Precision,Recall,ROC AUC
0,Decision Tree,0.756014,0.740741,0.77193,0.869082
1,Decision Tree,0.763458,0.711246,0.823944,0.890403
2,Decision Tree,0.769231,0.747508,0.792254,0.892761
3,Decision Tree,0.752166,0.740614,0.764085,0.881102
4,Decision Tree,0.776173,0.796296,0.757042,0.894621
5,Decision Tree Max depth 5,0.735751,0.72449,0.747368,0.877702
6,Decision Tree Max depth 5,0.761905,0.693642,0.84507,0.892468
7,Decision Tree Max depth 5,0.767918,0.745033,0.792254,0.891618
8,Decision Tree Max depth 5,0.756014,0.738255,0.774648,0.878351
9,Decision Tree Max depth 5,0.757856,0.797665,0.721831,0.883847


In [257]:
def define_hyperopt_space(model_name):
    if model_name == 'Random Forest gini' or model_name == 'Random Forest entropy':
        return {
            'n_estimators': hp.quniform('n_estimators', 50, 300, 10),
            'max_depth': hp.quniform('max_depth', 5, 50, 1),
        }
    elif model_name == 'Decision Tree' or model_name == 'Decision Tree Max depth 5':
        return {
            'max_depth': hp.quniform('max_depth', 5, 50, 1),
        }
    elif model_name == 'Logistic Regression':
        return {
            'C': hp.loguniform('C', np.log(1e-4), np.log(1e2)),
        }
    elif model_name == '3-Nearest Neighbors' or model_name == '5-Nearest Neighbors':
        return {
            'n_neighbors': hp.quniform('n_neighbors', 3, 20, 1),
        }
    elif model_name == 'AdaBoost':
        return {
            'n_estimators': hp.quniform('n_estimators', 50, 300, 10),
            'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(2)),
        }
    elif model_name == 'SVM':
        return {
            'C': hp.loguniform('C', np.log(1e-4), np.log(1e2)),
            'kernel': hp.choice('kernel', ['linear', 'rbf']),
        }

In [258]:
def nested_cv_with_hyperopt(X, y, models, outer_folds=5, inner_folds=3, metric='f1'):
    outer_cv = StratifiedKFold(n_splits=outer_folds, shuffle=True, random_state=42)
    results = []

    for model_name, model in models.items():

        # Define objective function for Hyperopt
        def objective(params):
            # If SVM, map the kernel index to string for correct cross-validation
            if 'kernel' in params and model_name == 'SVM':
                kernel_mapping = ['linear', 'rbf']
                if isinstance(params['kernel'], int):
                    params['kernel'] = kernel_mapping[params['kernel']]

            pipeline = make_pipeline(model)

            # Convert integer-like floats to int
            final_params = {}
            for k, v in params.items():
                if isinstance(v, float) and v.is_integer():
                    v = int(v)  # Convert to int if it is a whole number float
                final_params[f'Modelo__{k}'] = v

            pipeline.set_params(**final_params)

            inner_cv = StratifiedKFold(n_splits=inner_folds, shuffle=True, random_state=42)
            scores = cross_val_score(pipeline, X_train, y_train, cv=inner_cv, scoring=metric, n_jobs=-1)
            return {'loss': -np.mean(scores), 'status': STATUS_OK}

        # Outer loop
        for train_idx, test_idx in outer_cv.split(X, y):
            X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
            y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

            search_space = define_hyperopt_space(model_name)
            trials = Trials()

            best_params = fmin(
                fn=objective,
                space=search_space,
                algo=tpe.suggest,
                max_evals=20,
                trials=trials,
            )

            # Map SVM kernel back if needed
            if model_name == 'SVM' and 'kernel' in best_params:
                kernel_mapping = ['linear', 'rbf']
                best_params['kernel'] = kernel_mapping[best_params['kernel']]

            # Convert parameters and retrain pipeline
            final_params = {}
            for k, v in best_params.items():
                if isinstance(v, float) and v.is_integer():
                    v = int(v)
                final_params[f'Modelo__{k}'] = v

            # Train the pipeline with the best parameters
            pipeline = make_pipeline(model)
            pipeline.set_params(**final_params)
            pipeline.fit(X_train, y_train)

            # Evaluate on the test set
            y_pred = pipeline.predict(X_test)
            metrics = {
                'Model': model_name,
                'F1 Score': f1_score(y_test, y_pred),
                'Precision': precision_score(y_test, y_pred),
                'Recall': recall_score(y_test, y_pred),
                'ROC AUC': roc_auc_score(y_test, pipeline.predict_proba(X_test)[:, 1])
            }
            results.append(metrics)

    return pd.DataFrame(results)

In [259]:
results_df = nested_cv_with_hyperopt(X, y, algo_dict, outer_folds=5, inner_folds=3, metric='f1')
display(results_df)

100%|██████████| 20/20 [00:00<00:00, 43.05trial/s, best loss: -0.7407741353597684]
100%|██████████| 20/20 [00:00<00:00, 43.89trial/s, best loss: -0.7509230039026257]
100%|██████████| 20/20 [00:00<00:00, 45.29trial/s, best loss: -0.7491329836294941]
100%|██████████| 20/20 [00:00<00:00, 43.97trial/s, best loss: -0.7447137289316732]
100%|██████████| 20/20 [00:00<00:00, 42.74trial/s, best loss: -0.7077243377352112]
100%|██████████| 20/20 [00:00<00:00, 42.94trial/s, best loss: -0.7224152171523617]
100%|██████████| 20/20 [00:00<00:00, 42.92trial/s, best loss: -0.748804323654029] 
100%|██████████| 20/20 [00:00<00:00, 42.99trial/s, best loss: -0.7387256113257581]
100%|██████████| 20/20 [00:00<00:00, 43.61trial/s, best loss: -0.7536323145592467]
100%|██████████| 20/20 [00:00<00:00, 43.02trial/s, best loss: -0.7317576889602071]
100%|██████████| 20/20 [00:05<00:00,  3.47trial/s, best loss: -0.7755260812858292]
100%|██████████| 20/20 [00:05<00:00,  3.60trial/s, best loss: -0.7660643036125215]
100%

Unnamed: 0,Model,F1 Score,Precision,Recall,ROC AUC
0,Decision Tree,0.711744,0.722022,0.701754,0.779348
1,Decision Tree,0.758065,0.699405,0.827465,0.882262
2,Decision Tree,0.772174,0.762887,0.78169,0.891935
3,Decision Tree,0.727273,0.70903,0.746479,0.837841
4,Decision Tree,0.746269,0.705329,0.792254,0.834319
5,Decision Tree Max depth 5,0.692308,0.689895,0.694737,0.763719
6,Decision Tree Max depth 5,0.774411,0.741935,0.809859,0.887992
7,Decision Tree Max depth 5,0.765432,0.766784,0.764085,0.881483
8,Decision Tree Max depth 5,0.74606,0.74216,0.75,0.858733
9,Decision Tree Max depth 5,0.745299,0.724252,0.767606,0.847227


In [263]:
def define_optuna_space(model_name, trial):
    if model_name == 'Random Forest gini' or model_name == 'Random Forest entropy':
        return {
            'Modelo__n_estimators': trial.suggest_int('n_estimators', 50, 300),
            'Modelo__max_depth': trial.suggest_int('max_depth', 5, 50)
        }
    elif model_name == 'Decision Tree' or model_name == 'Decision Tree Max depth 5':
        return {
            'Modelo__max_depth': trial.suggest_int('max_depth', 5, 50)
        }
    elif model_name == 'Logistic Regression':
        return {
            'Modelo__C': trial.suggest_loguniform('C', 1e-4, 1e2)
        }
    elif model_name == '3-Nearest Neighbors' or model_name == '5-Nearest Neighbors':
        return {
            'Modelo__n_neighbors': trial.suggest_int('n_neighbors', 3, 20)
        }
    elif model_name == 'AdaBoost':
        return {
            'Modelo__n_estimators': trial.suggest_int('n_estimators', 50, 300),
            'Modelo__learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 2)
        }
    elif model_name == 'SVM':
        return {
            'Modelo__C': trial.suggest_loguniform('C', 1e-4, 1e2),
            'Modelo__kernel': trial.suggest_categorical('kernel', ['linear', 'rbf'])
        }

In [264]:
# def nested_cv_with_optuna(X, y, models, outer_folds=5, inner_folds=3, metric='f1'):
#     outer_cv = StratifiedKFold(n_splits=outer_folds, shuffle=True, random_state=42)
#     results = []

#     for model_name, model in models.items():
#         # Define objective for Optuna
#         def objective(trial):
#             params = define_optuna_space(model_name, trial)

#             pipeline = make_pipeline(model)
#             pipeline.set_params(**params)
#             inner_cv = StratifiedKFold(n_splits=inner_folds, shuffle=True, random_state=42)

#             return cross_val_score(pipeline, X_train, y_train, cv=inner_cv, scoring=metric, n_jobs=-1).mean()

#         # Outer loop
#         for train_idx, test_idx in outer_cv.split(X, y):
#             X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
#             y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

#             study = optuna.create_study(direction="maximize")
#             study.optimize(objective, n_trials=20)

#             # Add the 'Modelo__' prefix to all best_params keys
#             best_params = {f'Modelo__{key}': value for key, value in study.best_params.items()}

#             pipeline = make_pipeline(model)
#             pipeline.set_params(**best_params)
#             pipeline.fit(X_train, y_train)

#             # Evaluate on test set
#             y_pred = pipeline.predict(X_test)
#             metrics = {
#                 'Model': model_name,
#                 'F1 Score': f1_score(y_test, y_pred),
#                 'Precision': precision_score(y_test, y_pred),
#                 'Recall': recall_score(y_test, y_pred),
#                 'ROC AUC': roc_auc_score(y_test, pipeline.predict_proba(X_test)[:, 1])
#             }
#             results.append(metrics)

#     return pd.DataFrame(results)

In [269]:
# results_df = nested_cv_with_optuna(X, y, algo_dict, outer_folds=5, inner_folds=3, metric='f1')
# display(results_df)

In [275]:
import optuna
from optuna.visualization import (
    plot_optimization_history, 
    plot_slice, 
    plot_contour, 
    plot_parallel_coordinate, 
    plot_param_importances
)

In [276]:
def nested_cv_with_optuna(X, y, models, outer_folds=5, inner_folds=3, metric='f1'):
    outer_cv = StratifiedKFold(n_splits=outer_folds, shuffle=True, random_state=42)
    results = []
    last_study = None  # Will keep track of the most recent study

    for model_name, model in models.items():
        def objective(trial):
            params = define_optuna_space(model_name, trial)

            pipeline = make_pipeline(model)
            pipeline.set_params(**params)
            inner_cv = StratifiedKFold(n_splits=inner_folds, shuffle=True, random_state=42)

            return cross_val_score(pipeline, X_train, y_train, cv=inner_cv, scoring=metric, n_jobs=-1).mean()

        for train_idx, test_idx in outer_cv.split(X, y):
            X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
            y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

            study = optuna.create_study(direction="maximize")
            study.optimize(objective, n_trials=20)
            last_study = study  # Keep a reference to the last study

            best_params = {f'Modelo__{key}': value for key, value in study.best_params.items()}

            pipeline = make_pipeline(model)
            pipeline.set_params(**best_params)
            pipeline.fit(X_train, y_train)

            y_pred = pipeline.predict(X_test)
            metrics = {
                'Model': model_name,
                'F1 Score': f1_score(y_test, y_pred),
                'Precision': precision_score(y_test, y_pred),
                'Recall': recall_score(y_test, y_pred),
                'ROC AUC': roc_auc_score(y_test, pipeline.predict_proba(X_test)[:, 1])
            }
            results.append(metrics)

    return pd.DataFrame(results), last_study

In [277]:
results_df, last_study = nested_cv_with_optuna(X, y, algo_dict, outer_folds=5, inner_folds=3, metric='f1')
display(results_df)

[I 2024-12-07 14:35:31,789] A new study created in memory with name: no-name-06bf27ea-8430-44a6-b060-ea1cc01dad60
[I 2024-12-07 14:35:31,813] Trial 0 finished with value: 0.7073247291693893 and parameters: {'max_depth': 31}. Best is trial 0 with value: 0.7073247291693893.
[I 2024-12-07 14:35:31,837] Trial 1 finished with value: 0.7409529859228353 and parameters: {'max_depth': 9}. Best is trial 1 with value: 0.7409529859228353.
[I 2024-12-07 14:35:31,860] Trial 2 finished with value: 0.6975176248153234 and parameters: {'max_depth': 23}. Best is trial 1 with value: 0.7409529859228353.
[I 2024-12-07 14:35:31,883] Trial 3 finished with value: 0.7039240403795591 and parameters: {'max_depth': 35}. Best is trial 1 with value: 0.7409529859228353.
[I 2024-12-07 14:35:31,908] Trial 4 finished with value: 0.6958521612261479 and parameters: {'max_depth': 45}. Best is trial 1 with value: 0.7409529859228353.
[I 2024-12-07 14:35:31,931] Trial 5 finished with value: 0.7129454661964623 and parameters: 

Unnamed: 0,Model,F1 Score,Precision,Recall,ROC AUC
0,Decision Tree,0.735346,0.744604,0.726316,0.869254
1,Decision Tree,0.759868,0.712963,0.81338,0.889624
2,Decision Tree,0.77265,0.750831,0.795775,0.892907
3,Decision Tree,0.7487,0.737201,0.760563,0.863736
4,Decision Tree,0.765568,0.79771,0.735915,0.888864
5,Decision Tree Max depth 5,0.746167,0.725166,0.768421,0.873617
6,Decision Tree Max depth 5,0.762393,0.740864,0.785211,0.887596
7,Decision Tree Max depth 5,0.777202,0.762712,0.792254,0.89441
8,Decision Tree Max depth 5,0.73719,0.694704,0.785211,0.872586
9,Decision Tree Max depth 5,0.772964,0.761092,0.785211,0.886758


In [278]:

# Assuming you modify nested_cv_with_optuna to return both results and the last study
# or you keep a reference to the last created study outside.
# For demonstration, suppose `study` is the last Optuna study used.

# Example: If you have access to the last study used (from nested_cv_with_optuna)
# If nested_cv_with_optuna doesn't return the study, you would need to modify it
# to return the study from the last iteration or handle it outside the function.

# Let's say we return the last study from nested_cv_with_optuna by adjusting its code:
# results_df, last_study = nested_cv_with_optuna(X, y, algo_dict, outer_folds=5, inner_folds=3, metric='f1')

# Then you can do:
# Optimization history
fig_opt_history = plot_optimization_history(last_study)
fig_opt_history.show()

# Slice plot (shows how the objective value changes with respect to a single parameter)
fig_slice = plot_slice(last_study)
fig_slice.show()

# Parallel coordinate plot (relationship among parameters and objective)
fig_parallel = plot_parallel_coordinate(last_study)
fig_parallel.show()

# Contour plot (shows objective values across two parameters)
fig_contour = plot_contour(last_study)
fig_contour.show()

# Parameter importances (which parameters matter most?)
fig_param_importances = plot_param_importances(last_study)
fig_param_importances.show()

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed