## Registar um pipeline no mlflow

In [1]:
import mlflow
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.metrics import  precision_recall_curve, roc_auc_score, confusion_matrix, accuracy_score, recall_score, precision_score, f1_score,auc, roc_curve
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn import tree
from sklearn.linear_model import Perceptron
from sklearn.neural_network import MLPClassifier
from mlflow.models import infer_signature



In [2]:
root_path = '../data/'
seed = 42
target_col = "default.payment.next.month"

## Definir a diretoria onde as experiências são guardadas

In [3]:
from pathlib import Path

uri = "http://127.0.0.1:5000"

mlflow.set_tracking_uri(uri)

## Fazer set da experiência

In [4]:
mlflow.set_experiment("Lending Prediction Experiment")

2025/04/06 18:36:09 INFO mlflow.tracking.fluent: Experiment with name 'Lending Prediction Experiment' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/851929301218985109', creation_time=1743960969662, experiment_id='851929301218985109', last_update_time=1743960969662, lifecycle_stage='active', name='Lending Prediction Experiment', tags={}>

## Criar os datasets


In [5]:
file_path = root_path + 'lending_data.csv'
df = pd.read_csv(file_path)

df = df.drop('ID', axis = 1)

train_set, test_set = train_test_split(df, test_size = 0.2, random_state = seed)

train_set_staged = train_set.sample(frac=0.2)

X_train = train_set_staged.drop([target_col], axis = 'columns')
y_train = train_set_staged[target_col]

X_test = test_set.drop([target_col], axis = 1)
y_test = test_set[target_col]

X_train.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6
10858,40000.0,1,1,1,34,-1,-1,-1,-1,-1,...,8374.0,7505.0,7515.0,9120.0,7419.0,8382.0,7505.0,7515.0,9120.0,6180.0
13742,90000.0,2,1,1,38,-1,-1,0,0,-1,...,16360.0,41936.0,538.0,2716.0,15271.0,2000.0,30000.0,538.0,2716.0,652.0
2296,300000.0,2,3,2,38,0,0,0,0,-1,...,44508.0,67484.0,500.0,7391.0,10000.0,6000.0,30000.0,500.0,7391.0,7000.0
7878,130000.0,1,2,2,29,0,0,0,0,0,...,130128.0,101794.0,101826.0,101763.0,6658.0,5146.0,4000.0,4000.0,3866.0,3435.0
8399,170000.0,1,1,1,38,-2,-2,-2,-2,-2,...,0.0,1012.0,0.0,0.0,0.0,0.0,1012.0,0.0,0.0,492.0


In [6]:
# Definição dos modelos e dos seus respectivos hiper-parâmetros para busca em grid
models_and_params = [
    (
        'ann',
        Pipeline(steps=[
            ('scaler', MinMaxScaler()),
            ('Classifier', MLPClassifier(solver = 'lbfgs',  random_state = seed, max_iter = 1000))
        ]),
        {
          "Classifier__hidden_layer_sizes": [(20,), (20,10), (20, 10, 2)],
          'Classifier__learning_rate_init':[0.0001, 0.001, 0.01, 0.1]
        }
    ),
    (
        'random_forest',
        RandomForestClassifier(random_state = seed,  class_weight = 'balanced'),
        {
            'n_estimators':[10, 100, 300, 1000]
        }
    ),
    (
        'decision_tree',
        tree.DecisionTreeClassifier(random_state = seed,  class_weight = 'balanced'),
        {
            'max_depth':[3, 6],
            'min_samples_split': [2, 4, 10]
        }
    ),
    (
        'svm',
        Pipeline(steps=[
            ('scaler', MinMaxScaler()),
            ('Classifier', SVC(random_state = seed, class_weight = 'balanced', gamma = 'scale', probability = True, verbose = True))
        ]),
        {
          "Classifier__C": [0.1, 1, 10],
          "Classifier__kernel": ["linear","rbf"]
        }
    ),
    (
        'knn',
        Pipeline(steps=[
            ('scaler', MinMaxScaler()),
            ('Classifier', KNeighborsClassifier())
        ]),
        {
          "Classifier__n_neighbors": [1,10,100],
        }
    ),
    (
        'lr',
        Pipeline(steps=[
            ('scaler', MinMaxScaler()),
            ('Classifier', LogisticRegression(max_iter = 500, solver = 'lbfgs', random_state = seed, class_weight = 'balanced'))
        ]),
        {
          "Classifier__C": [0.001, 0.01, 0.1, 1, 10, 100]
        }
    )
]

In [7]:
for model_str, clf, params in models_and_params:
    print(f"Currently assessing {model_str}")
    
        # Inicia a run
    with mlflow.start_run(run_name=model_str):

        
        # 1️⃣ Executar Grid Search
        grid_search = GridSearchCV(clf, params, cv=5, n_jobs=-1)
        grid_search.fit(X_train, y_train)

        # 2️⃣ Obter os melhores parâmetros e o melhor score
        best_params = grid_search.best_params_
        best_score = grid_search.best_score_

        # 3️⃣ Registrar os hiperparâmetros e a métrica no MLflow
        mlflow.log_params(best_params)
        mlflow.log_metric("best_cv_score", best_score)

        # 4️⃣ Salvar o modelo treinado no MLflow
        mlflow.sklearn.log_model(grid_search.best_estimator_, artifact_path="sklearn-model", input_example=X_train, registered_model_name=model_str)

        # 5️⃣ Fazer predições no conjunto de teste
        y_preds = grid_search.best_estimator_.predict(X_test)

        # 6️⃣ Calcular métricas adicionais
        acc = accuracy_score(y_test, y_preds)
        mlflow.log_metric("accuracy", acc)

        print(f"✅ Model {model_str} completed with best score: {best_score:.4f}")


print("🎯 All models have been evaluated and logged in MLflow!")

Currently assessing ann


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
Successfully registered model 'ann'.
2025/04/06 18:38:11 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: ann, version 1
Created version '1' of model 'ann'.


✅ Model ann completed with best score: 0.8119
🏃 View run ann at: http://127.0.0.1:5000/#/experiments/851929301218985109/runs/e8195474cab34731b27335fbb13ef972
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/851929301218985109
Currently assessing random_forest


Successfully registered model 'random_forest'.
2025/04/06 18:39:56 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: random_forest, version 1
Created version '1' of model 'random_forest'.


✅ Model random_forest completed with best score: 0.8181
🏃 View run random_forest at: http://127.0.0.1:5000/#/experiments/851929301218985109/runs/ae9f7f221b5248558dd51f6f2121a315
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/851929301218985109
Currently assessing decision_tree


Successfully registered model 'decision_tree'.
2025/04/06 18:40:15 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: decision_tree, version 1
Created version '1' of model 'decision_tree'.


✅ Model decision_tree completed with best score: 0.7504
🏃 View run decision_tree at: http://127.0.0.1:5000/#/experiments/851929301218985109/runs/ee0dd96b03514a0aa2575c70ccfa9219
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/851929301218985109
Currently assessing svm
[LibSVM]

Successfully registered model 'svm'.
2025/04/06 18:43:23 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: svm, version 1
Created version '1' of model 'svm'.


✅ Model svm completed with best score: 0.8087
🏃 View run svm at: http://127.0.0.1:5000/#/experiments/851929301218985109/runs/4fb1f7ac17534ff284b8eb826cc7019c
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/851929301218985109
Currently assessing knn


Successfully registered model 'knn'.
2025/04/06 18:43:40 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: knn, version 1
Created version '1' of model 'knn'.


✅ Model knn completed with best score: 0.8081
🏃 View run knn at: http://127.0.0.1:5000/#/experiments/851929301218985109/runs/c533f7cde41142f680fb198f90849c05
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/851929301218985109
Currently assessing lr


Successfully registered model 'lr'.
2025/04/06 18:43:53 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: lr, version 1
Created version '1' of model 'lr'.


✅ Model lr completed with best score: 0.7042
🏃 View run lr at: http://127.0.0.1:5000/#/experiments/851929301218985109/runs/878fff7d043847e099d38e2264cd3fe5
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/851929301218985109
🎯 All models have been evaluated and logged in MLflow!
