# Objetivo do Projeto

O objetivo do projeto é demonstrar a aplicação das ferramentas de tracking do MLflow, otimização de hiperparametros com Optuna aplicado no modelo XGBoost.

# Bibliotecas

In [None]:
import math
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from eda_utils import plot_correlation_with_demand
from eda_utils import plot_feature_importance
from eda_utils import plot_residuals
import xgboost as xgb
import mlflow
import optuna
from sklearn.metrics import mean_squared_error
from modeling_utils import champion_callback

import warnings
warnings.filterwarnings("ignore")

# Carregando o dataset

In [None]:
# Load dataset
data = pd.read_csv('../data/raw/bike.csv')
data.info()

In [None]:
data.head()

# Análise Exploratória

In [None]:
# Converte a coluna 'dteday' para o tipo datetime
data['dteday'] = pd.to_datetime(data['dteday'])

In [None]:
# Reajuste da demanda pela média de aluguel por dia.
data.set_index('dteday')['cnt'].resample('D').mean().plot(title='Demanda de Bikes');

In [None]:
# Função para verificar a correlação das variáveis independentes com a target
correlation_plot = plot_correlation_with_demand(data, save_path="../report/figures/correlation_plot.png")
correlation_plot

In [None]:
plt.figure(figsize=(12, 8))
plt.title('Matriz de Correlação', fontsize=18)
sns.heatmap(data.corr(), annot=True, fmt=".1f");

# Pré-Processamento

In [None]:
# Divisão 80% treino e 20% para validação.
train = data.loc[data['dteday']< '2012-08-10']
valid = data.loc[data['dteday']>= '2012-08-10']

# Porcentagem dos dados de treino
round(train.shape[0] / data.shape[0], 2)

In [None]:
# Removendo as variáveis 'instant', 'dteday'
train.drop(['instant', 'dteday'], axis=1, inplace=True)
valid.drop(['instant', 'dteday'], axis=1, inplace=True)

In [None]:
# Salva os datasets pré-processados
train.to_csv('../data/processed/train.csv',index=False)
valid.to_csv('../data/processed/valid.csv',index=False)

In [None]:
# Separando as variáveis preditoras da variável target
train_x = train.drop(columns='cnt')
train_y = train['cnt']
valid_x = valid.drop(columns='cnt')
valid_y = valid['cnt']

# XGBoost Data Format
dtrain = xgb.DMatrix(train_x, label=train_y)
dvalid = xgb.DMatrix(valid_x, label=valid_y)

# Configurando o MLflow Experiment

In [None]:
mlflow.set_tracking_uri("http://localhost:5000")

In [None]:
# Cria o experimento no MLflow
experiment_id = mlflow.create_experiment("Bike Demand")

In [None]:
experiment_id

In [None]:
# Define o experimento a ser usado
mlflow.set_experiment(experiment_id=experiment_id)

# Hyperparameter Tuning

**Logging para gerenciar as saídas**

A medida que realizamos o ajuste de hiperparâmetros com o Optuna, é essencial entender que o processo pode gerar uma infinidade de execuções. Na verdade, tantas que a saída padrão (stdout) do registrador padrão pode rapidamente ficar inundada, produzindo páginas e mais páginas de relatórios de registro.

Embora a verbosidade da configuração de registro padrão seja inegavelmente valiosa durante a fase de desenvolvimento do código, iniciar um teste em grande escala pode resultar em uma quantidade avassaladora de informações. Considerando isso, registrar cada detalhe no stdout se torna menos prático, especialmente quando temos ferramentas dedicadas como o MLflow para rastrear meticulosamente nossos experimentos.

In [None]:
# override Optuna's default logging to ERROR only
optuna.logging.set_verbosity(optuna.logging.ERROR)

In [None]:
def objective(trial):
    with mlflow.start_run(nested=True):
        # Define hyperparameters
        params = {
            "objective": "reg:squarederror",
            "eval_metric": "rmse",
            # defines booster, gblinear for linear functions.
            "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
            # L2 regularization weight.
            "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
            # L1 regularization weight.
            "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
            # sampling ratio for training data.
            "subsample": trial.suggest_float("subsample", 0.2, 1.0),
            # sampling according to each tree.
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
        }

        if params["booster"] == "gbtree" or params["booster"] == "dart":
            # maximum depth of the tree, signifies complexity of the tree.
            params["max_depth"] = trial.suggest_int("max_depth", 1, 9)
            # minimum child weight, larger the term more conservative the tree.
            params["min_child_weight"] = trial.suggest_int("min_child_weight", 2, 10)
            # taxa de aprendizado
            params["learning_rate"] = trial.suggest_float("learning_rate", 1e-8, 1.0, log=True)
            # controla a poda
            params["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)

        # Train XGBoost model
        bst = xgb.train(params, dtrain)
        preds = bst.predict(dvalid)
        error = mean_squared_error(valid_y, preds)

        # Log to MLflow
        mlflow.log_params(params)
        mlflow.log_metric("mse", error)
        mlflow.log_metric("rmse", math.sqrt(error))

    return error


# Orchestrating Hyperparameter Tuning, Model Training, and Logging with MLflow

In [None]:
run_name = "first_attempt"

In [None]:
# Initiate the parent run and call the hyperparameter tuning child run logic
with mlflow.start_run(experiment_id=experiment_id, run_name=run_name, nested=True):
    # Initialize the Optuna study
    study = optuna.create_study(direction="minimize")

    # Execute the hyperparameter optimization trials.
    # Note the addition of the `champion_callback` inclusion to control our logging
    study.optimize(objective, n_trials=500, callbacks=[champion_callback])

    mlflow.log_params(study.best_params)
    mlflow.log_metric("best_mse", study.best_value)
    mlflow.log_metric("best_rmse", math.sqrt(study.best_value))

    # Log tags
    mlflow.set_tags(
        tags={
            "project": "Bike Demand Project",
            "optimizer_engine": "optuna",
            "model_family": "xgboost",
            "feature_set_version": 1,
        }
    )

    # Log a fit model instance
    model = xgb.train(study.best_params, dtrain)

    # Log the correlation plot
    mlflow.log_figure(figure=correlation_plot, artifact_file="correlation_plot.png")

    # Log the feature importances plot
    importances = plot_feature_importance(model, booster=study.best_params.get("booster"))
    mlflow.log_figure(figure=importances, artifact_file="feature_importances.png")

    # Log the residuals plot
    residuals = plot_residuals(model, dvalid, valid_y)
    mlflow.log_figure(figure=residuals, artifact_file="residuals.png")

    artifact_path = "model"

    mlflow.xgboost.log_model(
        xgb_model=model,
        artifact_path=artifact_path,
        input_example=train_x.iloc[[0]],
        model_format="ubj",
        metadata={"model_data_version": 1},
    )

    # Get the logged model uri so that we can load it from the artifact store
    model_uri = mlflow.get_artifact_uri(artifact_path)


In [None]:
study.trials_dataframe().sort_values(by='value', ascending=True).head(10)

In [None]:
study.best_params

# Loading the Trained Model with MLflow

In [None]:
model_uri

In [None]:
loaded = mlflow.xgboost.load_model(model_uri)

In [None]:
batch_dmatrix = xgb.DMatrix(valid_x)

inference = loaded.predict(batch_dmatrix)

infer_data = valid.copy()

infer_data["predicted_demand"] = inference


In [None]:
inference

In [None]:
infer_data["predicted_demand"] = infer_data["predicted_demand"].astype(int)

In [None]:
infer_data

In [None]:
infer_data[['cnt','predicted_demand']].describe()

In [None]:
infer_data[infer_data['predicted_demand'] <= 0]