### Working with databricks

In [1]:
import os, mlflow
from dotenv import load_dotenv

load_dotenv(override=True) # Cargar las variables de entorno desde el archivo .env
EXPERIMENT_NAME = "/Users/ivan.morales@iteso.mx/nyc-taxi-experiments" 

mlflow.set_tracking_uri("databricks")
experiment = mlflow.set_experiment(experiment_name=EXPERIMENT_NAME)

2025/10/21 20:44:27 INFO mlflow.tracking.fluent: Experiment with name '/Users/ivan.morales@iteso.mx/nyc-taxi-experiments' does not exist. Creating a new experiment.


### Importamos las librerias necesarias

In [5]:
import pickle
import pandas as pd
from sklearn.metrics import  root_mean_squared_error
from sklearn.feature_extraction import  DictVectorizer

In [6]:
def read_dataframe(filename):

    df = pd.read_parquet(filename)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)

    return df

In [7]:
df_train = read_dataframe('../data/green_tripdata_2025-01.parquet')
df_val = read_dataframe('../data/green_tripdata_2025-02.parquet')

Feature Engineering + One Hot Encoding, pipeline simple inicial

In [8]:
def preprocess(df, dv):
    df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']
    categorical = ['PU_DO']
    numerical = ['trip_distance']
    train_dicts = df[categorical + numerical].to_dict(orient='records')
    return dv.transform(train_dicts)

In [9]:
categorical = ['PULocationID', 'DOLocationID']
numerical = ['trip_distance']
dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

X_val = preprocess(df_val, dv)

Target

In [10]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

Definir los dataset como objetos de *mlflow* para poderlos trackear

In [11]:
training_dataset = mlflow.data.from_numpy(X_train.data, targets=y_train, name="green_tripdata_2025-01")
validation_dataset = mlflow.data.from_numpy(X_val.data, targets=y_val, name="green_tripdata_2025-02")

Tuning de hiperparametros para un modelo `XGBoost` con *Optuna*

In [14]:
# Liberias necesarias

import math
import optuna
import pathlib
import xgboost as xgb
from optuna.samplers import TPESampler
from mlflow.models.signature import infer_signature

In [13]:
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)

Funcion objetivo

In [17]:
# ------------------------------------------------------------
# Definir la función objetivo para Optuna
#    - Recibe un `trial`, que se usa para proponer hiperparámetros.
#    - Entrena un modelo con esos hiperparámetros.
#    - Calcula la métrica de validación (RMSE) y la retorna (Optuna la minimizará).
#    - Abrimos un run anidado de MLflow para registrar cada trial.
# ------------------------------------------------------------
def objective(trial: optuna.trial.Trial):
    # Hiperparámetros MUESTREADOS por Optuna en CADA trial.
    # Nota: usamos log=True para emular rangos log-uniformes (similar a loguniform).
    params = {
        "max_depth": trial.suggest_int("max_depth", 4, 100),
        "learning_rate": trial.suggest_float("learning_rate", math.exp(-3), 1.0, log=True),
        "reg_alpha": trial.suggest_float("reg_alpha",   math.exp(-5), math.exp(-1), log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", math.exp(-6), math.exp(-1), log=True),
        "min_child_weight": trial.suggest_float("min_child_weight", math.exp(-1), math.exp(3), log=True),
        "objective": "reg:squarederror",  
        "seed": 42,                      
    }

    # Run anidado para dejar rastro de cada trial en MLflow
    with mlflow.start_run(nested=True):
        mlflow.set_tag("model_family", "xgboost")  # etiqueta informativa
        mlflow.log_params(params)                  # registra hiperparámetros del trial

        # Entrenamiento con early stopping en el conjunto de validación
        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=100,
            evals=[(valid, "validation")],
            early_stopping_rounds=10,
        )

        # Predicción y métrica en validación
        y_pred = booster.predict(valid)
        rmse = root_mean_squared_error(y_val, y_pred)

        # Registrar la métrica principal
        mlflow.log_metric("rmse", rmse)

        # La "signature" describe la estructura esperada de entrada y salida del modelo:
        # incluye los nombres, tipos y forma (shape) de las variables de entrada y el tipo de salida.
        # MLflow la usa para validar datos en inferencia y documentar el modelo en el Model Registry.
        signature = infer_signature(X_val, y_pred)

        # Guardar el modelo del trial como artefacto en MLflow.
        mlflow.xgboost.log_model(
            booster,
            name="model",
            input_example=X_val[:5],
            signature=signature
        )

    # Optuna minimiza el valor retornado
    return rmse

Flujo de Busqueda

In [18]:
mlflow.xgboost.autolog(log_models=False)

# ------------------------------------------------------------
# Crear el estudio de Optuna
#    - Usamos TPE (Tree-structured Parzen Estimator) como sampler.
#    - direction="minimize" porque queremos minimizar el RMSE.
# ------------------------------------------------------------
sampler = TPESampler(seed=42)
study = optuna.create_study(direction="minimize", sampler=sampler)

# ------------------------------------------------------------
# Ejecutar la optimización (n_trials = número de intentos)
#    - Cada trial ejecuta la función objetivo con un set distinto de hiperparámetros.
#    - Abrimos un run "padre" para agrupar toda la búsqueda.
# ------------------------------------------------------------
with mlflow.start_run(run_name="XGBoost Hyperparameter Optimization (Optuna)", nested=True):
    study.optimize(objective, n_trials=10)

    # --------------------------------------------------------
    # Recuperar y registrar los mejores hiperparámetros
    # --------------------------------------------------------
    best_params = study.best_params
    # Asegurar tipos/campos fijos (por claridad y consistencia)
    best_params["max_depth"] = int(best_params["max_depth"])
    best_params["seed"] = 42
    best_params["objective"] = "reg:squarederror"

    mlflow.log_params(best_params)

    # Etiquetas del run "padre" (metadatos del experimento)
    mlflow.set_tags({
        "project": "NYC Taxi Time Prediction Project",
        "optimizer_engine": "optuna",
        "model_family": "xgboost",
        "feature_set_version": 1,
    })

    # --------------------------------------------------------
    # 7) Entrenar un modelo FINAL con los mejores hiperparámetros
    #    (normalmente se haría sobre train+val o con CV; aquí mantenemos el patrón original)
    # --------------------------------------------------------
    booster = xgb.train(
        params=best_params,
        dtrain=train,
        num_boost_round=100,
        evals=[(valid, "validation")],
        early_stopping_rounds=10,
    )

    # Evaluar y registrar la métrica final en validación
    y_pred = booster.predict(valid)
    rmse = root_mean_squared_error(y_val, y_pred)
    mlflow.log_metric("rmse", rmse)

    # --------------------------------------------------------
    # 8) Guardar artefactos adicionales (p. ej. el preprocesador)
    # --------------------------------------------------------
    pathlib.Path("preprocessor").mkdir(exist_ok=True)
    with open("preprocessor/preprocessor.b", "wb") as f_out:
        pickle.dump(dv, f_out)

    mlflow.log_artifact("preprocessor/preprocessor.b", artifact_path="preprocessor")

    # La "signature" describe la estructura esperada de entrada y salida del modelo:
    # incluye los nombres, tipos y forma (shape) de las variables de entrada y el tipo de salida.
    # MLflow la usa para validar datos en inferencia y documentar el modelo en el Model Registry.
    # Si X_val es la matriz dispersa (scipy.sparse) salida de DictVectorizer:
    feature_names = dv.get_feature_names_out()
    input_example = pd.DataFrame(X_val[:5].toarray(), columns=feature_names)

    # Para que las longitudes coincidan, usa el mismo slice en y_pred
    signature = infer_signature(input_example, y_val[:5])

    # Guardar el modelo del trial como artefacto en MLflow.
    mlflow.xgboost.log_model(
        booster,
        name="model",
        input_example=input_example,
        signature=signature
    )

[I 2025-10-21 21:01:51,755] A new study created in memory with name: no-name-dfea3b72-4fb8-4a8a-b9cd-33a793244e79


[0]	validation-rmse:5.97618
[1]	validation-rmse:5.96293
[2]	validation-rmse:5.97541
[3]	validation-rmse:5.97576
[4]	validation-rmse:5.97576
[5]	validation-rmse:5.97574
[6]	validation-rmse:5.97540
[7]	validation-rmse:5.97555
[8]	validation-rmse:5.97610
[9]	validation-rmse:5.97557
[10]	validation-rmse:5.97620


  xgb_model.save_model(model_data_path)


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

  model.load_model(xgb_model_path)
2025/10/21 21:02:15 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
[I 2025-10-21 21:02:19,018] Trial 0 finished with value: 5.976289297510136 and parameters: {'max_depth': 40, 'learning_rate': 0.8625543817410922, 'reg_alpha': 0.12593061066249622, 'reg_lambda': 0.049454235173237264, 'min_child_weight': 0.6866535292359801}. Best is trial 0 with value: 5.976289297510136.


🏃 View run big-hound-863 at: https://dbc-ed7122c4-c5b7.cloud.databricks.com/ml/experiments/2414141065980596/runs/bf0c35cad04f4cefa2cc9fad1f79ead8
🧪 View experiment at: https://dbc-ed7122c4-c5b7.cloud.databricks.com/ml/experiments/2414141065980596
[0]	validation-rmse:8.75443
[1]	validation-rmse:8.44234
[2]	validation-rmse:8.15748
[3]	validation-rmse:7.89941
[4]	validation-rmse:7.66447
[5]	validation-rmse:7.45313
[6]	validation-rmse:7.26115
[7]	validation-rmse:7.09232
[8]	validation-rmse:6.94195
[9]	validation-rmse:6.80677
[10]	validation-rmse:6.68661
[11]	validation-rmse:6.57883
[12]	validation-rmse:6.48359
[13]	validation-rmse:6.40011
[14]	validation-rmse:6.32434
[15]	validation-rmse:6.25910
[16]	validation-rmse:6.20305
[17]	validation-rmse:6.15157
[18]	validation-rmse:6.10657
[19]	validation-rmse:6.06838
[20]	validation-rmse:6.03423
[21]	validation-rmse:6.00470
[22]	validation-rmse:5.98021
[23]	validation-rmse:5.95935
[24]	validation-rmse:5.94300
[25]	validation-rmse:5.92949
[26]	vali

  xgb_model.save_model(model_data_path)


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

  model.load_model(xgb_model_path)
[I 2025-10-21 21:02:33,921] Trial 1 finished with value: 5.871727173718048 and parameters: {'max_depth': 19, 'learning_rate': 0.059264241587996896, 'reg_alpha': 0.21539205131792016, 'reg_lambda': 0.05006540936006931, 'min_child_weight': 6.248180561354165}. Best is trial 1 with value: 5.871727173718048.


🏃 View run skittish-vole-193 at: https://dbc-ed7122c4-c5b7.cloud.databricks.com/ml/experiments/2414141065980596/runs/6953c5a4fdeb42399e79b0d97ab0014a
🧪 View experiment at: https://dbc-ed7122c4-c5b7.cloud.databricks.com/ml/experiments/2414141065980596
[0]	validation-rmse:5.88480
[1]	validation-rmse:5.87918
[2]	validation-rmse:5.87752
[3]	validation-rmse:5.87758
[4]	validation-rmse:5.87414
[5]	validation-rmse:5.87534
[6]	validation-rmse:5.87592
[7]	validation-rmse:5.87536
[8]	validation-rmse:5.87471
[9]	validation-rmse:5.87431
[10]	validation-rmse:5.87445
[11]	validation-rmse:5.87422
[12]	validation-rmse:5.87418
[13]	validation-rmse:5.87385
[14]	validation-rmse:5.86068
[15]	validation-rmse:5.86163
[16]	validation-rmse:5.86347
[17]	validation-rmse:5.86320
[18]	validation-rmse:5.86328
[19]	validation-rmse:5.86315
[20]	validation-rmse:5.86330
[21]	validation-rmse:5.86192
[22]	validation-rmse:5.86069
[23]	validation-rmse:5.86122
[24]	validation-rmse:5.86108


  xgb_model.save_model(model_data_path)


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

  model.load_model(xgb_model_path)
[I 2025-10-21 21:02:45,823] Trial 2 finished with value: 5.861080956290079 and parameters: {'max_depth': 5, 'learning_rate': 0.9136840519292247, 'reg_alpha': 0.18820387978911576, 'reg_lambda': 0.007166739666045858, 'min_child_weight': 0.7613210498541186}. Best is trial 2 with value: 5.861080956290079.


🏃 View run lyrical-penguin-816 at: https://dbc-ed7122c4-c5b7.cloud.databricks.com/ml/experiments/2414141065980596/runs/888a034bcd1242898fc0a9810997bd01
🧪 View experiment at: https://dbc-ed7122c4-c5b7.cloud.databricks.com/ml/experiments/2414141065980596
[0]	validation-rmse:8.40909
[1]	validation-rmse:7.84573
[2]	validation-rmse:7.39572
[3]	validation-rmse:7.04071
[4]	validation-rmse:6.74761
[5]	validation-rmse:6.52071
[6]	validation-rmse:6.34414
[7]	validation-rmse:6.21028
[8]	validation-rmse:6.11245
[9]	validation-rmse:6.03994
[10]	validation-rmse:5.97905
[11]	validation-rmse:5.93916
[12]	validation-rmse:5.90916
[13]	validation-rmse:5.88816
[14]	validation-rmse:5.87149
[15]	validation-rmse:5.86217
[16]	validation-rmse:5.85431
[17]	validation-rmse:5.84908
[18]	validation-rmse:5.84536
[19]	validation-rmse:5.84496
[20]	validation-rmse:5.84430
[21]	validation-rmse:5.84316
[22]	validation-rmse:5.84282
[23]	validation-rmse:5.84295
[24]	validation-rmse:5.84285
[25]	validation-rmse:5.84337
[26

  xgb_model.save_model(model_data_path)


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

  model.load_model(xgb_model_path)
[I 2025-10-21 21:03:02,160] Trial 3 finished with value: 5.846650179216916 and parameters: {'max_depth': 21, 'learning_rate': 0.12402485733085497, 'reg_alpha': 0.054969638498598095, 'reg_lambda': 0.02148769342025257, 'min_child_weight': 1.1792947151892554}. Best is trial 3 with value: 5.846650179216916.


🏃 View run placid-hound-697 at: https://dbc-ed7122c4-c5b7.cloud.databricks.com/ml/experiments/2414141065980596/runs/e4254815c81a4cedb7489ce99f426bac
🧪 View experiment at: https://dbc-ed7122c4-c5b7.cloud.databricks.com/ml/experiments/2414141065980596
[0]	validation-rmse:8.63843
[1]	validation-rmse:8.23475
[2]	validation-rmse:7.88255
[3]	validation-rmse:7.57015
[4]	validation-rmse:7.29596
[5]	validation-rmse:7.06555
[6]	validation-rmse:6.86743
[7]	validation-rmse:6.70250
[8]	validation-rmse:6.56831
[9]	validation-rmse:6.45505
[10]	validation-rmse:6.36710
[11]	validation-rmse:6.29865
[12]	validation-rmse:6.23694
[13]	validation-rmse:6.19174
[14]	validation-rmse:6.15459
[15]	validation-rmse:6.12625
[16]	validation-rmse:6.10614
[17]	validation-rmse:6.09503
[18]	validation-rmse:6.09011
[19]	validation-rmse:6.08472
[20]	validation-rmse:6.08592
[21]	validation-rmse:6.08870
[22]	validation-rmse:6.09851
[23]	validation-rmse:6.11276
[24]	validation-rmse:6.12300
[25]	validation-rmse:6.13269
[26]	v

  xgb_model.save_model(model_data_path)


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

  model.load_model(xgb_model_path)
[I 2025-10-21 21:03:33,757] Trial 4 finished with value: 6.1879650227845096 and parameters: {'max_depth': 63, 'learning_rate': 0.07565903471570516, 'reg_alpha': 0.021678779375600917, 'reg_lambda': 0.015480241912324163, 'min_child_weight': 2.2802382585441565}. Best is trial 3 with value: 5.846650179216916.


🏃 View run melodic-skink-174 at: https://dbc-ed7122c4-c5b7.cloud.databricks.com/ml/experiments/2414141065980596/runs/e1fbad0d47574545b2bf5638c088bafc
🧪 View experiment at: https://dbc-ed7122c4-c5b7.cloud.databricks.com/ml/experiments/2414141065980596
[0]	validation-rmse:8.55025
[1]	validation-rmse:8.08926
[2]	validation-rmse:7.70090
[3]	validation-rmse:7.36877
[4]	validation-rmse:7.10317
[5]	validation-rmse:6.87696
[6]	validation-rmse:6.69591
[7]	validation-rmse:6.54729
[8]	validation-rmse:6.43376
[9]	validation-rmse:6.34405
[10]	validation-rmse:6.26905
[11]	validation-rmse:6.21067
[12]	validation-rmse:6.16674
[13]	validation-rmse:6.13451
[14]	validation-rmse:6.11458
[15]	validation-rmse:6.09688
[16]	validation-rmse:6.09145
[17]	validation-rmse:6.08687
[18]	validation-rmse:6.08824
[19]	validation-rmse:6.08446
[20]	validation-rmse:6.08665
[21]	validation-rmse:6.09259
[22]	validation-rmse:6.09975
[23]	validation-rmse:6.10610
[24]	validation-rmse:6.11013
[25]	validation-rmse:6.11605
[26]	

  xgb_model.save_model(model_data_path)


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

  model.load_model(xgb_model_path)
[I 2025-10-21 21:04:23,551] Trial 5 finished with value: 6.143141049015581 and parameters: {'max_depth': 80, 'learning_rate': 0.0906292152736207, 'reg_alpha': 0.05270408847118816, 'reg_lambda': 0.04793414660944966, 'min_child_weight': 0.4429943118354462}. Best is trial 3 with value: 5.846650179216916.


🏃 View run bustling-kit-243 at: https://dbc-ed7122c4-c5b7.cloud.databricks.com/ml/experiments/2414141065980596/runs/0309bc3fb92e4831be6233f802d07a01
🧪 View experiment at: https://dbc-ed7122c4-c5b7.cloud.databricks.com/ml/experiments/2414141065980596
[0]	validation-rmse:8.62391
[1]	validation-rmse:8.21018
[2]	validation-rmse:7.84974
[3]	validation-rmse:7.53188
[4]	validation-rmse:7.26013
[5]	validation-rmse:7.02725
[6]	validation-rmse:6.82746
[7]	validation-rmse:6.66305
[8]	validation-rmse:6.52048
[9]	validation-rmse:6.40161
[10]	validation-rmse:6.30692
[11]	validation-rmse:6.22890
[12]	validation-rmse:6.16775
[13]	validation-rmse:6.11905
[14]	validation-rmse:6.08234
[15]	validation-rmse:6.05303
[16]	validation-rmse:6.03194
[17]	validation-rmse:6.01691
[18]	validation-rmse:6.00887
[19]	validation-rmse:6.00468
[20]	validation-rmse:6.00513
[21]	validation-rmse:6.00844
[22]	validation-rmse:6.02110
[23]	validation-rmse:6.03277
[24]	validation-rmse:6.04577
[25]	validation-rmse:6.05854
[26]	v

  xgb_model.save_model(model_data_path)


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

  model.load_model(xgb_model_path)
[I 2025-10-21 21:04:38,707] Trial 6 finished with value: 6.10488312838917 and parameters: {'max_depth': 62, 'learning_rate': 0.08304043435235499, 'reg_alpha': 0.008740449782948887, 'reg_lambda': 0.28491274207986833, 'min_child_weight': 17.505727836123448}. Best is trial 3 with value: 5.846650179216916.


🏃 View run amusing-conch-165 at: https://dbc-ed7122c4-c5b7.cloud.databricks.com/ml/experiments/2414141065980596/runs/9a3dbc3ba4aa46e28a6d3da74c1076ac
🧪 View experiment at: https://dbc-ed7122c4-c5b7.cloud.databricks.com/ml/experiments/2414141065980596
[0]	validation-rmse:8.33007
[1]	validation-rmse:7.72054
[2]	validation-rmse:7.24069
[3]	validation-rmse:6.88367
[4]	validation-rmse:6.60014
[5]	validation-rmse:6.42148
[6]	validation-rmse:6.29091
[7]	validation-rmse:6.20743
[8]	validation-rmse:6.16511
[9]	validation-rmse:6.14034
[10]	validation-rmse:6.15159
[11]	validation-rmse:6.18146
[12]	validation-rmse:6.20450
[13]	validation-rmse:6.23558
[14]	validation-rmse:6.27294
[15]	validation-rmse:6.30535
[16]	validation-rmse:6.33939
[17]	validation-rmse:6.37846
[18]	validation-rmse:6.41587


  xgb_model.save_model(model_data_path)


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

  model.load_model(xgb_model_path)
[I 2025-10-21 21:05:07,695] Trial 7 finished with value: 6.453628057951985 and parameters: {'max_depth': 82, 'learning_rate': 0.12416316985362412, 'reg_alpha': 0.009958672056108932, 'reg_lambda': 0.0758623422350637, 'min_child_weight': 2.1395809133199974}. Best is trial 3 with value: 5.846650179216916.


🏃 View run calm-newt-367 at: https://dbc-ed7122c4-c5b7.cloud.databricks.com/ml/experiments/2414141065980596/runs/4a7782d6554049d0b4f03705273cd16d
🧪 View experiment at: https://dbc-ed7122c4-c5b7.cloud.databricks.com/ml/experiments/2414141065980596
[0]	validation-rmse:7.90086
[1]	validation-rmse:7.10776
[2]	validation-rmse:6.60035
[3]	validation-rmse:6.26634
[4]	validation-rmse:6.07716
[5]	validation-rmse:5.96639
[6]	validation-rmse:5.90866
[7]	validation-rmse:5.87440
[8]	validation-rmse:5.85545
[9]	validation-rmse:5.84629
[10]	validation-rmse:5.84413
[11]	validation-rmse:5.84210
[12]	validation-rmse:5.84137
[13]	validation-rmse:5.84109
[14]	validation-rmse:5.84131
[15]	validation-rmse:5.84484
[16]	validation-rmse:5.84484
[17]	validation-rmse:5.84465
[18]	validation-rmse:5.84423
[19]	validation-rmse:5.84364
[20]	validation-rmse:5.84347
[21]	validation-rmse:5.84339
[22]	validation-rmse:5.84329


  xgb_model.save_model(model_data_path)


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

  model.load_model(xgb_model_path)
[I 2025-10-21 21:05:20,652] Trial 8 finished with value: 5.843410858063445 and parameters: {'max_depth': 15, 'learning_rate': 0.21992487468175848, 'reg_alpha': 0.007731550026907306, 'reg_lambda': 0.23377457337376373, 'min_child_weight': 1.0357439143907545}. Best is trial 8 with value: 5.843410858063445.


🏃 View run funny-dolphin-192 at: https://dbc-ed7122c4-c5b7.cloud.databricks.com/ml/experiments/2414141065980596/runs/294d519bb150447e907b4eddadec305e
🧪 View experiment at: https://dbc-ed7122c4-c5b7.cloud.databricks.com/ml/experiments/2414141065980596
[0]	validation-rmse:8.35943
[1]	validation-rmse:7.78324
[2]	validation-rmse:7.31316
[3]	validation-rmse:6.96617
[4]	validation-rmse:6.68598
[5]	validation-rmse:6.50056
[6]	validation-rmse:6.34935
[7]	validation-rmse:6.24739
[8]	validation-rmse:6.17133
[9]	validation-rmse:6.12732
[10]	validation-rmse:6.09083
[11]	validation-rmse:6.06449
[12]	validation-rmse:6.05188
[13]	validation-rmse:6.05277
[14]	validation-rmse:6.04414
[15]	validation-rmse:6.04347
[16]	validation-rmse:6.04031
[17]	validation-rmse:6.04716
[18]	validation-rmse:6.05163
[19]	validation-rmse:6.05908
[20]	validation-rmse:6.06734
[21]	validation-rmse:6.07233
[22]	validation-rmse:6.08056
[23]	validation-rmse:6.08381
[24]	validation-rmse:6.08952
[25]	validation-rmse:6.09652


  xgb_model.save_model(model_data_path)


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

  model.load_model(xgb_model_path)
[I 2025-10-21 21:06:00,852] Trial 9 finished with value: 6.099423741638298 and parameters: {'max_depth': 68, 'learning_rate': 0.1268351874747755, 'reg_alpha': 0.05394836382863035, 'reg_lambda': 0.03814164293595655, 'min_child_weight': 0.7706028272535065}. Best is trial 8 with value: 5.843410858063445.


🏃 View run resilient-yak-932 at: https://dbc-ed7122c4-c5b7.cloud.databricks.com/ml/experiments/2414141065980596/runs/32bba52d25c5416397e76f3d81c4226a
🧪 View experiment at: https://dbc-ed7122c4-c5b7.cloud.databricks.com/ml/experiments/2414141065980596
[0]	validation-rmse:7.90086
[1]	validation-rmse:7.10776
[2]	validation-rmse:6.60035
[3]	validation-rmse:6.26634
[4]	validation-rmse:6.07716
[5]	validation-rmse:5.96639
[6]	validation-rmse:5.90866
[7]	validation-rmse:5.87440
[8]	validation-rmse:5.85545
[9]	validation-rmse:5.84629
[10]	validation-rmse:5.84413
[11]	validation-rmse:5.84210
[12]	validation-rmse:5.84137
[13]	validation-rmse:5.84109
[14]	validation-rmse:5.84131
[15]	validation-rmse:5.84484
[16]	validation-rmse:5.84484
[17]	validation-rmse:5.84465
[18]	validation-rmse:5.84423
[19]	validation-rmse:5.84364
[20]	validation-rmse:5.84347
[21]	validation-rmse:5.84339
[22]	validation-rmse:5.84329
[23]	validation-rmse:5.84341


  xgb_model.save_model(model_data_path)


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

  model.load_model(xgb_model_path)


🏃 View run XGBoost Hyperparameter Optimization (Optuna) at: https://dbc-ed7122c4-c5b7.cloud.databricks.com/ml/experiments/2414141065980596/runs/8daec81555674b8885ef792bd8f8351d
🧪 View experiment at: https://dbc-ed7122c4-c5b7.cloud.databricks.com/ml/experiments/2414141065980596


Registrar modelo en `model registry`

In [19]:
model_name = "workspace.default.nyc-taxi-model"

Metodo 1 : `manual`

In [20]:
run_id = input("Ingrese el run_id")
run_uri = f"runs:/{run_id}/model"

result = mlflow.register_model(
    model_uri=run_uri,
    name="workspace.default.nyc-taxi-model"
)

Successfully registered model 'workspace.default.nyc-taxi-model'.


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/8 [00:00<?, ?it/s]

Created version '1' of model 'workspace.default.nyc-taxi-model'.


Metodo 2 : `automatico`

In [21]:
runs = mlflow.search_runs(
    experiment_names=[EXPERIMENT_NAME],
    order_by=["metrics.rmse ASC"],
    output_format="list"
)

# Obtener el mejor run
if len(runs) > 0:
    best_run = runs[0]
    print("🏆 Champion Run encontrado:")
    print(f"Run ID: {best_run.info.run_id}")
    print(f"RMSE: {best_run.data.metrics['rmse']}")
    print(f"Params: {best_run.data.params}")
else:
    print("⚠️ No se encontraron runs con métrica RMSE.")

🏆 Champion Run encontrado:
Run ID: 294d519bb150447e907b4eddadec305e
RMSE: 5.843410858063445
Params: {'custom_metric': 'None', 'early_stopping_rounds': '10', 'learning_rate': '0.21992487468175848', 'max_depth': '15', 'maximize': 'None', 'min_child_weight': '1.0357439143907545', 'num_boost_round': '100', 'objective': 'reg:squarederror', 'reg_alpha': '0.007731550026907306', 'reg_lambda': '0.23377457337376373', 'seed': '42', 'verbose_eval': 'True'}


In [22]:
best_run

<Run: data=<RunData: metrics={'best_iteration': 13.0,
 'rmse': 5.843410858063445,
 'stopped_iteration': 22.0,
 'validation-rmse': 5.8410895189847905}, params={'custom_metric': 'None',
 'early_stopping_rounds': '10',
 'learning_rate': '0.21992487468175848',
 'max_depth': '15',
 'maximize': 'None',
 'min_child_weight': '1.0357439143907545',
 'num_boost_round': '100',
 'objective': 'reg:squarederror',
 'reg_alpha': '0.007731550026907306',
 'reg_lambda': '0.23377457337376373',
 'seed': '42',
 'verbose_eval': 'True'}, tags={'mlflow.parentRunId': '8daec81555674b8885ef792bd8f8351d',
 'mlflow.rootRunId': '8daec81555674b8885ef792bd8f8351d',
 'mlflow.runColor': '#5bc5db',
 'mlflow.runName': 'funny-dolphin-192',
 'mlflow.source.name': 'c:\\Users\\ivanm\\5to Semestre\\Proyecto de Ciencia de '
                       'Datos\\nyc-taxi-predictions-2025\\.venv\\Lib\\site-packages\\ipykernel_launcher.py',
 'mlflow.source.type': 'LOCAL',
 'mlflow.user': 'ivan.morales@iteso.mx',
 'model_family': 'xgboost'

In [23]:
best_params

{'max_depth': 15,
 'learning_rate': 0.21992487468175848,
 'reg_alpha': 0.007731550026907306,
 'reg_lambda': 0.23377457337376373,
 'min_child_weight': 1.0357439143907545,
 'seed': 42,
 'objective': 'reg:squarederror'}

In [24]:
best_run.info.run_id

'294d519bb150447e907b4eddadec305e'

In [25]:
result = mlflow.register_model(
    model_uri=f"runs:/{best_run.info.run_id}/model",
    name=model_name
)

Registered model 'workspace.default.nyc-taxi-model' already exists. Creating a new version of this model...


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/8 [00:00<?, ?it/s]

Created version '2' of model 'workspace.default.nyc-taxi-model'.


Asignar Alias

In [27]:
from mlflow import MlflowClient
from datetime import datetime

client = MlflowClient()

model_version = result.version
new_alias = "Champion"

client.set_registered_model_alias(
    name=model_name,
    alias=new_alias,
    version=result.version
)

date = datetime.today()

client.update_model_version(
    name=model_name,
    version=model_version,
    description=f"The model version {model_version} was transitioned to {new_alias} on {date}",
)

<ModelVersion: aliases=[], creation_timestamp=1761103456679, current_stage=None, deployment_job_state=<ModelVersionDeploymentJobState: current_task_name='', job_id='', job_state='DEPLOYMENT_JOB_CONNECTION_STATE_UNSPECIFIED', run_id='', run_state='DEPLOYMENT_JOB_RUN_STATE_UNSPECIFIED'>, description='The model version 2 was transitioned to Champion on 2025-10-21 21:25:50.778872', last_updated_timestamp=1761103550857, metrics=[<Metric: dataset_digest='', dataset_name='', key='best_iteration', model_id='m-6ed9b2791ef145a19597e5a3802fe516', run_id='294d519bb150447e907b4eddadec305e', step=0, timestamp=1761102308699, value=13.0>,
 <Metric: dataset_digest='', dataset_name='', key='rmse', model_id='m-6ed9b2791ef145a19597e5a3802fe516', run_id='294d519bb150447e907b4eddadec305e', step=0, timestamp=1761102312135, value=5.843410858063445>,
 <Metric: dataset_digest='', dataset_name='', key='stopped_iteration', model_id='m-6ed9b2791ef145a19597e5a3802fe516', run_id='294d519bb150447e907b4eddadec305e', s

Obtener los modelos del `model registry`

In [28]:
import mlflow.pyfunc

model_version_uri = f"models:/{model_name}@Champion"

champion_version = mlflow.pyfunc.load_model(model_version_uri)
champion_version.predict(X_val)

Downloading artifacts:   0%|          | 0/8 [00:00<?, ?it/s]

  model.load_model(xgb_model_path)


array([ 5.177853, 25.42249 , 28.921564, ..., 20.84267 , 13.736658,
       23.873058], shape=(44218,), dtype=float32)