# Tarea 5
- Ivan Morales, 10 de Octubre de 2025

Realizaremos dos experimentos, cada uno con child experiments

    - Gradient Boost
    - Random Forest

### Importar las librerias necesarias y establecer conexion con Databricks

In [4]:
import os, mlflow
from dotenv import load_dotenv
import pickle
import pandas as pd
from sklearn.metrics import  root_mean_squared_error
from sklearn.feature_extraction import  DictVectorizer

load_dotenv(override=True) # Cargar las variables de entorno desde el archivo .env
EXPERIMENT_NAME = "/Users/ivan.morales@iteso.mx/nyc-taxi-experiments" 

mlflow.set_tracking_uri("databricks")
experiment = mlflow.set_experiment(experiment_name=EXPERIMENT_NAME)

Preparacion de los datos

In [5]:
def read_dataframe(filename):

    df = pd.read_parquet(filename)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)

    return df

df_train_ene = read_dataframe('../data/green_tripdata_2025-01.parquet')
df_train_feb = read_dataframe('../data/green_tripdata_2025-02.parquet')
df_val = read_dataframe('../data/green_tripdata_2025-03.parquet')

In [6]:
df_train_ene.info()

<class 'pandas.core.frame.DataFrame'>
Index: 44218 entries, 0 to 46620
Data columns (total 22 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   VendorID               44218 non-null  int32         
 1   lpep_pickup_datetime   44218 non-null  datetime64[us]
 2   lpep_dropoff_datetime  44218 non-null  datetime64[us]
 3   store_and_fwd_flag     42295 non-null  object        
 4   RatecodeID             42295 non-null  float64       
 5   PULocationID           44218 non-null  object        
 6   DOLocationID           44218 non-null  object        
 7   passenger_count        42295 non-null  float64       
 8   trip_distance          44218 non-null  float64       
 9   fare_amount            44218 non-null  float64       
 10  extra                  44218 non-null  float64       
 11  mta_tax                44218 non-null  float64       
 12  tip_amount             44218 non-null  float64       
 13  tolls_

In [7]:
df_train = pd.concat([df_train_ene, df_train_feb])
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 88436 entries, 0 to 46620
Data columns (total 22 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   VendorID               88436 non-null  int32         
 1   lpep_pickup_datetime   88436 non-null  datetime64[us]
 2   lpep_dropoff_datetime  88436 non-null  datetime64[us]
 3   store_and_fwd_flag     84590 non-null  object        
 4   RatecodeID             84590 non-null  float64       
 5   PULocationID           88436 non-null  object        
 6   DOLocationID           88436 non-null  object        
 7   passenger_count        84590 non-null  float64       
 8   trip_distance          88436 non-null  float64       
 9   fare_amount            88436 non-null  float64       
 10  extra                  88436 non-null  float64       
 11  mta_tax                88436 non-null  float64       
 12  tip_amount             88436 non-null  float64       
 13  tolls_

Concatemos los datasets de enero y febrero para construir una base mas robusta para validar con los datos de marzo.

Feature Engineering + One Hot Encoding, pipeline simple inicial

In [8]:
def preprocess(df, dv):
    df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']
    categorical = ['PU_DO']
    numerical = ['trip_distance']
    train_dicts = df[categorical + numerical].to_dict(orient='records')
    return dv.transform(train_dicts)

categorical = ['PULocationID', 'DOLocationID']
numerical = ['trip_distance']
dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

X_val = preprocess(df_val, dv)

target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

Definir los dataset como objetos de *mlflow* para poderlos trackear

In [9]:
training_dataset = mlflow.data.from_numpy(X_train.data, targets=y_train, name="green_tripdata_2025-01&02")
validation_dataset = mlflow.data.from_numpy(X_val.data, targets=y_val, name="green_tripdata_2025-03")

# Modelos

In [10]:
# XGBOOST 
import math
import optuna
import pathlib
import xgboost as xgb
from optuna.samplers import TPESampler
from mlflow.models.signature import infer_signature

In [11]:
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)

In [12]:
# ------------------------------------------------------------
# Definir la función objetivo para Optuna
#    - Recibe un `trial`, que se usa para proponer hiperparámetros.
#    - Entrena un modelo con esos hiperparámetros.
#    - Calcula la métrica de validación (RMSE) y la retorna (Optuna la minimizará).
#    - Abrimos un run anidado de MLflow para registrar cada trial.
# ------------------------------------------------------------
def objective(trial: optuna.trial.Trial):
    # Hiperparámetros MUESTREADOS por Optuna en CADA trial.
    # Nota: usamos log=True para emular rangos log-uniformes (similar a loguniform).
    params = {
        "max_depth": trial.suggest_int("max_depth", 4, 100),
        "learning_rate": trial.suggest_float("learning_rate", math.exp(-3), 1.0, log=True),
        "reg_alpha": trial.suggest_float("reg_alpha",   math.exp(-5), math.exp(-1), log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", math.exp(-6), math.exp(-1), log=True),
        "min_child_weight": trial.suggest_float("min_child_weight", math.exp(-1), math.exp(3), log=True),
        "objective": "reg:squarederror",  
        "seed": 42,                      
    }

    # Run anidado para dejar rastro de cada trial en MLflow
    with mlflow.start_run(nested=True):
        mlflow.set_tag("model_family", "xgboost")  # etiqueta informativa
        mlflow.log_params(params)                  # registra hiperparámetros del trial

        # Entrenamiento con early stopping en el conjunto de validación
        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=100,
            evals=[(valid, "validation")],
            early_stopping_rounds=10,
        )

        # Predicción y métrica en validación
        y_pred = booster.predict(valid)
        rmse = root_mean_squared_error(y_val, y_pred)

        # Registrar la métrica principal
        mlflow.log_metric("rmse", rmse)

        # La "signature" describe la estructura esperada de entrada y salida del modelo:
        # incluye los nombres, tipos y forma (shape) de las variables de entrada y el tipo de salida.
        # MLflow la usa para validar datos en inferencia y documentar el modelo en el Model Registry.
        signature = infer_signature(X_val, y_pred)

        # Guardar el modelo del trial como artefacto en MLflow.
        mlflow.xgboost.log_model(
            booster,
            name="model",
            input_example=X_val[:5],
            signature=signature
        )

    # Optuna minimiza el valor retornado
    return rmse

Flujo de Busqueda

In [13]:
mlflow.xgboost.autolog(log_models=False)

# ------------------------------------------------------------
# Crear el estudio de Optuna
#    - Usamos TPE (Tree-structured Parzen Estimator) como sampler.
#    - direction="minimize" porque queremos minimizar el RMSE.
# ------------------------------------------------------------
sampler = TPESampler(seed=42)
study = optuna.create_study(direction="minimize", sampler=sampler)

# ------------------------------------------------------------
# Ejecutar la optimización (n_trials = número de intentos)
#    - Cada trial ejecuta la función objetivo con un set distinto de hiperparámetros.
#    - Abrimos un run "padre" para agrupar toda la búsqueda.
# ------------------------------------------------------------
with mlflow.start_run(run_name="XGBoost Hyperparameter Optimization (Optuna)", nested=True):
    study.optimize(objective, n_trials=10)

    # --------------------------------------------------------
    # Recuperar y registrar los mejores hiperparámetros
    # --------------------------------------------------------
    best_params = study.best_params
    # Asegurar tipos/campos fijos (por claridad y consistencia)
    best_params["max_depth"] = int(best_params["max_depth"])
    best_params["seed"] = 42
    best_params["objective"] = "reg:squarederror"

    mlflow.log_params(best_params)

    # Etiquetas del run "padre" (metadatos del experimento)
    mlflow.set_tags({
        "project": "NYC Taxi Time Prediction Project",
        "optimizer_engine": "optuna",
        "model_family": "xgboost",
        "feature_set_version": 1,
    })

    # --------------------------------------------------------
    # 7) Entrenar un modelo FINAL con los mejores hiperparámetros
    #    (normalmente se haría sobre train+val o con CV; aquí mantenemos el patrón original)
    # --------------------------------------------------------
    booster = xgb.train(
        params=best_params,
        dtrain=train,
        num_boost_round=100,
        evals=[(valid, "validation")],
        early_stopping_rounds=10,
    )

    # Evaluar y registrar la métrica final en validación
    y_pred = booster.predict(valid)
    rmse = root_mean_squared_error(y_val, y_pred)
    mlflow.log_metric("rmse", rmse)

    # --------------------------------------------------------
    # 8) Guardar artefactos adicionales (p. ej. el preprocesador)
    # --------------------------------------------------------
    pathlib.Path("preprocessor").mkdir(exist_ok=True)
    with open("preprocessor/preprocessor.b", "wb") as f_out:
        pickle.dump(dv, f_out)

    mlflow.log_artifact("preprocessor/preprocessor.b", artifact_path="preprocessor")

    # La "signature" describe la estructura esperada de entrada y salida del modelo:
    # incluye los nombres, tipos y forma (shape) de las variables de entrada y el tipo de salida.
    # MLflow la usa para validar datos en inferencia y documentar el modelo en el Model Registry.
    # Si X_val es la matriz dispersa (scipy.sparse) salida de DictVectorizer:
    feature_names = dv.get_feature_names_out()
    input_example = pd.DataFrame(X_val[:5].toarray(), columns=feature_names)

    # Para que las longitudes coincidan, usa el mismo slice en y_pred
    signature = infer_signature(input_example, y_val[:5])

    # Guardar el modelo del trial como artefacto en MLflow.
    mlflow.xgboost.log_model(
        booster,
        name="model",
        input_example=input_example,
        signature=signature
    )

[I 2025-10-28 20:31:52,111] A new study created in memory with name: no-name-7ffa3aec-134d-4296-a5ea-77545d4ce509


[0]	validation-rmse:6.50020
[1]	validation-rmse:6.44713
[2]	validation-rmse:6.44851
[3]	validation-rmse:6.44788
[4]	validation-rmse:6.44758
[5]	validation-rmse:6.44908
[6]	validation-rmse:6.44861
[7]	validation-rmse:6.44855
[8]	validation-rmse:6.44929
[9]	validation-rmse:6.44927
[10]	validation-rmse:6.44808
[11]	validation-rmse:6.44815


  xgb_model.save_model(model_data_path)


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

  model.load_model(xgb_model_path)
2025/10/28 20:32:15 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
[I 2025-10-28 20:32:18,318] Trial 0 finished with value: 6.448153607973611 and parameters: {'max_depth': 40, 'learning_rate': 0.8625543817410922, 'reg_alpha': 0.12593061066249622, 'reg_lambda': 0.049454235173237264, 'min_child_weight': 0.6866535292359801}. Best is trial 0 with value: 6.448153607973611.


🏃 View run stylish-squirrel-627 at: https://dbc-ed7122c4-c5b7.cloud.databricks.com/ml/experiments/2414141065980596/runs/5d85fbd06a79494db6c1f06671b3e02e
🧪 View experiment at: https://dbc-ed7122c4-c5b7.cloud.databricks.com/ml/experiments/2414141065980596
[0]	validation-rmse:9.39331
[1]	validation-rmse:9.06545
[2]	validation-rmse:8.77322
[3]	validation-rmse:8.50858
[4]	validation-rmse:8.26164
[5]	validation-rmse:8.04744
[6]	validation-rmse:7.85184
[7]	validation-rmse:7.67441
[8]	validation-rmse:7.51775
[9]	validation-rmse:7.37410
[10]	validation-rmse:7.24928
[11]	validation-rmse:7.13915
[12]	validation-rmse:7.04022
[13]	validation-rmse:6.95339
[14]	validation-rmse:6.87086
[15]	validation-rmse:6.80144
[16]	validation-rmse:6.74283
[17]	validation-rmse:6.68987
[18]	validation-rmse:6.64133
[19]	validation-rmse:6.59762
[20]	validation-rmse:6.56128
[21]	validation-rmse:6.52847
[22]	validation-rmse:6.49955
[23]	validation-rmse:6.47610
[24]	validation-rmse:6.45523
[25]	validation-rmse:6.43780
[2

  xgb_model.save_model(model_data_path)


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

  model.load_model(xgb_model_path)
[I 2025-10-28 20:32:35,280] Trial 1 finished with value: 6.342751049216642 and parameters: {'max_depth': 19, 'learning_rate': 0.059264241587996896, 'reg_alpha': 0.21539205131792016, 'reg_lambda': 0.05006540936006931, 'min_child_weight': 6.248180561354165}. Best is trial 1 with value: 6.342751049216642.


🏃 View run fortunate-whale-764 at: https://dbc-ed7122c4-c5b7.cloud.databricks.com/ml/experiments/2414141065980596/runs/9bdbe9a32f014581aeb6f8adb4360a9a
🧪 View experiment at: https://dbc-ed7122c4-c5b7.cloud.databricks.com/ml/experiments/2414141065980596
[0]	validation-rmse:6.40088
[1]	validation-rmse:6.37426
[2]	validation-rmse:6.37688
[3]	validation-rmse:6.37673
[4]	validation-rmse:6.37484
[5]	validation-rmse:6.37329
[6]	validation-rmse:6.37279
[7]	validation-rmse:6.37328
[8]	validation-rmse:6.37399
[9]	validation-rmse:6.37447
[10]	validation-rmse:6.37435
[11]	validation-rmse:6.37470
[12]	validation-rmse:6.37477
[13]	validation-rmse:6.37541
[14]	validation-rmse:6.36442
[15]	validation-rmse:6.36243
[16]	validation-rmse:6.36139
[17]	validation-rmse:6.36159
[18]	validation-rmse:6.36161
[19]	validation-rmse:6.36188
[20]	validation-rmse:6.36180
[21]	validation-rmse:6.35968
[22]	validation-rmse:6.35933
[23]	validation-rmse:6.36098
[24]	validation-rmse:6.36094
[25]	validation-rmse:6.36090
[26

  xgb_model.save_model(model_data_path)


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

  model.load_model(xgb_model_path)
[I 2025-10-28 20:32:48,248] Trial 2 finished with value: 6.362498610758229 and parameters: {'max_depth': 5, 'learning_rate': 0.9136840519292247, 'reg_alpha': 0.18820387978911576, 'reg_lambda': 0.007166739666045858, 'min_child_weight': 0.7613210498541186}. Best is trial 1 with value: 6.342751049216642.


🏃 View run rare-smelt-432 at: https://dbc-ed7122c4-c5b7.cloud.databricks.com/ml/experiments/2414141065980596/runs/911f83c347e74616ad2b11d55591d2da
🧪 View experiment at: https://dbc-ed7122c4-c5b7.cloud.databricks.com/ml/experiments/2414141065980596
[0]	validation-rmse:9.04875
[1]	validation-rmse:8.48014
[2]	validation-rmse:8.00174
[3]	validation-rmse:7.63227
[4]	validation-rmse:7.33562
[5]	validation-rmse:7.09551
[6]	validation-rmse:6.91687
[7]	validation-rmse:6.76362
[8]	validation-rmse:6.65808
[9]	validation-rmse:6.57570
[10]	validation-rmse:6.52171
[11]	validation-rmse:6.47661
[12]	validation-rmse:6.44133
[13]	validation-rmse:6.41692
[14]	validation-rmse:6.39647
[15]	validation-rmse:6.38382
[16]	validation-rmse:6.37170
[17]	validation-rmse:6.36426
[18]	validation-rmse:6.35918
[19]	validation-rmse:6.35468
[20]	validation-rmse:6.35233
[21]	validation-rmse:6.35011
[22]	validation-rmse:6.34931
[23]	validation-rmse:6.34779
[24]	validation-rmse:6.34820
[25]	validation-rmse:6.34779
[26]	val

  xgb_model.save_model(model_data_path)


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

  model.load_model(xgb_model_path)
[I 2025-10-28 20:33:07,171] Trial 3 finished with value: 6.349492328945935 and parameters: {'max_depth': 21, 'learning_rate': 0.12402485733085497, 'reg_alpha': 0.054969638498598095, 'reg_lambda': 0.02148769342025257, 'min_child_weight': 1.1792947151892554}. Best is trial 1 with value: 6.342751049216642.


🏃 View run whimsical-jay-309 at: https://dbc-ed7122c4-c5b7.cloud.databricks.com/ml/experiments/2414141065980596/runs/2611f4f463fa48cdbc407ec017a760b4
🧪 View experiment at: https://dbc-ed7122c4-c5b7.cloud.databricks.com/ml/experiments/2414141065980596
[0]	validation-rmse:9.28312
[1]	validation-rmse:8.87218
[2]	validation-rmse:8.51389
[3]	validation-rmse:8.19673
[4]	validation-rmse:7.93536
[5]	validation-rmse:7.69780
[6]	validation-rmse:7.48470
[7]	validation-rmse:7.32400
[8]	validation-rmse:7.18195
[9]	validation-rmse:7.06050
[10]	validation-rmse:6.96124
[11]	validation-rmse:6.88069
[12]	validation-rmse:6.81242
[13]	validation-rmse:6.74873
[14]	validation-rmse:6.70301
[15]	validation-rmse:6.66827
[16]	validation-rmse:6.64371
[17]	validation-rmse:6.62069
[18]	validation-rmse:6.60329
[19]	validation-rmse:6.59396
[20]	validation-rmse:6.58741
[21]	validation-rmse:6.58493
[22]	validation-rmse:6.58219
[23]	validation-rmse:6.58118
[24]	validation-rmse:6.57783
[25]	validation-rmse:6.57520
[26]	

  xgb_model.save_model(model_data_path)


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

  model.load_model(xgb_model_path)
[I 2025-10-28 20:33:52,279] Trial 4 finished with value: 6.603076047502704 and parameters: {'max_depth': 63, 'learning_rate': 0.07565903471570516, 'reg_alpha': 0.021678779375600917, 'reg_lambda': 0.015480241912324163, 'min_child_weight': 2.2802382585441565}. Best is trial 1 with value: 6.342751049216642.


🏃 View run mysterious-tern-886 at: https://dbc-ed7122c4-c5b7.cloud.databricks.com/ml/experiments/2414141065980596/runs/fc8d838081994bfd9cf7b9bd48c7f27f
🧪 View experiment at: https://dbc-ed7122c4-c5b7.cloud.databricks.com/ml/experiments/2414141065980596
[0]	validation-rmse:9.20277
[1]	validation-rmse:8.74255
[2]	validation-rmse:8.34117
[3]	validation-rmse:8.00725
[4]	validation-rmse:7.73580
[5]	validation-rmse:7.50561
[6]	validation-rmse:7.30006
[7]	validation-rmse:7.14771
[8]	validation-rmse:7.01802
[9]	validation-rmse:6.92437
[10]	validation-rmse:6.85499
[11]	validation-rmse:6.79653
[12]	validation-rmse:6.75475
[13]	validation-rmse:6.71663
[14]	validation-rmse:6.68683
[15]	validation-rmse:6.66302
[16]	validation-rmse:6.64757
[17]	validation-rmse:6.63263
[18]	validation-rmse:6.62531
[19]	validation-rmse:6.62252
[20]	validation-rmse:6.61705
[21]	validation-rmse:6.61592
[22]	validation-rmse:6.61780
[23]	validation-rmse:6.61948
[24]	validation-rmse:6.62396
[25]	validation-rmse:6.62790
[26

  xgb_model.save_model(model_data_path)


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

  model.load_model(xgb_model_path)
[I 2025-10-28 20:34:51,263] Trial 5 finished with value: 6.650083402109347 and parameters: {'max_depth': 80, 'learning_rate': 0.0906292152736207, 'reg_alpha': 0.05270408847118816, 'reg_lambda': 0.04793414660944966, 'min_child_weight': 0.4429943118354462}. Best is trial 1 with value: 6.342751049216642.


🏃 View run bedecked-sponge-998 at: https://dbc-ed7122c4-c5b7.cloud.databricks.com/ml/experiments/2414141065980596/runs/41ad06ca3b454ab6aa3717f379cdf3cf
🧪 View experiment at: https://dbc-ed7122c4-c5b7.cloud.databricks.com/ml/experiments/2414141065980596
[0]	validation-rmse:9.23235
[1]	validation-rmse:8.78476
[2]	validation-rmse:8.39317
[3]	validation-rmse:8.05754
[4]	validation-rmse:7.77172
[5]	validation-rmse:7.53408
[6]	validation-rmse:7.32957
[7]	validation-rmse:7.15367
[8]	validation-rmse:7.01487
[9]	validation-rmse:6.89655
[10]	validation-rmse:6.80227
[11]	validation-rmse:6.72713
[12]	validation-rmse:6.66966
[13]	validation-rmse:6.62126
[14]	validation-rmse:6.58492
[15]	validation-rmse:6.55773
[16]	validation-rmse:6.54286
[17]	validation-rmse:6.53413
[18]	validation-rmse:6.53031
[19]	validation-rmse:6.52684
[20]	validation-rmse:6.52843
[21]	validation-rmse:6.53742
[22]	validation-rmse:6.54782
[23]	validation-rmse:6.56139
[24]	validation-rmse:6.57214
[25]	validation-rmse:6.58336
[26

  xgb_model.save_model(model_data_path)


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

  model.load_model(xgb_model_path)
[I 2025-10-28 20:35:11,726] Trial 6 finished with value: 6.629101820763897 and parameters: {'max_depth': 62, 'learning_rate': 0.08304043435235499, 'reg_alpha': 0.008740449782948887, 'reg_lambda': 0.28491274207986833, 'min_child_weight': 17.505727836123448}. Best is trial 1 with value: 6.342751049216642.


🏃 View run placid-lark-432 at: https://dbc-ed7122c4-c5b7.cloud.databricks.com/ml/experiments/2414141065980596/runs/e2df8cf66fa44618a9d23112aacf545b
🧪 View experiment at: https://dbc-ed7122c4-c5b7.cloud.databricks.com/ml/experiments/2414141065980596
[0]	validation-rmse:8.95615
[1]	validation-rmse:8.31807
[2]	validation-rmse:7.81873
[3]	validation-rmse:7.43947
[4]	validation-rmse:7.15846
[5]	validation-rmse:6.96133
[6]	validation-rmse:6.82004
[7]	validation-rmse:6.72364
[8]	validation-rmse:6.66616
[9]	validation-rmse:6.63605
[10]	validation-rmse:6.61378
[11]	validation-rmse:6.61490
[12]	validation-rmse:6.63143
[13]	validation-rmse:6.65196
[14]	validation-rmse:6.67964
[15]	validation-rmse:6.69743
[16]	validation-rmse:6.72889
[17]	validation-rmse:6.74373
[18]	validation-rmse:6.76521
[19]	validation-rmse:6.78378
[20]	validation-rmse:6.79300


  xgb_model.save_model(model_data_path)


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

  model.load_model(xgb_model_path)
[I 2025-10-28 20:35:46,375] Trial 7 finished with value: 6.792999954456445 and parameters: {'max_depth': 82, 'learning_rate': 0.12416316985362412, 'reg_alpha': 0.009958672056108932, 'reg_lambda': 0.0758623422350637, 'min_child_weight': 2.1395809133199974}. Best is trial 1 with value: 6.342751049216642.


🏃 View run clean-bee-301 at: https://dbc-ed7122c4-c5b7.cloud.databricks.com/ml/experiments/2414141065980596/runs/0402fbe2cd7546abad320c23adfb907f
🧪 View experiment at: https://dbc-ed7122c4-c5b7.cloud.databricks.com/ml/experiments/2414141065980596
[0]	validation-rmse:8.52376
[1]	validation-rmse:7.69808
[2]	validation-rmse:7.19196
[3]	validation-rmse:6.84405
[4]	validation-rmse:6.63461
[5]	validation-rmse:6.50842
[6]	validation-rmse:6.44035
[7]	validation-rmse:6.39662
[8]	validation-rmse:6.37150
[9]	validation-rmse:6.35578
[10]	validation-rmse:6.34785
[11]	validation-rmse:6.34450
[12]	validation-rmse:6.34313
[13]	validation-rmse:6.34291
[14]	validation-rmse:6.34245
[15]	validation-rmse:6.34176
[16]	validation-rmse:6.34227
[17]	validation-rmse:6.34252
[18]	validation-rmse:6.34298
[19]	validation-rmse:6.34306
[20]	validation-rmse:6.34306
[21]	validation-rmse:6.34337
[22]	validation-rmse:6.34348
[23]	validation-rmse:6.34357
[24]	validation-rmse:6.34355
[25]	validation-rmse:6.34399


  xgb_model.save_model(model_data_path)


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

  model.load_model(xgb_model_path)


🏃 View run enchanting-shad-960 at: https://dbc-ed7122c4-c5b7.cloud.databricks.com/ml/experiments/2414141065980596/runs/5a8d8c52039340dd921ca1632d565da4
🧪 View experiment at: https://dbc-ed7122c4-c5b7.cloud.databricks.com/ml/experiments/2414141065980596


[W 2025-10-28 20:36:00,435] Trial 8 failed with parameters: {'max_depth': 15, 'learning_rate': 0.21992487468175848, 'reg_alpha': 0.007731550026907306, 'reg_lambda': 0.23377457337376373, 'min_child_weight': 1.0357439143907545} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "c:\Users\ivanm\5to Semestre\Proyecto de Ciencia de Datos\nyc-taxi-predictions-2025\.venv\Lib\site-packages\optuna\study\_optimize.py", line 201, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\ivanm\AppData\Local\Temp\ipykernel_13348\3894802457.py", line 48, in objective
    mlflow.xgboost.log_model(
    ~~~~~~~~~~~~~~~~~~~~~~~~^
        booster,
        ^^^^^^^^
    ...<2 lines>...
        signature=signature
        ^^^^^^^^^^^^^^^^^^^
    )
    ^
  File "c:\Users\ivanm\5to Semestre\Proyecto de Ciencia de Datos\nyc-taxi-predictions-2025\.venv\Lib\site-packages\mlflow\xgboost\__init__.py", line 279, in log_model
    return Model.log(
           ~~~~~~~

🏃 View run XGBoost Hyperparameter Optimization (Optuna) at: https://dbc-ed7122c4-c5b7.cloud.databricks.com/ml/experiments/2414141065980596/runs/5370f6320cd8408b96f24ea9f9076423
🧪 View experiment at: https://dbc-ed7122c4-c5b7.cloud.databricks.com/ml/experiments/2414141065980596


KeyboardInterrupt: 

In [14]:
model_name = "workspace.default.nyc-taxi-model-xgboost-tarea-5"

In [15]:
run_id = input("Ingrese el run_id")
run_uri = f"runs:/{run_id}/model"

result = mlflow.register_model(
    model_uri=run_uri,
    name=model_name
)

Successfully registered model 'workspace.default.nyc-taxi-model-xgboost-tarea-5'.


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/8 [00:00<?, ?it/s]

Created version '1' of model 'workspace.default.nyc-taxi-model-xgboost-tarea-5'.


Asignamos alias

In [16]:
from mlflow import MlflowClient

client = MlflowClient()

In [17]:
model_version = result.version
new_alias = "Challenger"

client.set_registered_model_alias(
    name=model_name,
    alias=new_alias,
    version=result.version
)

In [18]:
from datetime import datetime

date = datetime.today()

client.update_model_version(
    name=model_name,
    version=model_version,
    description=f"The model version {model_version} was transitioned to {new_alias} on {date}",
)

<ModelVersion: aliases=[], creation_timestamp=1761705419748, current_stage=None, deployment_job_state=<ModelVersionDeploymentJobState: current_task_name='', job_id='', job_state='DEPLOYMENT_JOB_CONNECTION_STATE_UNSPECIFIED', run_id='', run_state='DEPLOYMENT_JOB_RUN_STATE_UNSPECIFIED'>, description=('The model version 1 was transitioned to Challenger on 2025-10-28 '
 '20:43:30.198948'), last_updated_timestamp=1761705811659, metrics=[<Metric: dataset_digest='', dataset_name='', key='best_iteration', model_id='m-8c6d8c4c60374495807456c18eb16872', run_id='9bdbe9a32f014581aeb6f8adb4360a9a', step=0, timestamp=1761705140751, value=48.0>,
 <Metric: dataset_digest='', dataset_name='', key='rmse', model_id='m-8c6d8c4c60374495807456c18eb16872', run_id='9bdbe9a32f014581aeb6f8adb4360a9a', step=0, timestamp=1761705143719, value=6.342751049216642>,
 <Metric: dataset_digest='', dataset_name='', key='stopped_iteration', model_id='m-8c6d8c4c60374495807456c18eb16872', run_id='9bdbe9a32f014581aeb6f8adb436