In [27]:
import os
from dotenv import load_dotenv
import mlflow
import pathlib

load_dotenv(override=True)

EXPERIMENT_NAME = "/Users/isabel.valladolid@iteso.mx/nyc-taxi-experiments"
mlflow.set_tracking_uri("databricks")  
mlflow.set_experiment(EXPERIMENT_NAME)

<Experiment: artifact_location='dbfs:/databricks/mlflow-tracking/1858882611825285', creation_time=1761102889708, experiment_id='1858882611825285', last_update_time=1761617996764, lifecycle_stage='active', name='/Users/isabel.valladolid@iteso.mx/nyc-taxi-experiments', tags={'mlflow.experiment.sourceName': '/Users/isabel.valladolid@iteso.mx/nyc-taxi-experiments',
 'mlflow.experimentType': 'MLFLOW_EXPERIMENT',
 'mlflow.ownerEmail': 'isabel.valladolid@iteso.mx',
 'mlflow.ownerId': '72142664924861'}>

#### Descargar en la carpeta data el conjunto de datos correspondiente a marzo del 2025

In [28]:
import requests
import pathlib

DATA_DIR = pathlib.Path("../data")
DATA_DIR.mkdir(parents=True, exist_ok=True)

URL = "https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2025-03.parquet"
TARGET_PATH = os.path.join(DATA_DIR, 'green_tripdata_2025-03.parquet')

if not os.path.exists(TARGET_PATH):
    if URL:
        print(f"Descargando {URL} -> {TARGET_PATH} ...")
        r = requests.get(URL, stream=True)
        r.raise_for_status()
        with open(TARGET_PATH, 'wb') as f:
            for chunk in r.iter_content(chunk_size=1024*1024):
                f.write(chunk)
        print("Descarga completa.")
    else:
        print("No se proporcionó DOWNLOAD_URL. Por favor coloca la URL del parquet en la variable de entorno GREEN_2025_03_URL o descarga manualmente a data/.")
else:
    print(f"El archivo ya existe en {TARGET_PATH}")

Descargando https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2025-03.parquet -> ../data/green_tripdata_2025-03.parquet ...
Descarga completa.


#### Leer y preprocesar los datos

In [33]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import root_mean_squared_error

def read_dataframe(path):
    df = pd.read_parquet(path)
    df["duration"] = (df.lpep_dropoff_datetime - df.lpep_pickup_datetime).dt.total_seconds() / 60
    df = df[(df.duration >= 1) & (df.duration <= 60)]
    df[["PULocationID", "DOLocationID"]] = df[["PULocationID", "DOLocationID"]].astype(str)
    return df

df_train = read_dataframe("../data/green_tripdata_2025-01.parquet")
df_val = read_dataframe("../data/green_tripdata_2025-02.parquet")

In [34]:
# Feature Engineering + One Hot Encoding

def preprocess(df, dv):
    df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']
    categorical = ['PU_DO']
    numerical = ['trip_distance']
    train_dicts = df[categorical + numerical].to_dict(orient='records')
    return dv.transform(train_dicts)

categorical = ['PULocationID', 'DOLocationID']
numerical = ['trip_distance']
dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

X_val = preprocess(df_val, dv)

In [35]:
# Target
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [36]:
# Definir los dataset como objetos de mlflow para poderlos trackear
training_dataset = mlflow.data.from_numpy(X_train.data, targets=y_train, name="green_tripdata_2025-01")
validation_dataset = mlflow.data.from_numpy(X_val.data, targets=y_val, name="green_tripdata_2025-02")

#### Primer parent experiment: **Gradient Boost (XGBoost) con Optuna**

In [38]:
import xgboost as xgb
import optuna
import pickle
from optuna.samplers import TPESampler
from mlflow.models.signature import infer_signature
from sklearn.metrics import mean_squared_error

In [39]:
PARENT_XGB = "XGBoost_Parent_Experiment"

if X_train is not None:
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dvalid = xgb.DMatrix(X_val, label=y_val)

    def objective(trial: optuna.trial.Trial):
        params = {
            'max_depth': trial.suggest_int('max_depth', 4, 20),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
            'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
            'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
            'min_child_weight': trial.suggest_float('min_child_weight', 1e-3, 10.0, log=True),
            'objective': 'reg:squarederror',
            'seed': 42
        }

        with mlflow.start_run(run_name='xgboost_optuna_trial', nested=True):
            mlflow.set_tag('model_family', 'xgboost')
            mlflow.log_params(params)

            booster = xgb.train(
                params=params,
                dtrain=dtrain,
                num_boost_round=200,
                evals=[(dvalid, 'validation')],
                early_stopping_rounds=10,
                verbose_eval=False
            )

            y_pred = booster.predict(dvalid)
            rmse = mean_squared_error(y_val, y_pred, squared=False)
            mlflow.log_metric('rmse', rmse)

            # guardar modelo
            signature = infer_signature(X_val[:5].toarray() if hasattr(X_val, 'toarray') else X_val[:5], y_pred[:5])
            mlflow.xgboost.log_model(booster, artifact_path='model', input_example=X_val[:5].toarray() if hasattr(X_val, 'toarray') else X_val[:5], signature=signature)

        return rmse
    
    sampler = TPESampler(seed=42)
    study = optuna.create_study(direction='minimize', sampler=sampler)

    with mlflow.start_run(run_name='XGBoost Hyperparameter Optimization Parent', nested=False):
        study.optimize(objective, n_trials=5)

        best_params = study.best_params
        best_params['seed'] = 42
        best_params['objective'] = 'reg:squarederror'

        # Entrenar modelo final con best_params
        booster = xgb.train(
            params=best_params,
            dtrain=dtrain,
            num_boost_round=200,
            evals=[(dvalid, 'validation')],
            early_stopping_rounds=10,
            verbose_eval=False
        )

        y_pred = booster.predict(dvalid)
        rmse = mean_squared_error(y_val, y_pred, squared=False)
        mlflow.log_metric('rmse', rmse)

        pathlib.Path('preprocessor').mkdir(exist_ok=True)
        with open('preprocessor/dv_xgb.b', 'wb') as f_out:
            pickle.dump(dv, f_out)
        mlflow.log_artifact('preprocessor/dv_xgb.b', artifact_path='preprocessor')

        # Registrar artefacto modelo
        feature_names = dv.get_feature_names_out()
        input_example = pd.DataFrame(X_val[:5].toarray(), columns=feature_names)
        signature = infer_signature(input_example, y_pred[:5])
        mlflow.xgboost.log_model(booster, artifact_path='model', input_example=input_example, signature=signature, registered_model_name=None)

[I 2025-10-28 12:31:34,074] A new study created in memory with name: no-name-1b8d8a16-4a5f-4bdb-a6bd-88836f79446e
[W 2025-10-28 12:31:37,479] Trial 0 failed with parameters: {'max_depth': 10, 'learning_rate': 0.2536999076681771, 'reg_alpha': 0.03872090295370417, 'reg_lambda': 0.0024430162614261434, 'min_child_weight': 0.004207988669606638} because of the following error: TypeError("got an unexpected keyword argument 'squared'").
Traceback (most recent call last):
  File "/Users/isabelvalladolid/Documents/ProyectoCienciaDatos/nyc-taxi-predictions-2025/.venv/lib/python3.11/site-packages/optuna/study/_optimize.py", line 201, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/var/folders/ct/mykpk3h93yd67cvslfdfmmjm0000gn/T/ipykernel_90847/1757626162.py", line 32, in objective
    rmse = mean_squared_error(y_val, y_pred, squared=False)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/isabelvalladolid/Documents/ProyectoCiencia

🏃 View run xgboost_optuna_trial at: https://dbc-00d2933e-633b.cloud.databricks.com/ml/experiments/1858882611825285/runs/7d0bce8ad8504d9eb3158f061eb7b2f7
🧪 View experiment at: https://dbc-00d2933e-633b.cloud.databricks.com/ml/experiments/1858882611825285
🏃 View run XGBoost Hyperparameter Optimization Parent at: https://dbc-00d2933e-633b.cloud.databricks.com/ml/experiments/1858882611825285/runs/268c4250c58642f0b12c4e7214ec70f8
🧪 View experiment at: https://dbc-00d2933e-633b.cloud.databricks.com/ml/experiments/1858882611825285


TypeError: got an unexpected keyword argument 'squared'