In [1]:
import pickle
import pandas as pd

from sklearn.metrics import  root_mean_squared_error
from sklearn.feature_extraction import  DictVectorizer
from sklearn.linear_model import Lasso, Ridge, LinearRegression

In [2]:
def read_dataframe(filename):

    df = pd.read_parquet(filename)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)

    return df

In [3]:
df_train = read_dataframe('../data/green_tripdata_2024-01.parquet')
df_val = read_dataframe('../data/green_tripdata_2024-02.parquet')

In [4]:
df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
df_val['PU_DO'] = df_val['PULocationID'] + '_' + df_val['DOLocationID']

In [5]:
categorical = ['PU_DO']  #'PULocationID', 'DOLocationID']
numerical = ['trip_distance']
dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [6]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

Diferente

In [7]:
import dagshub
import mlflow


dagshub.init(url="https://dagshub.com/ioSoyPato/nyc-taxi-time-prediction", mlflow=True)

MLFLOW_TRACKING_URI = mlflow.get_tracking_uri()

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment(experiment_name="nyc-taxi-experiment")

<Experiment: artifact_location='mlflow-artifacts:/2755643640c14fd6bcf983414f96147c', creation_time=1726630094230, experiment_id='0', last_update_time=1726630094230, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>

In [8]:
training_dataset = mlflow.data.from_numpy(X_train.data, targets=y_train, name="green_tripdata_2024-01")
validation_dataset = mlflow.data.from_numpy(X_val.data, targets=y_val, name="green_tripdata_2024-02")

In [15]:
from sklearn.metrics import  root_mean_squared_error
from sklearn.feature_extraction import  DictVectorizer
from sklearn.linear_model import Lasso, Ridge, LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
import numpy as np

In [16]:
mlflow.sklearn.autolog()

def objective_rf(params):
    with mlflow.start_run(nested=True):
        # Set model tag
        mlflow.set_tag("model_family", "random_forest")
        
        # Log parameters
        mlflow.log_params(params)
        
        # Train RandomForest model
        rf_model = RandomForestRegressor(
            n_estimators=int(params['n_estimators']),
            max_depth=int(params['max_depth']),
            min_samples_split=int(params['min_samples_split']),
            min_samples_leaf=int(params['min_samples_leaf']),
            random_state=42
        )
        rf_model.fit(X_train, y_train)
        
        # Predict on validation dataset
        y_pred = rf_model.predict(X_val)
        
        # Calculate RMSE
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        
        # Log RMSE metric
        mlflow.log_metric("rmse", rmse)

    return {'loss': rmse, 'status': STATUS_OK}

# Define search space for RandomForest
search_space_rf = {
    'n_estimators': hp.quniform('n_estimators', 50, 100, 1),
    'max_depth': hp.quniform('max_depth', 5, 15, 1),
    'min_samples_split': hp.quniform('min_samples_split', 2, 5, 1),
    'min_samples_leaf': hp.quniform('min_samples_leaf', 1, 2, 1),
}


# Run hyperparameter optimization
with mlflow.start_run(run_name="Parent Random Forest", nested=True):
    best_params_rf = fmin(
        fn=objective_rf,
        space=search_space_rf,
        algo=tpe.suggest,
        max_evals=10,
        trials=Trials()
    )
    
    # Log best parameters
    mlflow.log_params(best_params_rf)


  0%|          | 0/10 [00:00<?, ?trial/s, best loss=?]



2024/09/20 15:40:26 INFO mlflow.tracking._tracking_service.client: 🏃 View run rumbling-grouse-217 at: https://dagshub.com/ioSoyPato/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/1eb19fc356c24631b5e02542fdeab3fa.

2024/09/20 15:40:26 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/ioSoyPato/nyc-taxi-time-prediction.mlflow/#/experiments/0.



 10%|█         | 1/10 [00:26<03:55, 26.14s/trial, best loss: 5.385345964611815]



2024/09/20 15:40:43 INFO mlflow.tracking._tracking_service.client: 🏃 View run enchanting-hen-616 at: https://dagshub.com/ioSoyPato/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/375e6c8f9f314f2aa3ecb05c88b5ed3b.

2024/09/20 15:40:43 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/ioSoyPato/nyc-taxi-time-prediction.mlflow/#/experiments/0.



 20%|██        | 2/10 [00:42<02:44, 20.55s/trial, best loss: 5.385345964611815]



2024/09/20 15:40:59 INFO mlflow.tracking._tracking_service.client: 🏃 View run brawny-zebra-286 at: https://dagshub.com/ioSoyPato/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/924c3c0164de431296da06149594e88f.

2024/09/20 15:40:59 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/ioSoyPato/nyc-taxi-time-prediction.mlflow/#/experiments/0.



 30%|███       | 3/10 [00:58<02:09, 18.46s/trial, best loss: 5.385345964611815]



2024/09/20 15:41:12 INFO mlflow.tracking._tracking_service.client: 🏃 View run mysterious-owl-901 at: https://dagshub.com/ioSoyPato/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/4f6e3f1a08a74ec38e7430f2eaed5f0c.

2024/09/20 15:41:12 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/ioSoyPato/nyc-taxi-time-prediction.mlflow/#/experiments/0.



 40%|████      | 4/10 [01:11<01:37, 16.27s/trial, best loss: 5.385345964611815]



2024/09/20 15:41:23 INFO mlflow.tracking._tracking_service.client: 🏃 View run treasured-fly-23 at: https://dagshub.com/ioSoyPato/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/6f8dbb6701bc47058a2b319ab099cc9e.

2024/09/20 15:41:23 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/ioSoyPato/nyc-taxi-time-prediction.mlflow/#/experiments/0.



 50%|█████     | 5/10 [01:22<01:11, 14.30s/trial, best loss: 5.385345964611815]



2024/09/20 15:41:34 INFO mlflow.tracking._tracking_service.client: 🏃 View run capricious-perch-932 at: https://dagshub.com/ioSoyPato/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/446185ed1b1544dea6474e6cf845980f.

2024/09/20 15:41:34 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/ioSoyPato/nyc-taxi-time-prediction.mlflow/#/experiments/0.



 60%|██████    | 6/10 [01:33<00:52, 13.11s/trial, best loss: 5.385345964611815]



2024/09/20 15:41:45 INFO mlflow.tracking._tracking_service.client: 🏃 View run luxuriant-lark-312 at: https://dagshub.com/ioSoyPato/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/2f1a7f9fa37e402ea45a5b57dbd49ced.

2024/09/20 15:41:45 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/ioSoyPato/nyc-taxi-time-prediction.mlflow/#/experiments/0.



 70%|███████   | 7/10 [01:44<00:37, 12.58s/trial, best loss: 5.385345964611815]



2024/09/20 15:41:58 INFO mlflow.tracking._tracking_service.client: 🏃 View run angry-croc-856 at: https://dagshub.com/ioSoyPato/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/8aaadd7a2926498f870f80b5e995d077.

2024/09/20 15:41:58 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/ioSoyPato/nyc-taxi-time-prediction.mlflow/#/experiments/0.



 80%|████████  | 8/10 [01:57<00:25, 12.77s/trial, best loss: 5.385345964611815]



2024/09/20 15:42:16 INFO mlflow.tracking._tracking_service.client: 🏃 View run ambitious-dog-972 at: https://dagshub.com/ioSoyPato/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/633c6546010e43fba43ad3bb3428848f.

2024/09/20 15:42:16 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/ioSoyPato/nyc-taxi-time-prediction.mlflow/#/experiments/0.



 90%|█████████ | 9/10 [02:16<00:14, 14.48s/trial, best loss: 5.385345964611815]



2024/09/20 15:42:34 INFO mlflow.tracking._tracking_service.client: 🏃 View run stylish-mule-819 at: https://dagshub.com/ioSoyPato/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/e7bf7712c89a44ae9122a99e1aba0203.

2024/09/20 15:42:34 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/ioSoyPato/nyc-taxi-time-prediction.mlflow/#/experiments/0.



100%|██████████| 10/10 [02:33<00:00, 15.34s/trial, best loss: 5.385345964611815]


2024/09/20 15:42:34 INFO mlflow.tracking._tracking_service.client: 🏃 View run Parent Random Forest at: https://dagshub.com/ioSoyPato/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/97fe1685462644759fd000e6067cd0ed.
2024/09/20 15:42:34 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/ioSoyPato/nyc-taxi-time-prediction.mlflow/#/experiments/0.


In [17]:
run_id = "8aaadd7a2926498f870f80b5e995d077"
run_uri = f"runs:/{run_id}/model"

result = mlflow.register_model(
    model_uri=run_uri,
    name="nyc-taxi-model"
)

Registered model 'nyc-taxi-model' already exists. Creating a new version of this model...
2024/09/20 15:47:15 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: nyc-taxi-model, version 2
Created version '2' of model 'nyc-taxi-model'.


In [18]:
mlflow.sklearn.autolog()
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor

def objective_gb(params):
    with mlflow.start_run(nested=True):
        # Set model tag
        mlflow.set_tag("model_family", "gradient_boosting")
        
        # Log parameters
        mlflow.log_params(params)
        
        # Train GradientBoosting model
        gb_model = GradientBoostingRegressor(
            n_estimators=int(params['n_estimators']),
            max_depth=int(params['max_depth']),
            min_samples_split=int(params['min_samples_split']),
            min_samples_leaf=int(params['min_samples_leaf']),
            learning_rate=float(params['learning_rate']),
            random_state=42
        )
        gb_model.fit(X_train, y_train)
        
        # Predict on validation dataset
        y_pred = gb_model.predict(X_val)
        
        # Calculate RMSE
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        
        # Log RMSE metric
        mlflow.log_metric("rmse", rmse)

    return {'loss': rmse, 'status': STATUS_OK}

# Define search space for GradientBoosting
search_space_gb = {
    'n_estimators': hp.quniform('n_estimators', 50, 100, 1),
    'max_depth': hp.quniform('max_depth', 3, 10, 1),
    'min_samples_split': hp.quniform('min_samples_split', 2, 5, 1),
    'min_samples_leaf': hp.quniform('min_samples_leaf', 1, 2, 1),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.1)
}

# Run hyperparameter optimization for GradientBoosting
with mlflow.start_run(run_name="Parent Gradient Boosting", nested=True):
    best_params_gb = fmin(
        fn=objective_gb,
        space=search_space_gb,
        algo=tpe.suggest,
        max_evals=10,
        trials=Trials()
    )
    
    # Log best parameters
    mlflow.log_params(best_params_gb)


  0%|          | 0/10 [00:00<?, ?trial/s, best loss=?]



2024/09/20 15:55:43 INFO mlflow.tracking._tracking_service.client: 🏃 View run sassy-elk-761 at: https://dagshub.com/ioSoyPato/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/22b99b4ba29c47f9a078c2ef6141dc01.

2024/09/20 15:55:43 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/ioSoyPato/nyc-taxi-time-prediction.mlflow/#/experiments/0.



 10%|█         | 1/10 [00:09<01:21,  9.01s/trial, best loss: 5.493179839791273]



2024/09/20 15:55:54 INFO mlflow.tracking._tracking_service.client: 🏃 View run unruly-skink-903 at: https://dagshub.com/ioSoyPato/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/6b87fa459d074386bc1beb2c7d9aa86a.

2024/09/20 15:55:54 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/ioSoyPato/nyc-taxi-time-prediction.mlflow/#/experiments/0.



 20%|██        | 2/10 [00:19<01:19,  9.99s/trial, best loss: 5.493179839791273]



2024/09/20 15:56:04 INFO mlflow.tracking._tracking_service.client: 🏃 View run bright-yak-244 at: https://dagshub.com/ioSoyPato/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/f46b162483f44d8ab69ceecb419c6a2f.

2024/09/20 15:56:04 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/ioSoyPato/nyc-taxi-time-prediction.mlflow/#/experiments/0.



 30%|███       | 3/10 [00:30<01:12, 10.34s/trial, best loss: 5.382779180821664]



2024/09/20 15:56:12 INFO mlflow.tracking._tracking_service.client: 🏃 View run whimsical-skunk-794 at: https://dagshub.com/ioSoyPato/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/f079dad64aa34b2881dcacb8c59fec92.

2024/09/20 15:56:12 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/ioSoyPato/nyc-taxi-time-prediction.mlflow/#/experiments/0.



 40%|████      | 4/10 [00:38<00:56,  9.46s/trial, best loss: 5.382779180821664]



2024/09/20 15:56:22 INFO mlflow.tracking._tracking_service.client: 🏃 View run salty-grouse-956 at: https://dagshub.com/ioSoyPato/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/7d7f7546810d4975b8bfd894f1696618.

2024/09/20 15:56:22 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/ioSoyPato/nyc-taxi-time-prediction.mlflow/#/experiments/0.



 50%|█████     | 5/10 [00:47<00:47,  9.42s/trial, best loss: 5.382779180821664]



2024/09/20 15:56:31 INFO mlflow.tracking._tracking_service.client: 🏃 View run rare-gull-357 at: https://dagshub.com/ioSoyPato/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/d5b7d04877fa40eb93fa4d7439a30417.

2024/09/20 15:56:31 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/ioSoyPato/nyc-taxi-time-prediction.mlflow/#/experiments/0.



 60%|██████    | 6/10 [00:56<00:36,  9.23s/trial, best loss: 5.382779180821664]



2024/09/20 15:56:41 INFO mlflow.tracking._tracking_service.client: 🏃 View run stately-mouse-131 at: https://dagshub.com/ioSoyPato/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/27d2979a31f5435398dd78a57ec37071.

2024/09/20 15:56:41 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/ioSoyPato/nyc-taxi-time-prediction.mlflow/#/experiments/0.



 70%|███████   | 7/10 [01:07<00:28,  9.57s/trial, best loss: 5.382779180821664]



2024/09/20 15:56:52 INFO mlflow.tracking._tracking_service.client: 🏃 View run thundering-skunk-399 at: https://dagshub.com/ioSoyPato/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/8224fd55e46644249809b99ab73b931a.

2024/09/20 15:56:52 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/ioSoyPato/nyc-taxi-time-prediction.mlflow/#/experiments/0.



 80%|████████  | 8/10 [01:17<00:19,  9.98s/trial, best loss: 5.331962173877195]



2024/09/20 15:57:03 INFO mlflow.tracking._tracking_service.client: 🏃 View run silent-worm-611 at: https://dagshub.com/ioSoyPato/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/3f2bb4a2df1a440ea1d80afeb84635bc.

2024/09/20 15:57:03 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/ioSoyPato/nyc-taxi-time-prediction.mlflow/#/experiments/0.



 90%|█████████ | 9/10 [01:28<00:10, 10.24s/trial, best loss: 5.331962173877195]



2024/09/20 15:57:12 INFO mlflow.tracking._tracking_service.client: 🏃 View run overjoyed-mouse-991 at: https://dagshub.com/ioSoyPato/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/5776d235b06e4215a1652d3db2c00f1a.

2024/09/20 15:57:12 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/ioSoyPato/nyc-taxi-time-prediction.mlflow/#/experiments/0.



100%|██████████| 10/10 [01:38<00:00,  9.84s/trial, best loss: 5.331962173877195]


2024/09/20 15:57:13 INFO mlflow.tracking._tracking_service.client: 🏃 View run Parent Gradient Boosting at: https://dagshub.com/ioSoyPato/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/620f3d98f4d74578bf614c7681441d3b.
2024/09/20 15:57:13 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/ioSoyPato/nyc-taxi-time-prediction.mlflow/#/experiments/0.


In [19]:
run_id = "f079dad64aa34b2881dcacb8c59fec92"
run_uri = f"runs:/{run_id}/model"

result = mlflow.register_model(
    model_uri=run_uri,
    name="nyc-taxi-model"
)

Registered model 'nyc-taxi-model' already exists. Creating a new version of this model...
2024/09/20 15:58:14 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: nyc-taxi-model, version 3
Created version '3' of model 'nyc-taxi-model'.
