In [1]:
# Create the directory if it doesn't exist
!mkdir -p ../data

# Download files using curl
!curl -o ../data/green_tripdata_2024-01.parquet https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2024-01.parquet
!curl -o ../data/green_tripdata_2024-02.parquet https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2024-02.parquet

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1330k  100 1330k    0     0   314k      0  0:00:04  0:00:04 --:--:--  314k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1253k  100 1253k    0     0   459k      0  0:00:02  0:00:02 --:--:--  459k


In [2]:
import pickle
import pandas as pd
from sklearn.metrics import  root_mean_squared_error
from sklearn.feature_extraction import  DictVectorizer
from sklearn.linear_model import Lasso, Ridge, LinearRegression

In [4]:
def read_dataframe(filename):

    df = pd.read_parquet(filename)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)

    return df

In [5]:
df_train = read_dataframe('../data/green_tripdata_2024-01.parquet')
df_val = read_dataframe('../data/green_tripdata_2024-02.parquet')

In [6]:
df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
df_val['PU_DO'] = df_val['PULocationID'] + '_' + df_val['DOLocationID']

In [7]:
categorical = ['PU_DO']  #'PULocationID', 'DOLocationID']
numerical = ['trip_distance']
dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [8]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [9]:
import dagshub
import mlflow


dagshub.init(url="https://dagshub.com/ioSoyPato/nyc-taxi-time-prediction", mlflow=True)

MLFLOW_TRACKING_URI = mlflow.get_tracking_uri()

print(MLFLOW_TRACKING_URI)

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment(experiment_name="nyc-taxi-experiment")

Output()



Open the following link in your browser to authorize the client:
https://dagshub.com/login/oauth/authorize?state=9c8dec60-20ad-41d0-b8b2-dcaa6229dda7&client_id=32b60ba385aa7cecf24046d8195a71c07dd345d9657977863b52e7748e0f0f28&middleman_request_id=495144d026ac82a46bc7faa4a7450368932df961fa0a663fbdfb25f49f9a8696




2024/09/17 21:28:14 INFO mlflow.tracking.fluent: Experiment with name 'nyc-taxi-experiment' does not exist. Creating a new experiment.


https://dagshub.com/ioSoyPato/nyc-taxi-time-prediction.mlflow


<Experiment: artifact_location='mlflow-artifacts:/2755643640c14fd6bcf983414f96147c', creation_time=1726630094230, experiment_id='0', last_update_time=1726630094230, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>

In [10]:
training_dataset = mlflow.data.from_numpy(X_train.data, targets=y_train, name="green_tripdata_2024-01")
validation_dataset = mlflow.data.from_numpy(X_val.data, targets=y_val, name="green_tripdata_2024-02")

In [13]:
from dagshub import get_repo_bucket_client
# Get a boto3.client object
s3 = get_repo_bucket_client("ioSoyPato/nyc-taxi-time-prediction")

# Upload file
s3.upload_file(
    Bucket="nyc-taxi-time-prediction",  # name of the repo
    Filename="../data/green_tripdata_2024-01.parquet",  # local path of file to upload
    Key="train_data.parquet",  # remote path where to upload the file
)

# Download file
# s3.download_file(
#     Bucket="nyc-taxi-time-prediction",  # name of the repo
#     Key="train_data.parquet",  #  remote path from where to download the file
#     Filename="local.csv",  # local path where to download the file
# )

In [14]:
s3.upload_file(
    Bucket="nyc-taxi-time-prediction",  # name of the repo
    Filename="../data/green_tripdata_2024-02.parquet",  # local path of file to upload
    Key="eval_data.parquet",  # remote path where to upload the file
)

# Download file
# s3.download_file(
#     Bucket="nyc-taxi-time-prediction",  # name of the repo
#     Key="train_data.parquet",  #  remote path from where to download the file
#     Filename="local.csv",  # local path where to download the file
# )

In [15]:
import xgboost as xgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope
import pathlib

In [16]:
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)

In [17]:
def objective(params):
    with mlflow.start_run(nested=True):
         
        # Tag model
        mlflow.set_tag("model_family", "xgboost")
        
        # Log parameters
        mlflow.log_params(params)
        
        # Train model
        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=100,
            evals=[(valid, 'validation')],
            early_stopping_rounds=10
        )
        
        # Log xgboost model with artifact_path
        mlflow.xgboost.log_model(booster, artifact_path="model")
         
        # Predict in the val dataset
        y_pred = booster.predict(valid)
        
        # Calculate metric
        rmse = root_mean_squared_error(y_val, y_pred)
        
        # Log performance metric
        mlflow.log_metric("rmse", rmse)

    return {'loss': rmse, 'status': STATUS_OK}

In [18]:
mlflow.xgboost.autolog()

with mlflow.start_run(run_name="Xgboost Hyper-parameter Optimization", nested=True):
    search_space = {
        'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
        'learning_rate': hp.loguniform('learning_rate', -3, 0),
        'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
        'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
        'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
        'objective': 'reg:squarederror',
        'seed': 42
    }
    
    best_params = fmin(
        fn=objective,
        space=search_space,
        algo=tpe.suggest,
        max_evals=10,
        trials=Trials()
    )
    best_params["max_depth"] = int(best_params["max_depth"])
    best_params["seed"] = 42
    best_params["objective"] = "reg:squarederror"
    
    mlflow.log_params(best_params)

    # Log tags
    mlflow.set_tags(
        tags={
            "project": "NYC Taxi Time Prediction Project",
            "optimizer_engine": "hyper-opt",
            "model_family": "xgboost",
            "feature_set_version": 1,
        }
    )

    # Log a fit model instance
    booster = xgb.train(
        params=best_params,
        dtrain=train,
        num_boost_round=100,
        evals=[(valid, 'validation')],
        early_stopping_rounds=10
    )
        
    y_pred = booster.predict(valid)
    
    rmse = root_mean_squared_error(y_val, y_pred)
    mlflow.log_metric("rmse", rmse)
    
    pathlib.Path("models").mkdir(exist_ok=True)
    with open("models/preprocessor.b", "wb") as f_out:
        pickle.dump(dv, f_out)
        
    mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")

[0]	validation-rmse:5.35989                           
[1]	validation-rmse:5.29673                           
[2]	validation-rmse:5.29706                           
[3]	validation-rmse:5.28990                           
[4]	validation-rmse:5.28564                           
[5]	validation-rmse:5.28006                           
[6]	validation-rmse:5.27749                           
[7]	validation-rmse:5.27772                           
[8]	validation-rmse:5.27519                           
[9]	validation-rmse:5.27929                           
[10]	validation-rmse:5.28243                          
[11]	validation-rmse:5.28494                          
[12]	validation-rmse:5.28301                          
[13]	validation-rmse:5.28077                          
[14]	validation-rmse:5.26949                          
[15]	validation-rmse:5.26551                          
[16]	validation-rmse:5.26567                          
[17]	validation-rmse:5.24999                          
[18]	valid






2024/09/17 21:38:21 INFO mlflow.tracking._tracking_service.client: 🏃 View run victorious-dove-82 at: https://dagshub.com/ioSoyPato/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/15ad68e1a5fb493294e1444770c3eaa0.

2024/09/17 21:38:21 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/ioSoyPato/nyc-taxi-time-prediction.mlflow/#/experiments/0.



[0]	validation-rmse:6.76173                                                    
[1]	validation-rmse:5.85074                                                    
[2]	validation-rmse:5.51398                                                    
[3]	validation-rmse:5.40313                                                    
[4]	validation-rmse:5.33934                                                    
[5]	validation-rmse:5.32669                                                    
[6]	validation-rmse:5.32501                                                    
[7]	validation-rmse:5.30067                                                    
[8]	validation-rmse:5.30010                                                    
[9]	validation-rmse:5.29805                                                    
[10]	validation-rmse:5.29645                                                   
[11]	validation-rmse:5.29501                                                   
[12]	validation-rmse:5.29210            






2024/09/17 21:39:00 INFO mlflow.tracking._tracking_service.client: 🏃 View run stately-sheep-231 at: https://dagshub.com/ioSoyPato/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/fa28a7d76f54446091bb99df3d992e5c.

2024/09/17 21:39:00 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/ioSoyPato/nyc-taxi-time-prediction.mlflow/#/experiments/0.



[0]	validation-rmse:8.08709                                                    
[1]	validation-rmse:7.30741                                                    
[2]	validation-rmse:6.72505                                                    
[3]	validation-rmse:6.29302                                                    
[4]	validation-rmse:5.98119                                                    
[5]	validation-rmse:5.75932                                                    
[6]	validation-rmse:5.60156                                                    
[7]	validation-rmse:5.48804                                                    
[8]	validation-rmse:5.40985                                                    
[9]	validation-rmse:5.35543                                                    
[10]	validation-rmse:5.31789                                                   
[11]	validation-rmse:5.29041                                                   
[12]	validation-rmse:5.26921            






2024/09/17 21:39:31 INFO mlflow.tracking._tracking_service.client: 🏃 View run monumental-swan-254 at: https://dagshub.com/ioSoyPato/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/b454f00224034da8bcb45a746fd5040a.

2024/09/17 21:39:31 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/ioSoyPato/nyc-taxi-time-prediction.mlflow/#/experiments/0.



[0]	validation-rmse:7.28727                                                    
[1]	validation-rmse:6.28947                                                   
[2]	validation-rmse:5.77677                                                   
[3]	validation-rmse:5.52223                                                   
[4]	validation-rmse:5.38601                                                   
[5]	validation-rmse:5.31448                                                   
[6]	validation-rmse:5.27675                                                   
[7]	validation-rmse:5.24933                                                   
[8]	validation-rmse:5.23153                                                   
[9]	validation-rmse:5.22261                                                   
[10]	validation-rmse:5.21660                                                  
[11]	validation-rmse:5.21387                                                  
[12]	validation-rmse:5.21276                       






2024/09/17 21:39:54 INFO mlflow.tracking._tracking_service.client: 🏃 View run gaudy-mare-824 at: https://dagshub.com/ioSoyPato/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/d969c789b90841e7ac7afc729cd48a00.

2024/09/17 21:39:54 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/ioSoyPato/nyc-taxi-time-prediction.mlflow/#/experiments/0.



[0]	validation-rmse:5.81681                                                    
[1]	validation-rmse:5.47785                                                    
[2]	validation-rmse:5.44198                                                    
[3]	validation-rmse:5.43082                                                    
[4]	validation-rmse:5.40249                                                    
[5]	validation-rmse:5.40122                                                    
[6]	validation-rmse:5.39759                                                    
[7]	validation-rmse:5.40201                                                    
[8]	validation-rmse:5.40285                                                    
[9]	validation-rmse:5.40198                                                    
[10]	validation-rmse:5.40474                                                   
[11]	validation-rmse:5.40584                                                   
[12]	validation-rmse:5.40710            






2024/09/17 21:40:25 INFO mlflow.tracking._tracking_service.client: 🏃 View run treasured-foal-720 at: https://dagshub.com/ioSoyPato/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/ffaf130037a14acf98267b859790339d.

2024/09/17 21:40:25 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/ioSoyPato/nyc-taxi-time-prediction.mlflow/#/experiments/0.



[0]	validation-rmse:8.64895                                                    
[1]	validation-rmse:8.23732                                                    
[2]	validation-rmse:7.87353                                                    
[3]	validation-rmse:7.55195                                                    
[4]	validation-rmse:7.26855                                                    
[5]	validation-rmse:7.01945                                                    
[6]	validation-rmse:6.80114                                                    
[7]	validation-rmse:6.60912                                                    
[8]	validation-rmse:6.44315                                                    
[9]	validation-rmse:6.29856                                                    
[10]	validation-rmse:6.17229                                                   
[11]	validation-rmse:6.06365                                                   
[12]	validation-rmse:5.96957            






2024/09/17 21:40:44 INFO mlflow.tracking._tracking_service.client: 🏃 View run nervous-bass-462 at: https://dagshub.com/ioSoyPato/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/8c6a0549770b4e399b17daef31049daa.

2024/09/17 21:40:44 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/ioSoyPato/nyc-taxi-time-prediction.mlflow/#/experiments/0.



[0]	validation-rmse:8.62060                                                    
[1]	validation-rmse:8.18539                                                    
[2]	validation-rmse:7.80156                                                    
[3]	validation-rmse:7.46203                                                    
[4]	validation-rmse:7.16524                                                    
[5]	validation-rmse:6.90278                                                    
[6]	validation-rmse:6.67564                                                    
[7]	validation-rmse:6.47813                                                    
[8]	validation-rmse:6.30363                                                    
[9]	validation-rmse:6.15657                                                    
[10]	validation-rmse:6.02556                                                   
[11]	validation-rmse:5.91531                                                   
[12]	validation-rmse:5.81985            






2024/09/17 21:41:16 INFO mlflow.tracking._tracking_service.client: 🏃 View run glamorous-shark-506 at: https://dagshub.com/ioSoyPato/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/90033cca74d14473a86eb3b16abf195c.

2024/09/17 21:41:16 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/ioSoyPato/nyc-taxi-time-prediction.mlflow/#/experiments/0.



[0]	validation-rmse:7.73315                                                    
[1]	validation-rmse:6.84423                                                    
[2]	validation-rmse:6.29191                                                    
[3]	validation-rmse:5.95770                                                    
[4]	validation-rmse:5.75969                                                    
[5]	validation-rmse:5.64279                                                    
[6]	validation-rmse:5.57285                                                    
[7]	validation-rmse:5.53006                                                    
[8]	validation-rmse:5.49725                                                    
[9]	validation-rmse:5.47521                                                    
[10]	validation-rmse:5.46083                                                   
[11]	validation-rmse:5.45062                                                   
[12]	validation-rmse:5.44458            






2024/09/17 21:41:46 INFO mlflow.tracking._tracking_service.client: 🏃 View run learned-dove-681 at: https://dagshub.com/ioSoyPato/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/1ada0b22f17a4917a8061d73c88d5218.

2024/09/17 21:41:46 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/ioSoyPato/nyc-taxi-time-prediction.mlflow/#/experiments/0.



[0]	validation-rmse:8.70183                                                    
[1]	validation-rmse:8.33071                                                    
[2]	validation-rmse:7.99620                                                    
[3]	validation-rmse:7.69386                                                    
[4]	validation-rmse:7.42201                                                    
[5]	validation-rmse:7.17808                                                    
[6]	validation-rmse:6.95942                                                    
[7]	validation-rmse:6.76304                                                    
[8]	validation-rmse:6.58847                                                    
[9]	validation-rmse:6.43272                                                    
[10]	validation-rmse:6.29391                                                   
[11]	validation-rmse:6.17180                                                   
[12]	validation-rmse:6.06359            






2024/09/17 21:42:14 INFO mlflow.tracking._tracking_service.client: 🏃 View run calm-rat-692 at: https://dagshub.com/ioSoyPato/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/61040929a3c947b59f665d910ac65f42.

2024/09/17 21:42:14 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/ioSoyPato/nyc-taxi-time-prediction.mlflow/#/experiments/0.



[0]	validation-rmse:8.72010                                                    
[1]	validation-rmse:8.36511                                                    
[2]	validation-rmse:8.04399                                                    
[3]	validation-rmse:7.75388                                                    
[4]	validation-rmse:7.49245                                                    
[5]	validation-rmse:7.25694                                                    
[6]	validation-rmse:7.04608                                                    
[7]	validation-rmse:6.85703                                                    
[8]	validation-rmse:6.68778                                                    
[9]	validation-rmse:6.53727                                                    
[10]	validation-rmse:6.40347                                                   
[11]	validation-rmse:6.28402                                                   
[12]	validation-rmse:6.17835            






2024/09/17 21:42:46 INFO mlflow.tracking._tracking_service.client: 🏃 View run skittish-jay-519 at: https://dagshub.com/ioSoyPato/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/2eb4fc62169b463a83651ed7d4bead9a.

2024/09/17 21:42:46 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/ioSoyPato/nyc-taxi-time-prediction.mlflow/#/experiments/0.



100%|██████████| 10/10 [04:47<00:00, 28.78s/trial, best loss: 5.151615112025083]
[0]	validation-rmse:7.28727
[1]	validation-rmse:6.28947
[2]	validation-rmse:5.77677
[3]	validation-rmse:5.52223
[4]	validation-rmse:5.38601
[5]	validation-rmse:5.31448
[6]	validation-rmse:5.27675
[7]	validation-rmse:5.24933
[8]	validation-rmse:5.23153
[9]	validation-rmse:5.22261
[10]	validation-rmse:5.21660
[11]	validation-rmse:5.21387
[12]	validation-rmse:5.21276
[13]	validation-rmse:5.20988
[14]	validation-rmse:5.20710
[15]	validation-rmse:5.20485
[16]	validation-rmse:5.20252
[17]	validation-rmse:5.20087
[18]	validation-rmse:5.19961
[19]	validation-rmse:5.19827
[20]	validation-rmse:5.19595
[21]	validation-rmse:5.19495
[22]	validation-rmse:5.19386
[23]	validation-rmse:5.19381
[24]	validation-rmse:5.19229
[25]	validation-rmse:5.19156
[26]	validation-rmse:5.18695
[27]	validation-rmse:5.18632
[28]	validation-rmse:5.18529
[29]	validation-rmse:5.18423
[30]	validation-rmse:5.18089
[31]	validation-rmse:5.17981
[

2024/09/17 21:43:03 INFO mlflow.tracking._tracking_service.client: 🏃 View run Xgboost Hyper-parameter Optimization at: https://dagshub.com/ioSoyPato/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/9b6218cdbb634ea5b108d5e63aace0ae.
2024/09/17 21:43:03 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/ioSoyPato/nyc-taxi-time-prediction.mlflow/#/experiments/0.


In [20]:
best_params

{'learning_rate': np.float64(0.33034006068911675),
 'max_depth': 28,
 'min_child_weight': np.float64(2.7031631135855614),
 'reg_alpha': np.float64(0.15969441855675173),
 'reg_lambda': np.float64(0.00568542691145429),
 'seed': 42,
 'objective': 'reg:squarederror'}

el mas perro:

15ad68e1a5fb493294e1444770c3eaa0 

In [21]:
run_id = "15ad68e1a5fb493294e1444770c3eaa0"
run_uri = f"runs:/{run_id}/model"

result = mlflow.register_model(
    model_uri=run_uri,
    name="nyc-taxi-model"
)

Successfully registered model 'nyc-taxi-model'.
2024/09/17 21:48:17 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: nyc-taxi-model, version 1
Created version '1' of model 'nyc-taxi-model'.


In [22]:
from datetime import datetime
from mlflow import MlflowClient

client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)
client.update_registered_model(
    name="nyc-taxi-model",
    description="Model registry for the NYC Taxi Time Prediction Project",
)

new_alias = "champion"
date = datetime.today()
model_version = "1"

# create "champion" alias for version 1 of model "nyc-taxi-model"
client.set_registered_model_alias(
    name="nyc-taxi-model",
    alias=new_alias,
    version=model_version
)

client.update_model_version(
    name="nyc-taxi-model",
    version=model_version,
    description=f"The model version {model_version} was transitioned to {new_alias} on {date}",
)

<ModelVersion: aliases=['champion'], creation_timestamp=1726631297076, current_stage='None', description='The model version 1 was transitioned to champion on 2024-09-17 21:49:24.314807', last_updated_timestamp=1726631364648, name='nyc-taxi-model', run_id='15ad68e1a5fb493294e1444770c3eaa0', run_link='', source='mlflow-artifacts:/2755643640c14fd6bcf983414f96147c/15ad68e1a5fb493294e1444770c3eaa0/artifacts/model', status='READY', status_message='', tags={}, user_id='', version='1'>

import mlflow.pyfunc

model_name = "nyc-taxi-model"
alias = "champion"

model_uri = f"models:/{model_name}@{alias}"

champion_version = mlflow.pyfunc.load_model(
    model_uri=model_uri
)

champion_version.predict(X_val)