In [5]:
# Create the directory if it doesn't exist
!mkdir ..\data

# Download files using curl
!curl -o ../data/green_tripdata_2024-01.parquet https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2024-01.parquet
!curl -o ../data/green_tripdata_2024-02.parquet https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2024-02.parquet

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
 87 1330k   87 1167k    0     0  3115k      0 --:--:-- --:--:-- --:--:-- 3129k
100 1330k  100 1330k    0     0  3363k      0 --:--:-- --:--:-- --:--:-- 3385k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 1253k  100 1253k    0     0  2195k      0 --:--:-- --:--:-- --:--:-- 2207k


In [8]:
import pickle
import pandas as pd
from sklearn.metrics import  root_mean_squared_error
from sklearn.feature_extraction import  DictVectorizer
from sklearn.linear_model import Lasso, Ridge, LinearRegression

In [6]:
def read_dataframe(filename):

    df = pd.read_parquet(filename)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)

    return df

In [9]:
df_train = read_dataframe('../data/green_tripdata_2024-01.parquet')
df_val = read_dataframe('../data/green_tripdata_2024-02.parquet')

In [10]:
df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
df_val['PU_DO'] = df_val['PULocationID'] + '_' + df_val['DOLocationID']

In [11]:
categorical = ['PU_DO']  #'PULocationID', 'DOLocationID']
numerical = ['trip_distance']
dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [12]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [13]:
import dagshub
import mlflow


dagshub.init(url="https://dagshub.com/PacoTinoco/nyc-taxi-time-prediction", mlflow=True)

MLFLOW_TRACKING_URI = mlflow.get_tracking_uri()

print(MLFLOW_TRACKING_URI)

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment(experiment_name="nyc-taxi-experiment")

https://dagshub.com/PacoTinoco/nyc-taxi-time-prediction.mlflow


<Experiment: artifact_location='mlflow-artifacts:/9e991ac0af45417cb6a8c46fbaf1900e', creation_time=1726630091495, experiment_id='0', last_update_time=1726630091495, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>

In [14]:
training_dataset = mlflow.data.from_numpy(X_train.data, targets=y_train, name="green_tripdata_2024-01")
validation_dataset = mlflow.data.from_numpy(X_val.data, targets=y_val, name="green_tripdata_2024-02")

In [15]:
from dagshub import get_repo_bucket_client
# Get a boto3.client object
s3 = get_repo_bucket_client("PacoTinoco/nyc-taxi-time-prediction")

# Upload file
s3.upload_file(
    Bucket="nyc-taxi-time-prediction",  # name of the repo
    Filename="../data/green_tripdata_2024-01.parquet",  # local path of file to upload
    Key="train_data.parquet",  # remote path where to upload the file
)

## test
s3.upload_file(
    Bucket="nyc-taxi-time-prediction",  # name of the repo
    Filename="../data/green_tripdata_2024-02.parquet",  # local path of file to upload
    Key="eval_data.parquet",  # remote path where to upload the file
)


In [16]:
import xgboost as xgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope
import pathlib

In [17]:
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)

In [18]:
def objective(params):
    with mlflow.start_run(nested=True):
         
        # Tag model
        mlflow.set_tag("model_family", "xgboost")
        
        # Log parameters
        mlflow.log_params(params)
        
        # Train model
        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=100,
            evals=[(valid, 'validation')],
            early_stopping_rounds=10
        )
        
        # Log xgboost model with artifact_path
        mlflow.xgboost.log_model(booster, artifact_path="model")
         
        # Predict in the val dataset
        y_pred = booster.predict(valid)
        
        # Calculate metric
        rmse = root_mean_squared_error(y_val, y_pred)
        
        # Log performance metric
        mlflow.log_metric("rmse", rmse)

    return {'loss': rmse, 'status': STATUS_OK}

In [19]:
mlflow.xgboost.autolog()

with mlflow.start_run(run_name="Xgboost Hyper-parameter Optimization", nested=True):
    search_space = {
        'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
        'learning_rate': hp.loguniform('learning_rate', -3, 0),
        'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
        'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
        'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
        'objective': 'reg:squarederror',
        'seed': 42
    }
    
    best_params = fmin(
        fn=objective,
        space=search_space,
        algo=tpe.suggest,
        max_evals=10,
        trials=Trials()
    )
    best_params["max_depth"] = int(best_params["max_depth"])
    best_params["seed"] = 42
    best_params["objective"] = "reg:squarederror"
    
    mlflow.log_params(best_params)

    # Log tags
    mlflow.set_tags(
        tags={
            "project": "NYC Taxi Time Prediction Project",
            "optimizer_engine": "hyper-opt",
            "model_family": "xgboost",
            "feature_set_version": 1,
        }
    )

    # Log a fit model instance
    booster = xgb.train(
        params=best_params,
        dtrain=train,
        num_boost_round=100,
        evals=[(valid, 'validation')],
        early_stopping_rounds=10
    )
        
    y_pred = booster.predict(valid)
    
    rmse = root_mean_squared_error(y_val, y_pred)
    mlflow.log_metric("rmse", rmse)
    
    pathlib.Path("models").mkdir(exist_ok=True)
    with open("models/preprocessor.b", "wb") as f_out:
        pickle.dump(dv, f_out)
        
    mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")

[0]	validation-rmse:6.34602                           
[1]	validation-rmse:5.59131                           
[2]	validation-rmse:5.38253                           
[3]	validation-rmse:5.34096                           
[4]	validation-rmse:5.31669                           
[5]	validation-rmse:5.31087                           
[6]	validation-rmse:5.30307                           
[7]	validation-rmse:5.28951                           
[8]	validation-rmse:5.28666                           
[9]	validation-rmse:5.28697                           
[10]	validation-rmse:5.28584                          
[11]	validation-rmse:5.28643                          
[12]	validation-rmse:5.28572                          
[13]	validation-rmse:5.28447                          
[14]	validation-rmse:5.28297                          
[15]	validation-rmse:5.28224                          
[16]	validation-rmse:5.28272                          
[17]	validation-rmse:5.28207                          
[18]	valid






2024/09/20 17:13:10 INFO mlflow.tracking._tracking_service.client: 🏃 View run amazing-skink-664 at: https://dagshub.com/PacoTinoco/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/64b0fb509f1149e18ad1e23d0de464e5.

2024/09/20 17:13:10 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/PacoTinoco/nyc-taxi-time-prediction.mlflow/#/experiments/0.



[0]	validation-rmse:8.71265                                                    
[1]	validation-rmse:8.35247                                                    
[2]	validation-rmse:8.02808                                                    
[3]	validation-rmse:7.73636                                                    
[4]	validation-rmse:7.47496                                                    
[5]	validation-rmse:7.24131                                                    
[6]	validation-rmse:7.03284                                                    
[7]	validation-rmse:6.84749                                                    
[8]	validation-rmse:6.68250                                                    
[9]	validation-rmse:6.53631                                                    
[10]	validation-rmse:6.40725                                                   
[11]	validation-rmse:6.29303                                                   
[12]	validation-rmse:6.19229            






2024/09/20 17:13:52 INFO mlflow.tracking._tracking_service.client: 🏃 View run thoughtful-cow-418 at: https://dagshub.com/PacoTinoco/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/d5b22ef581e944e68a1261b12850c6d8.

2024/09/20 17:13:52 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/PacoTinoco/nyc-taxi-time-prediction.mlflow/#/experiments/0.



[0]	validation-rmse:7.89004                                                    
[1]	validation-rmse:7.03968                                                    
[2]	validation-rmse:6.46258                                                    
[3]	validation-rmse:6.08060                                                    
[4]	validation-rmse:5.83390                                                    
[5]	validation-rmse:5.67641                                                    
[6]	validation-rmse:5.57585                                                    
[7]	validation-rmse:5.50642                                                    
[8]	validation-rmse:5.46394                                                    
[9]	validation-rmse:5.43338                                                    
[10]	validation-rmse:5.41250                                                   
[11]	validation-rmse:5.39837                                                   
[12]	validation-rmse:5.38582            






2024/09/20 17:14:23 INFO mlflow.tracking._tracking_service.client: 🏃 View run enthused-shrimp-764 at: https://dagshub.com/PacoTinoco/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/a621f03bf02a4206a16f36177232a812.

2024/09/20 17:14:23 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/PacoTinoco/nyc-taxi-time-prediction.mlflow/#/experiments/0.



[0]	validation-rmse:6.04750                                                    
[1]	validation-rmse:5.42246                                                    
[2]	validation-rmse:5.31073                                                    
[3]	validation-rmse:5.28133                                                    
[4]	validation-rmse:5.26843                                                    
[5]	validation-rmse:5.25812                                                    
[6]	validation-rmse:5.24758                                                    
[7]	validation-rmse:5.23863                                                    
[8]	validation-rmse:5.23874                                                    
[9]	validation-rmse:5.23578                                                    
[10]	validation-rmse:5.23483                                                   
[11]	validation-rmse:5.23425                                                   
[12]	validation-rmse:5.23212            






2024/09/20 17:15:02 INFO mlflow.tracking._tracking_service.client: 🏃 View run traveling-shrew-431 at: https://dagshub.com/PacoTinoco/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/c7ea0ec48bec493e9f47e8baa233d4d2.

2024/09/20 17:15:02 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/PacoTinoco/nyc-taxi-time-prediction.mlflow/#/experiments/0.



[0]	validation-rmse:8.66585                                                    
[1]	validation-rmse:8.26497                                                    
[2]	validation-rmse:7.90558                                                    
[3]	validation-rmse:7.58444                                                    
[4]	validation-rmse:7.29747                                                    
[5]	validation-rmse:7.04266                                                    
[6]	validation-rmse:6.81561                                                    
[7]	validation-rmse:6.61422                                                    
[8]	validation-rmse:6.43643                                                    
[9]	validation-rmse:6.28018                                                    
[10]	validation-rmse:6.14268                                                   
[11]	validation-rmse:6.02382                                                   
[12]	validation-rmse:5.91787            






2024/09/20 17:16:34 INFO mlflow.tracking._tracking_service.client: 🏃 View run brawny-sloth-686 at: https://dagshub.com/PacoTinoco/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/8eb7b1b720774c6d898b94c5ed2a82f7.

2024/09/20 17:16:34 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/PacoTinoco/nyc-taxi-time-prediction.mlflow/#/experiments/0.



[0]	validation-rmse:7.89646                                                    
[1]	validation-rmse:7.04315                                                    
[2]	validation-rmse:6.46075                                                    
[3]	validation-rmse:6.07410                                                    
[4]	validation-rmse:5.81723                                                    
[5]	validation-rmse:5.64922                                                    
[6]	validation-rmse:5.54245                                                    
[7]	validation-rmse:5.47166                                                    
[8]	validation-rmse:5.42603                                                    
[9]	validation-rmse:5.39694                                                    
[10]	validation-rmse:5.37627                                                   
[11]	validation-rmse:5.36071                                                   
[12]	validation-rmse:5.34981            






2024/09/20 17:17:14 INFO mlflow.tracking._tracking_service.client: 🏃 View run enchanting-deer-780 at: https://dagshub.com/PacoTinoco/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/ca770457c55e48e7b7b64d5ac7faebcf.

2024/09/20 17:17:14 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/PacoTinoco/nyc-taxi-time-prediction.mlflow/#/experiments/0.



[0]	validation-rmse:8.67016                                                    
[1]	validation-rmse:8.27599                                                    
[2]	validation-rmse:7.92423                                                    
[3]	validation-rmse:7.60949                                                    
[4]	validation-rmse:7.33469                                                    
[5]	validation-rmse:7.08269                                                    
[6]	validation-rmse:6.86711                                                    
[7]	validation-rmse:6.67337                                                    
[8]	validation-rmse:6.50540                                                    
[9]	validation-rmse:6.35246                                                    
[10]	validation-rmse:6.22224                                                   
[11]	validation-rmse:6.10903                                                   
[12]	validation-rmse:6.00496            






2024/09/20 17:18:14 INFO mlflow.tracking._tracking_service.client: 🏃 View run rare-perch-188 at: https://dagshub.com/PacoTinoco/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/29f155145fbf4dea8f9bbe8522ed5bc2.

2024/09/20 17:18:14 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/PacoTinoco/nyc-taxi-time-prediction.mlflow/#/experiments/0.



[0]	validation-rmse:8.46427                                                    
[1]	validation-rmse:7.91326                                                    
[2]	validation-rmse:7.44770                                                    
[3]	validation-rmse:7.05480                                                    
[4]	validation-rmse:6.72592                                                    
[5]	validation-rmse:6.45270                                                    
[6]	validation-rmse:6.22719                                                    
[7]	validation-rmse:6.04032                                                    
[8]	validation-rmse:5.88911                                                    
[9]	validation-rmse:5.76427                                                    
[10]	validation-rmse:5.66183                                                   
[11]	validation-rmse:5.57811                                                   
[12]	validation-rmse:5.51123            






2024/09/20 17:19:21 INFO mlflow.tracking._tracking_service.client: 🏃 View run thundering-snipe-687 at: https://dagshub.com/PacoTinoco/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/50fea87f4e244c2ea0bc83ef836a0257.

2024/09/20 17:19:21 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/PacoTinoco/nyc-taxi-time-prediction.mlflow/#/experiments/0.



[0]	validation-rmse:5.73007                                                    
[1]	validation-rmse:5.46262                                                    
[2]	validation-rmse:5.43490                                                    
[3]	validation-rmse:5.42455                                                    
[4]	validation-rmse:5.39766                                                    
[5]	validation-rmse:5.39070                                                    
[6]	validation-rmse:5.38283                                                    
[7]	validation-rmse:5.38293                                                    
[8]	validation-rmse:5.38165                                                    
[9]	validation-rmse:5.37859                                                    
[10]	validation-rmse:5.37482                                                   
[11]	validation-rmse:5.37392                                                   
[12]	validation-rmse:5.37446            






2024/09/20 17:20:15 INFO mlflow.tracking._tracking_service.client: 🏃 View run bold-grub-35 at: https://dagshub.com/PacoTinoco/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/fee4736c45df4b36be73f482427b34da.

2024/09/20 17:20:15 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/PacoTinoco/nyc-taxi-time-prediction.mlflow/#/experiments/0.



[0]	validation-rmse:8.43130                                                    
[1]	validation-rmse:7.86246                                                    
[2]	validation-rmse:7.39039                                                    
[3]	validation-rmse:7.00104                                                    
[4]	validation-rmse:6.68261                                                    
[5]	validation-rmse:6.42379                                                    
[6]	validation-rmse:6.21505                                                    
[7]	validation-rmse:6.04690                                                    
[8]	validation-rmse:5.91297                                                    
[9]	validation-rmse:5.80652                                                    
[10]	validation-rmse:5.72198                                                   
[11]	validation-rmse:5.65425                                                   
[12]	validation-rmse:5.60080            






2024/09/20 17:20:46 INFO mlflow.tracking._tracking_service.client: 🏃 View run whimsical-horse-942 at: https://dagshub.com/PacoTinoco/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/2b131f799f4b4d80a19f16a84aa220ff.

2024/09/20 17:20:46 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/PacoTinoco/nyc-taxi-time-prediction.mlflow/#/experiments/0.



100%|██████████| 10/10 [08:26<00:00, 50.62s/trial, best loss: 5.182616336428882]
[0]	validation-rmse:8.66585
[1]	validation-rmse:8.26497
[2]	validation-rmse:7.90558
[3]	validation-rmse:7.58444
[4]	validation-rmse:7.29747
[5]	validation-rmse:7.04266
[6]	validation-rmse:6.81561
[7]	validation-rmse:6.61422
[8]	validation-rmse:6.43643
[9]	validation-rmse:6.28018
[10]	validation-rmse:6.14268
[11]	validation-rmse:6.02382
[12]	validation-rmse:5.91787
[13]	validation-rmse:5.82600
[14]	validation-rmse:5.74457
[15]	validation-rmse:5.67291
[16]	validation-rmse:5.61022
[17]	validation-rmse:5.55724
[18]	validation-rmse:5.51052
[19]	validation-rmse:5.47105
[20]	validation-rmse:5.43513
[21]	validation-rmse:5.40462
[22]	validation-rmse:5.37791
[23]	validation-rmse:5.35578
[24]	validation-rmse:5.33522
[25]	validation-rmse:5.31814
[26]	validation-rmse:5.30322
[27]	validation-rmse:5.28904
[28]	validation-rmse:5.27761
[29]	validation-rmse:5.26730
[30]	validation-rmse:5.25786
[31]	validation-rmse:5.24976
[

2024/09/20 17:22:15 INFO mlflow.tracking._tracking_service.client: 🏃 View run Xgboost Hyper-parameter Optimization at: https://dagshub.com/PacoTinoco/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/a617ce05516545e0869842bc9def7eee.
2024/09/20 17:22:15 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/PacoTinoco/nyc-taxi-time-prediction.mlflow/#/experiments/0.


In [21]:
best_params

{'learning_rate': np.float64(0.14599396824069683),
 'max_depth': 42,
 'min_child_weight': np.float64(1.7855145549554956),
 'reg_alpha': np.float64(0.05020597797620721),
 'reg_lambda': np.float64(0.0036879070748615603),
 'seed': 42,
 'objective': 'reg:squarederror'}

In [20]:
import mlflow

In [21]:
run_id = input("Ingrese el run_id")
run_uri = f"runs:/{run_id}/model"

result = mlflow.register_model(
    model_uri=run_uri,
    name="nyc-taxi-model"
)

Registered model 'nyc-taxi-model' already exists. Creating a new version of this model...
2024/09/20 17:24:47 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: nyc-taxi-model, version 4
Created version '4' of model 'nyc-taxi-model'.


In [22]:
from datetime import datetime
from mlflow import MlflowClient

client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)
client.update_registered_model(
    name="nyc-taxi-model",
    description="Model registry for the NYC Taxi Time Prediction Project",
)

new_alias = "champion"
date = datetime.today()
model_version = "1"

# create "champion" alias for version 1 of model "nyc-taxi-model"
client.set_registered_model_alias(
    name="nyc-taxi-model",
    alias=new_alias,
    version=model_version
)

client.update_model_version(
    name="nyc-taxi-model",
    version=model_version,
    description=f"The model version {model_version} was transitioned to {new_alias} on {date}",
)

<ModelVersion: aliases=['champion'], creation_timestamp=1726631483611, current_stage='None', description='The model version 1 was transitioned to champion on 2024-09-20 17:24:54.731946', last_updated_timestamp=1726874694532, name='nyc-taxi-model', run_id='23b66a83aacb4d04a076a7275dfc65e3', run_link='', source='mlflow-artifacts:/9e991ac0af45417cb6a8c46fbaf1900e/23b66a83aacb4d04a076a7275dfc65e3/artifacts/model', status='READY', status_message='', tags={}, user_id='', version='1'>

In [23]:
import mlflow.pyfunc

model_name = "nyc-taxi-model"
alias = "champion"

model_uri = f"models:/{model_name}@{alias}"

champion_version = mlflow.pyfunc.load_model(
    model_uri=model_uri
)

champion_version.predict(X_val)

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

 - mlflow (current: 2.16.2, required: mlflow==2.16.1)
 - scikit-learn (current: 1.5.1, required: scikit-learn==1.5.2)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.


array([17.868977 , 27.244492 ,  7.5599394, ..., 36.802193 , 10.982415 ,
       19.203161 ], dtype=float32)