In [84]:
from mlflow.tracking import MlflowClient

MLFLOW_TRACKING_URI = "sqlite:///mlflow.db"

client= MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)

In [85]:
client.search_experiments()

[<Experiment: artifact_location='/Users/kennethleo/Documents/GitHub/mlops-zoomcamp/02-experiment-tracking/mlruns/2', creation_time=1761678509229, experiment_id='2', last_update_time=1761678509229, lifecycle_stage='active', name='my-cool-experiment', tags={}>,
 <Experiment: artifact_location='/Users/kennethleo/Documents/GitHub/mlops-zoomcamp/02-experiment-tracking/mlruns/1', creation_time=1761635392108, experiment_id='1', last_update_time=1761635392108, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>,
 <Experiment: artifact_location='/Users/kennethleo/Documents/GitHub/mlops-zoomcamp/02-experiment-tracking/mlruns/0', creation_time=1761635392105, experiment_id='0', last_update_time=1761635392105, lifecycle_stage='active', name='Default', tags={}>]

In [87]:
client.create_experiment(name="another-cool-experiment")

'3'

In [88]:
from  mlflow.entities import ViewType

runs = client.search_runs(
    experiment_ids=["1"],
    filter_string="metrics.rmse < 6.8",
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=5,
    order_by=["metrics.rmse ASC"]
    )

In [89]:
for run in runs:
    print(f"Run ID: {run.info.run_id}, RMSE: {run.data.metrics['rmse']}")

Run ID: 5ef86dad014342e59f774415cb739edc, RMSE: 5.127653127776889
Run ID: af631cc5e7ce4a24bdc2951de45ac1d7, RMSE: 5.127653127776889
Run ID: dd93577001584a039074992b537e9890, RMSE: 5.127653127776889
Run ID: 685a2d9fd49d4450864b5927b74896df, RMSE: 5.128611834046107
Run ID: effc63a1295845b09dff940970e1a873, RMSE: 5.128639627775001


In [90]:
#promote some of these models to model registry

import mlflow

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

In [91]:
run_id = "5ef86dad014342e59f774415cb739edc"
model_uri = f"runs:/{run_id}/model"

mlflow.register_model(model_uri=model_uri, name="nyc-taxi-xgboost-latest")

Registered model 'nyc-taxi-xgboost-latest' already exists. Creating a new version of this model...
Created version '5' of model 'nyc-taxi-xgboost-latest'.


<ModelVersion: aliases=[], creation_timestamp=1761709478185, current_stage='None', deployment_job_state=None, description=None, last_updated_timestamp=1761709478185, metrics=None, model_id=None, name='nyc-taxi-xgboost-latest', params=None, run_id='5ef86dad014342e59f774415cb739edc', run_link=None, source='models:/m-e63222b9b0074f779cb1f4b57ccc4d02', status='READY', status_message=None, tags={}, user_id=None, version=5>

In [92]:
client.search_registered_models()

[<RegisteredModel: aliases={}, creation_timestamp=1761678934995, deployment_job_id=None, deployment_job_state=None, description=None, last_updated_timestamp=1761679105664, latest_versions=[<ModelVersion: aliases=[], creation_timestamp=1761679105664, current_stage='None', deployment_job_state=None, description=None, last_updated_timestamp=1761679105664, metrics=None, model_id=None, name='nyc-taxi-duration-model', params=None, run_id='af631cc5e7ce4a24bdc2951de45ac1d7', run_link=None, source='models:/m-944bc58df8364c65aaaf93741da67b2d', status='READY', status_message=None, tags={}, user_id=None, version=1>], name='nyc-taxi-duration-model', tags={}>,
 <RegisteredModel: aliases={'candidate': 3, 'champion': 1}, creation_timestamp=1761677924672, deployment_job_id=None, deployment_job_state=None, description='NYC Taxi Predictor', last_updated_timestamp=1761709478185, latest_versions=[<ModelVersion: aliases=[], creation_timestamp=1761709478185, current_stage='None', deployment_job_state=None, d

In [93]:
model_name = "nyc-taxi-xgboost-latest"

versions = client.search_model_versions(f"name='{model_name}'")

latest_version = max(int(v.version) for v in versions)
client.set_registered_model_alias(model_name, "candidate", latest_version)
client.set_registered_model_alias(model_name, "champion", 1)


for v in versions:
    version_aliases = [
        alias
        for alias in ["champion", "candidate"]
        if client.get_model_version_by_alias(model_name, alias).version == v.version
    ]
    print(f"Version {v.version}: Aliases = {version_aliases}, Run ID: {v.run_id}")

Version 5: Aliases = ['candidate'], Run ID: 5ef86dad014342e59f774415cb739edc
Version 4: Aliases = [], Run ID: 5ef86dad014342e59f774415cb739edc
Version 3: Aliases = [], Run ID: 
Version 2: Aliases = [], Run ID: af631cc5e7ce4a24bdc2951de45ac1d7
Version 1: Aliases = ['champion'], Run ID: 


In [94]:
model_version = 4
new_alias = "champion"
client.set_registered_model_alias(
    name=model_name,
    version=model_version,
    alias=new_alias,
)

In [95]:
from datetime import datetime

date = datetime.today().date()
client.update_model_version(
    name=model_name,
    version=model_version,
    description=f"The model version {model_version} was transitioned to {new_alias} on {date}"
)

<ModelVersion: aliases=['champion'], creation_timestamp=1761709432794, current_stage='None', deployment_job_state=None, description='The model version 4 was transitioned to champion on 2025-10-29', last_updated_timestamp=1761709501111, metrics=None, model_id=None, name='nyc-taxi-xgboost-latest', params=None, run_id='5ef86dad014342e59f774415cb739edc', run_link=None, source='models:/m-e63222b9b0074f779cb1f4b57ccc4d02', status='READY', status_message=None, tags={}, user_id=None, version=4>

In [96]:
versions = client.search_model_versions(f"name='{model_name}'")
for v in versions:
    print(f"Version {v.version} | Tags: {v.tags}")

Version 4 | Tags: {'stage': 'staging'}
Version 5 | Tags: {'stage': 'staging'}
Version 3 | Tags: {'stage': 'staging'}
Version 2 | Tags: {'stage': 'staging'}
Version 1 | Tags: {'stage': 'production'}


In [64]:
client.set_model_version_tag(
    name=model_name,
    version="2",
    key="stage",
    value="staging"
)

client.set_model_version_tag(
    name=model_name,
    version="1",
    key="stage",
    value="production"
)

In [105]:
from sklearn.metrics import root_mean_squared_error
import pandas as pd


def read_dataframe(filename):
    df = pd.read_parquet(filename)

    df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
    df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df


def preprocess(df, dv):
    df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']
    categorical = ['PU_DO']
    numerical = ['trip_distance']
    train_dicts = df[categorical + numerical].to_dict(orient='records')
    return dv.transform(train_dicts)


def test_model(name, stage_tag, X_test, y_test):
    client = mlflow.tracking.MlflowClient()
    
    # Find model version with the given tag
    version = next(
        v.version for v in client.search_model_versions(f"name='{name}'")
        if v.tags.get("stage") == stage_tag
    )
    
    model = mlflow.pyfunc.load_model(f"models:/{name}/{version}")
    y_pred = model.predict(X_test)
    return {"rmse": root_mean_squared_error(y_test, y_pred)}

In [98]:
df = read_dataframe("data/green_tripdata_2024-03.parquet")

In [99]:
df.head()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge,duration
0,2,2024-03-01 00:10:52,2024-03-01 00:26:12,N,1.0,129,226,1.0,1.72,12.8,1.0,0.5,3.06,0.0,,1.0,18.36,1.0,1.0,0.0,15.333333
1,2,2024-03-01 00:22:21,2024-03-01 00:35:15,N,1.0,130,218,1.0,3.25,17.7,1.0,0.5,0.0,0.0,,1.0,20.2,2.0,1.0,0.0,12.9
2,2,2024-03-01 00:45:27,2024-03-01 01:04:32,N,1.0,255,107,2.0,4.58,23.3,1.0,0.5,3.5,0.0,,1.0,32.05,1.0,1.0,2.75,19.083333
3,1,2024-03-01 00:02:00,2024-03-01 00:23:45,N,1.0,181,71,1.0,0.0,22.5,0.0,1.5,0.0,0.0,,1.0,24.0,1.0,1.0,0.0,21.75
4,2,2024-03-01 00:16:45,2024-03-01 00:23:25,N,1.0,95,135,1.0,1.15,8.6,1.0,0.5,1.0,0.0,,1.0,12.1,1.0,1.0,0.0,6.666667


In [101]:
client.download_artifacts(run_id=run_id, path='preprocessor', dst_path='.')


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

'/Users/kennethleo/Documents/GitHub/mlops-zoomcamp/02-experiment-tracking/preprocessor'

In [102]:
import pickle

with open("preprocessor/preprocessor.b", "rb") as f_in:
    dv = pickle.load(f_in)

In [103]:
X_test = preprocess(df, dv)

target = "duration"
y_test = df[target].values

In [106]:
%time test_model(name=model_name, stage_tag="production", X_test=X_test, y_test=y_test)


CPU times: user 4.35 s, sys: 280 ms, total: 4.63 s
Wall time: 1.8 s


{'rmse': 5.182301539260736}

In [107]:
%time test_model(name=model_name, stage_tag="staging", X_test=X_test, y_test=y_test)


CPU times: user 4.01 s, sys: 70.3 ms, total: 4.08 s
Wall time: 792 ms


{'rmse': 5.182301539260736}