In [32]:
from mlflow.tracking import MlflowClient
from mlflow.entities import ViewType
import mlflow
from datetime import datetime

In [2]:
ML_FLOW_TRACKING_URI = "sqlite:///mlflow.db"
client = MlflowClient(tracking_uri=ML_FLOW_TRACKING_URI)
client.search_experiments()

[<Experiment: artifact_location='/home/osama/Mlops/Part-2/experiment_tracking/mlruns/2', creation_time=1707206536138, experiment_id='2', last_update_time=1707206536138, lifecycle_stage='active', name='my-experiment', tags={}>,
 <Experiment: artifact_location='/home/osama/Mlops/Part-2/experiment_tracking/mlruns/1', creation_time=1706873287173, experiment_id='1', last_update_time=1706873287173, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>,
 <Experiment: artifact_location='mlflow-artifacts:/0', creation_time=1706873036521, experiment_id='0', last_update_time=1706873036521, lifecycle_stage='active', name='Default', tags={}>]

In [None]:
client.create_experiment(name="my-experiment")

In [8]:
runs = client.search_runs(
    experiment_ids='1',
    filter_string ="metrics.rmse < 5.19",
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=5,
    order_by=["metrics.rmse ASC"]
)

In [9]:
for run in runs:
    print(f"run id: {run.info.run_id}, rmse {run.data.metrics['rmse']:.4f}")

run id: c2650cd34c0f4f938d0e530ac96d5cec, rmse 5.1562
run id: b7234f4608524718b3216e5b38fae812, rmse 5.1602
run id: 208c1e1c6c214f77b2d338738373ab67, rmse 5.1602


In [11]:
mlflow.set_tracking_uri(ML_FLOW_TRACKING_URI)


In [12]:
run_id = "b7234f4608524718b3216e5b38fae812"
model_uri = f"runs:/{run_id}/model"
mlflow.register_model(model_uri=model_uri, name="nyc-taxi-xgboost")

Registered model 'nyc-taxi-xgboost' already exists. Creating a new version of this model...
Created version '2' of model 'nyc-taxi-xgboost'.


<ModelVersion: aliases=[], creation_timestamp=1707207552937, current_stage='None', description=None, last_updated_timestamp=1707207552937, name='nyc-taxi-xgboost', run_id='b7234f4608524718b3216e5b38fae812', run_link=None, source='/home/osama/Mlops/Part-2/experiment_tracking/mlruns/1/b7234f4608524718b3216e5b38fae812/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=2>

In [20]:
model_name = "nyc-taxi-xgboost"
client.search_registered_models()
latest_versions = client.get_latest_versions(name=model_name)

for version in latest_versions:
    print(f"version: {version.version}, stage: {version.current_stage}")

version: 1, stage: Staging
version: 2, stage: None


  latest_versions = client.get_latest_versions(name=model_name)


In [36]:
model_version = 2
new_stage = "Staging"
client.transition_model_version_stage(
    name=model_name,
    version=model_version,
    stage=new_stage,
    archive_existing_versions=False
)

  client.transition_model_version_stage(


<ModelVersion: aliases=[], creation_timestamp=1707207552937, current_stage='Staging', description='The model version 2 was transitioned to Staging on 2024-02-06', last_updated_timestamp=1707208997923, name='nyc-taxi-xgboost', run_id='b7234f4608524718b3216e5b38fae812', run_link=None, source='/home/osama/Mlops/Part-2/experiment_tracking/mlruns/1/b7234f4608524718b3216e5b38fae812/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=2>

In [37]:

date = datetime.today().date()
client.update_model_version(
    name=model_name,
    version=model_version,
    description=f"The model version {model_version} was transitioned to {new_stage} on {date}"
)

<ModelVersion: aliases=[], creation_timestamp=1707207552937, current_stage='Staging', description='The model version 2 was transitioned to Staging on 2024-02-06', last_updated_timestamp=1707209002586, name='nyc-taxi-xgboost', run_id='b7234f4608524718b3216e5b38fae812', run_link=None, source='/home/osama/Mlops/Part-2/experiment_tracking/mlruns/1/b7234f4608524718b3216e5b38fae812/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=2>

In [47]:
from sklearn.metrics import mean_squared_error
import pandas as pd

def read_dataframe(filename):
    df = pd.read_parquet(filename)

    df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
    df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]
    
    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df

def preprocess(df, dv):
    df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']
    categorical = ['PU_DO']
    numerical = ['trip_distance']
    train_dicts = df[categorical + numerical].to_dict(orient='records')
    return dv.transform(train_dicts)

def test_model(name, stage, X_test, y_test):
    model = mlflow.pyfunc.load_model(f"models:/{name}/{stage}")
    y_pred = model.predict(X_test)
    return {"rmse": mean_squared_error(y_test, y_pred, squared=False)}

In [48]:
df = read_dataframe("data/green_tripdata_2023-05.parquet")

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,...,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge,duration
0,2,2023-05-01 00:52:10,2023-05-01 01:05:26,N,1.0,244,213,1.0,6.99,28.90,...,0.5,0.00,0.00,,1.0,31.40,1.0,1.0,0.00,13.266667
1,2,2023-05-01 00:29:49,2023-05-01 00:50:11,N,1.0,33,100,1.0,6.60,30.30,...,0.5,5.00,0.00,,1.0,40.55,1.0,1.0,2.75,20.366667
2,2,2023-05-01 00:25:19,2023-05-01 00:32:12,N,1.0,244,244,1.0,1.34,9.30,...,0.5,2.36,0.00,,1.0,14.16,1.0,1.0,0.00,6.883333
3,2,2023-05-01 00:07:06,2023-05-01 00:27:33,N,5.0,82,75,1.0,7.79,22.73,...,0.0,2.29,6.55,,1.0,32.57,1.0,1.0,0.00,20.450000
4,2,2023-05-01 00:43:31,2023-05-01 00:46:59,N,1.0,69,169,1.0,0.70,6.50,...,0.5,0.00,0.00,,1.0,9.00,2.0,1.0,0.00,3.466667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69169,2,2023-05-31 23:45:00,2023-06-01 00:04:00,,,106,225,,4.39,21.44,...,0.0,0.00,0.00,,1.0,22.44,,,,19.000000
69170,2,2023-05-31 23:59:00,2023-06-01 00:22:00,,,17,133,,5.06,19.03,...,0.0,4.01,0.00,,1.0,24.04,,,,23.000000
69171,2,2023-05-31 23:39:00,2023-05-31 23:55:00,,,66,189,,2.18,15.31,...,0.0,3.26,0.00,,1.0,19.57,,,,16.000000
69172,2,2023-05-31 23:42:00,2023-06-01 00:11:00,,,129,62,,8.68,31.83,...,0.0,6.57,0.00,,1.0,39.40,,,,29.000000


In [49]:
client.download_artifacts(run_id=run_id, path='preprocessor', dst_path=".")

  from .autonotebook import tqdm as notebook_tqdm
Downloading artifacts: 100%|██████████| 1/1 [00:00<00:00, 2207.53it/s] 


'/home/osama/Mlops/Part-2/experiment_tracking/preprocessor'

In [51]:
import pickle
with open("preprocessor/preprocessor.b", "rb") as f_in:
    dv = pickle.load(f_in)

In [52]:
X_test = preprocess(df, dv)

In [54]:
target = "duration"
y_test = df[target].values

In [60]:
client.transition_model_version_stage(
    name=model_name,
    version=2,
    stage="Production",
    archive_existing_versions=True
)

  client.transition_model_version_stage(


<ModelVersion: aliases=[], creation_timestamp=1707207552937, current_stage='Production', description='The model version 2 was transitioned to Staging on 2024-02-06', last_updated_timestamp=1707210784644, name='nyc-taxi-xgboost', run_id='b7234f4608524718b3216e5b38fae812', run_link=None, source='/home/osama/Mlops/Part-2/experiment_tracking/mlruns/1/b7234f4608524718b3216e5b38fae812/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=2>

In [61]:
%time test_model(name=model_name, stage="Production", X_test=X_test, y_test=y_test)

CPU times: user 12.8 s, sys: 47.8 ms, total: 12.8 s
Wall time: 2.61 s


{'rmse': 5.745792473978856}

In [57]:
%time test_model(name=model_name, stage="Staging", X_test=X_test, y_test=y_test)

  latest = client.get_latest_versions(name, None if stage is None else [stage])


CPU times: user 12.2 s, sys: 67.8 ms, total: 12.3 s
Wall time: 2.37 s


{'rmse': 5.745792473978856}