## MLflow's Model Registry

In [43]:
from sklearn.metrics import mean_squared_error, root_mean_squared_error
import pandas as pd

import mlflow
from mlflow.tracking import MlflowClient
from mlflow.entities import ViewType

In [4]:
MLFLOW_TRACKING_URI = "sqlite:///mlflow.db"

In [5]:
client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)

In [11]:
# Rechercher toutes les expériences
experiments = client.search_experiments()

In [55]:
# Afficher les détails de chaque expérience
for experiment in experiments:
    print(f"Experiment ID: {experiment.experiment_id}, Name: {experiment.name}")

Experiment ID: 2, Name: nouvelle-experiment-api
Experiment ID: 1, Name: nyc-taxi-experiment-tracking
Experiment ID: 0, Name: Default


In [10]:
# Définir le nom de la nouvelle expérience
experiment_name = "nouvelle-experiment-api"
# Créer la nouvelle expérience
experiment_id = client.create_experiment(experiment_name)
print(f"Experiment ID: {experiment_id}")

Experiment ID: 2


In [56]:
experiment_name = 'nyc-taxi-experiment-tracking'
# Récupérer l'ID de l'expérience
experiment = client.get_experiment_by_name(experiment_name)
experiment_id = experiment.experiment_id
print(f"Experiment ID: {experiment_id}")

Experiment ID: 1


In [23]:
# Rechercher les runs associés à l'expérience avec les conditions spécifiées
runs = client.search_runs(
    experiment_ids=[experiment_id],  # Liste des IDs des expériences à rechercher
    filter_string="metrics.rmse < 6.5",  # Filtre pour sélectionner les runs avec une métrique rmse inférieure à 7
    run_view_type=ViewType.ACTIVE_ONLY,  # Type de vue pour les runs (ACTIVE_ONLY, DELETED_ONLY, ou ALL)
    max_results=5,  # Limite le nombre maximum de résultats retournés
    order_by=["metrics.rmse ASC"]  # Trie les résultats par la métrique rmse en ordre croissant
)

In [24]:
# Afficher les détails de chaque run
for run in runs:
    print(f"Run ID: {run.info.run_id}")
    print(f"Start Time: {run.info.start_time}")
    print(f"Metrics: {run.data.metrics}")
    # print(f"Params: {run.data.params}")
    # print(f"Tags: {run.data.tags}")
    print("-" * 40)

Run ID: f47df67f15a743b29caf3e36284d76f2
Start Time: 1720007345591
Metrics: {'rmse': 6.312369926567225}
----------------------------------------
Run ID: 7d5852625ea245948607eef735da896c
Start Time: 1720007085478
Metrics: {'rmse': 6.312369926567225}
----------------------------------------
Run ID: 21dc170202d64b0393a4582b54381d2f
Start Time: 1720003377217
Metrics: {'rmse': 6.312369926567225}
----------------------------------------
Run ID: 6ade143924e941628a9092a869e00b2f
Start Time: 1720001814353
Metrics: {'validation-rmse': 6.3118810366834115, 'stopped_iteration': 940.0, 'best_iteration': 890.0, 'rmse': 6.312369926567225}
----------------------------------------
Run ID: 81140a48618b442fb950bae2302effaf
Start Time: 1719961694746
Metrics: {'rmse': 6.312369926567225}
----------------------------------------


In [26]:
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

In [30]:
run_id = "f47df67f15a743b29caf3e36284d76f2"
model_uri = f"runs:/{run_id}/model_mlflow"
registered_model = mlflow.register_model(model_uri=model_uri, name="nyc-taxi-regressor")

Registered model 'nyc-taxi-regressor' already exists. Creating a new version of this model...
Created version '3' of model 'nyc-taxi-regressor'.


In [28]:
# Ajouter une description au modèle
client.update_registered_model(
    name="nyc-taxi-regressor",
    description="Modèle de régression pour prédire les tarifs des taxis à New York"
)

<RegisteredModel: aliases={}, creation_timestamp=1720008192900, description='Modèle de régression pour prédire les tarifs des taxis à New York', last_updated_timestamp=1720016036273, latest_versions=[<ModelVersion: aliases=[], creation_timestamp=1720015804668, current_stage='None', description=None, last_updated_timestamp=1720015804668, name='nyc-taxi-regressor', run_id='f47df67f15a743b29caf3e36284d76f2', run_link=None, source='/Users/jeanmermozeffi/DataspellProjects/MLOps/02-experiment-tracking/mlruns/1/f47df67f15a743b29caf3e36284d76f2/artifacts/model_mlflow', status='READY', status_message=None, tags={}, user_id=None, version=2>], name='nyc-taxi-regressor', tags={}>

In [31]:
# Ajouter des tags à une version spécifique du modèle
client.set_model_version_tag(
    name="nyc-taxi-regressor",
    version=registered_model.version,
    key="algorithm",
    value="xgboost"
)

client.set_model_version_tag(
    name="nyc-taxi-regressor",
    version=registered_model.version,
    key="dataset",
    value="NYC Taxi Data"
)

In [34]:
# Lister tous les modèles enregistrés
registered_models = client.search_registered_models()
# pprint(dict(rm), indent=4)

In [35]:
# Afficher les détails de chaque modèle enregistré
for model in registered_models:
    print(f"Model Name: {model.name}")
    print(f"Creation Timestamp: {model.creation_timestamp}")
    print(f"Last Updated Timestamp: {model.last_updated_timestamp}")
    print(f"Description: {model.description}")
    print(f"Latest Versions: {model.latest_versions}")
    print("-" * 40)

Model Name: nyc-taxi-regressor
Creation Timestamp: 1720008192900
Last Updated Timestamp: 1720016137277
Description: Modèle de régression pour prédire les tarifs des taxis à New York
Latest Versions: [<ModelVersion: aliases=[], creation_timestamp=1720016137277, current_stage='None', description=None, last_updated_timestamp=1720016137277, name='nyc-taxi-regressor', run_id='f47df67f15a743b29caf3e36284d76f2', run_link=None, source='/Users/jeanmermozeffi/DataspellProjects/MLOps/02-experiment-tracking/mlruns/1/f47df67f15a743b29caf3e36284d76f2/artifacts/model_mlflow', status='READY', status_message=None, tags={'algorithm': 'xgboost', 'dataset': 'NYC Taxi Data'}, user_id=None, version=3>]
----------------------------------------


In [39]:
# Obtenir les dernières versions du modèle spécifié
model_name = "nyc-taxi-regressor"
latest_versions = client.get_latest_versions(name=model_name)

# Afficher les détails de chaque version
for version in latest_versions:
    print(f"Version: {version.version}")
    print(f"Stage: {version.current_stage}")
    print(f"Run ID: {version.run_id}")
    print(f"Status: {version.status}")
    print(f"Description: {version.description}")
    print(f"Tags: {version.tags}")
    print("-" * 40)

Version: 3
Stage: None
Run ID: f47df67f15a743b29caf3e36284d76f2
Status: READY
Description: None
Tags: {'algorithm': 'xgboost', 'dataset': 'NYC Taxi Data'}
----------------------------------------


  latest_versions = client.get_latest_versions(name=model_name)


In [41]:
model_version = 3
new_stage = "Staging"
client.transition_model_version_stage(
    name=model_name,
    version=model_version,
    stage=new_stage,
    archive_existing_versions=False
)

  client.transition_model_version_stage(


<ModelVersion: aliases=[], creation_timestamp=1720016137277, current_stage='Staging', description=None, last_updated_timestamp=1720017555155, name='nyc-taxi-regressor', run_id='f47df67f15a743b29caf3e36284d76f2', run_link=None, source='/Users/jeanmermozeffi/DataspellProjects/MLOps/02-experiment-tracking/mlruns/1/f47df67f15a743b29caf3e36284d76f2/artifacts/model_mlflow', status='READY', status_message=None, tags={'algorithm': 'xgboost', 'dataset': 'NYC Taxi Data'}, user_id=None, version=3>

In [42]:
from datetime import datetime

date = datetime.today().date()
client.update_model_version(
    name=model_name,
    version=model_version,
    description=f"The model version {model_version} was transitioned to {new_stage} on {date}"
)

<ModelVersion: aliases=[], creation_timestamp=1720016137277, current_stage='Staging', description='The model version 3 was transitioned to Staging on 2024-07-03', last_updated_timestamp=1720018221217, name='nyc-taxi-regressor', run_id='f47df67f15a743b29caf3e36284d76f2', run_link=None, source='/Users/jeanmermozeffi/DataspellProjects/MLOps/02-experiment-tracking/mlruns/1/f47df67f15a743b29caf3e36284d76f2/artifacts/model_mlflow', status='READY', status_message=None, tags={'algorithm': 'xgboost', 'dataset': 'NYC Taxi Data'}, user_id=None, version=3>

In [54]:
def read_dataframe(filename):
    df = pd.read_parquet(filename)

    df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
    df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)

    return df


def preprocess(df, dv):
    # Créer une nouvelle colonne 'PU_DO' en combinant 'PULocationID' et 'DOLocationID'
    df['PU_DO'] = df['PULocationID'].astype(str) + '_' + df['DOLocationID'].astype(str)

    # Variables catégorielles et numériques à utiliser
    categorical = ['PU_DO']
    numerical = ['trip_distance']

    # Convertir les colonnes sélectionnées en dictionnaires (records)
    train_dicts = df[categorical + numerical].to_dict(orient='records')

    # Utiliser l'objet dv pour transformer les données
    transformed_data = dv.transform(train_dicts)

    return transformed_data



def test_model(name, stage, X_test, y_test):
    model = mlflow.pyfunc.load_model(f"models:/{name}/{stage}")
    y_pred = model.predict(X_test)
    return {"rmse": root_mean_squared_error (y_test, y_pred)}

In [48]:
df = read_dataframe("../../DataSets/green_tripdata_2021-03.parquet")

In [50]:
df.shape

(80372, 21)

In [59]:
client.download_artifacts(run_id=run_id, path='preprocessor', dst_path='.')

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

MlflowException: The following failures occurred while downloading one or more artifacts from /Users/jeanmermozeffi/DataspellProjects/MLOps/02-experiment-tracking/mlruns/1/55a3ef95d88a4079b99c13b240a56216/artifacts:
##### File preprocessor #####
[Errno 2] No such file or directory: '/Users/jeanmermozeffi/DataspellProjects/MLOps/02-experiment-tracking/mlruns/1/55a3ef95d88a4079b99c13b240a56216/artifacts/preprocessor'

In [60]:
import pickle

with open("preprocessor/preprocessor.b", "rb") as f_in:
    dv = pickle.load(f_in)

In [61]:
X_test = preprocess(df, dv)

AttributeError: 'tuple' object has no attribute 'transform'