# MLflow Example

In [1]:
import os
import mlflow
import mlflow.sklearn
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from mlflow.models.signature import infer_signature
from mlflow.tracking import MlflowClient

In [2]:
# Set MLflow tracking folder locally
mlflow.set_tracking_uri("file://" + os.path.abspath("mlruns"))

# Define ONE experiment name for all runs
experiment_name = "rf_random_forest_experiments"
mlflow.set_experiment(experiment_name)

# Define parameter sets for three runs
params_list = [
    {"n_estimators": 200, "max_depth": 5,  "min_samples_split": 2},
    {"n_estimators": 300, "max_depth": 10, "min_samples_split": 4},
    {"n_estimators": 400, "max_depth": 15, "min_samples_split": 6},
]

# Load data
X, y = load_diabetes(return_X_y=True, as_frame=True)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)

In [3]:
# Loop over the parameter sets
for i in range(len(params_list)):
    params = params_list[i]

    with mlflow.start_run(run_name=f"rf_run_{i+1}"):
        # Train Random Forest
        model = RandomForestRegressor(
            n_estimators=params["n_estimators"],
            max_depth=params["max_depth"],
            min_samples_split=params["min_samples_split"],
            random_state=42,
            n_jobs=-1,
        )
        model.fit(X_train, y_train)

        # Predict and evaluate
        preds = model.predict(X_test)
        rmse = mean_squared_error(y_test, preds)
        r2 = r2_score(y_test, preds)

        # Log parameters and metrics
        mlflow.log_params(params)
        mlflow.log_metrics({"rmse": rmse, "r2": r2})

        # Log the model with a signature
        sig = infer_signature(X_train, model.predict(X_train))
        mlflow.sklearn.log_model(
            sk_model=model,
            name="model",
            signature=sig,
            input_example=X_train.head(5)
        )

        print(f"Run {i+1} | RMSE: {rmse:.4f} | R2: {r2:.4f}")

2025/11/04 15:39:26 INFO mlflow.models.model: Found the following environment variables used during model inference: [API_KEY]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.


Run 1 | RMSE: 2860.4226 | R2: 0.4827
Run 2 | RMSE: 2899.7989 | R2: 0.4756
Run 3 | RMSE: 2902.9488 | R2: 0.4750


In [4]:
# Find the best run by RMSE
exp = mlflow.get_experiment_by_name(experiment_name)
df = mlflow.search_runs(experiment_ids=[exp.experiment_id])
best = df.sort_values("metrics.rmse", ascending=True).iloc[0]
best

run_id                                            3617330497e24fc2a0bbae15cdbdbd96
experiment_id                                                   337544647433960240
status                                                                    FINISHED
artifact_uri                     file:///Users/jordan.bakerman/Library/CloudSto...
start_time                                        2025-11-04 20:08:36.501000+00:00
end_time                                          2025-11-04 20:08:38.490000+00:00
metrics.r2                                                                0.482716
metrics.rmse                                                           2860.422576
params.max_depth                                                                 5
params.min_samples_split                                                         2
params.n_estimators                                                            200
tags.mlflow.user                                                   jordan.bakerman
tags

In [5]:
best_run_id = best["run_id"]
best_rmse = best["metrics.rmse"]
best_params = {
    "n_estimators": best["params.n_estimators"],
    "max_depth": best["params.max_depth"],
    "min_samples_split": best["params.min_samples_split"],
}

print("Best run:", best_run_id, "| RMSE:", best_rmse, "| Params:", best_params)

Best run: 3617330497e24fc2a0bbae15cdbdbd96 | RMSE: 2860.422576052037 | Params: {'n_estimators': '200', 'max_depth': '5', 'min_samples_split': '2'}


In [6]:
model_uri = f"runs:/{best_run_id}/model"
registered_name = "RF_Diabetes"  # <-- choose a stable registry name for this project

result = mlflow.register_model(model_uri=model_uri, name=registered_name)

Registered model 'RF_Diabetes' already exists. Creating a new version of this model...
Created version '3' of model 'RF_Diabetes'.


In [7]:
# (Optional) wait until the model version is READY
client = MlflowClient()
version = result.version
status = client.get_model_version(name=registered_name, version=version).status

while status == "PENDING_REGISTRATION":
    time.sleep(1)
    status = client.get_model_version(name=registered_name, version=version).status

# Add helpful metadata
client.update_registered_model(
    name=registered_name,
    description="Random Forest for diabetes dataset; grid over n_estimators, max_depth, min_samples_split."
)
client.update_model_version(
    name=registered_name,
    version=version,
    description=f"Best RMSE={best_rmse:.4f} with params={best_params}"
)

# Promote to Staging (or Production if youâ€™re ready)
client.transition_model_version_stage(
    name=registered_name,
    version=version,
    stage="Staging",
    archive_existing_versions=False
)

print(f"Registered {registered_name} v{version} and moved to Staging.")
print(f"Load by name: models:/{registered_name}/Staging")

Registered RF_Diabetes v3 and moved to Staging.
Load by name: models:/RF_Diabetes/Staging


  client.transition_model_version_stage(


In [8]:
# Load best model to score new data
model = mlflow.pyfunc.load_model(model_uri)
preds = model.predict(X_test)
preds[:5]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/8 [00:00<?, ?it/s]

array([145.60709079, 175.22779878, 149.40908064, 244.23071269,
       120.42422891])

In [9]:
# Load a registered model to score new data
registered_name = "RF_Diabetes"
stage = "Staging"

model = mlflow.pyfunc.load_model(f"models:/{registered_name}/{stage}")

preds = model.predict(X_test)
preds[:5]

array([145.60709079, 175.22779878, 149.40908064, 244.23071269,
       120.42422891])

In [10]:
# Bash command to run in terminal
# mlflow ui