In [1]:
import mlflow
mlflow.set_experiment("Hyperparameter Tuning Experiment")

<Experiment: artifact_location='/Users/kowshid/Documents/Repos/grad/ai-545-machine-learning-operations/classworks/mlflow-deep-dive-1/mlruns/3', creation_time=1770238778978, experiment_id='3', last_update_time=1770238778978, lifecycle_stage='active', name='Hyperparameter Tuning Experiment', tags={}>

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_california_housing

X, y = fetch_california_housing(return_X_y=True)
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=0)

### Optuna is a framework for automatically finding good hyperparameters by running many smart experiments. You first define an objective function, which trains a model and returns a single score, such as validation error. Each time Optuna calls this function, it creates a trial, meaning one concrete set of hyperparameters and one model training attempt.

### A collection of trials forms a study, which represents the full tuning process. Optuna does not try parameters randomly; it uses past trial results to decide which values to try next, gradually focusing on better regions of the search space. Poor trials can be stopped early using pruning, saving computation

### Study is the whole optimization process. inside that study, you are then creating many trials.Each of these trails is a single mlflow run with aa trail number in its name. We are doing 30 of these child runs/trials.

In [3]:
import mlflow
import optuna
import sklearn


def objective(trial):
    # Setting nested=True will create a child run under the parent run.
    with mlflow.start_run(nested=True, run_name=f"trial_{trial.number}") as child_run:
        rf_max_depth = trial.suggest_int("rf_max_depth", 2, 32)
        rf_n_estimators = trial.suggest_int("rf_n_estimators", 50, 300, step=10)
        rf_max_features = trial.suggest_float("rf_max_features", 0.2, 1.0)
        params = {
            "max_depth": rf_max_depth,
            "n_estimators": rf_n_estimators,
            "max_features": rf_max_features,
        }
        # Log current trial's parameters
        mlflow.log_params(params)

        regressor_obj = sklearn.ensemble.RandomForestRegressor(**params)
        regressor_obj.fit(X_train, y_train)

        y_pred = regressor_obj.predict(X_val)
        error = sklearn.metrics.mean_squared_error(y_val, y_pred)
        # Log current trial's error metric
        mlflow.log_metrics({"error": error})

        # Log the model file
        mlflow.sklearn.log_model(regressor_obj, name="model")
        # Make it easy to retrieve the best-performing child run later
        trial.set_user_attr("run_id", child_run.info.run_id)
        return error

### Create a study called minimize (Optuna concept, not mlflow). Its a minimize direction cause we are trying to minimize the validation loss. Set how many trials/child runs you want it to have.use mlflow logs accrodingly to log whatever you want ie metrics, params etc. or use aulog to log everything.

In [4]:
# Create a parent run that contains all child runs for different trials
with mlflow.start_run(run_name="study") as run:
    # Log the experiment settings
    n_trials = 30
    mlflow.log_param("n_trials", n_trials)

    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=n_trials)

    # Log the best trial and its run ID
    mlflow.log_params(study.best_trial.params)
    mlflow.log_metrics({"best_error": study.best_value})
    if best_run_id := study.best_trial.user_attrs.get("run_id"):
        mlflow.log_param("best_child_run_id", best_run_id)

[32m[I 2026-02-04 16:04:11,791][0m Trial 29 finished with value: 0.2957123823778443 and parameters: {'rf_max_depth': 16, 'rf_n_estimators': 250, 'rf_max_features': 0.21674326379642894}. Best is trial 17 with value: 0.24456309456152023.[0m


### After the whole tuning is over, you can click on those trials, sort them by the validation error (or whatever objective function you have), and then register it to the model register! Then you shoulkd be able to see this registered model the model tab. You can add some metadata to that model as well ie which dataset it worked on, who created it etc.

In [6]:
mlflow.register_model(
    model_uri="runs:/2ab4afd0c2d849fb82b182896a2b0aaa/model",
    name="housing-price-predictor",
)

<ModelVersion: aliases=[], creation_timestamp=1770239222461, current_stage='None', deployment_job_state=None, description=None, last_updated_timestamp=1770239222461, metrics=None, model_id=None, name='housing-price-predictor', params=None, run_id='2ab4afd0c2d849fb82b182896a2b0aaa', run_link=None, source='models:/m-7fcfa63d0d86499caeda6a0e4fb9eaad', status='READY', status_message=None, tags={}, user_id=None, version=1>