In [7]:
"""
Executable end-to-end ML pipeline using sklearn Iris dataset.
Simulates solutions for:
1. I/O bottlenecks (compact dataset)
2. Feature duplication (feature store reuse)
3. Slow sweeps (Optuna tuning)
4. Dataset lineage (log dataset version)
5. Model governance (MLflow registry)
"""
#pip install scikit-learn optuna mlflow pandas
import mlflow
from mlflow.tracking import MlflowClient
import mlflow.sklearn
import optuna
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
import warnings
warnings.filterwarnings("ignore", message=".*")

# === Configuration ===
MODEL_NAME = "iris_rf_model"
DATASET_VERSION = "iris_v1"  # Simulated dataset version
MLFLOW_EXPERIMENT = "iris_mlflow_experiment"
# Set remote/local MLflow tracking server
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment(MLFLOW_EXPERIMENT)
client = MlflowClient()

# -----------------------------------------------------------
# 1. FIX I/O bottlenecks (simulate compact dataset)
# -----------------------------------------------------------
iris = load_iris(as_frame=True)
df = iris.frame.copy()
df["target"] = iris.target
print("step1")
# Instead of many small files, keep one clean DataFrame
print(f"Dataset compacted: {df.shape[0]} rows, {df.shape[1]} columns")

# -----------------------------------------------------------
# 2. FIX DUPLICATE FEATURES (simulate Feature Store reuse)
# -----------------------------------------------------------
# Simulate "feature store" by creating reusable feature DataFrame
def create_feature_store(df: pd.DataFrame) -> pd.DataFrame:
    feature_df = df.copy()
    feature_df["sepal_area"] = df["sepal length (cm)"] * df["sepal width (cm)"]
    feature_df["petal_area"] = df["petal length (cm)"] * df["petal width (cm)"]
    return feature_df

feature_store_df = create_feature_store(df)

# Reuse features in training
X = feature_store_df.drop(columns=["target"])
y = feature_store_df["target"]

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)
print("step2")
# -----------------------------------------------------------
# 3. FIX SLOW SWEEPS (Optuna hyperparameter tuning)
# -----------------------------------------------------------
def objective(trial):
    n_estimators = trial.suggest_int("n_estimators", 10, 200)
    max_depth = trial.suggest_int("max_depth", 2, 20)
    clf = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        random_state=42,
        n_jobs=-1
    )
    clf.fit(X_train, y_train)
    preds = clf.predict(X_val)
    return accuracy_score(y_val, preds)

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=5, n_jobs=1)
print("step3")
best_params = study.best_params
print("Best params found:", best_params)

# -----------------------------------------------------------
# 4 & 5. FIX LINEAGE + GOVERNANCE (MLflow registry)
# -----------------------------------------------------------
with mlflow.start_run() as run:
    # Log dataset lineage
    mlflow.set_tag("dataset_version", DATASET_VERSION)
    mlflow.set_tag("feature_table", "iris_feature_store")

    # Train final model
    final_model = RandomForestClassifier(**best_params, random_state=42, n_jobs=-1)
    final_model.fit(X_train, y_train)
    preds = final_model.predict(X_val)
    acc = accuracy_score(y_val, preds)

    mlflow.log_metric("val_accuracy", acc)
    mlflow.log_params(best_params)

    # Log + Register Model
    mlflow.sklearn.log_model(final_model, "model")
    print("step4")
    mv = mlflow.register_model(
        model_uri = f"runs:/{run.info.run_id}/model",
        name = MODEL_NAME
    )
    print("step5")
    # Governance tags
    client.set_model_version_tag(MODEL_NAME, mv.version, "owner", "ml_team")
    client.set_model_version_tag(MODEL_NAME, mv.version, "dataset_version", DATASET_VERSION)
    client.set_model_version_tag(MODEL_NAME, mv.version, "feature_table", "iris_feature_store")

# Remove this if no DB registry available
# client.transition_model_version_stage(
#     name=MODEL_NAME,
#     version=mv.version,
#     stage="Staging"
# )

print(f"✅ Model {MODEL_NAME} version {mv.version} logged with accuracy={acc:.4f}")


[I 2025-09-10 14:15:08,672] A new study created in memory with name: no-name-b05e7b23-4be5-4ef6-9623-fc20001e975c


step1
Dataset compacted: 150 rows, 5 columns
step2


[I 2025-09-10 14:15:08,914] Trial 0 finished with value: 0.9333333333333333 and parameters: {'n_estimators': 94, 'max_depth': 7}. Best is trial 0 with value: 0.9333333333333333.
[I 2025-09-10 14:15:09,314] Trial 1 finished with value: 0.9333333333333333 and parameters: {'n_estimators': 187, 'max_depth': 19}. Best is trial 0 with value: 0.9333333333333333.
[I 2025-09-10 14:15:09,694] Trial 2 finished with value: 0.9666666666666667 and parameters: {'n_estimators': 164, 'max_depth': 3}. Best is trial 2 with value: 0.9666666666666667.
[I 2025-09-10 14:15:09,768] Trial 3 finished with value: 0.9333333333333333 and parameters: {'n_estimators': 36, 'max_depth': 13}. Best is trial 2 with value: 0.9666666666666667.
[I 2025-09-10 14:15:09,967] Trial 4 finished with value: 0.9333333333333333 and parameters: {'n_estimators': 93, 'max_depth': 7}. Best is trial 2 with value: 0.9666666666666667.


step3
Best params found: {'n_estimators': 164, 'max_depth': 3}


Registered model 'iris_rf_model' already exists. Creating a new version of this model...


step4


2025/09/10 14:15:14 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: iris_rf_model, version 6
Created version '6' of model 'iris_rf_model'.


step5
🏃 View run legendary-frog-82 at: http://127.0.0.1:5000/#/experiments/204415308566175607/runs/645926febfd1428286939cefcd9a9f99
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/204415308566175607
✅ Model iris_rf_model version 6 logged with accuracy=0.9667


# MLflow Experiment page
![image.png](attachment:ff4ae6cd-8ed2-44c2-b515-232be7ae5266.png)
# MLflow Model page
![image.png](attachment:a9e81e50-ca57-42aa-b236-27e074bfa02b.png)