In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import mlflow, mlflow.sklearn

url = "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/penguins.csv"
df = pd.read_csv(url).dropna(subset=["species","bill_length_mm","bill_depth_mm",
                                     "flipper_length_mm","body_mass_g"])
X = df[["bill_length_mm","bill_depth_mm","flipper_length_mm","body_mass_g"]]
y = df["species"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    random_state=42, stratify=y)

In [2]:
pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("clf",    RandomForestClassifier(random_state=42))
])

param_grid = {
    "clf__n_estimators": [100, 300],
    "clf__max_depth":    [None, 5, 10],
    "clf__min_samples_split": [2, 4]
}

In [3]:
mlflow.set_experiment("ai-sprint-day5")

with mlflow.start_run() as run:
    search = GridSearchCV(pipe, param_grid, cv=5, scoring="accuracy", n_jobs=-1)
    search.fit(X_train, y_train)

    best_model = search.best_estimator_
    y_pred = best_model.predict(X_test)

    # log params & metrics
    mlflow.log_params(search.best_params_)
    mlflow.log_metric("cv_best_score", search.best_score_)
    mlflow.log_metric("test_accuracy", accuracy_score(y_test, y_pred))

    # register the model (creates a versioned entity in MLflow)
    mlflow.sklearn.log_model(
        best_model,
        "model",
        registered_model_name="PenguinRF",   # <= appears in MLflow UI
        input_example=X_test.iloc[[0]]
    )

print("Best CV acc :", search.best_score_)
print("Test acc    :", accuracy_score(y_test, y_pred))
print("Best params :", search.best_params_)


2025/10/26 12:43:09 INFO mlflow.tracking.fluent: Experiment with name 'ai-sprint-day5' does not exist. Creating a new experiment.


Best CV acc : 0.9744107744107744
Test acc    : 1.0
Best params : {'clf__max_depth': None, 'clf__min_samples_split': 2, 'clf__n_estimators': 300}


Successfully registered model 'PenguinRF'.
Created version '1' of model 'PenguinRF'.


In [4]:
import joblib
joblib.dump(best_model, "penguin_rf_tuned.pkl")
print("Tuned Random-Forest pipeline saved → penguin_rf_tuned.pkl")

Tuned Random-Forest pipeline saved → penguin_rf_tuned.pkl
