# MLflow and scikit-learn Demo


In [None]:
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import mlflow
import mlflow.sklearn


## Data Loading: California Housing Dataset

We load the [California housing dataset](https://scikit-learn.org/stable/datasets/real_world.html#california-housing-dataset) for regression.
It consists of $20,640$ samples with the following $8$ numeric attributes, and median house values for California districts as targets.

- MedInc median income in block group
- HouseAge median house age in block group
- AveRooms average number of rooms per household
- AveBedrms average number of bedrooms per household
- Population block group population
- AveOccup average number of household members
- Latitude block group latitude
- Longitude block group longitude


In [None]:
X, y = fetch_california_housing(return_X_y=True, download_if_missing=True)
X.shape, y.shape


In [None]:
# split data into train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42)
X_train.shape


## ML Training

### Model Training and Logging

Let's use ridge regression to predict house prices.
Therefore, we want to find a good penalty parameter for the regularisation.
We keep track of several tries and their mean squared error using MLflow.

Please note that we ignore best practices like cross validation, feature selection and randomised parameter search for demonstration purposes.


In [None]:
def create_pipeline(penalty_parameter: float) -> Pipeline:
    return Pipeline(
        steps=[('scalar', StandardScaler()),
               ('model', Ridge(alpha=penalty_parameter))])


for penalty_parameter in [0.0, 0.25, 0.5, 0.75, 1.0]:
    with mlflow.start_run():
        # build a pipeline with a ridge regression model
        model_pipeline = create_pipeline(penalty_parameter=penalty_parameter)
        model_pipeline.fit(X_train, y_train)

        # calculaye the mean squared error using the test data
        y_pred = model_pipeline.predict(X=X_test)
        mse = mean_squared_error(
            y_true=y_test, y_pred=y_pred, squared=True, multioutput='uniform_average')

        # log parameters, metrics and the model
        mlflow.log_param(key="penalty_parameter", value=penalty_parameter)
        mlflow.log_metric(key="mean_squared_error", value=mse)
        mlflow.sklearn.log_model(
            sk_model=model_pipeline, artifact_path="house_model_ridge")

        print(
            f"Model saved in run {mlflow.active_run().info.run_uuid}. MSE={mse}")


### Assessing the Runs in the MLflow Web-UI

We can inspect single runs with their parameters and metrics with MLflow's web-UI.
Just execute this cell and visit the uri in your web browser.
Terminate this cell to continue with the notebook.


In [None]:
!mlflow ui


### Loading a Model from a Previous Run

Let's reload a model to use it for another prediction with the test data.


In [None]:
run_id = ""  # fill
loaded_model = mlflow.sklearn.load_model(f"runs:/{run_id}/house_model_ridge")


In [None]:
y_pred = loaded_model.predict(X=X_test)
mean_squared_error(y_true=y_test, y_pred=y_pred,
                   squared=True, multioutput='uniform_average')
