In this notebook, you should implement a first version of a working machine learning model to predict the age of an Abalone.

A few guidelines:
- The model does not have to be complex. A simple linear regression model is enough.
- You should use MLflow to track your experiments. You can use the MLflow UI to compare your experiments.
- Do not push any MLflow data to the repository. Only the code to run the experiments is interesting and should be pushed.

# 0 - Download Data

In [1]:
import os
from typing import Tuple

import numpy as np
import opendatasets as od
import pandas as pd
from dotenv import load_dotenv
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

load_dotenv()
KAGGLE_API = os.getenv("KAGGLE_API")
KAGGLE_API

In [2]:
def download_data(force: bool = False) -> str:
    od.download("https://www.kaggle.com/datasets/rodolfomendes/abalone-dataset", force=force)
    data_folder = "abalone-dataset"
    return os.path.join(data_folder, "abalone.csv")


def load_data(PATH: str) -> pd.DataFrame:
    return pd.read_csv(PATH)

# 1 - Create the Preprocessing Pipeline

In [None]:
import scipy


def get_preprocessing_pipeline(X: pd.DataFrame) -> ColumnTransformer:
    """Create a preprocessing pipeline for the data."""
    # identify categorical and numerical columns
    categorical_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()
    numerical_cols = X.select_dtypes(include=["number"]).columns.tolist()

    # preprocessing pipeline for numerical features
    numerical_pipeline = Pipeline(steps=[("imputer", SimpleImputer(strategy="mean")), ("scaler", StandardScaler())])

    # preprocessing pipeline for categorical features
    categorical_pipeline = Pipeline(
        steps=[
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("onehot", OneHotEncoder(handle_unknown="ignore")),
        ]
    )

    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numerical_pipeline, numerical_cols),
            ("cat", categorical_pipeline, categorical_cols),
        ]
    )

    return preprocessor


def compute_target(df: pd.DataFrame, target_column: str) -> Tuple[pd.DataFrame, pd.Series]:

    # separate features and target variable
    X = df.drop(columns=[target_column])
    y = df[target_column] + 1.5

    return X, y


def split_data(
    X: pd.DataFrame, y: pd.Series, test_size: float = 0.2, random_state: int = 42
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    return X_train, X_test, y_train, y_test

# 3 - Fit a Random Forest Regressor Model

In [None]:
from sklearn.ensemble import RandomForestRegressor


def fit_random_forest(
    X_train: pd.DataFrame,
    y_train: pd.Series,
    n_estimators: int = 100,
    random_state: int = 42,
) -> RandomForestRegressor:
    """
    Fit a Random Forest regression model to the training data.

    Parameters:
    ----------
    X_train : pd.DataFrame
        The training feature DataFrame.
    y_train : pd.Series
        The training target variable.
    n_estimators : int, optional
        The number of trees in the forest. Default is 100.
    random_state : int, optional
        Random seed for reproducibility. Default is 42.

    Returns:
    -------
    RandomForestRegressor
        The fitted Random Forest regression model.
    """

    model = RandomForestRegressor(n_estimators=n_estimators, random_state=random_state)
    model.fit(X_train, y_train)

    return model


def predict(model: RandomForestRegressor, X_new: pd.DataFrame) -> pd.Series:
    """
    Make predictions using the fitted Random Forest regression model.

    Parameters:
    ----------
    model : RandomForestRegressor
        The fitted Random Forest regression model.
    X_new : pd.DataFrame
        The new feature data for which predictions are to be made.

    Returns:
    -------
    pd.Series
        The predicted values for the new data.
    """

    return model.predict(X_new)


def evaluate_rmse(y_true: pd.Series, y_pred: pd.Series) -> float:
    """
    Evaluate the performance of the model using Root Mean Squared Error (RMSE).

    Parameters:
    ----------
    y_true : pd.Series
        The true values of the target variable.
    y_pred : pd.Series
        The predicted values from the model.

    Returns:
    -------
    float
        The RMSE value.
    """

    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    return rmse

# 4 - Run the model & get a first benchmark on test set

In [None]:
PATH = download_data(force=False)
df = load_data(PATH)

X, y = compute_target(df, target_column="Rings")
preprocessor = get_preprocessing_pipeline(X=X)

X_train, X_test, y_train, y_test = split_data(X, y, test_size=0.2, random_state=42)
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

model = fit_random_forest(X_train_preprocessed, y_train, n_estimators=100, random_state=42)

y_pred = predict(model, X_test_preprocessed)
rmse = evaluate_rmse(y_test, y_pred)
print(f"RMSE: {rmse}")

# 5 - Create MLFlow experiments

In [None]:
import mlflow
from mlflow import MlflowClient
import pickle

client = MlflowClient()

In [None]:
# Set the experiment name
mlflow_experiment_path = f"/mlflow/random_forest_test_final"
mlflow.set_experiment(mlflow_experiment_path)
DATA_PATH = "abalone-dataset/abalone.csv"
N_ESTIMATORS = [100, 200, 500]
os.makedirs("temp", exist_ok=True)
# Create a method that will start an experiment for each N_ESTIMATORS hyperparameter
def create_experiment(n_estimator):
    with mlflow.start_run() as run:
        run_id = run.info.run_id

        # Set tags for the run
        mlflow.set_tag("Level", "Development")
        mlflow.set_tag("Team", "Data Science")

        # Set tags for the run
        mlflow.set_tag("Level", "Development")
        mlflow.set_tag("Team", "Data Science")

        # Load data
        df = load_data(DATA_PATH)
        df = load_data(PATH)

        X, y = compute_target(df, target_column="Rings")
        preprocessor = get_preprocessing_pipeline(X=X)
        pickle.dump(preprocessor, open("temp/preprocessor.pkl", "wb"))
        mlflow.log_artifact("temp/preprocessor.pkl")

        X_train, X_test, y_train, y_test = split_data(X, y, test_size=0.2, random_state=42)
        X_train_preprocessed = preprocessor.fit_transform(X_train)
        X_test_preprocessed = preprocessor.transform(X_test)

        model = fit_random_forest(X_train_preprocessed, y_train, n_estimators=n_estimator)

        # Evaluate model
        y_pred_train = predict(model, X_train_preprocessed)
        rmse = evaluate_rmse(y_train, y_pred_train)
        mlflow.log_metric("train_rmse", rmse)

        # Evaluate model on test set
        y_pred_test = predict(model, X_test_preprocessed)
        rmse = evaluate_rmse(y_test, y_pred_test)
        mlflow.log_metric("test_rmse", rmse)

        # Log your model
        mlflow.sklearn.log_model(model, "models")

        # Log your model parameters
        mlflow.log_params({"n_estimators": n_estimator})

        # Register the model
        mlflow.register_model(f"runs:/{run_id}/models", "random_forest_model")

In [None]:
for n_estimator in N_ESTIMATORS:
    create_experiment(n_estimator)

In [None]:
experiments = client.search_experiments()
experiments

In [None]:
!mlflow ui --host 0.0.0.0 --port 5002

|              | 100 Estimators | 200 Estimators | 500 Estimators |
|--------------|----------------|----------------|----------------|
| Test RMSE    | 2.256          | 2.241          | 2.231          |
| Train RMSE   | 0.793          | 0.792          | 0.789          |


Increasing model complexity seems to improve test set performance. We move model 3 into prod from now onwards (`n_estimators=500`)

In [None]:
# save best model as a pickle file
import pickle

model_name = "random_forest_model"
model_version = "3"

# Load the model from the Model Registry
model_uri = f"models:/{model_name}/{model_version}"
model = mlflow.sklearn.load_model(model_uri=model_uri)

# Get the run ID from the model metadata
model_details = client.get_model_version(model_name, model_version)
run_id = model_details.run_id

# retrieve the preprocessor artifact from the run
preprocessor_path = client.download_artifacts(run_id, "preprocessor.pkl")
preprocessor = pickle.load(open(preprocessor_path, "rb"))

# Save the model as a pickle file


def pickle_model(model, filepath: str) -> None:
    with open(filepath, "wb") as f:
        pickle.dump(model, f)


def load_model(filepath: str):
    with open(filepath, "rb") as f:
        model = pickle.load(f)


model_path = "prod_model/random_forest_model.pkl"

# create the file path
os.makedirs(os.path.dirname(model_path), exist_ok=True)

# save the model
pickle_model(model, model_path)
pickle_model(preprocessor, "prod_model/preprocessor.pkl")