In this notebook, you should implement a first version of a working machine learning model to predict the age of an Abalone.

A few guidelines:
- The model does not have to be complex. A simple linear regression model is enough.
- You should use MLflow to track your experiments. You can use the MLflow UI to compare your experiments.
- Do not push any MLflow data to the repository. Only the code to run the experiments is interesting and should be pushed.

# 0 - Download Data

In [None]:
import os
from dotenv import load_dotenv

load_dotenv()
KAGGLE_API = os.getenv("KAGGLE_API")
KAGGLE_API

In [None]:
import opendatasets as od
import pandas

od.download("https://www.kaggle.com/datasets/rodolfomendes/abalone-dataset")

In [None]:
import pandas as pd

data_folder = "abalone-dataset"
data_path = f"{data_folder}/abalone.csv"

df = pd.read_csv(data_path)

In [None]:
df

In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from typing import Tuple
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import numpy as np


def preprocess_dataframe(df: pd.DataFrame, target_column: str) -> Tuple[pd.DataFrame, pd.Series]:
    """
    Preprocess the given pandas DataFrame.

    This function handles missing values, encodes categorical variables,
    and normalizes numerical features. It separates the features from the target column.

    Parameters:
    ----------
    df : pd.DataFrame
        The input DataFrame containing features and the target variable.
    target_column : str
        The name of the target variable in the DataFrame.

    Returns:
    -------
    tuple
        A tuple containing:
        - X: pd.DataFrame of preprocessed features.
        - y: pd.Series of target variable.
    """

    # separate features and target variable
    X = df.drop(columns=[target_column])
    y = df[target_column]

    # identify categorical and numerical columns
    categorical_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()
    numerical_cols = X.select_dtypes(include=["number"]).columns.tolist()

    # preprocessing pipeline for numerical features
    numerical_pipeline = Pipeline(steps=[("imputer", SimpleImputer(strategy="mean")), ("scaler", StandardScaler())])

    # preprocessing pipeline for categorical features
    categorical_pipeline = Pipeline(
        steps=[("imputer", SimpleImputer(strategy="most_frequent")), ("onehot", OneHotEncoder(handle_unknown="ignore"))]
    )

    preprocessor = ColumnTransformer(
        transformers=[("num", numerical_pipeline, numerical_cols), ("cat", categorical_pipeline, categorical_cols)]
    )

    # fit and transform the features
    X_processed = preprocessor.fit_transform(X)

    return pd.DataFrame(X_processed), y


def split_data(
    X: pd.DataFrame, y: pd.Series, test_size: float = 0.2, random_state: int = 42
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """
    Split the features and target variable into training and testing sets.

    This function divides the dataset into training and testing subsets based on the specified test size.

    Parameters:
    ----------
    X : pd.DataFrame
        The feature DataFrame to be split.
    y : pd.Series
        The target variable to be split.
    test_size : float, optional
        Proportion of the dataset to include in the test split. Default is 0.2 (20%).
    random_state : int, optional
        Random seed for reproducibility. Default is 42.

    Returns:
    -------
    tuple
        A tuple containing:
        - X_train: pd.DataFrame of training features.
        - X_test: pd.DataFrame of testing features.
        - y_train: pd.Series of training target variable.
        - y_test: pd.Series of testing target variable.
    """

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    return X_train, X_test, y_train, y_test


from sklearn.ensemble import RandomForestRegressor


def fit_random_forest(
    X_train: pd.DataFrame, y_train: pd.Series, n_estimators: int = 100, random_state: int = 42
) -> RandomForestRegressor:
    """
    Fit a Random Forest regression model to the training data.

    Parameters:
    ----------
    X_train : pd.DataFrame
        The training feature DataFrame.
    y_train : pd.Series
        The training target variable.
    n_estimators : int, optional
        The number of trees in the forest. Default is 100.
    random_state : int, optional
        Random seed for reproducibility. Default is 42.

    Returns:
    -------
    RandomForestRegressor
        The fitted Random Forest regression model.
    """

    model = RandomForestRegressor(n_estimators=n_estimators, random_state=random_state)
    model.fit(X_train, y_train)

    return model


def predict(model: RandomForestRegressor, X_new: pd.DataFrame) -> pd.Series:
    """
    Make predictions using the fitted Random Forest regression model.

    Parameters:
    ----------
    model : RandomForestRegressor
        The fitted Random Forest regression model.
    X_new : pd.DataFrame
        The new feature data for which predictions are to be made.

    Returns:
    -------
    pd.Series
        The predicted values for the new data.
    """

    return model.predict(X_new)


def evaluate_rmse(y_true: pd.Series, y_pred: pd.Series) -> float:
    """
    Evaluate the performance of the model using Root Mean Squared Error (RMSE).

    Parameters:
    ----------
    y_true : pd.Series
        The true values of the target variable.
    y_pred : pd.Series
        The predicted values from the model.

    Returns:
    -------
    float
        The RMSE value.
    """

    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    return rmse

In [None]:
X, y = preprocess_dataframe(df, target_column="Rings")
X_train, X_test, y_train, y_test = split_data(X=X, y=y)

model = fit_random_forest(X_train, y_train)
y_pred = predict(model, X_test)

rmse_value = evaluate_rmse(y_test, y_pred)

print(f"Root Mean Squared Error: {rmse_value}")

In [None]:
import mlflow
from mlflow import MlflowClient

client = MlflowClient()