In [1]:
import os
import mlflow
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import train_test_split
from typing import List

# I. Load Data

In [2]:
def load_data(root_path: str, data_name: str):
    return pd.read_csv(os.path.join(root_path, data_name))


data_root = "../data"
training_data_name = "abalone.csv"
train_df = load_data(data_root, training_data_name)
train_df

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,M,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500,15
1,M,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700,7
2,F,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100,9
3,M,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550,10
4,I,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550,7
...,...,...,...,...,...,...,...,...,...
4172,F,0.565,0.450,0.165,0.8870,0.3700,0.2390,0.2490,11
4173,M,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605,10
4174,M,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080,9
4175,F,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960,10


# II. Preprocess Data

In [3]:
CATEGORICAL_COLS = ["Sex"]


def encode_categorical_cols(
    df: pd.DataFrame, categorical_cols: List[str] = None
) -> pd.DataFrame:

    if categorical_cols is None:
        categorical_cols = ["Sex"]
    le = LabelEncoder()
    for col in categorical_cols:
        df[col] = le.fit_transform(df[col])
    return df


def extract_x_y(df: pd.DataFrame, categorical_cols: List[str] = None) -> pd.DataFrame:
    df = encode_categorical_cols(df)
    X, y = df.drop("Rings", axis=1), df["Rings"]
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=1024
    )

    return X_train, X_test, y_train, y_test

In [4]:
X_train, X_test, y_train, y_test = extract_x_y(train_df)

# III. Train Model

In [5]:
def train_model(X_train: np.ndarray, y_train: np.ndarray):

    pipeline = Pipeline(
        [("scaler", StandardScaler()), ("regressor", LinearRegression())]
    )

    pipeline.fit(X_train, y_train)
    return pipeline

In [6]:
model = train_model(X_train, y_train)

# IV. Evaluate Model

In [7]:
def predict_age(input_data: np.ndarray, pipeline: Pipeline):

    return pipeline.predict(input_data)


def evaluate_model(y_true: np.ndarray, y_pred: np.ndarray):
    return root_mean_squared_error(y_true, y_pred)

In [8]:
# Evaluation on training data
train_prediction = predict_age(X_train, model)
train_rmse = evaluate_model(y_train, train_prediction)
train_rmse

2.2359269797432693

In [9]:
# Evaluation on test data
test_prediction = predict_age(X_test, model)
test_rmse = evaluate_model(y_test, test_prediction)
test_rmse

2.1383990429352444

# V. Log Model Parameters to MLFlow

In [10]:
# Set the experiment name
mlflow.set_experiment("abandon-1")

# Start a run
with mlflow.start_run() as run:
    run_id = run.info.run_id

    # Set tags for the run
    tags = {
        "model_type": "linear_regression",
        "developer": "Jazminemei",
        "experiment_type": "baseline",
        "dataset": "abalone",
        "message": "happy international developer's day!",
    }
    mlflow.set_tags(tags)

    # Load data
    data_root_path = "../data"
    data_name = "abalone.csv"
    train_df = load_data(data_root_path, data_name)

    # Extract X and y
    X_train, X_test, y_train, y_test = extract_x_y(train_df)

    # Train model
    ppl = train_model(X_train, y_train)

    # Evaluate model
    train_prediction = predict_age(X_train, model)
    train_rmse = evaluate_model(y_train, train_prediction)

    # Evaluate model on test set
    test_prediction = predict_age(X_test, model)
    test_rmse = evaluate_model(y_test, test_prediction)

    # Log your model
    mlflow.sklearn.log_model(ppl, "linear_regression_pipeline")
    # mlflow.sklearn.log_model(scaler, "preprocessor")
    mlflow.log_metric("train_accuracy", train_rmse)
    mlflow.log_metric("test_accuracy", test_rmse)

    # Register your model in mlfow model registry
    model_uri = f"runs:/{run.info.run_id}/model"
    mlflow.register_model(model_uri, "AbaloneLR")

Registered model 'AbaloneLR' already exists. Creating a new version of this model...
2024/10/24 12:35:00 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: AbaloneLR, version 4
Created version '4' of model 'AbaloneLR'.
