In this notebook, you should implement a first version of a working machine learning model to predict the age of an Abalone.

A few guidelines:
- The model does not have to be complex. A simple linear regression model is enough.
- You should use MLflow to track your experiments. You can use the MLflow UI to compare your experiments.
- Do not push any MLflow data to the repository. Only the code to run the experiments is interesting and should be pushed.

In [45]:
import sys
import os

# Get the current directory of the notebook
notebook_dir = os.getcwd()

# Assuming `config` is one level up from the `notebook` directory
config_dir = os.path.abspath(os.path.join(notebook_dir, '..', 'config'))

print(config_dir)
# Add the config directory to the sys.path
if config_dir not in sys.path:
    sys.path.append(config_dir)

/Users/laixinyu/xhec-mlops-project-student-expo/config


In [46]:
#%load_ext autoreload
#%autoreload 
import pandas as pd
pd.set_option('display.max_columns', 500)
# pd.options.plotting.backend = "plotly"
import numpy as np


from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn import metrics



#ROOT_PATH = Path(__file__).parent.parent
#OBJECTS_PATH = ROOT_PATH / "src/web_service/local_objects"
#TARGET = "rings"

In [23]:
#df = pd.read_csv(ROOT_PATH / "data/abalone.csv")
df = pd.read_csv("../data/abalone.csv")


# EDA

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4177 entries, 0 to 4176
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Sex             4177 non-null   object 
 1   Length          4177 non-null   float64
 2   Diameter        4177 non-null   float64
 3   Height          4177 non-null   float64
 4   Whole weight    4177 non-null   float64
 5   Shucked weight  4177 non-null   float64
 6   Viscera weight  4177 non-null   float64
 7   Shell weight    4177 non-null   float64
 8   Rings           4177 non-null   int64  
dtypes: float64(7), int64(1), object(1)
memory usage: 293.8+ KB


In [26]:
df.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


# Modeling

## preprocessing

In [31]:
df.columns = [col.replace(" ", "_").lower() for col in df.columns]
encoder = OneHotEncoder(handle_unknown='ignore')
df_cat = encoder.fit_transform(df.select_dtypes(include='object')).toarray()
df_num = df.select_dtypes(include='number')
data = pd.concat([df_num, pd.DataFrame(df_cat, columns=encoder.categories_[0].tolist())], axis=1)

X = data.drop('rings', axis=1)
y = data['rings']

## Train test split

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Training

In [35]:
model = LinearRegression()
model.fit(X_train, y_train)

## Evaluation

In [47]:
y_pred = model.predict(X_test)

def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true))

# Evaluate the model
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
mape = mean_absolute_percentage_error(y_test, y_pred)
print('Mean Absolute Percentage Error (MAPE):', mape)

Mean Absolute Error: 1.5923295454545454
Mean Squared Error: 4.883941942424865
Root Mean Squared Error: 2.2099642400783015
Mean Absolute Percentage Error (MAPE): 0.16111571278085793
