# Optimizing White Abalone Age Prediction
The white abalone (Haliotis sorenseni), is an endangered marine mollusk found along the Pacific coast of North America. The purpose of this experiment is to train multiple Machine Learning algorithms to predict ring count. From the abalone.csv file, three feature varibles and one Target variable will be utlized. Random Forest Regressor, Linear Regression, and XGBoost models will be used. 

In [12]:
# Install and Import required packages and libraries. 
%pip install pandas matplotlib seaborn scikit-learn numpy xgboost optuna
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import optuna
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error

In [14]:
# Load the Abalone Dataset, display Column information, and then provide the first five rows.
abalone_df = pd.read_csv('abalone.csv')
abalone_df.info()
abalone_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4177 entries, 0 to 4176
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   sex             4177 non-null   object 
 1   length          4177 non-null   float64
 2   diameter        4177 non-null   float64
 3   height          4177 non-null   float64
 4   whole_weight    4177 non-null   float64
 5   shucked_weight  4177 non-null   float64
 6   viscera_weight  4177 non-null   float64
 7   shell_weight    4177 non-null   float64
 8   rings           4177 non-null   int64  
dtypes: float64(7), int64(1), object(1)
memory usage: 277.4+ KB


Unnamed: 0,sex,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


# Preprocessing Pipeline
Predictor features (length, diameter, and height) for modeling are expanded upon using polynomial combinations (only interaction terms) and then scaled.

In [15]:
# Predictor features are selected out of the nine avaible features.
features = ['length', 'diameter', 'height']
target = 'rings'

# Create polynominal combinations.
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_poly = poly.fit_transform(abalone_df[features])

# Scale and transform polynominal combinations.
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_poly)

y = abalone_df[target]

# Split the Data

In [16]:
# Split data into training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Random Forest Regressor

## Hyperparameter Tuning and Optuna

In [17]:
# Hyperparameter tuning for Random Forest using Optuna.
def rf_objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 300)
    max_depth = trial.suggest_int('max_depth', 2, 20)
    rf = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
    score = cross_val_score(rf, X_train, y_train, cv=5, scoring='r2').mean()
    return score

# Optuna creates and estimates the best parameters over twenty trials.
study_rf = optuna.create_study(direction='maximize')
study_rf.optimize(rf_objective, n_trials=20)
rf_best_params = study_rf.best_params

[I 2025-03-20 15:06:14,795] A new study created in memory with name: no-name-89572b0e-3e4b-41f0-94dc-bb2dc7410693
[I 2025-03-20 15:06:42,927] Trial 0 finished with value: 0.3692589290861622 and parameters: {'n_estimators': 240, 'max_depth': 8}. Best is trial 0 with value: 0.3692589290861622.
[I 2025-03-20 15:07:03,828] Trial 1 finished with value: 0.36880519818739044 and parameters: {'n_estimators': 182, 'max_depth': 8}. Best is trial 0 with value: 0.3692589290861622.
[I 2025-03-20 15:07:53,516] Trial 2 finished with value: 0.2675455687285755 and parameters: {'n_estimators': 276, 'max_depth': 17}. Best is trial 0 with value: 0.3692589290861622.
[I 2025-03-20 15:08:16,125] Trial 3 finished with value: 0.2609383159259998 and parameters: {'n_estimators': 124, 'max_depth': 18}. Best is trial 0 with value: 0.3692589290861622.
[I 2025-03-20 15:08:40,943] Trial 4 finished with value: 0.3576829169161101 and parameters: {'n_estimators': 195, 'max_depth': 9}. Best is trial 0 with value: 0.369258

## Train, Fit, Predict, Score

In [18]:
# Train Optimized Random Forest model.
rf = RandomForestRegressor(**rf_best_params, random_state=42)

# Fit to traning data set.
rf.fit(X_train, y_train)

# Calculate predictions on test data set.
y_pred_rf = rf.predict(X_test)

# Score and calculate errors.
rf_r2 = r2_score(y_test, y_pred_rf)
rf_rmse = root_mean_squared_error(y_test, y_pred_rf)
rf_mae = mean_absolute_error(y_test, y_pred_rf)

# XgBoost

## Hyperparameter Tuning and Optuna

In [19]:
# Hyperparameter tuning for XGBoost using Optuna.
def xgb_objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 300)
    learning_rate = trial.suggest_float('learning_rate', 0.01, 0.3)
    max_depth = trial.suggest_int('max_depth', 2, 20)
    xgb = XGBRegressor(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth, random_state=42)
    score = cross_val_score(xgb, X_train, y_train, cv=5, scoring='r2').mean()
    return score

# Optuna creates and estimates the best parameters over twenty trials.
study_xgb = optuna.create_study(direction='maximize')
study_xgb.optimize(xgb_objective, n_trials=20)
xgb_best_params = study_xgb.best_params

[I 2025-03-20 15:12:46,767] A new study created in memory with name: no-name-d587c44f-6d99-45ef-86ca-9bce7c22b6c5
[I 2025-03-20 15:12:49,312] Trial 0 finished with value: 0.2947280764579773 and parameters: {'n_estimators': 213, 'learning_rate': 0.24570384162131917, 'max_depth': 3}. Best is trial 0 with value: 0.2947280764579773.
[I 2025-03-20 15:13:14,378] Trial 1 finished with value: 0.04222288131713867 and parameters: {'n_estimators': 120, 'learning_rate': 0.28031723104656975, 'max_depth': 19}. Best is trial 0 with value: 0.2947280764579773.
[I 2025-03-20 15:13:17,849] Trial 2 finished with value: 0.17820729017257692 and parameters: {'n_estimators': 217, 'learning_rate': 0.2814914348527951, 'max_depth': 5}. Best is trial 0 with value: 0.2947280764579773.
[I 2025-03-20 15:13:37,511] Trial 3 finished with value: 0.08764967918395997 and parameters: {'n_estimators': 216, 'learning_rate': 0.2618701221794845, 'max_depth': 12}. Best is trial 0 with value: 0.2947280764579773.
[I 2025-03-20 1

## Train, Fit, Predict, Score

In [20]:
# Train optimized XGBoost model.
xgb = XGBRegressor(**xgb_best_params, random_state=42)

# Fit to traning data set.
xgb.fit(X_train, y_train)

# Calculate predictions on test data set.
y_pred_xgb = xgb.predict(X_test)

# Score and calculate errors.
xgb_r2 = r2_score(y_test, y_pred_xgb)
xgb_rmse = root_mean_squared_error(y_test, y_pred_xgb)
xgb_mae = mean_absolute_error(y_test, y_pred_xgb)

# Linear Regression

## Train, Fit, Predict, Score

In [21]:
# Train optimized Linear Regression model with polynomial features.
lr = LinearRegression()

# Fit to training data set.
lr.fit(X_train, y_train)

# Calculate prediction on test data set.
y_pred_lr = lr.predict(X_test)

# Score and calculate errors.
lr_r2 = r2_score(y_test, y_pred_lr)
lr_rmse = root_mean_squared_error(y_test, y_pred_lr)
lr_mae = mean_absolute_error(y_test, y_pred_lr)

# Results

In [22]:
# Output results in a table. 
results = pd.DataFrame({
    'Model': ['Linear Regression', 'Random Forest', 'XGBoost'],
    'R² Score': [lr_r2, rf_r2, xgb_r2],
    'RMSE': [lr_rmse, rf_rmse, xgb_rmse],
    'MAE': [lr_mae, rf_mae, xgb_mae]
})
print(results)

               Model  R² Score      RMSE       MAE
0  Linear Regression  0.377189  2.596549  1.863254
1      Random Forest  0.372607  2.606081  1.860613
2            XGBoost  0.380125  2.590419  1.846835
