In [None]:
import numpy as np
import random
import pandas as pd
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, space_eval
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import (
    r2_score, mean_squared_error, mean_absolute_error,
    mean_absolute_percentage_error, mean_squared_log_error
)
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.ensemble import RandomForestRegressor
import pickle

In [None]:
# 0) Reproducibility
random_seed = 42
np.random.seed(random_seed)
random.seed(random_seed)
rng = np.random.default_rng(random_seed)

In [None]:
# 1) Data load
# NOTE: change the path to your actual CSV

Data = pd.read_csv('C:/Users/hangang/Desktop/Group 2_data_tra_uvvis.csv', encoding='utf-8')
X = Data.drop('Chl-a', axis=1)
y = Data['Chl-a']

In [None]:
# 2) Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_seed)

In [None]:
# 3) Metrics Helper
def calculate_metrics(y_true, y_pred):
    r2 = r2_score(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    mape = mean_absolute_percentage_error(y_true, y_pred)
    rmsle = np.sqrt(mean_squared_log_error(y_true, y_pred))
    return r2, rmse, mae, mse, mape, rmsle

In [None]:
# 4) Search Space
scalers = [StandardScaler(), MinMaxScaler(), RobustScaler()]
space = {
    'scaler': hp.choice('scaler', scalers),
    'n_estimators': hp.quniform('n_estimators', 50, 100, 2),
    'max_depth': hp.quniform('max_depth', 3, 20, 1),
    'min_samples_split': hp.quniform('min_samples_split', 2, 10, 1),
    'min_samples_leaf': hp.quniform('min_samples_leaf', 1, 10, 1)
}

In [None]:
# 5) Objective for TPE
# (KFold CV on scaled features)

def objective(params):
    scaler = params['scaler']
    n_estimators = int(params['n_estimators'])
    max_depth = int(params['max_depth'])
    min_samples_split = int(params['min_samples_split'])
    min_samples_leaf = int(params['min_samples_leaf'])

    X_train_scaled = scaler.fit_transform(X_train)

    model = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=random_seed,
        n_jobs=-1,
    )

    kf = KFold(n_splits=3, shuffle=True, random_state=random_seed)
    cv_scores = cross_val_score(
        model, X_train_scaled, y_train, cv=kf, scoring='neg_mean_squared_error'
    )
    mean_mse = -float(np.mean(cv_scores))
    return {'loss': mean_mse, 'status': STATUS_OK}

In [None]:
# 6) Hyperparameter Optimization

trials = Trials()
best = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=200,
    trials=trials,
    rstate=rng
)
best_params = space_eval(space, best)

best_scaler = best_params['scaler']
best_n_estimators = int(best_params['n_estimators'])
best_max_depth = int(best_params['max_depth'])
best_min_samples_split = int(best_params['min_samples_split'])
best_min_samples_leaf = int(best_params['min_samples_leaf'])

In [None]:
# 7) Train Best Model on Train Set

X_train_scaled = best_scaler.fit_transform(X_train)
X_test_scaled = best_scaler.transform(X_test)

best_model = RandomForestRegressor(
    n_estimators=best_n_estimators,
    max_depth=best_max_depth,
    min_samples_split=best_min_samples_split,
    min_samples_leaf=best_min_samples_leaf,
    random_state=random_seed,
    n_jobs=-1,
)
best_model.fit(X_train_scaled, y_train)

In [None]:
# 8) Evaluate

y_pred_train = best_model.predict(X_train_scaled)
y_pred_test = best_model.predict(X_test_scaled)

r2_train, rmse_train, mae_train, mse_train, mape_train, rmsle_train = calculate_metrics(y_train, y_pred_train)
r2_test,  rmse_test,  mae_test,  mse_test,  mape_test,  rmsle_test  = calculate_metrics(y_test,  y_pred_test)

print(f"\n[Best Scaler] {best_scaler.__class__.__name__}")
print("[Best Hyperparameters]")
print(f"  n_estimators      : {best_n_estimators}")
print(f"  max_depth         : {best_max_depth}")
print(f"  min_samples_split : {best_min_samples_split}")
print(f"  min_samples_leaf  : {best_min_samples_leaf}")

print("\n[Train Metrics]")
print(f"  R2: {r2_train:.4f} | RMSE: {rmse_train:.4f} | MAE: {mae_train:.4f} | MSE: {mse_train:.4f} | MAPE: {mape_train:.4f} | RMSLE: {rmsle_train:.4f}")

print("\n[Test Metrics]")
print(f"  R2: {r2_test:.4f}  | RMSE: {rmse_test:.4f}  | MAE: {mae_test:.4f}  | MSE: {mse_test:.4f}  | MAPE: {mape_test:.4f}  | RMSLE: {rmsle_test:.4f}")

In [None]:
# 9) Save Model (Change path)

model_save_path = r"C:/Users/hangang/Desktop/best_RF_model_tra_uvvis.pkl"
with open(model_save_path, 'wb') as f:
    pickle.dump({
        'model': best_model,
        'scaler': best_scaler,
        'features': X.columns.tolist(),
        'random_seed': random_seed,
        'best_params': {
            'n_estimators': best_n_estimators,
            'max_depth': best_max_depth,
            'min_samples_split': best_min_samples_split,
            'min_samples_leaf': best_min_samples_leaf
        }
    }, f)

print(f"\nModel saved to: {model_save_path}")