In [1]:
import warnings, logging, argparse
import pickle, yaml

import wod_predictor
from wod_predictor.data_loader import DataLoader
from wod_predictor.splitter import DataSplitter
from wod_predictor.preprocessor import DataPreprocessor 
from wod_predictor.modeling import RandomForestModel
from wod_predictor.hyperparam_search.tuner import HyperparamTuner, ParamRange
from wod_predictor.hyperparam_search import helpers
from wod_predictor.hyperparam_search.objectives import model_only_objective

from copy import deepcopy
from sklearn.metrics import mean_absolute_error

In [2]:
warnings.filterwarnings('ignore')
logging.basicConfig(level=logging.INFO)

# Load the config file

In [3]:
config_path = 'configs/random_forest_model_only.yml'

with open(config_path, 'r') as f:
    config = yaml.safe_load(f)

# Experiment with the default setting defined in the config file

In [4]:
# Data split and preprocessing
train, val, test = helpers.load_train_val_test_data()
preprocessor = helpers.fit_preprocessor(
    train, preprocessor_args=config["preprocessing"]
)
train_processed = preprocessor.transform(train)
val_processed = preprocessor.transform(val)
test_processed = preprocessor.transform(test)

config['data'] = {
    "train_data": train_processed,
    "val_data": val_processed
}

Converted height to height in imperial units
Converted weight to weight in imperial units
Converted height to height in imperial units
Converted weight to weight in imperial units
Converted height to height in imperial units
Converted weight to weight in imperial units


In [5]:
# Default model setting
model_name = config['model']['name']
model = helpers.fit_model(
    model_name,
    train_processed['X'], train_processed['y'],
    init_args=config['model']['init_args'],
    fit_args=config['model']['fit_args']
)
val_preds = model.predict(val_processed['X'])
val_mae = mean_absolute_error(val_preds, val_processed['y'])
print(f'Default model val MAE: {val_mae}')
test_preds = model.predict(test_processed['X'])
test_mae = mean_absolute_error(test_preds, test_processed['y'])
print(f'Default model test MAE: {test_mae}')

Default model val MAE: 0.6052235072696969
Default model test MAE: 0.5975699190558466


# Perform hyperparameter tuning experiment

In [6]:
def start_study(config, objective_fn):
    param_ranges = {}
    for name, args in config['tune_params'].items():
        param_ranges[name] = ParamRange(**args)
    
    tuner = HyperparamTuner(
        objective_fn=objective_fn,
        param_ranges=param_ranges,
        base_config=config,
        n_trials=config['study']['tuner_args']['n_trials']
    )
    final_config = tuner.optimize()
    return tuner, final_config

In [7]:
tuner, final_config = start_study(config, model_only_objective)

# Fit the model with the best parameters
model = helpers.fit_model(
    model_name,
    train_processed['X'], train_processed['y'],
    init_args=final_config['model']['init_args'],
    fit_args=final_config['model']['fit_args']
)

INFO:wod_predictor.hyperparam_search.tuner:Start hyperparameter optimization for 20 trials
[I 2024-12-03 14:28:36,345] A new study created in memory with name: no-name-73237a10-ca08-41ad-89e7-cac43ceb5f66
[I 2024-12-03 14:31:14,512] Trial 0 finished with value: 0.5842928553942847 and parameters: {'n_estimators': 193, 'max_depth': 29, 'min_samples_split': 15, 'min_samples_leaf': 6}. Best is trial 0 with value: 0.5842928553942847.
[I 2024-12-03 14:31:46,143] Trial 1 finished with value: 0.5811097780431514 and parameters: {'n_estimators': 86, 'max_depth': 7, 'min_samples_split': 3, 'min_samples_leaf': 9}. Best is trial 1 with value: 0.5811097780431514.
[I 2024-12-03 14:35:35,853] Trial 2 finished with value: 0.5822721873950194 and parameters: {'n_estimators': 305, 'max_depth': 22, 'min_samples_split': 2, 'min_samples_leaf': 10}. Best is trial 1 with value: 0.5811097780431514.
[I 2024-12-03 14:38:28,440] Trial 3 finished with value: 0.5734311908622921 and parameters: {'n_estimators': 418, 

In [11]:
val_preds = model.predict(val_processed['X'])
val_mae = mean_absolute_error(val_preds, val_processed['y'])
print(f'Best model val MAE: {val_mae}')
test_preds = model.predict(test_processed['X'])
new_test_mae = mean_absolute_error(test_preds, test_processed['y'])
print(f'Best model test MAE: {new_test_mae}')

Best model val MAE: 0.5649113263930734
Best model test MAE: 0.5705556526158794


In [13]:
def percentage_change(before, after):
    if before == 0 or after == 0:
        raise ValueError("Percentage change is undefined for value 0.")
    return (before - after) * 2 / (before + after)

In [14]:
print(f"We have {abs(100 * percentage_change(test_mae, new_test_mae)):.3f}% improvement after tuning hypereparamters.")

We have 4.625% improvement after tuning hypereparamters.


In [9]:
# Save the results (optional)
import os

save_path = 'modeling/kenneth/results/random_forest'
os.makedirs(save_path, exist_ok=True)
tuner.save_best_state('modeling/kenneth/results/random_forest')