In [1]:
import warnings
import wod_predictor
from wod_predictor.data_loader import DataLoader
from wod_predictor.splitter import DataSplitter
from wod_predictor.preprocessor import DataPreprocessor 
from wod_predictor.modeling import RandomForestModel
from wod_predictor.models.hyperparam_search import HyperparamTuner, ParamRange

In [2]:
warnings.filterwarnings('ignore')

In [3]:
data_path = wod_predictor.__path__[0].replace("wod_predictor", "Data")
loader = DataLoader(root_path = data_path, objects= ['open_results','descriptions','benchmark_stats',  'athlete_info'])
data = loader.load()

In [4]:
splitter = DataSplitter(sample = 20000, test_ratio = 0.2, test_filter='23.*')
train_data, test_data = splitter.split(data)

In [5]:
all_configs = {
    'preprocessing': {
        'open_results': {
            'scale_up': True,
            'scale_args': {
                "method": "general",
                "scaler_name": "StandardScaler"
            }
        },
        'benchmark_stats': {
            'remove_outliers': True,
            'missing_method': 'zero',
            'drop_missing_threshold': 0.9,
            'scale_args': {
                "method": "general",
                "scaler_name": "StandardScaler"
            }
        },
        'athlete_info': {}
    },
    'model': {
        'n_estimators': 10,  # Number of trees
    },
}

In [6]:
preprocessor = DataPreprocessor(config=all_configs['preprocessing'])
preprocessor.fit(data=train_data)

In [7]:
train_preprocessed = preprocessor.transform(data=train_data)
test_preprocessed = preprocessor.transform(data=test_data)

Converted height to height in imperial units
Converted weight to weight in imperial units
Converted height to height in imperial units
Converted weight to weight in imperial units


In [8]:
rf_modeler = RandomForestModel(config={}, **all_configs['model'])
rf_modeler.fit(X = train_preprocessed['X'], y = train_preprocessed['y'])

In [9]:
rf_modeler.predict(X = test_preprocessed['X'])
rf_modeler.show_results(y_test = test_preprocessed['y'], meta_data = test_preprocessed['meta_data'])

Mean Absolute Error: 0.52
Mean Absolute Percentage Error: 2.06


Unnamed: 0,y_test_mean,y_pred_mean,error_mean,error_percentage
23.1,185.562799,185.717542,7.917966,4.267001
23.1_scaled,235.666667,229.124071,26.852917,11.394448
23.2A,132.39885,132.353004,11.435362,8.637055
23.2B,207.339335,207.982797,18.574693,8.958596
23.3,25.174512,26.043498,7.291517,28.963888
23.3_scaled,10.894695,11.103557,2.061102,18.918399


## Hyperparameter tuning (Model Only)

In [10]:
all_configs['data'] = {
    'train_data': train_preprocessed,
    'val_data': test_preprocessed,
}

In [11]:
from time import time
from copy import deepcopy
from sklearn.metrics import mean_absolute_error

def custom_objective(config):
    model_params = deepcopy(config['model'])
    data = deepcopy(config['data'])
    train_data = data['train_data']
    val_data = data['val_data']

    rf_modeler = RandomForestModel(config={}, **model_params)
    rf_modeler.fit(X = train_data['X'], y = train_data['y'])

    y_pred = rf_modeler.predict(X = val_data['X'])
    score = mean_absolute_error(val_data['y'], y_pred)    
    return score  # Return the metric to optimize

In [12]:
# Default
custom_objective(all_configs)

0.5222696687181619

In [15]:
# Define parameter ranges for optimization
param_ranges = {
    "n_estimators": ParamRange(param_type="int", low=10, high=500, log=True, path=["model", "n_estimators"]),

    "max_depth": ParamRange(param_type="int", low=3, high=30, path=["model", "max_depth"]),

    "min_samples_split": ParamRange(param_type="int", low=2, high=20, path=["model", "min_samples_split"]),

    "min_samples_leaf": ParamRange(param_type="int", low=1, high=10, path=["model", "min_samples_leaf"]),
}

In [16]:
tuner = HyperparamTuner(
    objective_fn=custom_objective,
    param_ranges=param_ranges,
    base_config=all_configs,
    n_trials=20
)
final_config = tuner.optimize()

INFO:wod_predictor.models.hyperparam_search:Start hyperparameter optimization for 20 trials
[I 2024-11-09 10:22:27,480] A new study created in memory with name: no-name-6ec35b12-80db-4828-a5b6-8b077b84c532
[I 2024-11-09 10:24:16,309] Trial 0 finished with value: 0.4903056197324229 and parameters: {'n_estimators': 42, 'max_depth': 29, 'min_samples_split': 15, 'min_samples_leaf': 6}. Best is trial 0 with value: 0.4903056197324229.
INFO:wod_predictor.models.hyperparam_search:Trial 0 finished with metric = 0.4903056197324229
[I 2024-11-09 10:24:32,122] Trial 1 finished with value: 0.5234392820636259 and parameters: {'n_estimators': 18, 'max_depth': 7, 'min_samples_split': 3, 'min_samples_leaf': 9}. Best is trial 0 with value: 0.4903056197324229.
[I 2024-11-09 10:28:19,745] Trial 2 finished with value: 0.48725147192338364 and parameters: {'n_estimators': 103, 'max_depth': 22, 'min_samples_split': 2, 'min_samples_leaf': 10}. Best is trial 2 with value: 0.48725147192338364.
[I 2024-11-09 10:3

In [None]:
final_config['model']

{'n_estimators': 10,
 'max_depth': 30,
 'min_samples_split': 17,
 'min_samples_leaf': 3,
 'max_features': None,
 'bootstrap': True,
 'ccp_alpha': 0.1955370866274525}