In [1]:
from sklearn.model_selection import cross_val_score, train_test_split

from fowt_ml.datasets import convert_mat_to_df
from fowt_ml.ensemble import EnsembleModel

# Fit a Random Forest Estimator

Load and fix the dataset:

In [2]:
data_path = "../../data/exp699_032024_TUDelft/exp699.mat"
data = convert_mat_to_df(data_path, "exp699")
data["wind_speed"] = 4.  # add attribute as a feature

Define targets and predictors, then split them into train and test data:


In [3]:

target_labels = [
    'acc_tb_meas3[0]',
    'acc_tb_meas3[1]',
    'acc_tb_meas3[2]',
    'acc_tt_meas3[0]',
    'acc_tt_meas3[1]',
    'acc_tt_meas3[2]',
    'force_aero_est6[0]',
    'force_aero_est6[1]',
    'force_aero_est6[2]',
    'force_aero_est6[3]',
    'force_aero_est6[4]',
    'force_aero_est6[5]',
    'force_tt_meas6[0]',
    'force_tt_meas6[1]',
    'force_tt_meas6[2]',
    'force_tt_meas6[3]',
    'force_tt_meas6[4]',
    'force_tt_meas6[5]',
]
predictor_labels = [
    'pos_act6[0]',
    'pos_act6[1]',
    'pos_act6[2]',
    'pos_act6[3]',
    'pos_act6[4]',
    'pos_act6[5]',
    'spd_rot_act',
    'wind_speed',
]

X = data[predictor_labels]
Y = data[target_labels]

# should we shuffle data here?
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=123)  # shuffle = True)

Let's instantiate a model by defining a few parameters: 

In [4]:
model = EnsembleModel(
    estimator="RandomForest", max_depth=9, bootstrap=True, max_samples=10_000, n_estimators=50
)

We get an estimate of the model performance by running cross validation (CV) - the default is to run k-fold CV, with `k=5`: 

In [5]:
%%time
scores = cross_val_score(model.estimator, X_train, Y_train, scoring="neg_root_mean_squared_error")
len(scores), scores.mean(), scores.std()

CPU times: user 22.2 s, sys: 529 ms, total: 22.7 s
Wall time: 23.5 s


(5, -3.4828061390322005, 0.005965294435464364)

We do the same but using out-of-bag samples to estimate the generalization score (which should be cheaper):

In [6]:
%%time
score = model.oob_score(X_train, Y_train, scoring="neg_root_mean_squared_error")
score



CPU times: user 7.63 s, sys: 646 ms, total: 8.28 s
Wall time: 8.6 s


-3.480819620430942

Finally we train the model on the full training dataset, and get one or more scores on the test set: 

In [7]:
model.calculate_score(X_train, Y_train, X_test, Y_test, scoring=["neg_root_mean_squared_error", "r2"])

{'neg_root_mean_squared_error': -3.48854898913557, 'r2': 0.12065774081363125}

Include everything in a function to test both random forest and extremely randomized trees:

In [8]:
def run_cross_val_score(estimator, X, y, scoring, **kwargs):
    model = EnsembleModel(estimator, **kwargs)
    cv_score = cross_val_score(model.estimator, X, y, scoring=scoring)
    oob_score = model.oob_score(X, y, scoring=scoring)
    print(f"CV score: {cv_score.mean()} ; OOB score: {oob_score}")

In [10]:
params = {"max_depth": 9, "max_samples": 10_000, "bootstrap": True, "n_estimators": 50}
run_cross_val_score("RandomForest", X_train, Y_train, "neg_root_mean_squared_error", **params)
run_cross_val_score("ExtraTrees", X_train, Y_train, "neg_root_mean_squared_error", **params)



CV score: -3.481252853449478 ; OOB score: -3.48101482544245




CV score: -3.5334720930453885 ; OOB score: -3.5350612318601287
