In [1]:
from sklearn.model_selection import train_test_split

from fowt_ml.datasets import convert_mat_to_df
from fowt_ml.ensemble import EnsembleModel

# Fit a Random Forest Estimator

Load and fix the dataset:

In [2]:
data_path = "../../data/exp699_032024_TUDelft/exp699.mat"
data = convert_mat_to_df(data_path, "exp699")
data["wind_speed"] = 4.  # add attribute as a feature

Define targets and predictors, then split them into train and test data:


In [3]:

target_labels = [
    'acc_tb_meas3[0]',
    'acc_tb_meas3[1]',
    'acc_tb_meas3[2]',
    'acc_tt_meas3[0]',
    'acc_tt_meas3[1]',
    'acc_tt_meas3[2]',
    'force_aero_est6[0]',
    'force_aero_est6[1]',
    'force_aero_est6[2]',
    'force_aero_est6[3]',
    'force_aero_est6[4]',
    'force_aero_est6[5]',
    'force_tt_meas6[0]',
    'force_tt_meas6[1]',
    'force_tt_meas6[2]',
    'force_tt_meas6[3]',
    'force_tt_meas6[4]',
    'force_tt_meas6[5]',
]
predictor_labels = [
    'pos_act6[0]',
    'pos_act6[1]',
    'pos_act6[2]',
    'pos_act6[3]',
    'pos_act6[4]',
    'pos_act6[5]',
    'spd_rot_act',
    'wind_speed',
]

X = data[predictor_labels]
Y = data[target_labels]

# should we shuffle data here?
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=123)  # shuffle = True)

Let's instantiate a model by defining a few parameters: 

In [4]:
model = EnsembleModel(
    estimator="RandomForest", max_depth=9, max_samples=10_000, n_estimators=50
)

We get an estimate of the model performance by running cross validation (CV) - the default is to run k-fold CV, with `k=5`: 

In [5]:
%%time
scores = model.cross_val_score(X_train, Y_train, scoring="neg_root_mean_squared_error")
len(scores), scores.mean(), scores.std()

CPU times: user 22.9 s, sys: 564 ms, total: 23.5 s
Wall time: 24.3 s


(5, -3.4812604563320386, 0.005142800724868689)

We can then train the model on the full training dataset, and get the ultimate score on the test set: 

In [6]:
model.fit(X_train, Y_train)
model.score(X_test, Y_test, scoring="neg_root_mean_squared_error")


-3.487296951081186