In [16]:
from fowt_ml.pipeline import Pipeline

In [17]:
example_config_file = "../../src/example_config.yml"

In [18]:
my_pipeline = Pipeline(example_config_file)

In [19]:
# set correct path for mat file
my_pipeline.config["data"]["exp699"]["mat_file"] = "../../../data/example/exp699.mat"

In [20]:
# get the data
df = my_pipeline.get_data("exp699")
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 327702 entries, 0 to 327701
Columns: 125 entries, time to wind_speed
dtypes: float64(125)
memory usage: 312.5 MB


Unnamed: 0,time,acc_calc6[0],acc_calc6[1],acc_calc6[2],acc_calc6[3],acc_calc6[4],acc_calc6[5],acc_tb_meas3[0],acc_tb_meas3[1],acc_tb_meas3[2],...,spd_rot_cmd,time_wave_precalc,tq_mot_act,trig_Hex_in,trig_PIV_in,trig_PIV_out,wtm2_spd_rot_act,wtm2_spd_tor_cmd,wtm2_tq_mot_act,wind_speed
0,0.0,-1.580219,0.135577,0.070342,-0.757244,-2.295318,-0.619591,-1.032135,0.21481,9.635009,...,474.0,29.406,-17.928282,0.999695,-0.000488,0.0,-470.274464,-474.0,40.714927,4.0
1,0.001,-2.035639,0.70547,0.066672,-4.331129,-5.214741,-0.16552,-2.268602,-0.45881,7.425536,...,474.0,29.407,-29.986389,0.999969,-0.000336,0.0,-471.326827,-474.0,24.651387,4.0
2,0.001999,-2.1144,-0.785931,-0.044466,5.023345,-5.750528,-0.520829,-1.588994,-1.485706,6.823769,...,474.0,29.408,-14.466384,0.999695,-0.00061,0.0,-499.477522,-474.0,38.891995,4.0
3,0.003,-2.179736,0.08617,0.448392,-0.444052,-6.337631,0.078678,0.279178,-1.889878,6.000455,...,474.0,29.409,-22.535783,0.999084,-0.000214,0.0,-470.800646,-474.0,25.997681,4.0
4,0.004,-1.792539,-0.31519,0.25357,2.065519,-3.999853,0.206758,-1.349485,-1.129436,8.111131,...,474.0,29.41,-14.566729,0.999695,-9.2e-05,0.0,-471.589917,-474.0,29.576647,4.0


In [21]:
# inspect ML setup
my_pipeline.config["ml_setup"]

{'target': ['acc_tb_meas3[0]',
  'acc_tb_meas3[1]',
  'acc_tb_meas3[2]',
  'acc_tt_meas3[0]',
  'acc_tt_meas3[1]',
  'acc_tt_meas3[2]',
  'force_aero_est6[0]',
  'force_aero_est6[1]',
  'force_aero_est6[2]',
  'force_aero_est6[3]',
  'force_aero_est6[4]',
  'force_aero_est6[5]',
  'force_tt_meas6[0]',
  'force_tt_meas6[1]',
  'force_tt_meas6[2]',
  'force_tt_meas6[3]',
  'force_tt_meas6[4]',
  'force_tt_meas6[5]'],
 'predictors': ['pos_act6[0]',
  'pos_act6[1]',
  'pos_act6[2]',
  'pos_act6[3]',
  'pos_act6[4]',
  'pos_act6[5]',
  'spd_rot_act',
  'wind_speed'],
 'save_grid_scores': True,
 'save_best_model': True,
 'n_jobs': 2,
 'use_gpu': False,
 'train_size': 0.7,
 'models': ['en', 'lar', 'llar', 'lasso', 'lr', 'ridge', 'omp', 'ransac'],
 'metrics_sort': 'R2',
 'system_log': './logs.log'}

In [22]:
import mlflow
import mlflow.sklearn
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import Lars, Lasso, Ridge
from sklearn.metrics import root_mean_squared_error

In [23]:
def train_and_log_model(model, model_name, batch_size=None):
    with mlflow.start_run():
        X_batch, y_batch = X_train, y_train
        if batch_size != None:
            X_batch, y_batch = X_train[:batch_size], y_train[:batch_size]

        # Train the model
        model.fit(X_batch, y_batch)
        
        # Predict the target
        y_pred = model.predict(X_test)
        
        # Log model parameters and metrics
        mlflow.log_param("model_name", model_name)
        mlflow.log_param("n_estimators", model.n_estimators if hasattr(model, 'n_estimators') else None)
        mlflow.log_param("max_depth", model.max_depth if hasattr(model, 'max_depth') else None)
        
        # Calculate and log the performance metrics
        rmse = root_mean_squared_error(y_test, y_pred)
        mlflow.log_metric("rmse", rmse)
        
        # Log the model itself
        mlflow.sklearn.log_model(model, model_name)
        
        return rmse

In [31]:
gbr = HistGradientBoostingRegressor(
    # loss='squared_error',
    # learning_rate=0.1,
    max_iter=50,
    # max_leaf_nodes=31,
    max_depth=9,
    # min_samples_leaf=20,
    # l2_regularization=0.0,
    # max_features=3.0,           # ?
    # max_bins=255,
    # warm_start=False,
    # early_stopping='auto',
    # scoring='loss',
    # validation_fraction=0.1,
    # n_iter_no_change=10,
    # tol=1e-07,
)

models = [
    (MultiOutputRegressor(gbr, n_jobs=18), "GradientBoostingRegressor"),
    (Lars(), "LeastAngleRegression"),
    (Lasso(), "LassoRegression"),
    (Ridge(), "RidgeRegression")
]

# Note, gradient boosting does not natively support multi-target output. Use sklearn.multioutput.MultiOutputRegressor(estimator, *, n_jobs=None)


# Francesco used the following for RF EnsembleModel(estimator="RandomForest", max_depth=9, max_samples=10_000, n_estimators=50)
# Other potentially valuable arguments are max_leaf_nodes (instead of max_depth), learning_rate <= 0.1 (interacts strongly with n_estimators; set n_estimators "large enough"), subsample (eg 0.5), max_features (eg 2 or 3)
# for HistGBR, n_estimators captured by max_iter

# Dictionary to store model performances
model_performances = {}

predictors_labels = my_pipeline.config["ml_setup"]["predictors"]
target_labels = my_pipeline.config["ml_setup"]["target"]

X_data = df[predictors_labels]
Y_data = df[target_labels]

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X_data, Y_data, test_size=0.75, shuffle=False, random_state=123) # TODO set test_size to 0.25

In [33]:
print(X_train.shape)
print(y_train.shape)

(81925, 8)
(81925, 18)


In [35]:
import datetime

In [36]:
%%time
# Train, log models, and compare performance
for model, name in models:
    start_time = datetime.datetime.now()
    rmse = train_and_log_model(model, name)
    model_performances[name] = rmse
    end_time = datetime.datetime.now()
    print(f"runtime {name}: {end_time-start_time}")
print(model_performances)



runtime GradientBoostingRegressor: 0:00:09.487316




runtime LeastAngleRegression: 0:00:03.596855




runtime LassoRegression: 0:00:03.621780




runtime RidgeRegression: 0:00:03.487690
{'GradientBoostingRegressor': 3.6287607149205225, 'LeastAngleRegression': 3.5914625711937265, 'LassoRegression': 3.733306142704239, 'RidgeRegression': 3.5914586161568707}
CPU times: user 4.91 s, sys: 293 ms, total: 5.2 s
Wall time: 20.2 s


In [48]:
%%timeit

model = MultiOutputRegressor(
    HistGradientBoostingRegressor(
        # loss='squared_error',
        # learning_rate=0.1,
        max_iter=50,
        # max_leaf_nodes=31,
        max_depth=9,
        # min_samples_leaf=20,
        # l2_regularization=0.0,
        # max_features=3.0,           # ?
        # max_bins=255,
        # warm_start=False,
        # early_stopping='auto',
        # scoring='loss',
        # validation_fraction=0.1,
        # n_iter_no_change=10,
        # tol=1e-07,
    ),
    n_jobs=1
)
name = "GradientBoostingRegressor"

rmse = train_and_log_model(model, name)
print(rmse)



3.627843637426277




3.629074429163094




3.6277775420779244




3.6268734098960738




3.6262581453004876




3.62643473384315




3.6284647309358493




3.6278193471500426
14.3 s ± 937 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [49]:
%%timeit

model = MultiOutputRegressor(
    HistGradientBoostingRegressor(
        # loss='squared_error',
        # learning_rate=0.1,
        max_iter=50,
        # max_leaf_nodes=31,
        max_depth=9,
        # min_samples_leaf=20,
        # l2_regularization=0.0,
        # max_features=3.0,           # ?
        # max_bins=255,
        # warm_start=False,
        # early_stopping='auto',
        # scoring='loss',
        # validation_fraction=0.1,
        # n_iter_no_change=10,
        # tol=1e-07,
    ),
    n_jobs=18
)
name = "GradientBoostingRegressor"

rmse = train_and_log_model(model, name)
print(rmse)



3.6289860869231316




3.6263484958202588




3.624900808635564




3.6298826504232533




3.623949765540734




3.6293084466944134




3.625807129146841




3.6266515230196115
11.8 s ± 1.19 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [50]:
%%timeit

model = MultiOutputRegressor(
    HistGradientBoostingRegressor(
        # loss='squared_error',
        # learning_rate=0.1,
        max_iter=50,
        # max_leaf_nodes=31,
        max_depth=9,
        # min_samples_leaf=20,
        # l2_regularization=0.0,
        # max_features=3.0,           # ?
        # max_bins=255,
        # warm_start=False,
        # early_stopping='auto',
        # scoring='loss',
        # validation_fraction=0.1,
        # n_iter_no_change=10,
        # tol=1e-07,
    ),
    n_jobs=-1
)
name = "GradientBoostingRegressor"

rmse = train_and_log_model(model, name)
print(rmse)



3.6261840330739568




3.627059526181602




3.625578832128361




3.628246587106668




3.6282208771967834




3.624757427113047




3.629724578519978




3.625310498052794
12.6 s ± 1.41 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
%%timeit

model = MultiOutputRegressor(
    HistGradientBoostingRegressor(
        # loss="squared_error",
        # quantile=None,
        learning_rate=0.12,         # changed from default
        max_iter=20,                # changed from default
        # max_leaf_nodes=31,
        max_depth=3,                # changed from default
        # min_samples_leaf=20,
        l2_regularization=0.0,
        # max_features=1.0,
        max_bins=255,
        # warm_start=False,
        # early_stopping="auto",
        # scoring="loss",
        # validation_fraction=0.1,
        # n_iter_no_change=10,
        # tol=1e-7,
    ),
    n_jobs=18
)
# Best so far:
# model = MultiOutputRegressor(
#     HistGradientBoostingRegressor(
#         # loss="squared_error",
#         # quantile=None,
#         learning_rate=0.12,         # changed from default
#         max_iter=20,                # changed from default
#         # max_leaf_nodes=31,
#         max_depth=3,                # changed from default
#         # min_samples_leaf=20,
#         l2_regularization=0.0,
#         # max_features=1.0,
#         max_bins=255,
#         # warm_start=False,
#         # early_stopping="auto",
#         # scoring="loss",
#         # validation_fraction=0.1,
#         # n_iter_no_change=10,
#         # tol=1e-7,
#     ),
#     n_jobs=18
# )
# rmse ~= 3.592
name = "GradientBoostingRegressor"

rmse = train_and_log_model(model, name)
print(rmse)



3.5925059567096502




3.59259151296365




3.5931670760210985




3.592742220482834




3.591676974245366




3.59266392118389




3.593001503058995




3.592849810092114
7.42 s ± 265 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [71]:
gbr = HistGradientBoostingRegressor(
    learning_rate=0.12,
    max_iter=20,
    max_depth=3,
)

models = [
    (MultiOutputRegressor(gbr, n_jobs=18), "GradientBoostingRegressor"),
    (Lars(), "LeastAngleRegression"),
    (Lasso(), "LassoRegression"),
    (Ridge(), "RidgeRegression")
]

# Note, gradient boosting does not natively support multi-target output. Use sklearn.multioutput.MultiOutputRegressor(estimator, *, n_jobs=None)


# Francesco used the following for RF EnsembleModel(estimator="RandomForest", max_depth=9, max_samples=10_000, n_estimators=50)
# Other potentially valuable arguments are max_leaf_nodes (instead of max_depth), learning_rate <= 0.1 (interacts strongly with n_estimators; set n_estimators "large enough"), subsample (eg 0.5), max_features (eg 2 or 3)
# for HistGBR, n_estimators captured by max_iter

# Dictionary to store model performances
model_performances = {}

predictors_labels = my_pipeline.config["ml_setup"]["predictors"]
target_labels = my_pipeline.config["ml_setup"]["target"]

X_data = df[predictors_labels]
Y_data = df[target_labels]

In [73]:
%%time
# Train, log models, and compare performance
for model, name in models:
    start_time = datetime.datetime.now()
    rmse = train_and_log_model(model, name)
    model_performances[name] = rmse
    end_time = datetime.datetime.now()
    print(f"runtime {name}: {end_time-start_time}")
print(model_performances)



runtime GradientBoostingRegressor: 0:00:09.814222




runtime LeastAngleRegression: 0:00:06.542736




runtime LassoRegression: 0:00:06.277909




runtime RidgeRegression: 0:00:06.288213
{'GradientBoostingRegressor': 3.592517164078976, 'LeastAngleRegression': 3.5914625711937265, 'LassoRegression': 3.733306142704239, 'RidgeRegression': 3.5914586161568707}
CPU times: user 6.07 s, sys: 394 ms, total: 6.46 s
Wall time: 28.9 s
