In [1]:
from fowt_ml.pipeline import Pipeline

In [2]:
example_config_file = "../../src/example_config.yml"

In [3]:
my_pipeline = Pipeline(example_config_file)

In [4]:
# set correct path for mat file
my_pipeline.config["data"]["exp699"]["mat_file"] = "../../../data/example/exp699.mat"

In [5]:
# get the data
df = my_pipeline.get_data("exp699")
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 327702 entries, 0 to 327701
Columns: 125 entries, time to wind_speed
dtypes: float64(125)
memory usage: 312.5 MB


Unnamed: 0,time,acc_calc6[0],acc_calc6[1],acc_calc6[2],acc_calc6[3],acc_calc6[4],acc_calc6[5],acc_tb_meas3[0],acc_tb_meas3[1],acc_tb_meas3[2],...,spd_rot_cmd,time_wave_precalc,tq_mot_act,trig_Hex_in,trig_PIV_in,trig_PIV_out,wtm2_spd_rot_act,wtm2_spd_tor_cmd,wtm2_tq_mot_act,wind_speed
0,0.0,-1.580219,0.135577,0.070342,-0.757244,-2.295318,-0.619591,-1.032135,0.21481,9.635009,...,474.0,29.406,-17.928282,0.999695,-0.000488,0.0,-470.274464,-474.0,40.714927,4.0
1,0.001,-2.035639,0.70547,0.066672,-4.331129,-5.214741,-0.16552,-2.268602,-0.45881,7.425536,...,474.0,29.407,-29.986389,0.999969,-0.000336,0.0,-471.326827,-474.0,24.651387,4.0
2,0.001999,-2.1144,-0.785931,-0.044466,5.023345,-5.750528,-0.520829,-1.588994,-1.485706,6.823769,...,474.0,29.408,-14.466384,0.999695,-0.00061,0.0,-499.477522,-474.0,38.891995,4.0
3,0.003,-2.179736,0.08617,0.448392,-0.444052,-6.337631,0.078678,0.279178,-1.889878,6.000455,...,474.0,29.409,-22.535783,0.999084,-0.000214,0.0,-470.800646,-474.0,25.997681,4.0
4,0.004,-1.792539,-0.31519,0.25357,2.065519,-3.999853,0.206758,-1.349485,-1.129436,8.111131,...,474.0,29.41,-14.566729,0.999695,-9.2e-05,0.0,-471.589917,-474.0,29.576647,4.0


In [6]:
# inspect ML setup
my_pipeline.config["ml_setup"]

{'target': ['acc_tb_meas3[0]',
  'acc_tb_meas3[1]',
  'acc_tb_meas3[2]',
  'acc_tt_meas3[0]',
  'acc_tt_meas3[1]',
  'acc_tt_meas3[2]',
  'force_aero_est6[0]',
  'force_aero_est6[1]',
  'force_aero_est6[2]',
  'force_aero_est6[3]',
  'force_aero_est6[4]',
  'force_aero_est6[5]',
  'force_tt_meas6[0]',
  'force_tt_meas6[1]',
  'force_tt_meas6[2]',
  'force_tt_meas6[3]',
  'force_tt_meas6[4]',
  'force_tt_meas6[5]'],
 'predictors': ['pos_act6[0]',
  'pos_act6[1]',
  'pos_act6[2]',
  'pos_act6[3]',
  'pos_act6[4]',
  'pos_act6[5]',
  'spd_rot_act',
  'wind_speed'],
 'save_grid_scores': True,
 'save_best_model': True,
 'n_jobs': 2,
 'use_gpu': False,
 'train_size': 0.7,
 'models': ['en', 'lar', 'llar', 'lasso', 'lr', 'ridge', 'omp', 'ransac'],
 'metrics_sort': 'R2',
 'system_log': './logs.log'}

In [7]:
import mlflow
import mlflow.sklearn
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import Lars, Lasso, Ridge
from sklearn.metrics import root_mean_squared_error

In [8]:
def train_and_log_model(model, model_name, batch_size=None):
    with mlflow.start_run():
        X_batch, y_batch = X_train, y_train
        if batch_size != None:
            X_batch, y_batch = X_train[:batch_size], y_train[:batch_size]

        # Train the model
        model.fit(X_batch, y_batch)
        
        # Predict the target
        y_pred = model.predict(X_test)
        
        # Log model parameters and metrics
        mlflow.log_param("model_name", model_name)
        mlflow.log_param("n_estimators", model.n_estimators if hasattr(model, 'n_estimators') else None)
        mlflow.log_param("max_depth", model.max_depth if hasattr(model, 'max_depth') else None)
        
        # Calculate and log the performance metrics
        rmse = root_mean_squared_error(y_test, y_pred)
        mlflow.log_metric("rmse", rmse)
        
        # Log the model itself
        mlflow.sklearn.log_model(model, model_name)
        
        return rmse

In [9]:
gbr = HistGradientBoostingRegressor(
    learning_rate=0.12,
    max_iter=20,
    max_depth=3,
)

models = [
    (MultiOutputRegressor(gbr, n_jobs=18), "GradientBoostingRegressor"),
    (Lars(), "LeastAngleRegression"),
    (Lasso(), "LassoRegression"),
    (Ridge(), "RidgeRegression")
]

# Dictionary to store model performances
model_performances = {}

predictors_labels = my_pipeline.config["ml_setup"]["predictors"]
target_labels = my_pipeline.config["ml_setup"]["target"]

X_data = df[predictors_labels]
Y_data = df[target_labels]

# Notes on sklearn gradient boosting regressor

sklearn comes with two variants of gradient boosting regressors: GradientBoostingRegressor and HistGradientBoostingRegressor. The former is better for smaller sample sizes (<10k), the latter better for larger sample sizes.

sklearn gradient boosting regressors do not natively support multi-target learning. The MultiOutputRegressor can be used to solve this naively.

sklearn gradient boosting is slower than least angle, lasso, or ridge regression. However, it can achieve the same or slightly better performance by tweaking the regressor parameters. The above parameters gave good results on the example set.

The multi-output regressor also has a parameter to tweak: the number of jobs. For the example data 18 jobs (1 per target) outperformed both 1 job and -1 (as many as possible).

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X_data, Y_data, test_size=0.75, shuffle=False, random_state=123) # TODO set test_size to 0.25

In [11]:
print(X_train.shape)
print(y_train.shape)

(81925, 8)
(81925, 18)


In [12]:
import datetime

In [13]:
%%time
# Train, log models, and compare performance
for model, name in models:
    start_time = datetime.datetime.now()
    rmse = train_and_log_model(model, name)
    model_performances[name] = rmse
    end_time = datetime.datetime.now()
    print(f"runtime {name}: {end_time-start_time}")
print(model_performances)



runtime GradientBoostingRegressor: 0:00:16.975876




runtime LeastAngleRegression: 0:00:03.728180




runtime LassoRegression: 0:00:03.668163




runtime RidgeRegression: 0:00:03.736193
{'GradientBoostingRegressor': 3.592288147012628, 'LeastAngleRegression': 3.5914625711937265, 'LassoRegression': 3.733306142704239, 'RidgeRegression': 3.5914586161568707}
CPU times: user 6.29 s, sys: 1.05 s, total: 7.34 s
Wall time: 28.1 s


In [26]:
# from pprint import pprint

# import matplotlib as mpl
from xgboost import XGBRegressor
from sklearn import datasets
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score, log_loss
from sklearn.model_selection import train_test_split

import mlflow
import mlflow.xgboost

#from utils import fetch_logged_data

In [27]:
xgbr = XGBRegressor(
    learning_rate=0.12,
    n_estimators=20,
    max_depth=3,
    n_jobs=18
)

xmodels = [
    (xgbr, "GradientBoostingRegressor"),
    (Lars(), "LeastAngleRegression"),
    (Lasso(), "LassoRegression"),
    (Ridge(), "RidgeRegression")
]

# Dictionary to store model performances
xmodel_performances = {}

In [25]:
%%time
# Train, log models, and compare performance
for model, name in xmodels:
    start_time = datetime.datetime.now()
    rmse = train_and_log_model(model, name)
    xmodel_performances[name] = rmse
    end_time = datetime.datetime.now()
    print(f"runtime {name}: {end_time-start_time}")
print(xmodel_performances)

ValueError: feature_names must be string, and may not contain [, ] or <