In [2]:
from fowt_ml.pipeline import Pipeline

In [3]:
example_config_file = "../../src/example_config.yml"

In [4]:
my_pipeline = Pipeline(example_config_file)

In [5]:
# set correct path for mat file
my_pipeline.config["data"]["exp699"]["mat_file"] = "../../../data/example/exp699.mat"

In [6]:
# get the data
df = my_pipeline.get_data("exp699")
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 327702 entries, 0 to 327701
Columns: 125 entries, time to wind_speed
dtypes: float64(125)
memory usage: 312.5 MB


Unnamed: 0,time,acc_calc6[0],acc_calc6[1],acc_calc6[2],acc_calc6[3],acc_calc6[4],acc_calc6[5],acc_tb_meas3[0],acc_tb_meas3[1],acc_tb_meas3[2],...,spd_rot_cmd,time_wave_precalc,tq_mot_act,trig_Hex_in,trig_PIV_in,trig_PIV_out,wtm2_spd_rot_act,wtm2_spd_tor_cmd,wtm2_tq_mot_act,wind_speed
0,0.0,-1.580219,0.135577,0.070342,-0.757244,-2.295318,-0.619591,-1.032135,0.21481,9.635009,...,474.0,29.406,-17.928282,0.999695,-0.000488,0.0,-470.274464,-474.0,40.714927,4.0
1,0.001,-2.035639,0.70547,0.066672,-4.331129,-5.214741,-0.16552,-2.268602,-0.45881,7.425536,...,474.0,29.407,-29.986389,0.999969,-0.000336,0.0,-471.326827,-474.0,24.651387,4.0
2,0.001999,-2.1144,-0.785931,-0.044466,5.023345,-5.750528,-0.520829,-1.588994,-1.485706,6.823769,...,474.0,29.408,-14.466384,0.999695,-0.00061,0.0,-499.477522,-474.0,38.891995,4.0
3,0.003,-2.179736,0.08617,0.448392,-0.444052,-6.337631,0.078678,0.279178,-1.889878,6.000455,...,474.0,29.409,-22.535783,0.999084,-0.000214,0.0,-470.800646,-474.0,25.997681,4.0
4,0.004,-1.792539,-0.31519,0.25357,2.065519,-3.999853,0.206758,-1.349485,-1.129436,8.111131,...,474.0,29.41,-14.566729,0.999695,-9.2e-05,0.0,-471.589917,-474.0,29.576647,4.0


In [7]:
# inspect ML setup
my_pipeline.config["ml_setup"]

{'target': ['acc_tb_meas3[0]',
  'acc_tb_meas3[1]',
  'acc_tb_meas3[2]',
  'acc_tt_meas3[0]',
  'acc_tt_meas3[1]',
  'acc_tt_meas3[2]',
  'force_aero_est6[0]',
  'force_aero_est6[1]',
  'force_aero_est6[2]',
  'force_aero_est6[3]',
  'force_aero_est6[4]',
  'force_aero_est6[5]',
  'force_tt_meas6[0]',
  'force_tt_meas6[1]',
  'force_tt_meas6[2]',
  'force_tt_meas6[3]',
  'force_tt_meas6[4]',
  'force_tt_meas6[5]'],
 'predictors': ['pos_act6[0]',
  'pos_act6[1]',
  'pos_act6[2]',
  'pos_act6[3]',
  'pos_act6[4]',
  'pos_act6[5]',
  'spd_rot_act',
  'wind_speed'],
 'save_grid_scores': True,
 'save_best_model': True,
 'n_jobs': 2,
 'use_gpu': False,
 'train_size': 0.7,
 'models': ['en', 'lar', 'llar', 'lasso', 'lr', 'ridge', 'omp', 'ransac'],
 'metrics_sort': 'R2',
 'system_log': './logs.log'}

In [8]:
import mlflow
import mlflow.sklearn
from sklearn.decomposition import PCA
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lars, Lasso, Ridge
from sklearn.metrics import root_mean_squared_error

In [9]:
def train_and_log_model(model, model_name, batch_size=None):
    with mlflow.start_run():
        X_batch, y_batch = X_train, y_train
        if batch_size != None:
            X_batch, y_batch = X_train[:batch_size], y_train[:batch_size]

        # Train the model
        model.fit(X_batch, y_batch)
        
        # Predict the target
        y_pred = model.predict(X_test)
        
        # Log model parameters and metrics
        mlflow.log_param("model_name", model_name)
        mlflow.log_param("n_estimators", model.n_estimators if hasattr(model, 'n_estimators') else None)
        mlflow.log_param("max_depth", model.max_depth if hasattr(model, 'max_depth') else None)
        
        # Calculate and log the performance metrics
        rmse = root_mean_squared_error(y_test, y_pred)
        mlflow.log_metric("rmse", rmse)
        
        # Log the model itself
        mlflow.sklearn.log_model(model, model_name)
        
        return rmse

In [10]:
import pandas as pd

def pca(df, variance=0.95):
    # variance must be in (0,1)
    pca = PCA(svd_solver="full", n_components=variance)
    pca.fit(df)
    df_pc = pd.DataFrame(pca.transform(df))
    return df_pc

In [11]:
models = [
    (GaussianProcessRegressor(), "GaussianProcessRegressor"),
    (Lars(), "LeastAngleRegression"),
    (Lasso(), "LassoRegression"),
    (Ridge(), "RidgeRegression")
]

# Dictionary to store model performances
model_performances = {}

predictors_labels = my_pipeline.config["ml_setup"]["predictors"]
target_labels = my_pipeline.config["ml_setup"]["target"]

X_data = df[predictors_labels]
Y_data = df[target_labels]

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X_data, Y_data, test_size=0.75, shuffle=False, random_state=123) # TODO set test_size to 0.25

In [24]:
%%time
# Train, log models, and compare performance
for model, name in models:
    rmse = train_and_log_model(model, name)
    model_performances[name] = rmse
print(model_performances)

MemoryError: Unable to allocate 50.0 GiB for an array with shape (81925, 81925) and data type float64

Notes on using GaussianProcessRegressor.

1. Using it naively on the example data tries to create an NxN matrix, where N is the number of observations (81925). This matrix is 50 GiB and does not fit in memory.
    The reason is likely that the Gaussian process needs to estimate covariance matrices over the whole input space and this process is not sparse.
2. One suggestion was to set the copy_X_train argument to False. However, this does not stop that matrix from being allocated and giving a MemoryError.
3. Another suggestion was to work in batches. Unfortunately, GaussianProcessRegressor does not support online learning.
    https://stackoverflow.com/questions/61025487/gaussian-process-regression-incremental-learning has an example of retraining GPR; this may require tweaking the stopping condition between training sessions.
    https://github.com/Bigpig4396/Incremental-Gaussian-Process-Regression-IGPR has an implementation of incremental GPR. However, this may be an implementation only for specific kernels; the code mentions the squared exponential kernel.

In [15]:
import datetime

In [16]:
%%time
# Train, log models, and compare performance
for model, name in models:
    start_time = datetime.datetime.now()
    rmse = train_and_log_model(model, name, 500)
    model_performances[name] = rmse
    end_time = datetime.datetime.now()
    print(f"runtime {name}: {end_time-start_time}")
print(model_performances)



runtime GaussianProcessRegressor: 0:00:35.222997




runtime LeastAngleRegression: 0:00:04.654092




runtime LassoRegression: 0:00:04.776032




runtime RidgeRegression: 0:00:04.924166
{'GaussianProcessRegressor': 440.77206120259626, 'LeastAngleRegression': 87.73582019678052, 'LassoRegression': 10.168372461292385, 'RidgeRegression': 15.372045858602714}
CPU times: user 20.3 s, sys: 23.7 s, total: 44 s
Wall time: 49.6 s


In [17]:
%%time
# Train, log models, and compare performance
for model, name in models:
    start_time = datetime.datetime.now()
    rmse = train_and_log_model(model, name, 600)
    model_performances[name] = rmse
    end_time = datetime.datetime.now()
    print(f"runtime {name}: {end_time-start_time}")
print(model_performances)



runtime GaussianProcessRegressor: 0:00:33.401200




runtime LeastAngleRegression: 0:00:04.151744




runtime LassoRegression: 0:00:04.512727




runtime RidgeRegression: 0:00:04.463954
{'GaussianProcessRegressor': 576.1136473231086, 'LeastAngleRegression': 86.73288668038944, 'LassoRegression': 9.074332656829165, 'RidgeRegression': 10.863722169466321}
CPU times: user 27.9 s, sys: 21.8 s, total: 49.7 s
Wall time: 46.5 s


In [18]:
%%time
# Train, log models, and compare performance
for model, name in models:
    start_time = datetime.datetime.now()
    rmse = train_and_log_model(model, name, 700)
    model_performances[name] = rmse
    end_time = datetime.datetime.now()
    print(f"runtime {name}: {end_time-start_time}")
print(model_performances)



runtime GaussianProcessRegressor: 0:00:38.461666




runtime LeastAngleRegression: 0:00:06.167855




runtime LassoRegression: 0:00:04.303331




runtime RidgeRegression: 0:00:04.145896
{'GaussianProcessRegressor': 648.4373856966664, 'LeastAngleRegression': 49.9732929145126, 'LassoRegression': 9.11254608623855, 'RidgeRegression': 12.007957335690193}
CPU times: user 29 s, sys: 24.7 s, total: 53.7 s
Wall time: 53.1 s


In [20]:
%%time
# Train, log models, and compare performance
for model, name in models:
    start_time = datetime.datetime.now()
    rmse = train_and_log_model(model, name, 800)
    model_performances[name] = rmse
    end_time = datetime.datetime.now()
    print(f"runtime {name}: {end_time-start_time}")
print(model_performances)



runtime GaussianProcessRegressor: 0:01:16.418097




runtime LeastAngleRegression: 0:00:04.905509




runtime LassoRegression: 0:00:05.171165




runtime RidgeRegression: 0:00:04.784623
{'GaussianProcessRegressor': 1124.102375066762, 'LeastAngleRegression': 74.59235570347981, 'LassoRegression': 8.167064522319412, 'RidgeRegression': 12.287142024965288}
CPU times: user 40.5 s, sys: 31.6 s, total: 1min 12s
Wall time: 1min 31s


In [21]:
%%time
# Train, log models, and compare performance
for model, name in models:
    start_time = datetime.datetime.now()
    rmse = train_and_log_model(model, name, 900)
    model_performances[name] = rmse
    end_time = datetime.datetime.now()
    print(f"runtime {name}: {end_time-start_time}")
print(model_performances)



runtime GaussianProcessRegressor: 0:00:47.915317




runtime LeastAngleRegression: 0:00:05.124493




runtime LassoRegression: 0:00:05.408109




runtime RidgeRegression: 0:00:05.234294
{'GaussianProcessRegressor': 1198.0376752865473, 'LeastAngleRegression': 11.012585531817273, 'LassoRegression': 7.442159867472665, 'RidgeRegression': 8.002792319619367}
CPU times: user 40.1 s, sys: 27.9 s, total: 1min 8s
Wall time: 1min 3s


In [19]:
%%time
# Train, log models, and compare performance
for model, name in models:
    start_time = datetime.datetime.now()
    rmse = train_and_log_model(model, name, 1000)
    model_performances[name] = rmse
    end_time = datetime.datetime.now()
    print(f"runtime {name}: {end_time-start_time}")
print(model_performances)



runtime GaussianProcessRegressor: 0:01:24.299852




runtime LeastAngleRegression: 0:00:04.445656




runtime LassoRegression: 0:00:04.325984




runtime RidgeRegression: 0:00:04.338389
{'GaussianProcessRegressor': 1200.2397125891844, 'LeastAngleRegression': 9.17764643205609, 'LassoRegression': 6.123109267875933, 'RidgeRegression': 7.699702437844231}
CPU times: user 55.6 s, sys: 35.9 s, total: 1min 31s
Wall time: 1min 37s


Notes on batch training in the context of GPR.

GPR can be trained on smaller datasets, fixing the MemoryError.
However, the processing time ramps up very steeply, going from around 30 seconds at 500 observations to 90 seconds at 1000 observations.
What's worse: the RMS error on the test set are atrocious compared to the compared regressions.