# Paper: Towards JITAI -



In [1]:
import os
import sys
import regex as re
# If your current working directory is the notebooks directory, use this:
notebook_dir = os.getcwd()  # current working directory
src_path = os.path.abspath(os.path.join(notebook_dir, '..', 'src'))
parent_dir = os.path.abspath(os.path.join(notebook_dir, '..'))
model_path = os.path.abspath(os.path.join(notebook_dir, '..', 'model_pipeline'))

sys.path.append(parent_dir)
sys.path.append(src_path)
sys.path.append(model_path)

import glob
import pickle
from IPython.display import Markdown
from server_config import datapath, preprocessed_path, preprocessed_path_freezed, redcap_path

import pandas as pd
import numpy as np
import datetime as dt
from scipy.stats import entropy

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
# Enable experimental features in scikit-learn
from sklearn.experimental import enable_iterative_imputer  # ✅ Must be imported first
from sklearn.impute import IterativeImputer  # ✅ Now you can import it


import ML_config
import ML_pipeline
import run_ML_pipeline

import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns 
import matplotlib.patches as mpatches

sns.set_context("notebook", rc={"axes.labelsize": 14, "xtick.labelsize": 14, "ytick.labelsize": 14})
sns.set_style("whitegrid", {'axes.grid': True})
%matplotlib inline
import plotly.express as px
import plotly.graph_objs as go
from plotly.subplots import make_subplots


2025-02-05 18:10:42.234573: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1738775442.253708 2042500 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1738775442.259837 2042500 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-05 18:10:42.278936: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
#backup_path = preprocessed_path + "backup_data_passive.feather"
#df_backup = pd.read_feather(backup_path)

with open(preprocessed_path_freezed + '/ema_data.pkl', 'rb') as file:
    df_ema_framework = pickle.load(file)

with open(preprocessed_path_freezed + '/ema_content.pkl', 'rb') as file:
    df_ema_content = pickle.load(file)  

with open(preprocessed_path_freezed + '/monitoring_data.pkl', 'rb') as file:
    df_monitoring = pickle.load(file)

with open(preprocessed_path_freezed + '/redcap_data.pkl', 'rb') as file:
    df_redcap = pickle.load(file)

with open(preprocessed_path_freezed + '/map_ema_passive.pkl', 'rb') as file:
    df_ema_passive = pickle.load(file)

### Configurations

In [3]:
# EMA
assessment_phase = [0] #1,2
min_num_daily = 4
min_days_data = 7

### 3. Compare Included vs. not Included Participantants

In [4]:
df_ema_content_cust = df_ema_content.customer.unique().tolist()

In [5]:
df_redcap_original = df_redcap.dropna(subset = ["age", "customer"] )
df_redcap_original = df_redcap_original[df_redcap_original.customer.isin(df_ema_content_cust)]
df_redcap_original = df_redcap_original.drop_duplicates(subset="customer")


In [6]:
# Create a set of included customer IDs
included_customers = set(df_ema_passive['customer'])

# Add a new column to df_redcap_original indicating inclusion
df_redcap_original['Included'] = df_redcap_original['customer'].isin(included_customers)

# Define the two groups
df_redcap_original['Group'] = df_redcap_original['Included'].map({True: 'Included', False: 'Not Included'})

# Verify the counts
print(f"Subjects included in the analysis (n={df_redcap_original['Group'].value_counts().get('Included', 0)})")
print(f"Subjects not included in the analysis (n={df_redcap_original['Group'].value_counts().get('Not Included', 0)})")


Subjects included in the analysis (n=158)
Subjects not included in the analysis (n=143)


In [7]:
from tableone import TableOne
# Define your variables
# Replace the variable names with those present in your DataFrame

# Demographic variables
age = 'age'  # Continuous
employable = 'employability_description_simple'  # Categorical
smartphone_type = 'ema_smartphone_description'  # Categorical
psychotropic_med = 'psychotropic_description'
diagnosis = 'scid_cv_description'
previous_treatment = 'prior_treatment_description_simple'
somatic = 'somatic_description'



# List of all variables to include in the table
columns = [age, employable, smartphone_type, previous_treatment, psychotropic_med, diagnosis, somatic]

# Define categorical variables
categorical = [employable, smartphone_type, previous_treatment, psychotropic_med, diagnosis, somatic]

# Define grouping variable
group_var = 'Included'


In [8]:
# Create the TableOne object
table1 = TableOne(
    df_redcap_original,
    columns=columns,
    categorical=categorical,
    groupby=group_var,
    pval=True,
    nonnormal=[],  # Add variables that are non-normally distributed if any
    missing=False  # Whether to include missing data
)

# Print the table
print(table1.tabulate(tablefmt="fancy_grid"))
table1.to_csv('sample_overview.csv')


╒═══════════════════════════════════════════╤═══════════════════════════════════╤═════════════╤═════════════╤═════════════╤═══════════╕
│                                           │                                   │ Overall     │ False       │ True        │ P-Value   │
╞═══════════════════════════════════════════╪═══════════════════════════════════╪═════════════╪═════════════╪═════════════╪═══════════╡
│ n                                         │                                   │ 301         │ 143         │ 158         │           │
├───────────────────────────────────────────┼───────────────────────────────────┼─────────────┼─────────────┼─────────────┼───────────┤
│ age, mean (SD)                            │                                   │ 33.1 (10.9) │ 33.7 (11.5) │ 32.5 (10.2) │ 0.329     │
├───────────────────────────────────────────┼───────────────────────────────────┼─────────────┼─────────────┼─────────────┼───────────┤
│ employability_description_simple, n (%)   │ no

## Manual Missing data handling

In [9]:
# also impute activity features 

#### GPS

In [10]:
# Create a mask for rows where missing_GPS equals 'Steps<=625'
mask = df_ema_passive['missing_GPS'] == 'Steps<=625'

# For these rows, set the selected columns to 0
cols_set_zero = ['n_GPS', 'total_distance_km', 'time_in_transition_minutes']
for col in cols_set_zero:
    df_ema_passive.loc[mask, col] = 0

# For these rows, set the selected columns to 120
cols_set_120 = ['time_stationary_minutes']
for col in cols_set_120:
    df_ema_passive.loc[mask, col] = 120

mask = df_ema_passive['missing_GPS_home'] == 'Steps<=625'

# For these rows, set the selected columns to 120
cols_set_120 = ['at_home_minute']
for col in cols_set_120:
    df_ema_passive.loc[mask, col] = 120


#### Steps

In [11]:
# Create a mask for rows where missing_GPS equals 'Steps>625'
mask = df_ema_passive['missing_steps'] == 'step_zero'

# For these rows, set the selected columns to 0
cols_set_zero = ['n_steps']
for col in cols_set_zero:
    df_ema_passive.loc[mask, col] = 0

#### Physical Activity

In [12]:
# Create a mask for rows where missing_GPS equals 'Steps>625'
mask = df_ema_passive['missing_pa'] == 'pa_zero'

# For these rows, set the selected columns to 0
cols_set_zero = ['activity_102_minutes', 'activity_103_minutes', 'activity_104_minutes', 'activity_105_minutes', 'activity_106_minutes', 
                 'activity_107_minutes']
for col in cols_set_zero:
    df_ema_passive.loc[mask, col] = 0

### Feature Encoding

- prior treatment: ordinal encoding
- age: min-max scaling
- somatic, employability, psychotropic: 

In [13]:
# Define which columns are which
binary_features = ['somatic_description', 'psychotropic_description', 'employability_description_simple', 'smartphone_type', 'weekend']
categorical_features = ['weekday', 'prior_treatment_description_simple', 'quest_create_hour', 'season', 'time_of_day']
numeric_features = ['age','hr_mean', 'hr_min', 'hr_max', 'hr_std', 'hr_zone_resting', 'hr_zone_moderate','hr_zone_vigorous', 'n_steps', 
       'n_GPS', 'total_distance_km', 'at_home_minute',
       'time_in_transition_minutes', 'time_stationary_minutes', 'activity_102_minutes',
       'activity_103_minutes', 'activity_104_minutes', 'activity_105_minutes',
       'activity_106_minutes', 'activity_107_minutes',
       'apparent_temperature_mean', 'sunshine_duration', 'precipitation_hours'] 

person_static_features = ['customer', 'age', 'somatic_description', 'psychotropic_description', 'employability_description_simple', 'smartphone_type', 'weekend']


In [14]:
df_ema_passive[numeric_features] = df_ema_passive[numeric_features].replace(-1, np.nan)

In [15]:
from scipy.stats import skewtest,normaltest

skewed_features = []
for col in numeric_features:
    valid_data = df_ema_passive[col].dropna()

    # skewtest requires sample size > 7 for reliable results
    stat, p_val = skewtest(valid_data)
    if p_val < 0.05:
        skewed_features.append(col)  # append this feature as skewed


### Model Pipeline

In [16]:
df_ema_passive["intercept"] = 1

In [17]:
df_ema_pipeline = df_ema_passive[['customer', 'unique_day_id', 
       'quest_create_hour', 'weekday', 'weekend', 'season', 'time_of_day',
       'n_quest', 'mean_na', 'sensor_block_end', 'age', 
       'ema_smartphone', 'psychotropic', 'somatic_problems','employability_description_simple',
       'prior_treatment_description_simple',
       'hr_mean', 'hr_min', 'hr_max', 'hr_std', 
       'hr_zone_resting', 'hr_zone_moderate',
       'hr_zone_vigorous', 'n_steps',  'n_GPS', 'total_distance_km', 'at_home_minute',
       'time_in_transition_minutes', 'time_stationary_minutes',
       'activity_102_minutes',
       'activity_103_minutes', 'activity_104_minutes', 'activity_105_minutes',
       'activity_106_minutes', 'activity_107_minutes',
       'apparent_temperature_mean', 'sunshine_duration', 'precipitation_hours', 'intercept'
      ]]

In [18]:
df_ema_passive['customer'] = df_ema_passive['customer'].astype('string')
df_ema_passive.to_csv("short_term_data.csv")

In [19]:
for col in numeric_features:
    series = df_ema_pipeline[col]
    n_nan = series.isna().sum()
    col_min = series.min()
    col_max = series.max()
    col_mean = series.mean()
    col_std = series.std()
    print(f"{col}: NaNs={n_nan}, min={col_min}, max={col_max}, mean={col_mean}, std={col_std}")


age: NaNs=0, min=18.0, max=66.0, mean=32.22805878054721, std=9.986262010627621
hr_mean: NaNs=362, min=40.583333333333336, max=176.0, mean=76.21038364205769, std=14.42543554669274
hr_min: NaNs=362, min=35.0, max=176.0, mean=61.30263488080301, std=11.62405021207671
hr_max: NaNs=362, min=42.0, max=211.0, mean=96.8409870347135, std=27.542694611533477
hr_std: NaNs=362, min=0.0, max=56.0, mean=10.792436474020727, std=7.9558771372936725
hr_zone_resting: NaNs=362, min=0.0, max=32.400000000000006, mean=1.3624361661787256, std=2.2104999178041633
hr_zone_moderate: NaNs=362, min=0.0, max=114.00951666666391, mean=5.550823878433013, std=3.4077147994985237
hr_zone_vigorous: NaNs=362, min=0.0, max=78.6040666666656, mean=1.0310447901854172, std=3.6420108564005598
n_steps: NaNs=198, min=0.0, max=10078.0, mean=842.1961382952388, std=1105.3486861640363
n_GPS: NaNs=880, min=0.0, max=1897.0, mean=45.22707003584856, std=113.742096612119
total_distance_km: NaNs=880, min=0.0, max=180.0840241629681, mean=2.4437

In [20]:
df_ema_pipeline["n_quest_sum"] = (df_ema_pipeline.groupby('customer')['unique_day_id'].transform('nunique')
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ema_pipeline["n_quest_sum"] = (df_ema_pipeline.groupby('customer')['unique_day_id'].transform('nunique')


In [21]:
df_ema_strat = df_ema_pipeline[["customer", "n_quest_sum"]].drop_duplicates()
df_ema_strat['n_quest_stratify'] = pd.qcut(df_ema_strat["n_quest_sum"], q=4, labels=False, duplicates="drop")


In [22]:
df_ema_pipeline= pd.merge(df_ema_pipeline,df_ema_strat, on=["customer", "n_quest_sum"], how="left")

In [23]:
from ML_pipeline import MLpipeline
from ML_config import Config

my_config = Config()
pipeline = MLpipeline(my_config)

pipeline.set_data(df_ema_pipeline)
pipeline.outer_user_split()
pipeline.inner_time_split()

# (1) Time-based runs
results_timebased, results_holdout = pipeline.run(my_config.ANALYSIS["neg_affect_regression"]["MODEL_PIPEGRIDS"])


INFO     [ML_pipeline.py:180] [ML_Pipeline] Configuration Loaded:
INFO     [ML_pipeline.py:181]   Impute Strategy: knn
INFO     [ML_pipeline.py:182]   Scaler Strategy: minmax
INFO     [ML_pipeline.py:183]   Holdout Ratio: 0.1
INFO     [ML_pipeline.py:184]   Time Ratio: 0.8
INFO     [ML_pipeline.py:185]   Number of Jobs: 1
INFO     [ML_pipeline.py:186]   Number of Folds (Inner CV): 5
INFO     [ML_pipeline.py:187]   CV Method: forwardchaining (expected 'forwardchaining' here)
INFO     [ML_pipeline.py:195] [set_data] DataFrame with 12317 rows loaded in pipeline.
INFO     [ML_pipeline.py:266] [outer_user_split] Holdout Users: 15/158 | Holdout Strategy: first_20 | Holdout Size: 224 rows.
INFO     [ML_pipeline.py:292] [inner_time_split] Inner train size: 8880, test size: 2290.
INFO     [ML_pipeline.py:497] [run] Starting the ML pipeline run method.
INFO     [ML_pipeline.py:521] 
[run] Starting pipeline: Global_Intercept


features: ['hr_min', 'sunshine_duration', 'precipitation_hours', 'weekday', 'time_in_transition_minutes', 'activity_103_minutes', 'n_steps', 'activity_107_minutes', 'hr_zone_vigorous', 'hr_max', 'hr_zone_moderate', 'time_stationary_minutes', 'total_distance_km', 'activity_105_minutes', 'apparent_temperature_mean', 'at_home_minute', 'hr_std', 'n_GPS', 'activity_102_minutes', 'weekend', 'hr_mean', 'activity_104_minutes', 'quest_create_hour', 'activity_106_minutes', 'time_of_day', 'season', 'hr_zone_resting']


INFO     [ML_pipeline.py:548] [Global_Intercept] Fitting GridSearchCV.
INFO     [ML_pipeline.py:94] [PerUserForwardChainingCV] Unique users found: 143
INFO     [ML_pipeline.py:550] [Global_Intercept] GridSearchCV completed.
INFO     [ML_pipeline.py:551] [Global_Intercept] Best Parameters: {}
INFO     [ML_pipeline.py:552] [Global_Intercept] Best CV Score (mae): -0.883
INFO     [ML_pipeline.py:587] [Global_Intercept] Inner Test Scores: {'r2': -0.009145076570804545, 'mae': 0.9735838472205831, 'rmse': np.float64(1.1884263092171299)}
INFO     [ML_pipeline.py:521] 
[run] Starting pipeline: PerUser_Intercept


features: ['hr_min', 'sunshine_duration', 'precipitation_hours', 'weekday', 'time_in_transition_minutes', 'customer', 'activity_103_minutes', 'n_steps', 'activity_107_minutes', 'hr_zone_vigorous', 'hr_max', 'hr_zone_moderate', 'time_stationary_minutes', 'total_distance_km', 'activity_105_minutes', 'apparent_temperature_mean', 'at_home_minute', 'hr_std', 'n_GPS', 'activity_102_minutes', 'weekend', 'hr_mean', 'activity_104_minutes', 'quest_create_hour', 'activity_106_minutes', 'time_of_day', 'season', 'hr_zone_resting']


INFO     [ML_pipeline.py:548] [PerUser_Intercept] Fitting GridSearchCV.
INFO     [ML_pipeline.py:94] [PerUserForwardChainingCV] Unique users found: 143


✅ Fit completed. Detected 143 unique users.
✅ Fit completed. Detected 143 unique users.
✅ Fit completed. Detected 143 unique users.
✅ Fit completed. Detected 143 unique users.


INFO     [ML_pipeline.py:550] [PerUser_Intercept] GridSearchCV completed.
INFO     [ML_pipeline.py:551] [PerUser_Intercept] Best Parameters: {}
INFO     [ML_pipeline.py:552] [PerUser_Intercept] Best CV Score (mae): -0.600


✅ Fit completed. Detected 143 unique users.


INFO     [ML_pipeline.py:587] [PerUser_Intercept] Inner Test Scores: {'r2': 0.5216969745545884, 'mae': 0.6124634527085445, 'rmse': np.float64(0.8181765326969993)}
INFO     [ML_pipeline.py:521] 
[run] Starting pipeline: FFNN_with_Embeddings_PerUserscaled


features: ['hr_min', 'sunshine_duration', 'precipitation_hours', 'weekday', 'time_in_transition_minutes', 'customer', 'activity_103_minutes', 'n_steps', 'activity_107_minutes', 'hr_zone_vigorous', 'hr_max', 'hr_zone_moderate', 'time_stationary_minutes', 'total_distance_km', 'activity_105_minutes', 'apparent_temperature_mean', 'at_home_minute', 'hr_std', 'n_GPS', 'activity_102_minutes', 'weekend', 'hr_mean', 'activity_104_minutes', 'quest_create_hour', 'activity_106_minutes', 'time_of_day', 'season', 'hr_zone_resting']


INFO     [ML_pipeline.py:548] [FFNN_with_Embeddings_PerUserscaled] Fitting GridSearchCV.
INFO     [ML_pipeline.py:94] [PerUserForwardChainingCV] Unique users found: 143
ERROR    [ML_pipeline.py:554] [ERROR] GridSearchCV failed for pipeline FFNN_with_Embeddings_PerUserscaled: Invalid parameter embedding_dim for estimator KerasRegressor.
This issue can likely be resolved by setting this parameter in the KerasRegressor constructor:
`KerasRegressor(embedding_dim=32)`
Check the list of available parameters with `estimator.get_params().keys()`
INFO     [ML_pipeline.py:521] 
[run] Starting pipeline: LR_without_PS_PerUserscaled


features: ['hr_min', 'sunshine_duration', 'precipitation_hours', 'weekday', 'time_in_transition_minutes', 'customer', 'activity_103_minutes', 'n_steps', 'activity_107_minutes', 'hr_zone_vigorous', 'hr_max', 'hr_zone_moderate', 'time_stationary_minutes', 'total_distance_km', 'activity_105_minutes', 'apparent_temperature_mean', 'at_home_minute', 'hr_std', 'n_GPS', 'activity_102_minutes', 'weekend', 'hr_mean', 'activity_104_minutes', 'quest_create_hour', 'activity_106_minutes', 'time_of_day', 'season', 'hr_zone_resting']


INFO     [ML_pipeline.py:548] [LR_without_PS_PerUserscaled] Fitting GridSearchCV.
INFO     [ML_pipeline.py:94] [PerUserForwardChainingCV] Unique users found: 143
INFO     [ML_pipeline.py:550] [LR_without_PS_PerUserscaled] GridSearchCV completed.
INFO     [ML_pipeline.py:551] [LR_without_PS_PerUserscaled] Best Parameters: {'model_TTR__regressor__fit_intercept': True}
INFO     [ML_pipeline.py:552] [LR_without_PS_PerUserscaled] Best CV Score (mae): -0.600
INFO     [ML_pipeline.py:587] [LR_without_PS_PerUserscaled] Inner Test Scores: {'r2': 0.5343649648732491, 'mae': 0.6025885561940711, 'rmse': np.float64(0.8072690079014381)}
INFO     [ML_pipeline.py:521] 
[run] Starting pipeline: LR_with_PS_PerUserscaled


features: ['prior_treatment_description_simple', 'hr_min', 'sunshine_duration', 'age', 'precipitation_hours', 'weekday', 'time_in_transition_minutes', 'customer', 'activity_103_minutes', 'n_steps', 'activity_107_minutes', 'psychotropic', 'somatic_problems', 'hr_zone_vigorous', 'hr_max', 'hr_zone_moderate', 'time_stationary_minutes', 'employability_description_simple', 'total_distance_km', 'activity_105_minutes', 'apparent_temperature_mean', 'at_home_minute', 'hr_std', 'n_GPS', 'ema_smartphone', 'activity_102_minutes', 'weekend', 'hr_mean', 'activity_104_minutes', 'quest_create_hour', 'activity_106_minutes', 'time_of_day', 'season', 'hr_zone_resting']


INFO     [ML_pipeline.py:548] [LR_with_PS_PerUserscaled] Fitting GridSearchCV.
INFO     [ML_pipeline.py:94] [PerUserForwardChainingCV] Unique users found: 143
INFO     [ML_pipeline.py:550] [LR_with_PS_PerUserscaled] GridSearchCV completed.
INFO     [ML_pipeline.py:551] [LR_with_PS_PerUserscaled] Best Parameters: {'model_TTR__regressor__fit_intercept': True}
INFO     [ML_pipeline.py:552] [LR_with_PS_PerUserscaled] Best CV Score (mae): -0.600
INFO     [ML_pipeline.py:587] [LR_with_PS_PerUserscaled] Inner Test Scores: {'r2': 0.5336727509906767, 'mae': 0.6030270643234882, 'rmse': np.float64(0.8078688288416678)}
INFO     [ML_pipeline.py:521] 
[run] Starting pipeline: RF_without_PS_PerUserscaled


features: ['hr_min', 'sunshine_duration', 'precipitation_hours', 'weekday', 'time_in_transition_minutes', 'customer', 'activity_103_minutes', 'n_steps', 'activity_107_minutes', 'hr_zone_vigorous', 'hr_max', 'hr_zone_moderate', 'time_stationary_minutes', 'total_distance_km', 'activity_105_minutes', 'apparent_temperature_mean', 'at_home_minute', 'hr_std', 'n_GPS', 'activity_102_minutes', 'weekend', 'hr_mean', 'activity_104_minutes', 'quest_create_hour', 'activity_106_minutes', 'time_of_day', 'season', 'hr_zone_resting']


INFO     [ML_pipeline.py:548] [RF_without_PS_PerUserscaled] Fitting GridSearchCV.
INFO     [ML_pipeline.py:94] [PerUserForwardChainingCV] Unique users found: 143


KeyboardInterrupt: 

In [None]:
import pandas as pd

def format_results_as_dataframe(results_timebased, holdout_results):
    """
    Formats the results into a structured DataFrame with columns:
    - pipeline_name
    - best_cv_score
    - r2, mae, rmse (inner test scores)
    - holdout_r2, holdout_mae, holdout_rmse (holdout test scores)
    - hyperparameters (only relevant hyperparameters per model)

    Parameters:
    ----------
    results_timebased : list
        List of dictionaries containing results from ML pipelines.
    holdout_results : list
        List of dictionaries containing holdout evaluation results.

    Returns:
    -------
    pd.DataFrame
        A DataFrame displaying structured results.
    """

    formatted_results = []

    # Create a mapping of pipeline_name -> holdout_scores for easy lookup
    holdout_scores_map = {res["pipeline_name"]: res["holdout_scores"] for res in holdout_results}

    for result in results_timebased:
        pipeline_name = result["pipeline_name"]
        best_cv_score = result["best_cv_score"]
        
        # Extract inner test scores
        inner_test_scores = result["inner_test_scores"]
        r2 = inner_test_scores.get("r2", None)
        mae = inner_test_scores.get("mae", None)
        rmse = inner_test_scores.get("rmse", None)
        
        # Extract holdout scores if available
        holdout_scores = holdout_scores_map.get(pipeline_name, {})
        holdout_r2 = holdout_scores.get("r2", None)
        holdout_mae = holdout_scores.get("mae", None)
        holdout_rmse = holdout_scores.get("rmse", None)

        # Extract relevant hyperparameters dynamically
        best_estimator = result["best_estimator"]
        hyperparameters = {}

        if hasattr(best_estimator, 'get_params'):
            params = best_estimator.get_params()
            
            # Select relevant hyperparameters based on known model prefixes
            param_keys = [
                "model_LR__fit_intercept",
                "model_LRPS__fit_intercept",
                "model_TTR__regressor__n_estimators",
                "model_TTR__regressor__max_depth",
                "model_TTR__regressor__min_samples_split",
                "model_TTR__regressor__max_features",
                "model_MERF__regressor__max_iterations",
                "model_MERF__regressor__rf__n_estimators",
                "model_TTR__regressor__hidden_layer_sizes",
                "model_TTR__regressor__alpha"
            ]

            # Extract only relevant hyperparameters for the current model
            for key in param_keys:
                if key in params:
                    hyperparameters[key] = params[key]

        formatted_results.append({
            "pipeline_name": pipeline_name,
            "best_cv_score": best_cv_score,
            "r2": r2,
            "mae": mae,
            "rmse": rmse,
            "holdout_r2": holdout_r2,
            "holdout_mae": holdout_mae,
            "holdout_rmse": holdout_rmse,
            "hyperparameters": hyperparameters  # Store extracted hyperparameters
        })

    df_results = pd.DataFrame(formatted_results)
    return df_results


In [None]:
df_results = format_results_as_dataframe(results_timebased, results_holdout)


In [None]:
df_results