# Paper: Towards JITAI -



In [1]:
import os
import sys
import regex as re
# If your current working directory is the notebooks directory, use this:
notebook_dir = os.getcwd()  # current working directory
src_path = os.path.abspath(os.path.join(notebook_dir, '..', 'src'))
parent_dir = os.path.abspath(os.path.join(notebook_dir, '..'))
model_path = os.path.abspath(os.path.join(notebook_dir, '..', 'model_pipeline'))

sys.path.append(parent_dir)
sys.path.append(src_path)
sys.path.append(model_path)

import glob
import pickle
from IPython.display import Markdown
from server_config import datapath, preprocessed_path, preprocessed_path_freezed, redcap_path

import pandas as pd
import numpy as np
import datetime as dt
from scipy.stats import entropy

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer

import ML_config
import ML_pipeline
import run_ML_pipeline

import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns 
import matplotlib.patches as mpatches

sns.set_context("notebook", rc={"axes.labelsize": 14, "xtick.labelsize": 14, "ytick.labelsize": 14})
sns.set_style("whitegrid", {'axes.grid': True})
%matplotlib inline
import plotly.express as px
import plotly.graph_objs as go
from plotly.subplots import make_subplots


In [2]:
#backup_path = preprocessed_path + "backup_data_passive.feather"
#df_backup = pd.read_feather(backup_path)

with open(preprocessed_path_freezed + '/ema_data.pkl', 'rb') as file:
    df_ema_framework = pickle.load(file)

with open(preprocessed_path_freezed + '/ema_content.pkl', 'rb') as file:
    df_ema_content = pickle.load(file)  

with open(preprocessed_path_freezed + '/monitoring_data.pkl', 'rb') as file:
    df_monitoring = pickle.load(file)

with open(preprocessed_path_freezed + '/redcap_data.pkl', 'rb') as file:
    df_redcap = pickle.load(file)

with open(preprocessed_path_freezed + '/map_ema_passive.pkl', 'rb') as file:
    df_ema_passive = pickle.load(file)

### Configurations

In [3]:
# EMA
assessment_phase = [0] #1,2
min_num_daily = 4
min_days_data = 7

### 3. Compare Included vs. not Included Participantants

In [4]:
df_ema_content_cust = df_ema_content.customer.unique().tolist()

In [5]:
df_redcap_original = df_redcap.dropna(subset = ["age", "customer"] )
df_redcap_original = df_redcap_original[df_redcap_original.customer.isin(df_ema_content_cust)]
df_redcap_original = df_redcap_original.drop_duplicates(subset="customer")


In [6]:
# Create a set of included customer IDs
included_customers = set(df_ema_passive['customer'])

# Add a new column to df_redcap_original indicating inclusion
df_redcap_original['Included'] = df_redcap_original['customer'].isin(included_customers)

# Define the two groups
df_redcap_original['Group'] = df_redcap_original['Included'].map({True: 'Included', False: 'Not Included'})

# Verify the counts
print(f"Subjects included in the analysis (n={df_redcap_original['Group'].value_counts().get('Included', 0)})")
print(f"Subjects not included in the analysis (n={df_redcap_original['Group'].value_counts().get('Not Included', 0)})")


Subjects included in the analysis (n=159)
Subjects not included in the analysis (n=142)


In [7]:
from tableone import TableOne
# Define your variables
# Replace the variable names with those present in your DataFrame

# Demographic variables
age = 'age'  # Continuous
employable = 'employability_description_simple'  # Categorical
smartphone_type = 'ema_smartphone_description'  # Categorical
psychotropic_med = 'psychotropic_description'
diagnosis = 'scid_cv_description'
previous_treatment = 'prior_treatment_description_simple'
somatic = 'somatic_description'



# List of all variables to include in the table
columns = [age, employable, smartphone_type, previous_treatment, psychotropic_med, diagnosis, somatic]

# Define categorical variables
categorical = [employable, smartphone_type, previous_treatment, psychotropic_med, diagnosis, somatic]

# Define grouping variable
group_var = 'Included'


In [8]:
# Create the TableOne object
table1 = TableOne(
    df_redcap_original,
    columns=columns,
    categorical=categorical,
    groupby=group_var,
    pval=True,
    nonnormal=[],  # Add variables that are non-normally distributed if any
    missing=False  # Whether to include missing data
)

# Print the table
print(table1.tabulate(tablefmt="fancy_grid"))
table1.to_csv('sample_overview.csv')


╒═══════════════════════════════════════════╤═══════════════════════════════════╤═════════════╤═════════════╤═════════════╤═══════════╕
│                                           │                                   │ Overall     │ False       │ True        │ P-Value   │
╞═══════════════════════════════════════════╪═══════════════════════════════════╪═════════════╪═════════════╪═════════════╪═══════════╡
│ n                                         │                                   │ 301         │ 142         │ 159         │           │
├───────────────────────────────────────────┼───────────────────────────────────┼─────────────┼─────────────┼─────────────┼───────────┤
│ age, mean (SD)                            │                                   │ 33.1 (10.9) │ 33.8 (11.6) │ 32.5 (10.2) │ 0.294     │
├───────────────────────────────────────────┼───────────────────────────────────┼─────────────┼─────────────┼─────────────┼───────────┤
│ employability_description_simple, n (%)   │ no

## Manual Missing data handling

In [9]:
# also impute activity features 

#### GPS

In [10]:
# Create a mask for rows where missing_GPS equals 'Steps<=625'
mask = df_ema_passive['missing_GPS'] == 'Steps<=625'

# For these rows, set the selected columns to 0
cols_set_zero = ['n_GPS', 'total_distance_km', 'time_in_transition_minutes']
for col in cols_set_zero:
    df_ema_passive.loc[mask, col] = 0

# For these rows, set the selected columns to 120
cols_set_120 = ['time_stationary_minutes']
for col in cols_set_120:
    df_ema_passive.loc[mask, col] = 120

mask = df_ema_passive['missing_GPS_home'] == 'Steps<=625'

# For these rows, set the selected columns to 120
cols_set_120 = ['at_home_minute']
for col in cols_set_120:
    df_ema_passive.loc[mask, col] = 120


#### Steps

In [11]:
# Create a mask for rows where missing_GPS equals 'Steps>625'
mask = df_ema_passive['missing_steps'] == 'step_zero'

# For these rows, set the selected columns to 0
cols_set_zero = ['n_steps']
for col in cols_set_zero:
    df_ema_passive.loc[mask, col] = 0

#### Physical Activity

In [12]:
# Create a mask for rows where missing_GPS equals 'Steps>625'
mask = df_ema_passive['missing_pa'] == 'pa_zero'

# For these rows, set the selected columns to 0
cols_set_zero = ['activity_102_minutes', 'activity_103_minutes', 'activity_104_minutes', 'activity_105_minutes', 'activity_106_minutes', 
                 'activity_107_minutes']
for col in cols_set_zero:
    df_ema_passive.loc[mask, col] = 0

### Feature Encoding

- prior treatment: ordinal encoding
- age: min-max scaling
- somatic, employability, psychotropic: 

In [13]:
# Define which columns are which
binary_features = ['somatic_description', 'psychotropic_description', 'employability_description_simple', 'smartphone_type', 'weekend']
categorical_features = ['weekday', 'prior_treatment_description_simple', 'quest_create_hour', 'season', 'time_of_day']
numeric_features = ['age','hr_mean', 'hr_min', 'hr_max', 'hr_std', 'hr_zone_resting', 'hr_zone_moderate','hr_zone_vigorous', 'n_steps', 
       'n_GPS', 'total_distance_km', 'at_home_minute',
       'time_in_transition_minutes', 'time_stationary_minutes', 'activity_102_minutes',
       'activity_103_minutes', 'activity_104_minutes', 'activity_105_minutes',
       'activity_106_minutes', 'activity_107_minutes',
       'apparent_temperature_mean', 'sunshine_duration', 'precipitation_hours'] 

person_static_features = ['customer', 'age', 'somatic_description', 'psychotropic_description', 'employability_description_simple', 'smartphone_type', 'weekend']


In [14]:
df_ema_passive[numeric_features] = df_ema_passive[numeric_features].replace(-1, np.nan)

In [15]:
from scipy.stats import skewtest,normaltest

skewed_features = []
for col in numeric_features:
    valid_data = df_ema_passive[col].dropna()

    # skewtest requires sample size > 7 for reliable results
    stat, p_val = skewtest(valid_data)
    if p_val < 0.05:
        skewed_features.append(col)  # append this feature as skewed


### Model Pipeline

In [16]:
df_ema_passive["intercept"] = 1

In [17]:
df_ema_pipeline = df_ema_passive[['customer', 'unique_day_id', 
       'quest_create_hour', 'weekday', 'weekend', 'season', 'time_of_day',
       'n_quest', 'mean_na', 'sensor_block_end', 'age', 
       'ema_smartphone', 'psychotropic', 'somatic_problems','employability_description_simple',
       'prior_treatment_description_simple',
       'hr_mean', 'hr_min', 'hr_max', 'hr_std', 
       'hr_zone_resting', 'hr_zone_moderate',
       'hr_zone_vigorous', 'n_steps',  'n_GPS', 'total_distance_km', 'at_home_minute',
       'time_in_transition_minutes', 'time_stationary_minutes',
       'activity_102_minutes',
       'activity_103_minutes', 'activity_104_minutes', 'activity_105_minutes',
       'activity_106_minutes', 'activity_107_minutes',
       'apparent_temperature_mean', 'sunshine_duration', 'precipitation_hours', 'intercept'
      ]]

In [18]:
df_ema_passive['customer'] = df_ema_passive['customer'].astype('string')

In [19]:
for col in numeric_features:
    series = df_ema_pipeline[col]
    n_nan = series.isna().sum()
    col_min = series.min()
    col_max = series.max()
    col_mean = series.mean()
    col_std = series.std()
    print(f"{col}: NaNs={n_nan}, min={col_min}, max={col_max}, mean={col_mean}, std={col_std}")


age: NaNs=0, min=18.0, max=66.0, mean=32.19508064516129, std=9.968403587324959
hr_mean: NaNs=362, min=40.583333333333336, max=176.0, mean=76.239656761943, std=14.409070791182929
hr_min: NaNs=362, min=35.0, max=176.0, mean=61.32472171457053, std=11.615648572159467
hr_max: NaNs=362, min=42.0, max=211.0, mean=96.86168798803789, std=27.51912747308003
hr_std: NaNs=362, min=0.0, max=56.0, mean=10.79009904911434, std=7.947545603716206
hr_zone_resting: NaNs=362, min=0.0, max=32.400000000000006, mean=1.355257697845711, std=2.2051439499461534
hr_zone_moderate: NaNs=362, min=0.0, max=114.00951666666391, mean=5.5548609846596895, std=3.4002753571366977
hr_zone_vigorous: NaNs=362, min=0.0, max=78.6040666666656, mean=1.030957596776873, std=3.6330888532741623
n_steps: NaNs=198, min=0.0, max=10078.0, mean=842.7129159154237, std=1104.26053441668
n_GPS: NaNs=892, min=0.0, max=1897.0, mean=45.141640597844976, std=113.94481470554737
total_distance_km: NaNs=892, min=0.0, max=180.0840241629681, mean=2.444647

In [20]:
from ML_pipeline import MLpipeline
from ML_config import Config

my_config = Config()
pipeline = MLpipeline(my_config)

pipeline.set_data(df_ema_pipeline)
pipeline.outer_user_split()
pipeline.inner_time_split()

# (1) Time-based runs
results_timebased = pipeline.run(my_config.ANALYSIS["neg_affect_regression"]["MODEL_PIPEGRIDS"])

# Inspect or store results_timebased
print("Time-based results:")
for r in results_timebased:
    print(r)

# (2) User-based holdout evaluation
#results_holdout = pipeline.evaluate_holdout_all(results_timebased)

#print("Holdout results:")
#for r in results_holdout:
#    print(r)


INFO     [ML_pipeline.py:183] [ML_Pipeline] Configuration Loaded:
INFO     [ML_pipeline.py:184]   Impute Strategy: knn
INFO     [ML_pipeline.py:185]   Scaler Strategy: minmax
INFO     [ML_pipeline.py:186]   Holdout Ratio: 0.1
INFO     [ML_pipeline.py:187]   Time Ratio: 0.8
INFO     [ML_pipeline.py:188]   Number of Jobs: 1
INFO     [ML_pipeline.py:189]   Number of Folds (Inner CV): 5
INFO     [ML_pipeline.py:190]   CV Method: forwardchaining (expected 'forwardchaining' here)
INFO     [ML_pipeline.py:198] [set_data] DataFrame with 12400 rows loaded in pipeline.
INFO     [ML_pipeline.py:217] [outer_user_split] Held out 15/159 users; holdout size: 1178 rows.
INFO     [ML_pipeline.py:240] [inner_time_split] Inner train size: 8923, test size: 2299.
INFO     [ML_pipeline.py:387] [run] Starting the ML pipeline run method.
INFO     [ML_pipeline.py:411] 
[run] Starting pipeline: LR_without_PS
INFO     [ML_pipeline.py:435] [LR_without_PS] Fitting GridSearchCV.
INFO     [ML_pipeline.py:97] [PerUse

['hr_zone_vigorous', 'activity_102_minutes', 'sunshine_duration', 'hr_zone_resting', 'time_stationary_minutes', 'hr_zone_moderate', 'activity_105_minutes', 'hr_min', 'total_distance_km', 'hr_max', 'hr_mean', 'activity_103_minutes', 'activity_107_minutes', 'weekend', 'quest_create_hour', 'time_of_day', 'hr_std', 'activity_106_minutes', 'n_steps', 'weekday', 'activity_104_minutes', 'apparent_temperature_mean', 'season', 'time_in_transition_minutes', 'at_home_minute', 'precipitation_hours', 'n_GPS']
(8923, 27)


INFO     [ML_pipeline.py:437] [LR_without_PS] GridSearchCV completed.
INFO     [ML_pipeline.py:438] [LR_without_PS] Best Parameters: {'model_LR__regressor__fit_intercept': True}
INFO     [ML_pipeline.py:439] [LR_without_PS] Best CV Score (mae): -0.882
INFO     [ML_pipeline.py:460] [LR_without_PS] Inner Test Scores: {'r2': 0.002324765855860167, 'mae': 0.9358885630228598, 'rmse': np.float64(1.1528625447818188)}
INFO     [ML_pipeline.py:411] 
[run] Starting pipeline: LR_with_PS
INFO     [ML_pipeline.py:435] [LR_with_PS] Fitting GridSearchCV.
INFO     [ML_pipeline.py:97] [PerUserForwardChainingCV] Unique users found: 144


['age', 'hr_zone_vigorous', 'prior_treatment_description_simple', 'activity_102_minutes', 'sunshine_duration', 'psychotropic', 'hr_zone_resting', 'time_stationary_minutes', 'hr_zone_moderate', 'activity_105_minutes', 'hr_min', 'total_distance_km', 'somatic_problems', 'hr_max', 'hr_mean', 'activity_103_minutes', 'activity_107_minutes', 'weekend', 'quest_create_hour', 'time_of_day', 'hr_std', 'activity_106_minutes', 'n_steps', 'ema_smartphone', 'weekday', 'activity_104_minutes', 'apparent_temperature_mean', 'season', 'employability_description_simple', 'time_in_transition_minutes', 'at_home_minute', 'precipitation_hours', 'n_GPS']
(8923, 33)


INFO     [ML_pipeline.py:437] [LR_with_PS] GridSearchCV completed.
INFO     [ML_pipeline.py:438] [LR_with_PS] Best Parameters: {'model_LRPS__regressor__fit_intercept': True}
INFO     [ML_pipeline.py:439] [LR_with_PS] Best CV Score (mae): -0.868
INFO     [ML_pipeline.py:460] [LR_with_PS] Inner Test Scores: {'r2': 0.033718345674271344, 'mae': 0.9251133567724216, 'rmse': np.float64(1.1345791568272114)}
INFO     [ML_pipeline.py:411] 
[run] Starting pipeline: MERF_without_PS
INFO     [ML_pipeline.py:435] [MERF_without_PS] Fitting GridSearchCV.
INFO     [ML_pipeline.py:97] [PerUserForwardChainingCV] Unique users found: 144


['hr_zone_vigorous', 'activity_102_minutes', 'sunshine_duration', 'hr_zone_resting', 'time_stationary_minutes', 'hr_zone_moderate', 'activity_105_minutes', 'hr_min', 'total_distance_km', 'hr_max', 'hr_mean', 'activity_103_minutes', 'activity_107_minutes', 'weekend', 'quest_create_hour', 'time_of_day', 'hr_std', 'activity_106_minutes', 'n_steps', 'weekday', 'activity_104_minutes', 'apparent_temperature_mean', 'season', 'time_in_transition_minutes', 'at_home_minute', 'precipitation_hours', 'n_GPS', 'customer', 'intercept']
(8923, 29)
✅ Final Z dtype: float64
✅ Final y dtype: float64


INFO     [merf.py:307] Training GLL is -2701.3509117818007 at iteration 1.
INFO     [merf.py:307] Training GLL is -2966.753284217057 at iteration 2.
INFO     [merf.py:307] Training GLL is -2892.503871274043 at iteration 3.
INFO     [merf.py:307] Training GLL is -2906.2166369535707 at iteration 4.
INFO     [merf.py:321] Gll -2906.2166369535707 less than threshold 0.0047407942356487995, stopping early ...


✅ Final X_fixed shape for prediction: (1815, 56)
🔍 Z dtype before MERF.predict(): float64
🔍 Sample Z values before MERF.predict():
[[1.]
 [1.]
 [1.]
 [1.]
 [1.]]




✅ Final Z dtype: float64
✅ Final y dtype: float64


INFO     [merf.py:307] Training GLL is -5426.910007091989 at iteration 1.
INFO     [merf.py:307] Training GLL is -5725.7520432992105 at iteration 2.
INFO     [merf.py:307] Training GLL is -5721.6238390002645 at iteration 3.
INFO     [merf.py:321] Gll -5721.6238390002645 less than threshold 0.0007209890103042792, stopping early ...


✅ Final X_fixed shape for prediction: (1784, 56)
🔍 Z dtype before MERF.predict(): float64
🔍 Sample Z values before MERF.predict():
[[1.]
 [1.]
 [1.]
 [1.]
 [1.]]




✅ Final Z dtype: float64
✅ Final y dtype: float64


INFO     [merf.py:307] Training GLL is -8131.262738835824 at iteration 1.
INFO     [merf.py:307] Training GLL is -8472.587260940996 at iteration 2.
INFO     [merf.py:307] Training GLL is -8452.220132869617 at iteration 3.
INFO     [merf.py:321] Gll -8452.220132869617 less than threshold 0.0024038853120194104, stopping early ...


✅ Final X_fixed shape for prediction: (1749, 56)
🔍 Z dtype before MERF.predict(): float64
🔍 Sample Z values before MERF.predict():
[[1.]
 [1.]
 [1.]
 [1.]
 [1.]]




✅ Final Z dtype: float64
✅ Final y dtype: float64


INFO     [merf.py:307] Training GLL is -10567.445599847599 at iteration 1.
INFO     [merf.py:307] Training GLL is -10925.727752306058 at iteration 2.
INFO     [merf.py:307] Training GLL is -11067.180513345374 at iteration 3.
INFO     [merf.py:307] Training GLL is -11146.203602550087 at iteration 4.
INFO     [merf.py:321] Gll -11146.203602550087 less than threshold 0.00714030905246579, stopping early ...


✅ Final X_fixed shape for prediction: (1729, 56)
🔍 Z dtype before MERF.predict(): float64
🔍 Sample Z values before MERF.predict():
[[1.]
 [1.]
 [1.]
 [1.]
 [1.]]




✅ Final Z dtype: float64
✅ Final y dtype: float64


INFO     [merf.py:307] Training GLL is -2784.8958789075477 at iteration 1.
INFO     [merf.py:307] Training GLL is -3050.670089027264 at iteration 2.
INFO     [merf.py:307] Training GLL is -3035.426996785209 at iteration 3.
INFO     [merf.py:321] Gll -3035.426996785209 less than threshold 0.004996637393496491, stopping early ...


✅ Final X_fixed shape for prediction: (1815, 56)
🔍 Z dtype before MERF.predict(): float64
🔍 Sample Z values before MERF.predict():
[[1.]
 [1.]
 [1.]
 [1.]
 [1.]]




✅ Final Z dtype: float64
✅ Final y dtype: float64


INFO     [merf.py:307] Training GLL is -5654.380541009228 at iteration 1.
INFO     [merf.py:307] Training GLL is -5907.358003158547 at iteration 2.
INFO     [merf.py:307] Training GLL is -5890.564325311312 at iteration 3.
INFO     [merf.py:321] Gll -5890.564325311312 less than threshold 0.002842840714623298, stopping early ...


✅ Final X_fixed shape for prediction: (1784, 56)
🔍 Z dtype before MERF.predict(): float64
🔍 Sample Z values before MERF.predict():
[[1.]
 [1.]
 [1.]
 [1.]
 [1.]]




✅ Final Z dtype: float64
✅ Final y dtype: float64


INFO     [merf.py:307] Training GLL is -8469.40829289856 at iteration 1.
INFO     [merf.py:307] Training GLL is -8694.039575041326 at iteration 2.
INFO     [merf.py:307] Training GLL is -8700.20862623776 at iteration 3.
INFO     [merf.py:321] Gll -8700.20862623776 less than threshold 0.0007095724770040634, stopping early ...


✅ Final X_fixed shape for prediction: (1749, 56)
🔍 Z dtype before MERF.predict(): float64
🔍 Sample Z values before MERF.predict():
[[1.]
 [1.]
 [1.]
 [1.]
 [1.]]




✅ Final Z dtype: float64
✅ Final y dtype: float64


KeyboardInterrupt: 