# Paper: Towards JITAI -



In [1]:
import os
import sys
import regex as re
# If your current working directory is the notebooks directory, use this:
notebook_dir = os.getcwd()  # current working directory
src_path = os.path.abspath(os.path.join(notebook_dir, '..', 'src'))
parent_dir = os.path.abspath(os.path.join(notebook_dir, '..'))
model_path = os.path.abspath(os.path.join(notebook_dir, '..', 'model_pipeline'))

sys.path.append(parent_dir)
sys.path.append(src_path)
sys.path.append(model_path)

import glob
import pickle
from IPython.display import Markdown
from server_config import datapath, preprocessed_path, preprocessed_path_freezed, redcap_path

import pandas as pd
import numpy as np
import datetime as dt
from scipy.stats import entropy

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer

import ML_config
import ML_pipeline
import run_ML_pipeline

import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns 
import matplotlib.patches as mpatches

sns.set_context("notebook", rc={"axes.labelsize": 14, "xtick.labelsize": 14, "ytick.labelsize": 14})
sns.set_style("whitegrid", {'axes.grid': True})
%matplotlib inline
import plotly.express as px
import plotly.graph_objs as go
from plotly.subplots import make_subplots


In [2]:
#backup_path = preprocessed_path + "backup_data_passive.feather"
#df_backup = pd.read_feather(backup_path)

with open(preprocessed_path_freezed + '/ema_data.pkl', 'rb') as file:
    df_ema_framework = pickle.load(file)

with open(preprocessed_path_freezed + '/ema_content.pkl', 'rb') as file:
    df_ema_content = pickle.load(file)  

with open(preprocessed_path_freezed + '/monitoring_data.pkl', 'rb') as file:
    df_monitoring = pickle.load(file)

with open(preprocessed_path_freezed + '/redcap_data.pkl', 'rb') as file:
    df_redcap = pickle.load(file)

with open(preprocessed_path_freezed + '/map_ema_passive.pkl', 'rb') as file:
    df_ema_passive = pickle.load(file)

### Configurations

In [3]:
# EMA
assessment_phase = [0] #1,2
min_num_daily = 4
min_days_data = 7

### 3. Compare Included vs. not Included Participantants

In [4]:
df_ema_content_cust = df_ema_content.customer.unique().tolist()

In [5]:
df_redcap_original = df_redcap.dropna(subset = ["age", "customer"] )
df_redcap_original = df_redcap_original[df_redcap_original.customer.isin(df_ema_content_cust)]
df_redcap_original = df_redcap_original.drop_duplicates(subset="customer")


In [6]:
# Create a set of included customer IDs
included_customers = set(df_ema_passive['customer'])

# Add a new column to df_redcap_original indicating inclusion
df_redcap_original['Included'] = df_redcap_original['customer'].isin(included_customers)

# Define the two groups
df_redcap_original['Group'] = df_redcap_original['Included'].map({True: 'Included', False: 'Not Included'})

# Verify the counts
print(f"Subjects included in the analysis (n={df_redcap_original['Group'].value_counts().get('Included', 0)})")
print(f"Subjects not included in the analysis (n={df_redcap_original['Group'].value_counts().get('Not Included', 0)})")


Subjects included in the analysis (n=176)
Subjects not included in the analysis (n=125)


In [7]:
from tableone import TableOne
# Define your variables
# Replace the variable names with those present in your DataFrame

# Demographic variables
age = 'age'  # Continuous
employable = 'employability_description_simple'  # Categorical
smartphone_type = 'ema_smartphone_description'  # Categorical
psychotropic_med = 'psychotropic_description'
diagnosis = 'scid_cv_description'
previous_treatment = 'prior_treatment_description_simple'
somatic = 'somatic_description'



# List of all variables to include in the table
columns = [age, employable, smartphone_type, previous_treatment, psychotropic_med, diagnosis, somatic]

# Define categorical variables
categorical = [employable, smartphone_type, previous_treatment, psychotropic_med, diagnosis, somatic]

# Define grouping variable
group_var = 'Included'


In [8]:
# Create the TableOne object
table1 = TableOne(
    df_redcap_original,
    columns=columns,
    categorical=categorical,
    groupby=group_var,
    pval=True,
    nonnormal=[],  # Add variables that are non-normally distributed if any
    missing=False  # Whether to include missing data
)

# Print the table
print(table1.tabulate(tablefmt="fancy_grid"))


╒═══════════════════════════════════════════╤═══════════════════════════════════╤═════════════╤═════════════╤═════════════╤═══════════╕
│                                           │                                   │ Overall     │ False       │ True        │ P-Value   │
╞═══════════════════════════════════════════╪═══════════════════════════════════╪═════════════╪═════════════╪═════════════╪═══════════╡
│ n                                         │                                   │ 301         │ 125         │ 176         │           │
├───────────────────────────────────────────┼───────────────────────────────────┼─────────────┼─────────────┼─────────────┼───────────┤
│ age, mean (SD)                            │                                   │ 33.1 (10.9) │ 33.3 (11.2) │ 32.9 (10.6) │ 0.798     │
├───────────────────────────────────────────┼───────────────────────────────────┼─────────────┼─────────────┼─────────────┼───────────┤
│ employability_description_simple, n (%)   │ no

## Manual Missing data handling

In [9]:
# also impute activity features 

#### GPS

In [10]:
# Create a mask for rows where missing_GPS equals 'Steps<=625'
mask = df_ema_passive['missing_GPS'] == 'Steps<=625'

# For these rows, set the selected columns to 0
cols_set_zero = ['n_GPS', 'total_distance_km', 'time_in_transition_minutes']
for col in cols_set_zero:
    df_ema_passive.loc[mask, col] = 0

# For these rows, set the selected columns to 120
cols_set_120 = ['time_stationary_minutes']
for col in cols_set_120:
    df_ema_passive.loc[mask, col] = 120

mask = df_ema_passive['missing_GPS_home'] == 'Steps<=625'

# For these rows, set the selected columns to 120
cols_set_120 = ['at_home_minute']
for col in cols_set_120:
    df_ema_passive.loc[mask, col] = 120


#### Steps

In [11]:
# Create a mask for rows where missing_GPS equals 'Steps>625'
mask = df_ema_passive['missing_steps'] == 'step_zero'

# For these rows, set the selected columns to 0
cols_set_zero = ['n_steps']
for col in cols_set_zero:
    df_ema_passive.loc[mask, col] = 0

#### Physical Activity

In [12]:
# Create a mask for rows where missing_GPS equals 'Steps>625'
mask = df_ema_passive['missing_pa'] == 'pa_zero'

# For these rows, set the selected columns to 0
cols_set_zero = ['activity_102_minutes', 'activity_103_minutes', 'activity_104_minutes', 'activity_105_minutes', 'activity_106_minutes', 
                 'activity_107_minutes']
for col in cols_set_zero:
    df_ema_passive.loc[mask, col] = 0

### Feature Encoding

- prior treatment: ordinal encoding
- age: min-max scaling
- somatic, employability, psychotropic: 

In [13]:
# Define which columns are which
binary_features = ['somatic_description', 'psychotropic_description', 'employability_description_simple', 'smartphone_type', 'weekend']
categorical_features = ['weekday', 'prior_treatment_description_simple', 'quest_create_hour', 'season', 'time_of_day']
numeric_features = ['age','hr_mean', 'hr_min', 'hr_max', 'hr_std', 'hr_zone_resting', 'hr_zone_moderate','hr_zone_vigorous', 'n_steps', 
       'n_GPS', 'total_distance_km', 'at_home_minute',
       'time_in_transition_minutes', 'time_stationary_minutes',
       'prop_time_moving', 'prop_time_stationary', 'activity_102_minutes',
       'activity_103_minutes', 'activity_104_minutes', 'activity_105_minutes',
       'activity_106_minutes', 'activity_107_minutes',
       'apparent_temperature_mean', 'sunshine_duration', 'precipitation_hours'] 

person_static_features = ['customer', 'age', 'somatic_description', 'psychotropic_description', 'employability_description_simple', 'smartphone_type', 'weekend']


In [14]:
df_ema_passive[numeric_features] = df_ema_passive[numeric_features].replace(-1, np.nan)

In [15]:
from scipy.stats import skewtest,normaltest

skewed_features = []
for col in numeric_features:
    valid_data = df_ema_passive[col].dropna()

    # skewtest requires sample size > 7 for reliable results
    stat, p_val = skewtest(valid_data)
    print(f"Feature: {col}")
    print(f"  Skewtest statistic={stat:.3f}, p-value={p_val:.3f}")

    if p_val < 0.05:
        skewed_features.append(col)  # append this feature as skewed


print("Skewed features:", skewed_features)

Feature: age
  Skewtest statistic=46.061, p-value=0.000
Feature: hr_mean
  Skewtest statistic=33.683, p-value=0.000
Feature: hr_min
  Skewtest statistic=36.286, p-value=0.000
Feature: hr_max
  Skewtest statistic=43.285, p-value=0.000
Feature: hr_std
  Skewtest statistic=48.983, p-value=0.000
Feature: hr_zone_resting
  Skewtest statistic=73.116, p-value=0.000
Feature: hr_zone_moderate
  Skewtest statistic=76.633, p-value=0.000
Feature: hr_zone_vigorous
  Skewtest statistic=120.213, p-value=0.000
Feature: n_steps
  Skewtest statistic=68.289, p-value=0.000
Feature: n_GPS
  Skewtest statistic=99.497, p-value=0.000
Feature: total_distance_km
  Skewtest statistic=117.025, p-value=0.000
Feature: at_home_minute
  Skewtest statistic=-31.149, p-value=0.000
Feature: time_in_transition_minutes
  Skewtest statistic=122.659, p-value=0.000
Feature: time_stationary_minutes
  Skewtest statistic=155.178, p-value=0.000
Feature: prop_time_moving
  Skewtest statistic=93.895, p-value=0.000
Feature: prop_tim

### Model Pipeline

In [16]:
df_ema_pipeline = df_ema_passive[['customer', 'unique_day_id', 
       'quest_create_hour', 'weekday', 'weekend', 'season', 'time_of_day',
       'n_quest', 'mean_na', 'sensor_block_end', 'age', 
       'ema_smartphone', 'psychotropic', 'somatic_problems','employability_description_simple',
       'prior_treatment_description_simple',
       'hr_mean', 'hr_min', 'hr_max', 'hr_std', 
       'hr_zone_resting', 'hr_zone_moderate',
       'hr_zone_vigorous', 'n_steps',  'n_GPS', 'total_distance_km', 'at_home_minute',
       'time_in_transition_minutes', 'time_stationary_minutes',
       'activity_102_minutes',
       'activity_103_minutes', 'activity_104_minutes', 'activity_105_minutes',
       'activity_106_minutes', 'activity_107_minutes',
       'apparent_temperature_mean', 'sunshine_duration', 'precipitation_hours',
      ]]

In [18]:
df_ema_passive.columns[["somatic_problems", ]]

Index(['customer', 'quest_create_day', 'unique_day_id', 'assess',
       'quest_create_hour', 'weekday', 'weekend', 'season', 'time_of_day',
       'n_quest', 'mean_na', 'sensor_block_end', 'sensor_block_start',
       'unique_blocks', 'for_id', 'ema_watch',
       'basic_documentation_sheet_timestamp', 'age', 'gender',
       'scid_cv_prim_cat', 'marital_status', 'partnership', 'graduation',
       'profession', 'ema_start_date', 'years_of_education', 'employability',
       'ses', 'ema_smartphone', 'ema_sleep', 'prior_treatment',
       'ema_special_event', 'psychotropic', 'somatic_problems',
       'gender_description', 'scid_cv_description',
       'marital_status_description', 'employability_description',
       'employability_description_simple',
       'prior_treatment_description_simple', 'graduation_description',
       'profession_description', 'prior_treatment_description',
       'ema_smartphone_description', 'ema_special_event_description',
       'age_description', 'somat

In [17]:
from ML_pipeline import MLpipeline
from ML_config import Config

my_config = Config()
pipeline = MLpipeline(my_config)

pipeline.set_data(df_ema_pipeline)
pipeline.outer_user_split()
pipeline.inner_time_split()

# (1) Time-based runs
results_timebased = pipeline.run(my_config.ANALYSIS["neg_affect_regression"]["MODEL_PIPEGRIDS"])

# Inspect or store results_timebased
print("Time-based results:")
for r in results_timebased:
    print(r)

# (2) User-based holdout evaluation
#results_holdout = pipeline.evaluate_holdout_all(results_timebased)

#print("Holdout results:")
#for r in results_holdout:
#    print(r)


[set_data] DataFrame with 13266 rows loaded in pipeline.
[outer_user_split] Held out 17/176 users; holdout size: 1141 rows.
[inner_time_split] Inner train size: 9637, test size: 2488.

[run] Starting pipeline: LR_without_PS


KeyError: "['somatic', 'smartphone_type'] not in index"