# Naturalistic EMA validation

Applying the findings from the 4-state correlation work (EMA x UPDRS) onto real-life EMA data.

Goals:
- analyse real-life variation of EMA values
    - inter-individual variation
    - intra-individual variation, daily fluctuations, differences between days

## 0. Import packages

- document versions for reproducibility

In [None]:
# import packages
import datetime as dt
import pandas as pd
import numpy as np
import os
import sys
import csv
import json
import importlib
from itertools import product, compress
import matplotlib.pyplot as plt
from scipy.stats import pearsonr, spearmanr
from scipy.signal import welch

In [None]:
print('Python sys', sys.version)
print('pandas', pd.__version__)
print('numpy', np.__version__)
# print('mne_bids', mne_bids.__version__)
# print('mne', mne.__version__)
# print('sci-py', scipy.__version__)
# print('sci-kit learn', sk.__version__)
# print('matplotlib', plt_version)

"""
Python sys 3.11.5 | packaged by Anaconda, Inc. | (main, Sep 11 2023, 13:26:23) [MSC v.1916 64 bit (AMD64)]
pandas 2.1.1
numpy 1.26.0

from 16.09

Python sys 3.11.5 | packaged by Anaconda, Inc. | (main, Sep 11 2023, 13:26:23) [MSC v.1916 64 bit (AMD64)]
pandas 2.3.2
numpy 2.3.3
"""

Import custom functions

In [None]:
import dbs_home

# from dbs_home repo
from dbs_home.load_raw.main_load_raw import loadSubject 
import dbs_home.utils.helpers as home_helpers
import dbs_home.utils.ema_utils as home_ema_utils
import dbs_home.plot_data.plot_compliance as plot_home_compl
import dbs_home.preprocessing.preparing_ema as home_ema_prep

In [None]:
# from current repo
from utils import load_utils, load_data, prep_data
from plotting import plot_help



## 1. Import Home-Data

Use pre-operative sessions

- use 9-point-converter
- use direct. inverter

Import EMA home data from raw files

In [None]:
MOMENTS = ['pre-op', 'pre 3MFU', 'post 3MFU']

sub_skip = [] # ['hm25',]  # skip full subject
# skip per session
ses_skip = [['hm20', 'ses03'],]
# ses_skip = [['hm14', 'ses03']]

In [None]:
sessions_include = {m: {} for m in MOMENTS}

for rec_moment in MOMENTS:

    sel_info = home_helpers.select_sessions(target_session=rec_moment)
    sel_info = sel_info.set_index(sel_info['study_id'],)
    sel_sessions = {sub: ses for sub, ses in sel_info[['study_id', 'Session']].values}

    for key, val in sel_sessions.items():
        sessions_include[rec_moment][key] = val


In [None]:
print(sessions_include.keys())
print(sessions_include)

Load data

dev for EMA


In [None]:
importlib.reload(home_ema_utils)
importlib.reload(load_data)
importlib.reload(prep_data)
importlib.reload(home_ema_prep)

# load all combined data

# SUBS_INCL = ['hm14']


data = {m: {} for m in MOMENTS}

for rec_moment, sub_sess in sessions_include.items():
    # rec_moment contains 'pre-op', or 'pre 3MFU', 'post 3MFU', etc
    for sub, ses in sub_sess.items():

        if sub in sub_skip: continue
                    
        if [sub, ses] in ses_skip: continue
                
        ses_class = loadSubject(
            sub=sub,
            ses=ses,
            incl_EMA=True,
            incl_ACC=False,
        )
        temp_df = home_ema_utils.load_ema_df(sub_ses_class=ses_class)
        # prepare
        temp_df = home_ema_prep.prepare_ema_df(temp_df, ADD_MEANMOVE=True, INVERT_NEG_ITEMS=False,)
        ### TODO: CHECK WHY NOT ALL SUBS ARE INVERTED PROPERLY

        data[rec_moment][sub] = temp_df


## 2. Explore naturalistic EMAs

Preprocess EMA

- merge scores
- invert negative-items (higher = clinically better)
- mean-correct EMA
    - test different normalizations:
        - normalize with grand-mean per sub
        - normalize with session mean

In [None]:
importlib.reload(prep_data)

allsubs = []
for mom in list(data.keys()): allsubs.extend(list(data[mom].keys()))
allsubs = np.unique(allsubs)


corr_data = {m: {} for m in MOMENTS}

for sub in allsubs:

    subdf = home_ema_prep.merge_sub_ema_df(datadict=data, sub=sub)
    subdf = home_ema_prep.mean_correct_ema_df(subdf)

    # split and palce back as moment dfs    
    for moment in MOMENTS:
        corr_data[moment][sub] = subdf[subdf['moment'] == moment].reset_index(drop=True)



Visualize first EMA results

In [None]:
PLOT_ITEMS = ['move_mean', 'walking', 'tremor']

PLOT_CORR = False
if PLOT_CORR: PLOT_DATADICT = corr_data
else: PLOT_DATADICT = data


fig, axes = plt.subplots(3, 1, figsize=(9, 6))
fname = 'motorItems_abs_perSub_perSes_1104'
if PLOT_CORR: fname = fname.replace('abs', 'corr')

x_margin = 2
bin_w = 0.5

fsize=14

x_starts = {list(data.keys())[0]: 0}  # first moment starts at 0

subcolors = plot_help.get_sub_colors(PLOT_DATADICT)


for i_ax, col in enumerate(PLOT_ITEMS):

    for i_mom, moment in enumerate(PLOT_DATADICT.keys()):

        if PLOT_CORR and i_mom == 0: col = f'{col}_corr'

        # loop over sub-dfs within moment and add specific column values
        list_values = [tempdf[col].values for tempdf in PLOT_DATADICT[moment].values()]
        box_subs = list(PLOT_DATADICT[moment].keys())  # subs included in this boxplot
        # sort by sub id
        i_sort = np.argsort(box_subs)
        box_subs = [box_subs[i] for i in i_sort]
        list_values = [list_values[i] for i in i_sort]

        # drop NaN values in lists
        list_values = [[v for v in l if not np.isnan(v)] for l in list_values]

        # plot boxes for one moment
        bp = axes[i_ax].boxplot(list_values, widths=bin_w,
                           positions=x_starts[moment] + bin_w * np.arange(len(list_values)),
                           patch_artist=True,)
        if i_ax == 0:
            if moment != list(PLOT_DATADICT.keys())[-1]:
                x_starts[list(PLOT_DATADICT.keys())[i_mom + 1]] = x_starts[moment] + len(list_values) * bin_w + x_margin

        # Loop over boxes
        for patch, patchsub in zip(bp['boxes'], box_subs):
            patch.set_facecolor(subcolors[patchsub])

    # pretty plot
    axes[i_ax].set_ylabel(f'{col}\n(EMA answer)', size=fsize,)

# pretty axes
for ax in axes:
    if not PLOT_CORR:
        ax.set_ylim(0, 10)
        ax.set_yticks(np.arange(1, 10, 2))
        ax.set_yticklabels(np.arange(1, 10, 2))
    else:
        ax.set_ylim(-5, 5)
        ax.set_yticks(np.arange(-4, 6, 2))
        ax.set_yticklabels(np.arange(-4, 6, 2))

    ax.set_xticks(list(x_starts.values()))
    ax.set_xticklabels(list(x_starts.keys()))
    ax.spines[['right', 'top']].set_visible(False)
    ax.tick_params(axis='both', labelsize=fsize, size=fsize,)

    ax.axhline(0, xmin=0, xmax=1,
               color='gray', alpha=.3, zorder=0,)
    if PLOT_CORR: ylines = [-4, -2, 2, 4,]
    else: ylines = [1, 3, 5, 7, 9]
    for yline in ylines:
        ax.axhline(yline, xmin=0, xmax=1, color='gray', alpha=.15, zorder=0,)

plt.tight_layout()

# plt.savefig(os.path.join(load_utils.get_onedrive_path('figures'),
#              'ema_naturalistic', fname),
#              dpi=300, facecolor='w',)

plt.close()

Check completion rates

In [None]:


for rec_moment in data.keys():

    for sub in data[rec_moment].keys():

        df = data[rec_moment][sub]

        df['Submission'] = pd.to_numeric(df['Submission'], errors='coerce')
        rate = df['Submission'].mean()
        print(f"{sub} completion rate @ {rec_moment}: {rate:.0%}")



Check means and variances for movement items, tremor, and gait items
- split per sub
- split per ses

In [None]:
temp_df.EMA_reports

In [None]:
importlib.reload(home_helpers)

data = {}

# Define pre-operative sessions
sel_info = home_helpers.select_sessions()
sel_info = sel_info.set_index(sel_info['study_id'],)
sel_sessions = {sub: ses for sub, ses in sel_info[['study_id', 'Session']].values}
print(sel_sessions)


for sub, ses in sel_sessions.items():

    data[sub] = loadSubject(
        sub=sub,
        ses=ses,
        incl_EMA=True,
        incl_ACC=False,
    )



## 3. Explore naturalistic ACC

include loading option for ACC only for EMA windows, store these selected windows separately, to prevent loading of full acc data

In [None]:
feas_data_path = os.path.join(
    os.path.dirname(home_paths.get_home_onedrive()),
    'PROJECTS', 'home_feasibility'
)
feas_fig_path = os.path.join(
    home_paths.get_home_onedrive('figures'),
    'feasibility'
)

Load ACC data, create SVM and filtered data wihtin the dataclass

In [None]:
# import naturalistic data via dbs_home repo

sub_id = 'hm24'
ses_id = 'ses01'

dev_day_selection = ['2025-07-17', '2025-07-18']

home_dat = loadSubject(
    sub=sub_id,
    ses=ses_id,
    incl_STEPS=False,
    incl_EPHYS=False,
    incl_EMA=True,
    incl_ACC=True,
    day_selection=dev_day_selection
)

Check available EMAs

In [None]:
plot_home_compl.plot_EMA_completion_perSession(home_dat)

Get Acc-Windows aligned to EMAs

In [None]:
from dbs_home.preprocessing import acc_preprocessing as acc_prep

In [None]:

# create dict with ACC data per EMA window

acc_ema_combis = {}

for i_day, str_day in enumerate(home_dat.watch_days):
    # define current day
    print(str_day)
    # check default parameters here
    acc_ema_combis[str_day] = acc_prep.get_day_EMA_AccWindows(
        subSesClass=home_dat, str_day=str_day,
    )
    


   

In [None]:
# check dict by plotting all windows for one day in same plot

fig, ax = plt.subplots(1, 1)

ax_ema = ax.twinx()

for ema, win_times, win_svm in zip(
    acc_ema_combis['2025-07-17']['ema'],
    acc_ema_combis['2025-07-17']['acc_times'],
    acc_ema_combis['2025-07-17']['acc_svm']
):
    ax.plot(win_times, win_svm)
    
    # scatter "move well"-value
    ax_ema.scatter(win_times[0], ema['Q6'], color='gray', s=50, alpha=.5,)

ax.set_ylabel('ACC SVM (vector-g)')
ax_ema.set_ylabel('EMA answer (Likert-scale)')

plt.show()

Get submovement indices

In [None]:
from sklearn.cluster import KMeans

In [None]:
# loop over each day
for day_idx, day in enumerate(home_dat.watch_days):
    
    # if day_idx > 0: continue  # ONLY EXECUTE ONE DAY FOR DEVELOPMENT
    
    print(f'\nStart day i-{day_idx}: {day}')
    print('load acc day')
    day_accClass = moveProc.get_accel_day(dat=home_dat, day_index=day_idx,)

   

    print('compute act index')
    # calculate act index
    ai_values, ai_times = moveProc.compute_activity_index(
        day_accClass.raw_triax_acc,
        day_accClass.timestamps
    )
    print('cluster ai')
    # cluster act indices
    kmeans = KMeans(
        n_clusters=2, random_state=0, n_init="auto"
    ).fit(ai_values.reshape([-1, 1]))

    ai_binary_clusters = kmeans.labels_

In [None]:
importlib.reload(load_data)
importlib.reload(prep_data)





# load all combined data

# SUBS_INCL = ['hm14']


emadata = {m: {} for m in MOMENTS}
accdata = {m: {} for m in MOMENTS}

for rec_moment, sub_sess in sessions_include.items():

    for sub, ses in sub_sess.items():
        print(f'\n\n{"#" * 30}\nstart sub-{sub}: ses:{ses}\n{"#" * 30}\n\n')

        # to test acc loading
        # if not sub in SUBS_INCL: continue

        if sub in sub_skip: continue
                    
        if [sub, ses] in ses_skip: continue
                
        ses_class = load_home.loadSubject(
            sub=sub,
            ses=ses,
            incl_EMA=True,
            incl_ACC=True,
        )
        # temp_df = ema_utils.load_ema_df(sub_ses_class=ses_class)
        # # prepare
        # temp_df = prep_data.prepare_ema_df(temp_df, ADD_MEANMOVE=True, INVERT_NEG_ITEMS=False,)
        ### TODO: CHECK WHY NOT ALL SUBS ARE INVERTED PROPERLY

        # emadata[rec_moment][sub] = temp_df

        ################
        # store and plot acc feasib acquisition
        
        # get feas/acquisition numbers
        sub_timesums = get_acc_feas(ses_class)

        # Save with highest protocol (fast & compact)
        fname = f"acc_seconds_{ses_class.sub}_{ses_class.ses}.pkl"

        with open(os.path.join(feas_data_path, fname), "wb") as f:
            pickle.dump(sub_timesums, f,
                        protocol=pickle.HIGHEST_PROTOCOL)

        plot_feas_acc(sub_timesums=sub_timesums,
                      sub=ses_class.sub, ses=ses_class.ses)


In [None]:
plot_feas_acc(sub_timesums=feasload, sub=SUB, ses=SES)

In [None]:
importlib.reload(load_watch)
importlib.reload(load_home)
importlib.reload(dbs_home)


temp = load_home.loadSubject(
            sub='hm23',
            ses='ses01',
            incl_EMA=True,
            incl_ACC=True,
            proc_ACC=False,
        )

In [None]:
import dbs_home.utils.finding_paths as home_paths

In [None]:
temp.sub

explore feasbility, completion rates for acc

In [None]:
def get_acc_feas(
    dataClass_acc,
    TIME_GAP_SEC = 1,
    ACC_SFREQ = 32,
):

    sub_timesums = {}

    for i_day, day in enumerate(dataClass_acc.watch_days):
        print(f'\n\n{day}')
        time_sum = dt.timedelta(0)  # store collected time sum in variable dt timedelta

        time_diff = np.diff(dataClass_acc.acc_times[i_day][::(ACC_SFREQ * TIME_GAP_SEC)])  # use every 1-second value

        per_start, per_end = None, None

        for t_df, stamp in zip(
            time_diff,
            dataClass_acc.acc_times[i_day][::(ACC_SFREQ * TIME_GAP_SEC)]
        ):  # loop over timedeltas

            if not per_start:
                per_start = stamp
                # print(f'\nstartis {per_start}')
                continue

            # if too large time gap
            if t_df > dt.timedelta(seconds=TIME_GAP_SEC):
                per_end = stamp
                # add period times to list
                time_sum += (per_end - per_start)
                # reset period times and start over
                per_start, per_end = None, None

        if per_start and not per_end:
            per_end = stamp
            # add period times to list
            time_sum += (per_end - per_start)
            
        # store day sum in sub dict
        sub_timesums[day] = time_sum

    return sub_timesums


In [None]:
import pickle

In [None]:
# # Load

SUB='hm23'
SES='ses01'

fname = f"acc_seconds_{SUB}_{SES}.pkl"

with open(os.path.join(feas_data_path, fname), "rb") as f:
    feasload = pickle.load(f)

In [None]:
feasload

In [None]:
def plot_feas_acc(sub_timesums, sub, ses):

       fname = f'ACC_collection_{sub}_{ses}'

       fig, ax = plt.subplots(1, 1, figsize=(9, 3))

       ax.bar(x=np.arange(len(sub_timesums)),
              height=[t.seconds / 3600 for t in sub_timesums.values()])

       ax.set_xticklabels([l for l in sub_timesums.keys()],
                     rotation=45,)
       ax.set_xticks(np.arange(len(sub_timesums.values())),)
       ax.set_ylabel('Day sum (hours)')
       ax.set_title(f'collected ACC-time: {sub}, {ses}')

       plt.tight_layout()

       plt.savefig(os.path.join(feas_fig_path, fname), dpi=300,
                     facecolor='w', )

       plt.close()


## 2. Preprocess data: explore and visualize

#### Get (mean-corrected) EMA and UPDRS values per symptom subtype

In [None]:
importlib.reload(load_data)
importlib.reload(prep_data)

predat = ema_dat['hm14'].copy()
# dat = prep_data.prepare_home_emas(predat)

In [None]:
ema_dat['hm20'].columns

## 3. Explore / visualize data

In [None]:
figpath = load_utils.get_onedrive_path('emaval_fig')


In [None]:
ema_dat['hm18'].keys()

In [None]:
for sub, dat in ema_dat.items():

    daystart = dat['dates'].iloc[0]

    dayend = dat['dates'].iloc[-1]

    daystart = sel_info.loc[sub]['onboarding_date'] + dt.timedelta(days=1)
    dayend = sel_info.loc[sub]['checkout_date'] - dt.timedelta(days=1)


    ndays = (dayend - daystart).days

    
    compl_perc = dat.shape[0] / (ndays * 6)

    print(f'{sub}: over {ndays} days completed {np.round(compl_perc, 2)}')


# ema_dat['hm18']['dates']

## 4. Perform Statistics

In [None]:
import utils.stats as utilsstat
import statsmodels.formula.api as smf
