# Naturalistic EMA validation

Applying the findings from the 4-state correlation work (EMA x UPDRS) onto real-life EMA data.

Goals:
- analyse real-life variation of EMA values
    - inter-individual variation
    - intra-individual variation, daily fluctuations, differences between days

## 0. Import packages

- document versions for reproducibility

In [None]:
# import packages
import datetime as dt
import pandas as pd
import numpy as np
import os
import sys
import csv
import json
import importlib
from itertools import product, compress
import matplotlib.pyplot as plt
from scipy.stats import pearsonr, spearmanr
from scipy.signal import welch

In [None]:
print('Python sys', sys.version)
print('pandas', pd.__version__)
print('numpy', np.__version__)
# print('mne_bids', mne_bids.__version__)
# print('mne', mne.__version__)
# print('sci-py', scipy.__version__)
# print('sci-kit learn', sk.__version__)
# print('matplotlib', plt_version)

"""
Python sys 3.11.5 | packaged by Anaconda, Inc. | (main, Sep 11 2023, 13:26:23) [MSC v.1916 64 bit (AMD64)]
pandas 2.1.1
numpy 1.26.0

from 16.09

Python sys 3.11.5 | packaged by Anaconda, Inc. | (main, Sep 11 2023, 13:26:23) [MSC v.1916 64 bit (AMD64)]
pandas 2.3.2
numpy 2.3.3
"""

Import custom functions

In [None]:
# from dbs_home repo
from dbs_home.load_raw.main_load_raw import loadSubject 
import dbs_home.utils.helpers as home_helpers
import dbs_home.utils.ema_utils as home_ema_utils
import dbs_home.plot_data.plot_compliance as plot_home_compl
import dbs_home.preprocessing.preparing_ema as home_ema_prep

In [None]:
# from current repo
from utils import load_utils, load_data, prep_data
from plotting import plot_help



## 1. Import Home-Data

Use pre-operative sessions

- use 9-point-converter
- use direct. inverter

Import EMA home data from raw files

In [None]:
MOMENTS = ['pre-op', 'pre 3MFU', 'post 3MFU']

sub_skip = [] # ['hm25',]  # skip full subject
# skip per session
ses_skip = [['hm20', 'ses03'],]
# ses_skip = [['hm14', 'ses03']]

In [None]:
sessions_include = {m: {} for m in MOMENTS}

for rec_moment in MOMENTS:

    sel_info = home_helpers.select_sessions(target_session=rec_moment)
    sel_info = sel_info.set_index(sel_info['study_id'],)
    sel_sessions = {sub: ses for sub, ses in sel_info[['study_id', 'Session']].values}

    for key, val in sel_sessions.items():
        sessions_include[rec_moment][key] = val


In [None]:
print(sessions_include.keys())
print(sessions_include)

Load data

dev for EMA


In [None]:
importlib.reload(home_ema_utils)
importlib.reload(load_data)
importlib.reload(prep_data)
importlib.reload(home_ema_prep)

# load all combined data

# SUBS_INCL = ['hm14']


data = {m: {} for m in MOMENTS}

for rec_moment, sub_sess in sessions_include.items():
    # rec_moment contains 'pre-op', or 'pre 3MFU', 'post 3MFU', etc
    for sub, ses in sub_sess.items():

        if sub in sub_skip: continue
                    
        if [sub, ses] in ses_skip: continue
                
        ses_class = loadSubject(
            sub=sub,
            ses=ses,
            incl_EMA=True,
            incl_ACC=False,
        )
        temp_df = home_ema_utils.load_ema_df(sub_ses_class=ses_class)
        # prepare
        temp_df = home_ema_prep.prepare_ema_df(temp_df, ADD_MEANMOVE=True, INVERT_NEG_ITEMS=False,)
        ### TODO: CHECK WHY NOT ALL SUBS ARE INVERTED PROPERLY

        data[rec_moment][sub] = temp_df


## 2. Explore naturalistic EMAs

Preprocess EMA

- merge scores
- invert negative-items (higher = clinically better)
- mean-correct EMA
    - test different normalizations:
        - normalize with grand-mean per sub
        - normalize with session mean

In [None]:
importlib.reload(prep_data)

allsubs = []
for mom in list(data.keys()): allsubs.extend(list(data[mom].keys()))
allsubs = np.unique(allsubs)


corr_data = {m: {} for m in MOMENTS}

for sub in allsubs:

    subdf = home_ema_prep.merge_sub_ema_df(datadict=data, sub=sub)
    subdf = home_ema_prep.mean_correct_ema_df(subdf)

    # split and palce back as moment dfs    
    for moment in MOMENTS:
        corr_data[moment][sub] = subdf[subdf['moment'] == moment].reset_index(drop=True)



Visualize first EMA results full group

In [None]:
PLOT_ITEMS = ['move_mean', 'walking', 'tremor']

PLOT_CORR = False
if PLOT_CORR: PLOT_DATADICT = corr_data
else: PLOT_DATADICT = data


fig, axes = plt.subplots(3, 1, figsize=(9, 6))
fname = 'motorItems_abs_perSub_perSes_1104'
if PLOT_CORR: fname = fname.replace('abs', 'corr')

x_margin = 2
bin_w = 0.5

fsize=14

x_starts = {list(data.keys())[0]: 0}  # first moment starts at 0

subcolors = plot_help.get_sub_colors(PLOT_DATADICT)


for i_ax, col in enumerate(PLOT_ITEMS):

    for i_mom, moment in enumerate(PLOT_DATADICT.keys()):

        if PLOT_CORR and i_mom == 0: col = f'{col}_corr'

        # loop over sub-dfs within moment and add specific column values
        list_values = [tempdf[col].values for tempdf in PLOT_DATADICT[moment].values()]
        box_subs = list(PLOT_DATADICT[moment].keys())  # subs included in this boxplot
        # sort by sub id
        i_sort = np.argsort(box_subs)
        box_subs = [box_subs[i] for i in i_sort]
        list_values = [list_values[i] for i in i_sort]

        # drop NaN values in lists
        list_values = [[v for v in l if not np.isnan(v)] for l in list_values]

        # plot boxes for one moment
        bp = axes[i_ax].boxplot(list_values, widths=bin_w,
                           positions=x_starts[moment] + bin_w * np.arange(len(list_values)),
                           patch_artist=True,)
        if i_ax == 0:
            if moment != list(PLOT_DATADICT.keys())[-1]:
                x_starts[list(PLOT_DATADICT.keys())[i_mom + 1]] = x_starts[moment] + len(list_values) * bin_w + x_margin

        # Loop over boxes
        for patch, patchsub in zip(bp['boxes'], box_subs):
            patch.set_facecolor(subcolors[patchsub])

    # pretty plot
    axes[i_ax].set_ylabel(f'{col}\n(EMA answer)', size=fsize,)

# pretty axes
for ax in axes:
    if not PLOT_CORR:
        ax.set_ylim(0, 10)
        ax.set_yticks(np.arange(1, 10, 2))
        ax.set_yticklabels(np.arange(1, 10, 2))
    else:
        ax.set_ylim(-5, 5)
        ax.set_yticks(np.arange(-4, 6, 2))
        ax.set_yticklabels(np.arange(-4, 6, 2))

    ax.set_xticks(list(x_starts.values()))
    ax.set_xticklabels(list(x_starts.keys()))
    ax.spines[['right', 'top']].set_visible(False)
    ax.tick_params(axis='both', labelsize=fsize, size=fsize,)

    ax.axhline(0, xmin=0, xmax=1,
               color='gray', alpha=.3, zorder=0,)
    if PLOT_CORR: ylines = [-4, -2, 2, 4,]
    else: ylines = [1, 3, 5, 7, 9]
    for yline in ylines:
        ax.axhline(yline, xmin=0, xmax=1, color='gray', alpha=.15, zorder=0,)

plt.tight_layout()

# plt.savefig(os.path.join(load_utils.get_onedrive_path('figures'),
#              'ema_naturalistic', fname),
#              dpi=300, facecolor='w',)

plt.close()

Check completion rates

In [None]:


for rec_moment in data.keys():

    for sub in data[rec_moment].keys():

        df = data[rec_moment][sub]

        df['Submission'] = pd.to_numeric(df['Submission'], errors='coerce')
        rate = df['Submission'].mean()
        print(f"{sub} completion rate @ {rec_moment}: {rate:.0%}")



Sub-analysis paper: EMA pre-post vs UPDRS

In [None]:
from statsmodels.regression.mixed_linear_model import MixedLM


In [None]:
Q_SEL = 'general movement'

PER1 = 'pre-op'
PER2 = 'post 3MFU'

pt_pre = list(data[PER1].keys())
pt_sel = [p for p in list(data[PER2].keys()) if p in pt_pre]

stat_df = {'sub': [], 'period': [], 'ema': []}

pt_coding = {}

for i_pt, pt in enumerate(pt_sel):  # add index for pt coding
    pt_coding[pt] = i_pt
    for i_period, period in enumerate([PER1, PER2]):
        values = data[period][pt]
        values = values.loc[values['Submission'].astype(int) == 1]  # only take completed emas
        values = values[Q_SEL].astype(float)  # take selected question
        values = values[~np.isnan(values)]  # excl nan values
        stat_df['ema'].extend(values)
        stat_df['sub'].extend([i_pt] * len(values))
        stat_df['period'].extend([i_period] * len(values))

stat_df = pd.DataFrame(stat_df)

In [None]:
for pt in np.unique(stat_df['sub']):

    temp_df = stat_df[stat_df['sub'] == pt]
    values = [temp_df[temp_df['period'] == p]['ema'] for p in [0, 1]]
    plt.boxplot(values)

    plt.close()

In [None]:
### stats

# define model
lm_model = MixedLM(
    endog=np.array(stat_df['period']),  # dependent variable 
    exog=np.array(stat_df['ema']),  # independent variable (i.e., LID presence, movement)
    groups=np.array(stat_df['sub']),  # subjects
    exog_re=None,  # (None)  defaults to a random intercept for each group
)
# run and fit model
# try:
lm_results = lm_model.fit()
# except:
#     if allow_lm_error:
#         return False
#     else:
#         print(dep_var.shape, indep_var.shape, groups.shape)
#         lm_results = lm_model.fit()

# extract results
fixeff_cf = lm_results._results.fe_params[0]
pval = lm_results._results.pvalues[0]


In [None]:
lm_results._results.fe_params, lm_results._results.pvalues

print(lm_results)

Check means and variances for movement items, tremor, and gait items
- split per sub
- split per ses

In [None]:
temp_df.EMA_reports

In [None]:
importlib.reload(home_helpers)

data = {}

# Define pre-operative sessions
sel_info = home_helpers.select_sessions()
sel_info = sel_info.set_index(sel_info['study_id'],)
sel_sessions = {sub: ses for sub, ses in sel_info[['study_id', 'Session']].values}
print(sel_sessions)


for sub, ses in sel_sessions.items():

    data[sub] = loadSubject(
        sub=sub,
        ses=ses,
        incl_EMA=True,
        incl_ACC=False,
    )



## 3. Explore naturalistic ACC and Extract Features

include loading option for ACC only for EMA windows, store these selected windows separately, to prevent loading of full acc data

In [None]:
feas_data_path = os.path.join(
    os.path.dirname(load_utils.get_onedrive_path()),
    'PROJECTS', 'home_feasibility'
)
feas_fig_path = os.path.join(
    load_utils.get_onedrive_path('figures'),
    'feasibility'
)

ntrl_fig_path = load_utils.get_onedrive_path('emaval_fig')

Load ACC data, create SVM and filtered data wihtin the dataclass

In [None]:
from dbs_home.preprocessing import acc_preprocessing as acc_prep
from dbs_home.preprocessing import get_submovements


In [None]:
# import naturalistic data via dbs_home repo

# LID
sub_id = 'hm24'
ses_id = 'ses01'

# # tremor check for Anna
# sub_id = 'hm22'
# ses_id = 'ses01'

### test days for hm24-ses01  # dyskinesia
# dev_day_selection = ['2025-07-17', '2025-07-18']
# dev_day_selection = [f'2025-07-{d}' for d in np.arange(17, 31)]
dev_day_selection = []

### test days for hm20-ses01  # tremor
# dev_day_selection = [
#     '2025-06-13', '2025-06-14',
#     '2025-06-15', '2025-06-16'
# ]

home_dat = loadSubject(
    sub=sub_id,
    ses=ses_id,
    incl_STEPS=False,
    incl_EPHYS=False,
    incl_EMA=True,
    incl_ACC=True,
    day_selection=dev_day_selection
)

Check available EMAs

In [None]:
plot_home_compl.plot_EMA_completion_perSession(home_dat)

Get Acc-Windows aligned to EMAs

In [None]:
from dataclasses import dataclass, field

@dataclass(init=True,)
class windowData:
    sub: str
    ses: str
    day: str | None = None
    acc_times: np.ndarray | None = None
    acc_triax: np.ndarray | None = None
    acc_svm: np.ndarray | None = None
    sfreq: int | None = None
    ema: dict = field(default_factory=dict)
    day: str | None = None

    def __post_init__(self):

        print(f'created windowData class for {self.sub}, {self.ses};'
              f'starttime {self.acc_times[0]}')
        if type(self.day) == str: print(f'belonging to day {self.day}')

        if self.sfreq == None:
            # extract sfreq if not given
            time_df = np.diff(self.acc_times[:5])[0]
            self.sfreq = int(dt.timedelta(seconds=1) / time_df)

        


In [None]:
def get_submove_day_timestamps(
    day, sub, ses, SM_MIN_DUR=0, SM_MAX_DUR=60,
    SUBMOVE_version='v1',
):

    sm_day_times = get_submovements.load_submovements(
        sub_id=sub, ses_id=ses, day=day,
        ONLY_TIMES=True, SUBMOVE_version=SUBMOVE_version,
    )

    # get submovement start and ends (from json-dict)
    sm_time_arr = np.array([list(s.values()) for s in sm_day_times['submovements']])
    # get array with datetime objects for starts and ends
    sm_day_starts = np.array([dt.datetime.strptime(t, "%Y-%m-%dT%H:%M:%S.%f")
                            for t in sm_time_arr[:, 0]])
    sm_day_ends = np.array([dt.datetime.strptime(t, "%Y-%m-%dT%H:%M:%S.%f")
                        for t in sm_time_arr[:, 1]])
    
    sm_durations = sm_day_ends - sm_day_starts
    sel_submoves = np.logical_and(
        sm_durations > dt.timedelta(seconds=SM_MIN_DUR),
        sm_durations < dt.timedelta(seconds=SM_MAX_DUR),
    )
    sm_day_starts = sm_day_starts[sel_submoves]
    sm_day_ends = sm_day_ends[sel_submoves]
    
    return sm_day_starts, sm_day_ends

In [None]:
def get_window_submoveMask(win_times, sm_dt_starts, sm_dt_ends,):
    """
    get boolean array that is "1" for submovement-positive samples
    during EMA-window-matched acc data

    returns:
    - array with shape acc-window, positive for submove samples
    - boolean array for day-submoves that are within current window-times
    """

    # get start and end time of acc-ema window
    win_start, win_end = win_times[0], win_times[-1]

    # compare and select starts and ends within acc-ema-window
    submoves_in_win_mask = np.logical_and(
        sm_dt_starts > win_start,
        sm_dt_ends < win_end
    )
    win_sm_starts = sm_dt_starts[submoves_in_win_mask]
    win_sm_ends = sm_dt_ends[submoves_in_win_mask]

    # select window-samples that are within submoves
    # create boolean for acc-window, that will be 1 during submoves
    win_submove_bool = np.zeros_like(win_times)
    for t1, t2 in zip(win_sm_starts, win_sm_ends):
        mask = np.logical_and(win_times > t1, win_times < t2)
        win_submove_bool[mask] = 1

    win_submove_bool = win_submove_bool.astype(bool)

    return win_submove_bool, submoves_in_win_mask

In [None]:
from itertools import compress
from scipy.stats import variation

In [None]:
tempwin = sm_win_data[5]

print(vars(tempwin).keys())

In [None]:
# thoughts: velocity is negative?
# pc's exclude, many of the variance within the acc signal

for i_sm, tempwin in enumerate(sm_win_data[:30]):

    for att in ['pc1', 'pc2']:
        plt.plot(tempwin.timestamps, getattr(tempwin, att), label=att,
                lw=3, alpha=.5,)

    plt.plot(tempwin.timestamps, tempwin.svm, label='svm',)
    plt.plot(tempwin.timestamps, tempwin.velocity.T,
            label=[f'velo_{a}' for a in 'xyz'],
            ls='--',)

    plt.title(f'submovement # {i_sm}')
    plt.legend()

    plt.show()

In [None]:
from sklearn.decomposition import PCA


In [None]:
# microsecs = np.array([t.microsecond for t in tempwin.timestamps]) * 1e-6
# microsecs = np.array([t.microsecond for t in tempwin.timestamps])

plt.plot(tempwin.velocity.T)

pca = PCA(n_components=2)
projected = pca.fit_transform(tempwin.velocity.T)  # Shape: (N, 2)

pc1 = projected[:, 0]  # Primary direction
pc2 = projected[:, 1]  # Secondary direction

plt.plot(pc1, alpha=.3, lw=5,)
plt.plot(pc2, alpha=.3, lw=5,)

plt.show()

In [None]:
plt.plot(np.abs(tempwin.velocity.T))

pca = PCA(n_components=2)
projected = pca.fit_transform(np.abs(tempwin.velocity.T))  # Shape: (N, 2)

pc1 = projected[:, 0]  # Primary direction
pc2 = projected[:, 1]  # Secondary direction

plt.plot(pc1, alpha=.3, lw=5,)
plt.plot(pc2, alpha=.3, lw=5,)

plt.show()

In [None]:
import dbs_home.preprocessing.submovement_processing as submove_proc
import dbs_home.load_raw.load_watch_raw as load_watch

In [None]:
importlib.reload(load_watch)

hr_day_data = load_watch.get_source_heartrate_day(
    sub=home_dat.sub, ses=home_dat.ses, date='2025-08-13',
)

In [None]:
t1 = windat.acc_times[0]
t2 = windat.acc_times[-1]

hr_sel = np.logical_and(hr_day_data['timestamp'] > t1,
                        hr_day_data['timestamp'] < t2)

hr_win = hr_day_data[hr_sel].reset_index(drop=True)



hr_win

In [None]:
fig, ax = plt.subplots(1, 1)

ax2 = ax.twinx()

ax.plot(windat.acc_times, windat.acc_svm)
ax2.plot(hr_win['timestamp'], hr_win[' HeartRate'], color='orangered',)

ax2.plot(hr_day_data['timestamp'], hr_day_data[' HeartRate'],
         color='orangered',)


plt.show()

In [None]:
def plot_submove_check(
    FIGDIR, FIGNAME, SAVE_PLOT, SHOW_PLOT,
    windat, win_submove_bool, ema_win, hr_win,
    str_day, i_win, SUBMOVE_version,
):

    FONTSIZE = 12
    
    fig, ax = plt.subplots(1, 1)
    ax2 = ax.twinx()
    ax2.plot(hr_win['timestamp'], hr_win[' HeartRate'], color='orangered',)
    ax2.set_ylim(-10, 130)
    ax2.set_ylabel('Heartrate (bpm)', size=FONTSIZE, color='orangered')

    ax.plot(windat.acc_times, windat.acc_svm, label='svm', alpha=.5,)
    ax.scatter(windat.acc_times, win_submove_bool.astype(int),
                label='submove-boolean', s=50, alpha=.3, color='orange',)
    ax.set_ylim(-.5, 5)
    ax.set_ylabel('ACC-vector (squared-magn.)',
                    size=FONTSIZE, color='blue')
    ax.legend(loc='upper right')
    ax.set_title(
        f'{str_day}: EMA-window # {i_win} ({windat.sub}, {windat.ses}, submove-{SUBMOVE_version})'
        f'\n EMA: tremor: {ema_win["Q7"]}, dyskinesia: {ema_win["Q8"]}'
    )

    for axx in [ax, ax2]:
        axx.tick_params(axis='both', size=FONTSIZE,
                        labelsize=FONTSIZE,)
    plt.tight_layout()

    if SAVE_PLOT:
        plt.savefig(os.path.join(FIGDIR, FIGNAME), facecolor='w', dpi=150,)

    if SHOW_PLOT: plt.show()
    else: plt.close()

In [None]:
import utils.acc_features as acc_fts
import utils.feat_extraction as ft_extr

In [None]:
EMA_CODING = {'tremor': 'Q7', 'LID': 'Q8'}

In [None]:
importlib.reload(get_submovements)
importlib.reload(submove_proc)
importlib.reload(ft_extr)


SELECT_SUBMOVES = False
EXTRACT_FT_FROM_SMs = True
assert not SELECT_SUBMOVES and EXTRACT_FT_FROM_SMs, (
    'CHOSE ONE OF TWO APROACHES, data OR times from submoves'
)
ACC_FEATS_on_SINGLE_MOVES = True

SUBMOVE_version = 'v3'
ACC_SFREQ = 32
ACC_MIN_PER_EMA = 15

SM_MIN_DUR = .5  # sec
SM_MAX_DUR = 600  # sec

MIN_ACC_PRESENT = 0.5
WIN_SAMPLES = (ACC_MIN_PER_EMA * 60 * ACC_SFREQ)

# plotting settings
SAVE_PLOT = False
SHOW_PLOT = False
FIGDIR = os.path.join(
    home_helpers.finding_paths.get_home_onedrive('figures'),
    'acc_processing', 'submovement_checks',
    f'submove_{SUBMOVE_version}', windat.sub, windat.ses
)
if not os.path.exists(FIGDIR): os.makedirs(FIGDIR)

### ft extraction params
FEATS_INCL = [
    'rms_acc', 'svm_coefvar', 'svm_sd', 'iqr_acc',
    'lowpass_rms', 'jerk_magn', 'pow_4_7_ratio',
    'pow_8_12_ratio', 'pow_4_12', 'pow_1_3',
    'hr_mean', 'hr_std', 'hr_coefvar',
    'sm_count', 'sm_duration_mean',
    'sm_duration_std', 'sm_duration_coefvar'
]
# if features are not directly extraced, list to store data
if EXTRACT_FT_FROM_SMs:
    FEAT_STORE = {f: [] for f in FEATS_INCL}
    if ACC_FEATS_on_SINGLE_MOVES:  # replace ACC keys with _mean-cfvar keys
        for f in list(FEAT_STORE.keys()):
            if 'sm_' in f or 'hr_' in f: continue  # convert only svm-acc features
            for metric in ['SMmean', 'SMcfvar']:
                FEAT_STORE[f'{f}_{metric}'] = []
            del FEAT_STORE[f]
else:
    all_windows = []

Y_STORE = {'tremor': [], 'LID': []}


for i_day, str_day in enumerate(home_dat.watch_days):
    # define current day
    print(f"\n\n##### START day: {str_day}")
    # get dict for current day, needed in both methods of ft extraction (sm or not sm)
    day_dict_lists = acc_prep.get_day_EMA_AccWindows(
        subSesClass=home_dat, str_day=str_day,
    )
    
    if not EXTRACT_FT_FROM_SMs:

        if SELECT_SUBMOVES:
            (sm_day_starts,
             sm_day_ends) = get_submove_day_timestamps(
                str_day, home_dat.sub, home_dat.ses,
                SM_MIN_DUR=SM_MIN_DUR, SM_MAX_DUR=SM_MAX_DUR,
                SUBMOVE_version=SUBMOVE_version,
            )
        
    else:
        # extract feats from sm-data directly
        sm_day_data = get_submovements.load_submovements(
            sub_id=sub_id, ses_id=ses_id, day=str_day,
            ONLY_TIMES=False,
            SUBMOVE_version=SUBMOVE_version,
        )
        # select submoves on durations
        sm_day_data = [s for s in sm_day_data
                       if np.logical_and(s.duration > SM_MIN_DUR,
                                         s.duration < SM_MAX_DUR)]
        # get sm times for selection within window
        (sm_day_starts,
         sm_day_ends) = get_submove_day_timestamps(
            str_day, home_dat.sub, home_dat.ses,
            SM_MIN_DUR=SM_MIN_DUR, SM_MAX_DUR=SM_MAX_DUR,
            SUBMOVE_version=SUBMOVE_version,
        )
    
    # load heartrate for full day
    hr_day_data = load_watch.get_source_heartrate_day(
        sub=home_dat.sub, ses=home_dat.ses, date=str_day,
    )


    for i_win in np.arange(len(list(day_dict_lists.values())[0])):

        print(f"\n\n\n######## START day-win # {i_win} / {len(list(day_dict_lists.values())[0])}")

        # create class with processed acc-data and with ema-dict per completed ema
        
        # skip incomplete acc data
        if len(day_dict_lists['acc_times'][i_win]) < (WIN_SAMPLES * MIN_ACC_PRESENT):
            print(f"skip WIN, not enough acc-data "
                  f"({len(day_dict_lists['acc_times'][i_win]) / (60 * ACC_SFREQ)} minutes)")
            continue

        # check for missing EMA data, and skip emas with missings
        if any(day_dict_lists['ema'][i_win].values == ''):
            print('skip WIN, missing EMA')
            continue
    
        # get window data
        windat = windowData(
            sub=home_dat.sub,
            ses=home_dat.ses,
            day=str_day,
            acc_times=day_dict_lists['acc_times'][i_win],
            acc_triax=day_dict_lists['acc_filt'][i_win],
            acc_svm=day_dict_lists['acc_svm'][i_win],
            ema=day_dict_lists['ema'][i_win],
        )

        # select heartrate for current window
        t1 = windat.acc_times[0]
        t2 = windat.acc_times[-1]
        hr_sel = np.logical_and(hr_day_data['timestamp'] > t1,
                                hr_day_data['timestamp'] < t2)
        hr_win = hr_day_data[hr_sel].reset_index(drop=True)

        # store window data for later ft-extraction
        if not EXTRACT_FT_FROM_SMs:
            if SELECT_SUBMOVES:
                # get mask for submove-pos samples in window
                win_submove_bool = get_window_submoveMask(
                    windat.acc_times, sm_day_starts, sm_day_ends,
                )
                # print(f'\nsubmove-positive window selection is '
                #       f'{round(sum(win_submove_bool) / len(win_submove_bool) * 100)}%')
                # change timeseries-attributes within window class
                for att in ['acc_times', 'acc_svm', 'acc_triax']:
                    full_series = getattr(windat, att)
                    setattr(windat, att, full_series[win_submove_bool])
                
                # store durations of single selected submoves within window
                sm_durations = sm_day_ends[win_submove_bool] - sm_day_starts[win_submove_bool]
                setattr(windat, 'sm_durations', sm_durations)

            all_windows.append(windat)

        # EXTRACT FEATURES directly, without substoring
        else:
            # EXTRACT FEATS FROM SUBMOVES DIRECTLY, no sub storing
            ema_win = day_dict_lists['ema'][i_win]
            
            # get mask for submove-pos samples in window
            win_submove_bool, submoves_in_win_bool = get_window_submoveMask(
                windat.acc_times, sm_day_starts, sm_day_ends,
            )       

            # print(f'\nsubmove-positive window selection is '
            #       f'{round(sum(win_submove_bool)/len(win_submove_bool)*100)}%')

            # only submoves from day, are within current window
            sm_win_data = list(compress(sm_day_data, submoves_in_win_bool))

            # print(f'n-submovements in window: {len(sm_win_data)}')
            # win_sm_mean = np.mean([s.duration for s in sm_win_data]).round(2)
            # win_sm_var = np.var([s.duration for s in sm_win_data]).round(2)
            # win_sm_cfvar = variation([s.duration for s in sm_win_data]).round(2)
            # print(f'durations: mean {win_sm_mean}, var {win_sm_var}, coef-var {win_sm_cfvar}')

            # check correctness of submovements by plotting window ACC
            if SAVE_PLOT or SHOW_PLOT:
                FIGNAME = f'submoveCheck_{SUBMOVE_version}_{windat.sub}_{windat.ses}_{str_day}_ema{i_win}'
                plot_submove_check(
                    FIGDIR=FIGDIR, FIGNAME=FIGNAME, SAVE_PLOT=SAVE_PLOT,
                    SHOW_PLOT=SHOW_PLOT, SUBMOVE_version=SUBMOVE_version,
                    windat=windat, win_submove_bool=win_submove_bool,
                    ema_win=ema_win, str_day=str_day,i_win=i_win, hr_win=hr_win,
                )

            # TODO: EXTRACT FEATURES HERE FROM SUBMOVE data

            ### acc-features from full window


            ### acc-features from sub-movement-data

            # get one array with svm data of all submovements from window
            
            ### SM_MERGE
            merged_sm_svm = np.array([value for sm in sm_win_data for value in sm.svm])
            hr = [h if not h==0 else np.nan for h in hr_win[' HeartRate'].values]

            sm_ft_class = ft_extr.SubmoveData2Feat(
                acc_svm=merged_sm_svm,
                hr=hr_win[' HeartRate'].values,
                sm_durations=[s.duration for s in sm_win_data],
            )

            # extract all feats that are defined in FEATS_INCL on MERGED SM data
            for ft in FEATS_INCL:
                if np.logical_and(
                    ACC_FEATS_on_SINGLE_MOVES,
                    'sm_' not in ft and 'hr_' not in ft
                ):
                    # do not add single merged acc feats if they be calculated on singlemoves
                    continue
                value = getattr(sm_ft_class, f'run_{ft}')()  # extra brackets () for executing function
                FEAT_STORE[ft].append(value)

            ### SM SINGLES
            if ACC_FEATS_on_SINGLE_MOVES:
                for ft in [f for f in FEATS_INCL if ('sm_' not in f and 'hr_' not in f)]:
                    # skip heartrate and sm-duration features here
                    ft_win_list = []  # store ft-values per sm within window
                    if len(sm_win_data) == 0:
                        ft_win_list.append(0)
                    else:
                        for sm in sm_win_data:
                            single_sm_class = ft_extr.SubmoveData2Feat(acc_svm=sm.svm,)
                            value = getattr(single_sm_class, f'run_{ft}')()
                            ft_win_list.append(value)  # add value per submovement
                    # add summarized scores per window
                    FEAT_STORE[f'{ft}_SMmean'].append(np.nanmean(ft_win_list))
                    FEAT_STORE[f'{ft}_SMcfvar'].append(variation([v for v in ft_win_list if not np.isnan(v)]))


            # TODO: try for cnn -> interpolate single sm-features into 
            # n=100 (zB) mask, 0-padding for n-sm <100 windows
        
            ### extract EMA values per window
            for EMA_ITEM in list(Y_STORE.keys()):
                Y_STORE[EMA_ITEM].append(ema_win[EMA_CODING[EMA_ITEM]])
            




In [None]:
importlib.reload(acc_fts)
importlib.reload(ft_extr)

for ft in FEATS_INCL:

    value = getattr(sm_ft_class, f'run_{ft}')()  # extra brackets () for executing function

    print(f'{ft}: {value}')



in case later necessary:

In [None]:
def round_datetime(t, T_RES_Sec = .1):
    """rounds on 0.1 sec"""

    t_sec = t.microsecond / 1e6
    t_sec_round = round(t_sec, abs(np.log10(T_RES_Sec)).astype(int))
    micro = t_sec_round * 1e6

    # add full second if rounding goes to 1e6 microseconds
    if micro == 1e6:
        t = t.replace(microsecond=0)
        t += dt.timedelta(seconds=1)
    # else replace rounded microseconds
    else:
        t = t.replace(microsecond=int(micro))

    return t

## 4. Evaluate extracted Features

- Hssayeni et al, Scientific Reports 2021
    - strongest wrist-features: angular velocity, standard deviation, power of secondary frequency, power of 1â€“4 Hz band, and Shannon Entropy (r = 0.82  - r = 0.75)

- from svm: classic features
- include cross-corr between pc1 and pc2


Evaluate Features per window
- distinction between extraciton over merged-svm or per-submovement is done in extraction code above

In [None]:
def normalize_values(values, ZSCORE=True, LOG=True, return_kept_idx=False,):

    # zscore
    if ZSCORE:
        values = (values - np.nanmean(values)) / np.nanstd(values)

    # log transform    
    if LOG:
        # remove zeros and nans
        if return_kept_idx:
            kept_idx = np.where([~np.isnan(v) and v != 0.0 for v in values])[0]
        values = [v for v in values if ~np.isnan(v) and v != 0.0]
        values = np.log(values)
    
    if return_kept_idx:
        return values, kept_idx
    else:
        return values

In [None]:
# get array with shape n-windows, n-feats
X = np.array([l for l in list(FEAT_STORE.values())])
# get array with length n-windows
y = np.array(Y_STORE['LID']).astype(float)

# for i_win, (X_win, y_win) in enumerate(zip(X.T, y)):
#     print(X_win.shape, y_win)



for i_ft, (ft_X, ft_name) in enumerate(zip(X, FEAT_STORE)):

    y = np.array(Y_STORE['LID']).astype(float)

    # print(ft_X.shape, ft_name, len(y))

    ft_X, kept_idx = normalize_values(ft_X, ZSCORE=True, LOG=True,
                                         return_kept_idx=True)
    print(len(kept_idx), len(y))
    y = y[kept_idx]

    fig, ax = plt.subplots(1,1, figsize=(8, 3))

    box_values = {str(y_value): [] for y_value in np.arange(1, 10)}

    for y_value in list(box_values.keys()):
        box_sel = y == float(y_value)
        matching_values = ft_X[box_sel]
        matching_values = matching_values[~np.isnan(matching_values)]  # get rid of NaNs
        box_values[y_value].extend(matching_values)

    ax.boxplot(box_values.values())

    ax.set_xticklabels(list(box_values.keys()))

    ax.set_ylabel(ft_name)
    ax.set_title(ft_name)
    ax.set_xlabel('EMA Dyskinesia severity (Likert points)')


    plt.show()

In [None]:
# for c, m in zip(FEAT_STORE['sm_count'], FEAT_STORE['sm_duration_mean']):
#     print(f"# {c},\t\t{round(c * m / 6, 1)} segments,\t\tmean: {round(m, 1)} s")

Check feature distributions
- pre and post z-scoring

In [None]:
# get array with shape n-windows, n-feats
X = np.array([l for l in list(FEAT_STORE.values())])
# get array with length n-windows
y = np.array(Y_STORE['LID']).astype(float)

for ftname, ftvalues in FEAT_STORE.items():
    fig, axes = plt.subplots(1, 2, figsize=(8, 3))

    axes[0].hist(ftvalues)
    axes[0].set_ylabel('count (n)')
    axes[0].set_xlabel('raw values')

    # z-score and take log values
    ftvalues = normalize_values(ftvalues, ZSCORE=True, LOG=True)

    axes[1].hist(ftvalues)
    axes[1].set_ylabel('count (n)')
    axes[1].set_xlabel('log. z-scored values')

    plt.suptitle(ftname)
    plt.tight_layout()

    plt.show()

### Predict EMAs

In [None]:
from sklearn.model_selection import StratifiedKFold


from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

from sklearn.metrics import (
    accuracy_score, r2_score,
    roc_auc_score, balanced_accuracy_score
)
from scipy.stats import f as stats_f
from sklearn.feature_selection import f_regression, r_regression


In [None]:
def get_f_stat(y_pred, y_true, n_feats):
    # F-statistic for model usability
    
    # Sum of squares
    SSR = np.sum((y_pred - np.mean(y_true)) ** 2)   # Regression
    SSE = np.sum((y_true - y_pred) ** 2)   # Error
    SST = np.sum((y_true - np.mean(y_true)) ** 2)   # Total

    # Degrees of freedom
    df_reg = n_feats
    df_err = len(y_true) - (n_feats + 1)  # +1 for coeff next to betas of features

    # Mean squares
    MSR = SSR / df_reg if df_reg > 0 else np.nan
    MSE = SSE / df_err if df_err > 0 else np.nan
    F = MSR / MSE
    # p-value for the observed F
    f_pval = 1 - stats_f.cdf(F, df_reg, df_err)

    return F, f_pval

cross-validation n=1 concept

clear NaNs

In [None]:
# get array with shape n-windows, n-feats
X = np.array([l for l in list(FEAT_STORE.values())]).T
# get array with length n-windows
y = np.array(Y_STORE['LID']).astype(float).reshape(-1, 1)

print(X.shape, y.shape)

# WITHOUT HR BCS OF NANS
hr_sel = ['hr_' in k for k in list(FEAT_STORE.keys())]
X = X[:, ~np.array(hr_sel)]

print(X.shape, y.shape)
# check where nans are, TODO during creation
# list(compress(list(FEAT_STORE.keys()), np.any(np.isnan(X), axis=1)))

nonnan_sel = ~np.any(np.isnan(X), axis=1)
X = X[nonnan_sel, :]
y = y[nonnan_sel]

print(X.shape, y.shape)


In [None]:


skf = StratifiedKFold(n_splits=4, random_state=27, shuffle=True,)
skf.get_n_splits()

# print(skf)

y_pred_total = np.zeros_like(y).ravel()

for i_fold, (train_idx, test_idx) in enumerate(skf.split(X, y)):

    clf = LDA()
    clf.fit(X[train_idx], y[train_idx])

    y_pred = clf.predict(X[test_idx])
    y_true = y[test_idx]

    y_pred_total[test_idx] = y_pred

    f_regr_skl = f_regression(y_pred.reshape(-1, 1), y_true.reshape(-1, 1))
    pearson_r = pearsonr(y_pred.reshape(-1, 1), y_true.reshape(-1, 1))
    
    print(f'Fold # {i_fold}:')
    print(f'\tF-stat (skl): {round(f_regr_skl[0][0], 1)}, '
          f'p = {round(f_regr_skl[1][0], 5)}')
    print(f'\tPearson-R (skl): {round(pearson_r[0][0], 1)}, '
          f'p = {round(pearson_r[1][0], 5)}\n\n')

JIT_SIZE=.3
x_jitter = np.random.uniform(low=-JIT_SIZE, high=JIT_SIZE, size=len(y))
y_jitter = np.random.uniform(low=-JIT_SIZE, high=JIT_SIZE, size=len(y))
plt.scatter(y_pred_total.ravel() + x_jitter,
            y.ravel() + y_jitter,
            s=30, alpha=.3,)
plt.show()

f_regr_skl = f_regression(y_pred_total.reshape(-1, 1), y.reshape(-1, 1))
pearson_r = pearsonr(y_pred_total.reshape(-1, 1), y.reshape(-1, 1))

print(f'TOTALLL CV')
print(f'\tF-stat (skl): {round(f_regr_skl[0][0], 1)}, '
      f'p = {round(f_regr_skl[1][0], 5)}')
print(f'\tPearson-R (skl): {round(pearson_r[0][0], 1)}, '
      f'p = {round(pearson_r[1][0], 5)}\n\n')

proof of concept:
- 1 sec segments
- mean as averaging method
- LDA for EMA prediction

In [None]:
models = {'scale': LinearRegression(),
          'lda': LDA(),
          'bin': LogisticRegression()}


CLSF = 'lda'

# TEST_SEL = daycode_arr > 25  # takes circa .33

# X_train = X_arr[~TEST_SEL, :]
# y_train = y_arr[~TEST_SEL]

# # test cohort
# X_test = X_arr[TEST_SEL, :]
# y_true = y_arr[TEST_SEL]


# y_pred to fill (LOdayO)
y_true = y_arr.astype(int)
y_pred = np.zeros_like(y_true)

# MAKE BINARY
if CLSF == 'bin':
    y_train = y_train >= 4
    y_true = y_true >= 4

# Run prediction

# leave one day out CV
for day in np.unique(daycode_arr):

    TEST_SEL = daycode_arr == day
    X_train = X_arr[~TEST_SEL, :]
    y_train = y_arr.astype(int)[~TEST_SEL]

    # test cohort
    X_test = X_arr[TEST_SEL, :]

    model = models[CLSF]
    model.fit(X_train, y_train)

    y_pred[TEST_SEL] = model.predict(X_test)



# round predictions to full numbers
if CLSF == 'scale' or 'lda': 
    y_pred = np.array([np.round(v) for v in y_pred])

    pred_F, pred_F_p = get_f_stat(y_pred=y_pred, y_true=y_true, n_feats=X_test.shape[1])
    pred_corrcoef, prs_p = pearsonr(y_true, y_pred)
    sk_f, sk_f_p = f_regression(y_pred.reshape(-1, 1), y_true)

    acc = accuracy_score(y_true, y_pred)
    R2 = r2_score(y_true=y_true, y_pred=y_pred,)

    print(f'({CLSF}) accuracy: {np.round(acc, 2)} (test sample: n={len(y_true)})')
    print(f'({CLSF}) R2: {np.round(R2, 2)} (test sample: n={len(y_true)})')
    print(f'({CLSF}) Corr-Coeff: {np.round(pred_corrcoef, 2)}, p={np.round(prs_p, 5)} (test sample: n={len(y_true)})')
    print(f'({CLSF}) F-stat: {np.round(pred_F, 2)}, p={pred_F_p}')
    print(f'({CLSF}) F-stat (sklearn): {np.round(sk_f, 2)}, p={np.round(sk_f_p, 5)}')


elif  CLSF == 'bin':
    auc = roc_auc_score(y_true=y_true, y_score=y_pred)
    print(f'({CLSF}) AUROC: {np.round(auc, 2)} (test sample: n={len(y_true)})')


In [None]:
fig, ax = plt.subplots(1,1, figsize=(8, 3))

ax.plot(y_true, color='orange', lw=5, alpha=.5,
         label='true',)
ax.plot(y_pred, color='purple', lw=2, alpha=.8,
         label='predicted')

ax.set_xlabel('Samples (n)', size=14)
ax.set_ylabel('EMA LID value', size=14)
ax.legend(frameon=False, fontsize=14, loc='upper right')

ax.tick_params(axis='both', size=14, labelsize=14,)
ax.spines[['right', 'top']].set_visible(False)

plt.tight_layout()

# plt.savefig(os.path.join(load_utils.get_onedrive_path('figures'),
#              'proof_kin_pred', 'LID_pred_hm24_ses01_24nov'),
#              dpi=300, facecolor='w',)

plt.show()

fig, ax = plt.subplots(1, 1, figsize=(4,4))

ax.scatter(y_true + np.random.uniform(-.2, .2, len(y_true)),
            y_pred + np.random.uniform(-.2, .2, len(y_pred)),)
ax.set_xlabel('true scores')
ax.set_ylabel('predicted scores')

plt.show()

Perm test

In [None]:
N_PERMS = 1000

# TEST_SEL = daycode_arr > 20  # takes circa .33

# X_train = X_arr[~TEST_SEL, :]
# y_train = y_arr[~TEST_SEL]

# # test cohort
# X_test = X_arr[TEST_SEL, :]
# y_true = y_arr[TEST_SEL]

model = LDA()

# Run prediction
perm_mets ={'F': [], 'R': []}

y_true = y_arr.astype(int)

np.random.seed(27)

for i_perm in np.arange(N_PERMS):

    y_perm_pred = np.array([np.nan] * len(y_arr))

    for day in np.unique(daycode_arr):

        TEST_SEL = daycode_arr == day
        X_train = X_arr[~TEST_SEL, :]
        y_train = y_arr.astype(int)[~TEST_SEL]
        np.random.shuffle(y_train)   

        # test cohort
        X_test = X_arr[TEST_SEL, :]

        model = models[CLSF]
        model.fit(X_train, y_train)

        y_perm_pred[TEST_SEL] = model.predict(X_test)

    # y_train = y_arr[~TEST_SEL]
    # np.random.shuffle(y_train)   
    # model.fit(X_train, y_train)

    # y_perm_pred = model.predict(X_test)
    # # np.random.shuffle(y_perm_pred)
    y_perm_pred = np.array([np.round(v) for v in y_perm_pred])

    F, f_pvalue = get_f_stat(y_pred=y_perm_pred, y_true=y_true,
                             n_feats=X_test.shape[1])
    prs_stat, _ = pearsonr(y_perm_pred, y_true)
    perm_mets['F'].append(F)
    perm_mets['R'].append(prs_stat)




In [None]:
fig, axes = plt.subplots(1, 2, figsize=(8, 3))

pred_mets = {'F': pred_F, 'R': pred_corrcoef}

for i_ax, metr in enumerate(list(perm_mets.keys())):

    axes[i_ax].hist(perm_mets[metr], color='gray', alpha=.5,)
    axes[i_ax].axvline(np.percentile(perm_mets[metr], 95),
                       color='orange', alpha=.8, lw=3,
                       label='permuted\nalpha 0.05',)
    
    axes[i_ax].axvline(pred_mets[metr],
                       color='purple', alpha=.5, lw=1,
                       label='prediction',)
    
    p_calc = sum(pred_mets[metr] < perm_mets[metr]) / len(perm_mets[metr])
    print(f'metric {metr}: p = {np.round(p_calc, 3)}')

    axes[i_ax].set_xlabel(f'{metr} score', size=14,)

    axes[i_ax].set_ylabel('count (n)', size=14)

    axes[i_ax].tick_params(axis='both', size=14, labelsize=14,)
    axes[i_ax].spines[['right', 'top']].set_visible(False)

axes[1].legend(frameon=False, fontsize=14,
               bbox_to_anchor=(.95, .5), loc='center left')

plt.tight_layout()

# plt.savefig(os.path.join(load_utils.get_onedrive_path('figures'),
#              'proof_kin_pred', 'LID_pred_hm24_ses01_24nov_perms'),
#              dpi=300, facecolor='w',)

plt.show()