# In-clinic data to validate EMA with UPDRS

## 0. Import packages

- document versions for reproducibility

In [None]:
# import packages
import pandas as pd
import numpy as np
import os
import sys
import csv
import json
import importlib
from itertools import product, compress
import matplotlib.pyplot as plt
from scipy.stats import pearsonr, spearmanr
from scipy.signal import welch

In [None]:
print('Python sys', sys.version)
print('pandas', pd.__version__)
print('numpy', np.__version__)
# print('mne_bids', mne_bids.__version__)
# print('mne', mne.__version__)
# print('sci-py', scipy.__version__)
# print('sci-kit learn', sk.__version__)
# print('matplotlib', plt_version)

"""
Python sys 3.11.5 | packaged by Anaconda, Inc. | (main, Sep 11 2023, 13:26:23) [MSC v.1916 64 bit (AMD64)]
pandas 2.1.1
numpy 1.26.0
"""

In [None]:
from utils import load_utils, load_data, prep_data
# from PerceiveImport.classes import main_class

In [None]:
# FOR DEBUGGING
importlib.reload(load_data)
importlib.reload(load_utils)

## 1. Import Data

### 1.1 Import EMA and UPDRS

In [None]:
# # SINGLE CONDITION
# CONDITION = 'm0s0'

# ema_df, updrs_df = load_data.get_EMA_UPDRS_data(condition=CONDITION)


In [None]:
importlib.reload(load_data)
importlib.reload(load_utils)


# list of IDs to exclude bcs data is still missing
excl_ids = []  # 'ema31', 'ema32', 'ema33', 'ema34'

# 4 CONDITIONS
EMA, UPDRS = {}, {}

for COND in ['m0s0', 'm0s1', 'm1s0', 'm1s1']:
    ema_temp, updrs_temp = load_data.get_EMA_UPDRS_data(
        condition=COND, CONVERT_SCORES=True,
    )
    EMA[COND] = ema_temp
    UPDRS[COND] = updrs_temp

    # print(f'EMA ids: {EMA[COND]["study_id"]}')
    # print(f'UPDRS ids: {UPDRS[COND]["study_id"]}')

    for ema_n_excl in excl_ids:
        if ema_n_excl in EMA[COND]['study_id'].values:
            drop_idx = np.where(EMA[COND]['study_id'] == ema_n_excl)[0][0]
            EMA[COND] = EMA[COND].drop(drop_idx).reset_index(drop=True)
            print(f'drop {ema_n_excl} in EMA, index: {drop_idx}')
        if ema_n_excl in UPDRS[COND]['study_id'].values:
            drop_idx = np.where(UPDRS[COND]['study_id'] == ema_n_excl)[0][0]
            UPDRS[COND] = UPDRS[COND].drop(drop_idx).reset_index(drop=True)
            print(f'drop {ema_n_excl} in UPDRS, index: {drop_idx}')
        

## 2. Preprocess data

#### Get (mean-corrected) EMA and UPDRS values per symptom subtype

In [None]:
importlib.reload(load_data)

importlib.reload(prep_data)


sumdf = prep_data.get_sum_df(EMA_dict=EMA, UPDRS_dict=UPDRS,
                             MEAN_CORR=True,)

# sumdf

Split in Training and Test Cohorts

In [None]:
# SPLIT DATA IN TRAIN AND TEST

train_subs, test_subs = prep_data.get_train_test_split(sumdf)

traindf = sumdf.loc[[i for i in sumdf.index if i in train_subs]]

testdf = sumdf.loc[[i for i in sumdf.index if i in test_subs]]

print(traindf.shape, testdf.shape)

## Explore EMA x UPDRS correlations

In [None]:
from scipy.stats import ttest_rel, pearsonr

In [None]:
def scatter_EMA_UPDRS(
    ax, dat_df,
    EMA_subscore = 'brady',
    UPDRS_subscore = 'brady',
    show_updrs_improve=True,
):

    ema_values, updrs_values = [], []

    for COND in ['m0s0', 'm0s1', 'm1s0', 'm1s1']:

        ema_v = dat_df[f'EMA_SUM_{EMA_subscore}_{COND}']
        updrs_v = dat_df[f'UPDRS_SUM_{UPDRS_subscore}_{COND}']

        nan_sel = np.logical_or(pd.isna(ema_v), pd.isna(updrs_v))
        ema_v = ema_v[~nan_sel]
        updrs_v = updrs_v[~nan_sel]

        ema_values.extend(ema_v)
        updrs_values.extend(updrs_v)

    # plot UPDRS clinical IMPROVEMENT
    if show_updrs_improve:
        updrs_values = np.array(updrs_values) * -1
        ax.set_xlabel(f'UPDRS-improvement {UPDRS_subscore}\n(high: less symptoms)')
    
    else:
        ax.set_xlabel(f'UPDRS {UPDRS_subscore}\n(low: less symptoms)')


    ax.scatter(updrs_values, ema_values)
    ax.axhline(y=0, c='gray', alpha=0.3)
    ax.axvline(x=0, c='gray', alpha=0.3)

    R, pval = pearsonr(
        [x for x in updrs_values if not np.isnan(x)],
        [y for y in ema_values if not np.isnan(y)]
    )

    ax.set_title(f'{EMA_subscore}  R: {R.round(2)}, p={pval.round(5)}')
    ax.set_ylabel(f'EMA {EMA_subscore}\n(high: less symptoms)')
    
    return ax

In [None]:
figpath = load_utils.get_onedrive_path('emaval_fig')


In [None]:
figname = 'motor_corr_meanCorrvalues_brady9'

fig, axes = plt.subplots(1, 3, figsize=(12, 4))

for ax, subscore in zip(axes, ['brady', 'tremor', 'gait']):

    ax = scatter_EMA_UPDRS(
        ax=ax, dat_df=traindf,
        EMA_subscore=subscore,
        UPDRS_subscore=subscore,
        show_updrs_improve=True,
    )

plt.tight_layout()

plt.savefig(os.path.join(figpath, 'train_data', 'ema_updrs_corr', figname), dpi=300,
            facecolor='w',)

plt.show()

Stats ema x updrs

In [None]:
print(traindf.columns)

In [None]:
importlib.reload(prep_data)


lmm_df = prep_data.get_lmm_df(traindf)

print(lmm_df.values.shape)

print(lmm_df.columns)

print(lmm_df.dtypes)

In [None]:
import utils.stats as utilsstat
import statsmodels.formula.api as smf


In [None]:
importlib.reload(utilsstat)

# set target motor symptom
motor_target = 'brady'

lmm_fix = {
    'single_motor': f"EMA_SUM_{motor_target} + EMA_SUM_nonmotor",
    'all_motor': (
        "EMA_SUM_brady + EMA_SUM_tremor + "
        "EMA_SUM_gait + EMA_SUM_nonmotor"
    )
}

FIX_EFF = 'all_motor'

# Random intercepts only
model = smf.mixedlm(
    f"UPDRS_SUM_{motor_target} ~ {lmm_fix[FIX_EFF]}",
    lmm_df,
    groups=lmm_df["subid"],
    # re_formula=f"~EMA_SUM_{motor_target}",  # for random slopes for EMA motor
)
result = model.fit()
print(result.summary())

## calculate explained variances
R2_marg, R2_cond = utilsstat.calc_expl_variances(fitted_model=result)

print(f"for {motor_target}: R2_marginal {np.round(R2_marg, 3)},"
      f"R2_conditional: {np.round(R2_cond, 3)}")



Show individual differences in EMA-point vs UPDRS-change

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(12, 4),
                         sharey=True, )

for i_trg, target in enumerate(['brady', 'tremor', 'gait']):

    id_coefs = []

    for subid in np.unique(lmm_df['subid']):

        x = lmm_df[f'EMA_SUM_{target}'][lmm_df['subid'] == subid]
        y = lmm_df[f'UPDRS_SUM_{target}'][lmm_df['subid'] == subid]

        try:
            z = np.polyfit(x, y, 1)
            coef = z[0]
        except:
            if all(x == y): coef = 0

        id_coefs.append(coef)
        # plt.scatter(x, y)
        xplot = np.arange(5)
        # plt.plot(xplot, xplot * coef)

    # plt.show()


    axes[i_trg].hist(id_coefs)
    axes[i_trg].set_xlabel('delta UPDRS point / EMA point')
    axes[i_trg].set_ylabel(f'Observations for {target} (n subjects)')

plt.show()

### LFP analysis (not included)

to do's:
- double check "rest" task is not excluding data
- include stim-amplitude data rows to double s0 vs s1



In [None]:
importlib.reload(load_utils)

In [None]:

ids = load_data.get_ids()

SKIP_LFPs = {
    'ema03': ['m0s1'],
    'ema07': ['m1s0', 'm1s1'],  # no m1 done: always ['m1s0', 'm1s1']
    'ema09': ['m1s0', 'm1s1'],  # no m1 done: always ['m1s0', 'm1s1']
    'ema10': ['m1s0', 'm1s1'],  # no m1 done: always ['m1s0', 'm1s1']
    'ema12': ['m1s0', 'm1s1'],  # no m1 done: always ['m1s0', 'm1s1']
    'ema14': 'all',  # no m1 done, m0s1 not found in motherfolder
    # 'ema14': ['m1s0', 'm1s1', 'm0s1'],  # ONLY m0s0; EXCLUDE?!
    'ema15': ['m1s0', 'm1s1'],  # no m1 done: always ['m1s0', 'm1s1']
    'ema16': ['m1s0', 'm1s1']  # ONLY m0s0; EXCLUDE?!
    # 'ema16': 'all'  # no m1 done: always ['m1s0', 'm1s1']
}

lfp_data = {}

for ema_id, COND in product(ids.index,
                            ['m0s0', 'm0s1', 'm1s0', 'm1s1']):
    if ema_id in SKIP_LFPs.keys():
        if COND in SKIP_LFPs[ema_id] or SKIP_LFPs[ema_id] == 'all':
            print(f'\n#### SKIP {ema_id} {COND}, not percept ready ####\n')
            continue

    sub = ids.loc[ema_id]['prc_id']
    ses = ids.loc[ema_id]['prc_ses']

    print(f'\nGET LFP {ema_id}, {sub}, {ses}, {COND}')

    # load session that corresponds to current selection
    ### TODO: 'REST' is hardcoded currently, check for issues with task like rest&tap
    sub_data = main_class.PerceiveData(
        sub = sub, 
        incl_modalities=['streaming'],
        incl_session = [ses],
        incl_condition =[COND,],
        incl_task = ["rest"],
        import_json=False, # for addtionally loading the corresponding JSON files as source files, set to True
        warn_for_metaNaNs=True, # True will give you a warning with rows from the metadata table with NaNs. Make sure you have filled out all columns of the file you want to load.
        allow_NaNs_in_metadata=True,
    )

    dat = getattr(sub_data.streaming, ses)
    # only include first two data rows (left and right STN signal)
    dat = getattr(dat, COND).rest.run1.data.get_data()[:2, :]
    ### TODO: include stimulation amplitude data streams to double check whether s0 vs s1 is correct
    lfp_data[f'{ema_id}_{COND}'] = dat

In [None]:
# FIG_PATH = os.path.join(os.path.dirname(os.getcwd()), 'figures', 'lfp_preprocess')
FIG_PATH = load_utils.get_onedrive_path('emaval_fig')
print(f'CHECK FIG_PATH: {FIG_PATH}, exists? -> {os.path.exists(FIG_PATH)}')

In [None]:
def plot_single_lfp_preprocess(
    DAT,
    SUB = 'emaXX',
    COND = 'm0s0',
    N_STD_OUTLIER = 3,
    LOWPASS = 2,
    HIGHPASS = 45,
    SFREQ=250,
    SHOWPLOTS=False,
    SAVEPLOTS=True,
):
    lfp_times = prep_data.get_lfp_times()


    fig, axes = plt.subplots(2, 2)
    for i, (arr, side) in enumerate(
        zip(DAT[f'{SUB}_{COND}'][:2], ['left', 'right'])
    ):
        arr = arr.copy()  # do not overwrite original dict data

        if ids.loc[SUB]['prc_id'] in lfp_times.keys():
            t_start, t_end = lfp_times[ids.loc[SUB]['prc_id']][COND]['rest']
            i_start, i_end = (t_start * 250, t_end * 250)
            arr = arr[i_start:i_end]

        ### plot raw signal
        axes[0, i].plot(arr, color='blue', alpha=.3, label='raw filtered',)

        ### handle outliers
        sel = np.logical_or(arr > (N_STD_OUTLIER * np.std(arr)),
                            arr < (-N_STD_OUTLIER * np.std(arr)))
        # arr[sel] = np.nan  # replace outliers with NaNs
        arr = arr[~sel]  # drop outliers
        
        ### plot resulting arr
        axes[0, i].plot(arr, color='blue', label='cleaned',)
        axes[0, i].set_title(f'{SUB} {COND} {side} STN', weight='bold')
        axes[0, i].set_ylabel(f'{side}-STN activity (yVolt)')
        xticks = np.arange(0, len(arr), 250 * 60)
        axes[0, i].set_xticks(xticks)
        axes[0, i].set_xticklabels(np.arange(len(xticks)))
        axes[0, i].set_xlabel('Time (minutes)')
        axes[0, i].set_ylim(-50, 50)
        # axes[0, i].legend(loc='upper right', frameon=False,)  # legend

        ### plot PSD
        arr = prep_data.lfp_filter(signal=arr, low=LOWPASS, high=HIGHPASS,)
        f, psx = welch(arr, fs=SFREQ,)
        axes[1, i].plot(f, psx)
        axes[1, i].set_ylabel(f'{side}-STN Power (a.u.)')
        axes[1, i].set_xlim(0, 45)
        axes[1, i].set_xlabel('Freq (Hz)')

    plt.tight_layout()

    if SAVEPLOTS:
        plt.savefig(os.path.join(FIG_PATH, 'lfp_preprocess', f'PSD_check_{SUB}_{COND}'),
                    facecolor='w', dpi=150,)
    if SHOWPLOTS: plt.show()
    else: plt.close()

CHECK missing LFP sessions

check motherfolder:
- ema16, sub105: too many runs? UPDRS tasks? 3 rest m0s0, 2 rest m0s1?
- ema14: only m0s0, leave out only one state

In [None]:
lfp_done = np.unique([k.split('_')[0] for k in lfp_data.keys()])

lfp_todo = [s for s in ids.index if s not in lfp_done]

print(lfp_todo)



In [None]:
for sub in lfp_todo:

    print(f'\n{sub}  -> sub-{ids.loc[sub]["prc_id"]} @ {ids.loc[sub]["prc_ses"]}')
    for COND in ['m0s0', 'm0s1', 'm1s0', 'm1s1']:
        print(f'\t{COND}')
        sub_data = main_class.PerceiveData(
            sub = ids.loc[sub]['prc_id'],
            incl_modalities=['streaming'],
            incl_session = [ids.loc[sub]['prc_ses']],
            incl_condition =[COND,],
            incl_task = ["rest"],
            import_json=False, # for addtionally loading the corresponding JSON files as source files, set to True
            warn_for_metaNaNs=True, # True will give you a warning with rows from the metadata table with NaNs. Make sure you have filled out all columns of the file you want to load.
            allow_NaNs_in_metadata=True,
        )

#### Select relevant ephys epochs based on task timings

In [None]:
lfp_times = prep_data.get_lfp_times()
ids = load_data.get_ids()


In [None]:
Fs = 250
sub = 'ema01'
con = 'm0s0'
lfp_sub = ids.loc[sub]['prc_id']

rest_times = lfp_times[lfp_sub][con]['rest']
rest_samples = [rest_times[0] * Fs, rest_times[1] * Fs]

plt.plot(lfp_data[f'{sub}_{con}'][0][rest_samples[0]:rest_samples[1]])

### TODO:
# check if all seconds for available data is working
# correct 'rest' tasks if troublesome i.e. rest&tap
# check s0 and s1 versus stim-ampltidude time series
# plot individual PSDs
# calculate beta-powers X UPDRS correlations
# draft if and if so, how to include movement parts?

#### Plot and save spectral preprocessing

In [None]:
lfp_subs = np.unique([k.split('_')[0] for k in lfp_data.keys()])

# lfp_subs = ['ema01', 'ema08']

for SUB, COND in product(lfp_subs, ['m0s0', 'm0s1', 'm1s0', 'm1s1']):

    print(f'\n### {SUB}, {COND}')
    if f'{SUB}_{COND}' not in lfp_data.keys():
        print(f'...skip {SUB}, {COND}')
        continue

    plot_single_lfp_preprocess(SUB=SUB, COND=COND, DAT=lfp_data,
                               N_STD_OUTLIER=6,
                               SHOWPLOTS=False, SAVEPLOTS=True,)
