## Prediction with Movement dependency


Notebook to train and test prediction models that use movement dependent and/or movement independent data / features

In [1]:
# Importing Python and external packages
import os
import sys
import importlib
import json
import csv
from dataclasses import dataclass, field, fields
from itertools import compress, product
import pandas as pd
import numpy as np
import sklearn as sk
from scipy import signal, stats

import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
def get_project_path_in_notebook(
    subfolder: str = '',
):
    """
    Finds path of projectfolder from Notebook.
    Start running this once to correctly find
    other modules/functions
    """
    path = os.getcwd()

    while path[-20:] != 'dyskinesia_neurophys':

        path = os.path.dirname(path)
    
    return path

In [3]:
projectpath = get_project_path_in_notebook()


Import own functions

In [70]:
os.chdir(os.path.join(projectpath, 'code'))

# own utility functions
import utils.utils_fileManagement as utilsFiles

# own data exploration functions
import lfpecog_preproc.preproc_import_scores_annotations as importClin
import lfpecog_analysis.ft_processing_helpers as ftProc
import lfpecog_analysis.psd_lid_stats as lidStats
import lfpecog_analysis.ft_processing_helpers as ftProc
import lfpecog_features.feats_spectral_helpers as specHelp
import lfpecog_analysis.get_acc_task_derivs as getAccTask

import lfpecog_predict.prepare_predict_arrays as predArrays
import lfpecog_features.extract_ssd_features as ssdFeats

from lfpecog_plotting.plotHelpers import get_colors
import lfpecog_plotting.plotHelpers as pltHelp
import lfpecog_plotting.plot_FreqCorr as plotFtCorrs
import lfpecog_plotting.plot_SSD_feat_descriptives as plot_ssd_descr

check versions

In [5]:
from  matplotlib import __version__ as plt_version

# check some package versions for documentation and reproducability
print('Python sys', sys.version)
print('pandas', pd.__version__)
print('numpy', np.__version__)
print('matplotlib', plt_version)
# Python sys 3.9.0 (default, Nov 15 2020, 08:30:55) [MSC v.1916 64 bit (AMD64)]
# pandas 1.4.4
# numpy 1.23.3
# matplotlib 3.5.3

Python sys 3.9.0 (default, Nov 15 2020, 08:30:55) [MSC v.1916 64 bit (AMD64)]
pandas 1.4.4
numpy 1.23.3
matplotlib 3.5.3


### Import data

imports SSD-band-envelops with parallel CDRS, timestamps, task, movement-coding

In [7]:
# SET VERSIONS

FT_VERSION='v8'
SETTINGS = utilsFiles.load_ft_ext_cfg(FT_VERSION=FT_VERSION)

SUBS = utilsFiles.get_avail_ssd_subs(
    DATA_VERSION=SETTINGS["DATA_VERSION"],
    FT_VERSION=FT_VERSION,
)
print(f'n = {len(SUBS)} subjects available')

n = 21 subjects available


## FIG: Spectral Scatterplot: Movement vs. Dyskinesia dependence

Load STN data (n=21)

In [21]:
FT_VERSION = 'v8'
INCL_CORE_CDRS = True
CATEG_CDRS = False
MILD_CDRS = 4
SEV_CDRS = 8
INCL_ECOG = False

FeatLid_STN = ftProc.FeatLidClass(
    FT_VERSION=FT_VERSION,
    CDRS_RATER='Patricia',
    INCL_ECOG=INCL_ECOG,
    INCL_ACC_RMS=False,
    CATEGORICAL_CDRS=CATEG_CDRS,
    CORR_TARGET='CDRS',
    cutMild=MILD_CDRS, cutSevere=SEV_CDRS,
    TO_CALC_CORR=False,
)

Acc_STN = ftProc.FeatLidClass(
    FT_VERSION='v6',
    CDRS_RATER='Patricia',
    INCL_ECOG=INCL_ECOG,
    INCL_ACC_RMS=True,
    CATEGORICAL_CDRS=CATEG_CDRS,
    CORR_TARGET='CDRS',
    cutMild=MILD_CDRS, cutSevere=SEV_CDRS,
    TO_CALC_CORR=False,
)

SUBS: n=21 (['017', '020', '021', '101', '102', '107', '023', '014', '103', '016', '019', '009', '008', '013', '110', '108', '022', '010', '109', '105', '012'])
load 017
load 020
...deleted 278 rows (uni-LID ipsi-lat to ECoG)
load 021
load 101
load 102
load 107
load 023
load 014
...deleted 145 rows (uni-LID ipsi-lat to ECoG)
load 103
load 016
...deleted 35 rows (uni-LID ipsi-lat to ECoG)
load 019
load 009
load 008
load 013
...deleted 41 rows (uni-LID ipsi-lat to ECoG)
load 110
load 108
load 022
...deleted 140 rows (uni-LID ipsi-lat to ECoG)
load 010
...deleted 554 rows (uni-LID ipsi-lat to ECoG)
load 109
load 105
load 012
...deleted 31 rows (uni-LID ipsi-lat to ECoG)
SUBS: n=21 (['017', '020', '021', '101', '102', '107', '023', '014', '103', '016', '019', '009', '008', '013', '110', '108', '022', '010', '109', '105', '012'])
load 017
load 020
...deleted 278 rows (uni-LID ipsi-lat to ECoG)
load 021
load 101
load 102
load 107
load 023
load 014
...deleted 145 rows (uni-LID ipsi-lat to ECo

Load ECoG data (n=13)

In [9]:
FT_VERSION = 'v8'
INCL_CORE_CDRS = True
CATEG_CDRS = False
MILD_CDRS = 4
SEV_CDRS = 8
INCL_ECOG = True

FeatLid_ECOG = ftProc.FeatLidClass(
    FT_VERSION=FT_VERSION,
    CDRS_RATER='Patricia',
    INCL_ECOG=INCL_ECOG,
    INCL_ACC_RMS=False,
    CATEGORICAL_CDRS=CATEG_CDRS,
    CORR_TARGET='CDRS',
    cutMild=MILD_CDRS, cutSevere=SEV_CDRS,
    TO_CALC_CORR=False,
)

# get ACC from v6
Acc_ECOG = ftProc.FeatLidClass(
    FT_VERSION='v6',
    CDRS_RATER='Patricia',
    INCL_ECOG=INCL_ECOG,
    INCL_ACC_RMS=True,
    CATEGORICAL_CDRS=CATEG_CDRS,
    CORR_TARGET='CDRS',
    cutMild=MILD_CDRS, cutSevere=SEV_CDRS,
    TO_CALC_CORR=False,
)

SUBS: n=21 (['017', '020', '021', '101', '102', '107', '023', '014', '103', '016', '019', '009', '008', '013', '110', '108', '022', '010', '109', '105', '012'])
load 017
load 020
...deleted 278 rows (uni-LID ipsi-lat to ECoG)
load 021
load 101
101 skipped due to ECoG-inclusion
load 102
102 skipped due to ECoG-inclusion
load 107
107 skipped due to ECoG-inclusion
load 023
load 014
...deleted 145 rows (uni-LID ipsi-lat to ECoG)
load 103
103 skipped due to ECoG-inclusion
load 016
...deleted 35 rows (uni-LID ipsi-lat to ECoG)
load 019
load 009
load 008
load 013
...deleted 41 rows (uni-LID ipsi-lat to ECoG)
load 110
110 skipped due to ECoG-inclusion
load 108
108 skipped due to ECoG-inclusion
load 022
...deleted 140 rows (uni-LID ipsi-lat to ECoG)
load 010
...deleted 554 rows (uni-LID ipsi-lat to ECoG)
load 109
109 skipped due to ECoG-inclusion
load 105
105 skipped due to ECoG-inclusion
load 012
...deleted 31 rows (uni-LID ipsi-lat to ECoG)
SUBS: n=21 (['017', '020', '021', '101', '102', '107

Get task data per minute

In [None]:
TASK_MINS = getAccTask.get_task_minutes(LOAD_JSON=True, SUBS=SUBS)

In [76]:
import lfpecog_plotting.plot_Spectrals_vs_LID as plotSpecLid

Plot Scatterplot

In [77]:
importlib.reload(plotSpecLid)

POW_or_COH = 'POW'
SRC = 'lfp'  # ecog or lfp
EXCL_FREE = False
gamma_peak = 'peak'  # peak / mean

if SRC == 'lfp':
    FeatClass = FeatLid_STN  # FeatLid_ECOG
    AccClass = Acc_STN
if SRC == 'ecog':
    FeatClass = FeatLid_ECOG
    AccClass = Acc_ECOG

FIG_NAME = (f'0000319_PowerScatter_{SRC.upper()}_LID_MOVE_g{gamma_peak}'
            f'_n{len(FeatClass.FEATS.keys())}')

if EXCL_FREE: FIG_NAME = 'woFREE_' + FIG_NAME

plotSpecLid.scatter_Feats_LID_MOVE(FeatClass=FeatClass,
                       AccClass=AccClass,
                       POW_or_COH=POW_or_COH,
                       SRC=SRC,
                       SAVE_FIG=True,
                       FIG_NAME=FIG_NAME,
                       gamma_mean_or_peakband=gamma_peak,
                       EXCL_FREE=EXCL_FREE,
                       task_minutes=TASK_MINS,
                       ZERO_SPACE=True,)

  axes[i_ft].set_xticklabels(xticklabels, size=fsize,)


saved plot 0000319_PowerScatter_LFP_LID_MOVE_gpeak_n21 in c:\Users\habetsj\Research\projects\dyskinesia_neurophys\figures\final_Q1_2024\feat_scatter_LID_MOVE!


## FIG) Simple biomarker versus Dyskinesia onset


Calculates based on envelop arrays from predArrays.get_move_selected_env_arrays()

In [None]:
importlib.reload(predArrays)

# get move-selected env arrays
# contains: freq-bands, CDRS, timestamps, tasks, mov-coding
DATA, env_fbands = {}, {}

for sub in SUBS:
    DATA[sub], env_fbands[sub] = predArrays.get_move_selected_env_arrays(
        sub=sub, LOAD_SAVE=True
    )

Prepare data

In [None]:
src = 'lfp_left'

ex_sub = list(env_fbands.keys())[0]
ex_src= list(env_fbands[ex_sub].keys())[0]

i_theta = np.where([k == 'theta' for k in env_fbands[sub][src]])[0][0]
i_beta = np.where([k == 'lo_beta' for k in env_fbands[sub][src]])[0][0]
i_gammaPeak = np.where([k == 'gammaPeak' for k in env_fbands[sub][src]])[0][0]
i_cdrs = len(env_fbands[sub][src])
i_time = i_cdrs + 1
i_task = i_time + 1
# i_move is last

time_list = {'lid': [], 'nolid': []}
ratio_list = {'lid': [], 'nolid': []}

lid_onsets = []

for sub, src in product(DATA.keys(),
                        ['lfp_left', 'lfp_right']):
    
    if sum(DATA[sub][src][i_cdrs, :]) == 0:
        print(f'sub {sub} had no LID')
        subgroup = 'nolid'
    else:
        subgroup = 'lid'

    print(f'...calc {sub, src}')
    
    idx_sort = np.argsort(DATA[sub][src][i_time, :])
    sort_arr = DATA[sub][src][:, idx_sort]

    theta = sort_arr[i_theta, :]
    beta = sort_arr[i_beta, :]
    gamma = sort_arr[i_gammaPeak, :]


    for arr in [theta, beta, gamma]:
        off_sel = sort_arr[i_time, :] < 5
        m = np.mean(arr[off_sel])
        sd = np.std(arr[off_sel])
        arr = (arr - m) / sd

    assert theta.shape == beta.shape, 'shapes unequal'

    if subgroup == 'lid':
        i0_lid = np.where(sort_arr[i_cdrs, :] > 0)[0][0]
        t0_lid = sort_arr[i_time, i0_lid]
        lid_times = sort_arr[i_time, :] - t0_lid
        lid_onsets.append(t0_lid)
    else:
        lid_times = sort_arr[i_time, :]


    time_list[subgroup].append(lid_times)
    ratio = (theta / beta) * gamma
    ratio = (ratio - np.mean(ratio)) / np.std(ratio)
    ratio_list[subgroup].append(ratio)


In [None]:
min_sec, max_sec = (
    int(np.min([np.min(l) for l in time_list['lid']])),
    int(np.max([np.max(l) for l in time_list['lid']]))
)

print(f'mean LID onset: {round(np.mean(lid_onsets) / 60, 1)} minutes'
      f' (sd: {round(np.std(lid_onsets) / 60, 1)})')
# correct group without LID to comparable time offsets
nolid_new_times = []
for t in time_list['nolid']:
    nolid_new_times.append(t - np.mean(lid_onsets))


In [None]:
# create overall array per X seconds
WIN_LEN = 10

## LID group
min_sec, max_sec = (
    int(np.min([np.min(l) for l in time_list['lid']])),
    int(np.max([np.max(l) for l in time_list['lid']]))
)
t_new = np.arange(min_sec, max_sec, WIN_LEN)

ratio_arr = np.array([[np.nan] * len(t_new)] * len(ratio_list['lid']))

for i_t, t0 in enumerate(t_new):

    for i_row, (sig_temp, t_temp) in enumerate(
        zip(ratio_list['lid'], time_list['lid'])
    ):
        # select idx for window
        win_sel = np.logical_and(t_temp > t0, t_temp < (t0 + WIN_LEN))
        # add mean ratio to correct idx
        ratio_arr[i_row, i_t] = np.mean(sig_temp[win_sel])


## NO-LID group

# correct group without LID to comparable time offsets
nolid_new_times = []
for t in time_list['nolid']:
    nolid_new_times.append(t - np.mean(lid_onsets))

ratio_arr_noLID = np.array([[np.nan] * len(t_new)] * len(ratio_list['nolid']))

# use same time frame
for i_t, t0 in enumerate(t_new):

    for i_row, (sig_temp, t_temp) in enumerate(
        zip(ratio_list['nolid'], nolid_new_times)
    ):
        # select idx for window
        win_sel = np.logical_and(t_temp > t0, t_temp < (t0 + WIN_LEN))
        # add mean ratio to correct idx
        ratio_arr_noLID[i_row, i_t] = np.mean(sig_temp[win_sel])
        

In [None]:
importlib.reload(plotSpecLid)

# plot_ratio_biomarker(ratio_arr=ratio_arr,
#                      Z_SCORE_RATIOS=False,
#                      MIN_SUBS=5,
#                      SMOOTH_WIN=0,)

plotSpecLid.plot_ratio_biomarker(
    ratio_arr=ratio_arr,
    t_new=t_new,
    Z_SCORE_RATIOS=False,
    MIN_SUBS=5,
    SMOOTH_WIN=10,
    SAVE_FIG=True,
)

# plot_ratio_biomarker(ratio_arr=ratio_arr_noLID,
#                      Z_SCORE_RATIOS=False, MIN_SUBS=2)

## Old)

Extract Spectral Power and Variation in Feature windows

TODO:
- current gamma: only indiv peak, add sum gamma over 60 - 90

In [None]:
def indiv_zscoring_feats(X_arr, sub_arr):

    for i_f, sub in product(np.arange(X_arr.shape[1]),
                            np.unique(sub_arr)):
        # loop over all feature and sub combinations
        sub_sel = sub_arr == sub
        m = np.mean(X_arr[sub_sel, i_f])
        sd = np.std(X_arr[sub_sel, i_f])
        # z-score values for sub
        X_arr[sub_sel, i_f] = (X_arr[sub_sel, i_f] - m) / sd
    
    return X_arr

In [None]:
importlib.reload(ssdFeats)

X_arrs, y_arrs, sub_arrs = {}, {}, {}
mov_dep_code = {}

for i_mov, MOV_SEL in enumerate(['INDEP', 'DEPEND']):
    # loading/ creating/ saving in ssdFeats script
    (
        X_arrs[i_mov], y_arrs[i_mov], sub_arrs[i_mov], feat_names
    ) = ssdFeats.get_moveSpec_predArrays(
        MOV_SEL=MOV_SEL, LOAD_SOURCES=['STN',],
        POWER_METHOD='ENV'
    )
    # add movement code
    mov_dep_code[i_mov] = np.array([i_mov] * X_arrs[i_mov]['STN'].shape[0]).T

# merging for zscoring together
stn_X = np.concatenate([X_arrs[0]['STN'], X_arrs[1]['STN']], axis=0)
stn_y = np.concatenate([y_arrs[0]['STN'], y_arrs[1]['STN']], axis=0)
stn_subids = np.concatenate([sub_arrs[0]['STN'], sub_arrs[1]['STN']], axis=0)

mov_dep_code = np.concatenate([mov_dep_code[0], mov_dep_code[1]], axis=0)

In [None]:
# calculate coefficients and pvalues
CDRS_categs = {0: 'none', 1: 'mild',
               2: 'moderate', 3: 'severe'}

X = stn_X.copy()
y = stn_y.copy()
sub_ids = stn_subids.copy()

stat_arr = {'INDEP': {'coef': [], 'pval': []},
            'DEPEND': {'coef': [], 'pval': []}}

X = indiv_zscoring_feats(X, sub_arr=sub_ids)

for i_ft, ft in enumerate(feat_names['STN']):
    

    for i_mov, MOV_SEL in enumerate(['INDEP', 'DEPEND']):
        mov_bool = mov_dep_code == i_mov
        # define X and groups for feat
        ft_temp = X[mov_bool, i_ft]
        y_temp = y[mov_bool]
        box_categs = [ft_temp[y_temp == cat]
                      for cat in CDRS_categs.keys()]
        # run LMM
        coeff, pval = lidStats.run_mixEff_wGroups(
            dep_var=ft_temp,
            indep_var=y_temp,
            groups=sub_ids[mov_bool],
            TO_ZSCORE=False,
        )
        # save in dict-lists
        stat_arr[MOV_SEL]['coef'].append(coeff)
        stat_arr[MOV_SEL]['pval'].append(pval)

        

In [None]:
# plot boxplots per feature

FIG_NAME = '0129env_boxplots_specPowerVar_vs_LIDcategs'


fig, axes = plt.subplots(len(feat_names['STN']), 2,
                         figsize=(12, 18),
                         sharex='col', sharey='row')

for i_ft, ft in enumerate(feat_names['STN']):
    
    for i_mov, MOV_SEL in enumerate(['INDEP', 'DEPEND']):
        # get boxplot data
        mov_bool = mov_dep_code == i_mov
        # define X and groups for feat
        ft_temp = X[mov_bool, i_ft]
        y_temp = y[mov_bool]
        box_categs = [ft_temp[y_temp == cat]
                      for cat in CDRS_categs.keys()]
        
        # stats are calculated before

        # plotting
        coeff = stat_arr[MOV_SEL]['coef'][i_ft]
        pval = stat_arr[MOV_SEL]['pval'][i_ft]
        axes[i_ft, i_mov].boxplot(box_categs)
        if pval < (.05 / len(feat_names['STN'])): w = 'bold'
        else: w='normal'
        axes[i_ft, i_mov].set_title(f'{ft}, mov-{MOV_SEL}\n'
                f'(coeff {round(coeff, 2)}, '
                f'p={round(pval, 5)})',
                weight=w,)
        axes[i_ft, i_mov].set_ylim(-3, 3)

        axes[i_ft, i_mov].set_xticks([1,2,3,4])
        axes[i_ft, i_mov].set_xticklabels(CDRS_categs.values())
        axes[i_ft, i_mov].set_ylabel('indiv. z-scored feature\n(a.u.)')
        axes[i_ft, i_mov].set_xlabel('LID (CDRS sum)')

plt.tight_layout()

plt.savefig(os.path.join(utilsFiles.get_project_path('figures'),
                            'feat_dysk_corrs',
                            'corr_boxplots',
                            FIG_NAME),
            dpi=300, facecolor='w',)

plt.close()
        

In [None]:
# plot HEATMAP

FIG_NAME = '0129cf_heatmap_specPowerVar_vs_LIDcategs'


fig, ax = plt.subplots(1, 1,
                         figsize=(12, 4),)
ALPHA = .05 / len(feat_names['STN'])
# 0 is not sign, 1 is sign
heat_arrs = {sig_label: np.array([
    [np.nan] * len(feat_names['STN'])
] * 2) for sig_label in [0, 1]}

for i_ft, ft in enumerate(feat_names['STN']):
    
    for i_mov, MOV_SEL in enumerate(['INDEP', 'DEPEND']):
        # fill arrays with stats calculated before
        coeff = stat_arr[MOV_SEL]['coef'][i_ft]
        pval = stat_arr[MOV_SEL]['pval'][i_ft]
        sig_lab = (pval < ALPHA).astype(int)
        heat_arrs[sig_lab][i_mov, i_ft] = coeff

# non-sign heatmap
vmin, vmax = -.25, .25
cmap = 'coolwarm'  # RdBu_r
nonsig_map = ax.imshow(heat_arrs[0], vmin=vmin, vmax=vmax,
                           cmap=cmap, )
# hatch = plt.pcolor(heat_arrs[0], vmin=vmin, vmax=vmax,
#                    hatch='//', cmap=cmap,
#                    edgecolor='w', )

# ax.imshow(X=heat_arrs[0], cmap='coolwarm',
        #   alpha=.6, vmin=-.3, vmax=.3,)
sig_map = ax.imshow(X=heat_arrs[1], cmap=cmap,  # RdBu_r
                    alpha=.9, vmin=vmin, vmax=vmax,)

for i_m, i_f in product(np.arange(heat_arrs[1].shape[0]),
                        np.arange(heat_arrs[1].shape[1])):
    if np.isnan(heat_arrs[1][i_m, i_f]): continue
    if abs(heat_arrs[1][i_m, i_f]) > .3: c='w'
    else: c = 'black'
    ax.text(i_f, i_m, s=round(heat_arrs[1][i_m, i_f], 2),
            color=c, horizontalalignment='center',
            verticalalignment='center', weight='bold',)

cbar = fig.colorbar(sig_map, pad=.01)
cbar.ax.set_ylabel('LMM coefficient (a.u.)')

ax.set_xticks(np.arange(len(feat_names['STN'])))
ax.set_xticklabels(feat_names['STN'],
                   rotation=75,)
ax.set_yticks([0, 1])
ax.set_yticklabels(['move-INDEPENDENT', 'move-DEPENDENT'],)

plt.tight_layout()

plt.savefig(os.path.join(utilsFiles.get_project_path('figures'),
                            'feat_dysk_corrs',
                            'corr_boxplots',
                            FIG_NAME),
            dpi=300, facecolor='w',)

plt.close()
        

In [None]:
heat_arrs[1].shape