## Feature Extraction: Neurophysiology [dyskinesia project]

This notebooks helps to extract features from preprocessed ECoG and LFP (STN electrodes) data within the ReTune-Dyskinesia project.

Data is required to be preprocessed, for example with the corresponding preprocess functions.

<b> Content </b>


<b> Periodic component analysis: </b> 
- Try Wavelet Dceomposition vs Welch (tapered) Spectral Decomposition



<b> Aperiodic estimates </b>
Relevant literature:
- Periodic and a-periodic components relevance and interaction, different reasons (per + a-per) for signal changes observed within a specific bandwidth. Aperiodic component (complicated) vs exponent (1/f) (Donoghue, ..., Shestyuk & Voytek, Nature Neurosc 2020 : https://www.nature.com/articles/s41593-020-00744-x)
- cycle-by-cycle features: bycycle toolbox (Cole & Voytek, J of Neurophys 2019, https://journals.physiology.org/doi/full/10.1152/jn.00273.2019)

### 0. Loading packages and functions, defining paths



In [1]:
# Importing Python and external packages
import os
import sys
import importlib
import json
import csv
from dataclasses import dataclass, field, fields
from collections import namedtuple
from typing import Any
from itertools import compress
from pathlib import Path
import pandas as pd
import numpy as np
import sklearn as sk
from scipy.stats import pearsonr, mannwhitneyu

import matplotlib.pyplot as plt

from scipy import signal, stats
from array import array
import datetime as dt
# #mne
# import mne_bids
import mne


In [2]:
# check some package versions for documentation and reproducability
print('Python sys', sys.version)
print('pandas', pd.__version__)
print('numpy', np.__version__)
# print('mne_bids', mne_bids.__version__)
# print('mne', mne.__version__)
# print('sci-py', scipy.__version__)
print('sci-kit learn', sk.__version__)
## FEB 2022:
# Python sys 3.9.7 (default, Sep 16 2021, 08:50:36) 
# [Clang 10.0.0 ]
# pandas 1.3.4
# numpy 1.20.3
# mne_bids 0.9
# mne 0.24.1
# sci-py 1.7.1
# sci-kit learn 1.0.1

Python sys 3.9.0 (default, Nov 15 2020, 08:30:55) [MSC v.1916 64 bit (AMD64)]
pandas 1.4.4
numpy 1.23.3
sci-kit learn 1.1.3


In [3]:
def get_project_path_in_notebook(
    subfolder: str = '',
):
    """
    Finds path of projectfolder from Notebook.
    Start running this once to correctly find
    other modules/functions
    """
    path = os.getcwd()

    while path[-20:] != 'dyskinesia_neurophys':

        path = os.path.dirname(path)
    
    return path

In [4]:
# define local storage directories
projectpath = get_project_path_in_notebook()
codepath = os.path.join(projectpath, 'code')
figpath = os.path.join(projectpath, 'figures')
datapath = os.path.join(projectpath, 'data')

In [5]:
os.chdir(codepath)
# own utility functions
import utils.utils_fileManagement as utilsFiles
import utils.utils_windowing as utilsWindows
# own data preprocessing functions
import lfpecog_preproc.preproc_data_management as dataMng
import lfpecog_preproc.preproc_filters as fltrs
# own data exploration functions
import lfpecog_features.feats_read_proc_data as read_data
import lfpecog_plotting.expl_plotting as expl_plot
import lfpecog_features.feats_spectral_baseline as specBase
import lfpecog_features.feats_spectral_features as spectral
import lfpecog_features.feats_spectral_helpers as specHelp


import lfpecog_preproc.preproc_import_scores_annotations as importClin
import lfpecog_analysis.import_ephys_results as importResults
import lfpecog_analysis.get_acc_task_derivs as accDerivs

from lfpecog_plotting.plotHelpers import remove_duplicate_legend

### 0. Sketch protocol

In [None]:
rest_starts = [-10, 0, 15, 25, 35, 50, 60]
tap_starts = [x + 5 for x in rest_starts]
free_starts = [10, 45]

fig, ax = plt.subplots(1, 1, figsize=(16, 3))

fs = 16

fill_y = (0, 2)
for x in rest_starts:
    plt.fill_betweenx(
        fill_y, x1=x, x2=x + 5, color='darkblue', alpha=.2,
        hatch='//', label='REST',)
for x in tap_starts:
    plt.fill_betweenx(
        fill_y, x1=x, x2=x + 5, facecolor='w',
        edgecolor='darkgreen', alpha=.8,
        hatch='//', label='TAP (contra to ECoG)',)
for x in free_starts:
    plt.fill_betweenx(
        fill_y, x1=x, x2=x + 5, facecolor='w',
        edgecolor='purple', alpha=.8, lw=5, 
        hatch='X', label='FREE',)

plt.vlines(
    0, ymin=0, ymax=4, color='gray', lw=3,
    ls='--', label='200 mg LDOPA LT')

plt.xlim(-12, 72)
plt.ylim(0, 4)

plt.xlabel('Time (minutes after LDOPA)', size=fs)

plt.xticks(np.arange(-10, 80, 10))
plt.yticks([])
plt.tick_params(size=fs, labelsize=fs)

handles_labels = plt.gca().get_legend_handles_labels()
handles, labels = remove_duplicate_legend(handles_labels)
plt.legend(
    handles, labels, frameon=False, fontsize=fs,
    ncol=len(handles),)

for side in ['right', 'left', 'top']:
    getattr(ax.spines, side).set_visible(False)

plt.tight_layout()

plt.savefig(
    os.path.join(figpath, 'protocol', 'updated_protocol_Feb23'),
    dpi=150, facecolor='w',
)

plt.close()

### 1. Load Merged Sub-Data

#### load none ephys pickle

In [6]:
os.chdir(codepath)

from utils.utils_fileManagement import (get_project_path,
                                        load_class_pickle,
                                        save_class_pickle,
                                        mergedData,
                                        correct_acc_class)


In [None]:
data_version = 'v3.1'
mins_recording = []

subs_to_plot = ['017',]

for sub in subs_to_plot:
    # load Acc-detected movement labels
    acc = load_class_pickle(os.path.join(
        get_project_path('data'),
        'merged_sub_data', data_version,
        f'{sub}_mergedDataClass_{data_version}_noEphys.P'
    ))
    acc = correct_acc_class(acc)

    mins_recording.append(acc.data.shape[0] / acc.fs / 60)

In [None]:
print(f'Mean minutes of recording were {np.mean(mins_recording)} (+/- {np.std(mins_recording)})')

#### Descr Stats on CDRS

In [None]:
lid_peak = {'bi': [], 'uni': []}
n_coh_wins = 

for sub in subs_to_plot:

    _, scores = importClin.get_cdrs_specific(sub, side='both')
    lid_peak['bi'].append(np.max(scores))
    _, scores = importClin.get_cdrs_specific(sub, side='contra ecog')
    lid_peak['uni'].append(np.max(scores))

    mics = importResults.get_mic_scores(sub=sub, task='both')
    


for key, peaks in zip(lid_peak.keys(), lid_peak.values()):

    print(f'Mean CDRS {key.upper()}-lat {np.mean(peaks)} (+/- {np.std(peaks)})')



#### plotting gamma-MICs against dopa-time

Plot features over time with time-0 is LID start

In [None]:
fig, axes = plt.subplots(2,1, figsize=(12, 12))
fs = 14
f_lo = 70
f_hi = 85

lid_starts = {
    '008': 8, '009': 25, '010': 28, '012': 1,
    '013': 22, '014': 17,
}  # '016' TODO

for ax_i, task in enumerate(['rest', 'tap']):

    for sub in ['008', '009', '010', '012', '013', '014']:

        mic_df = importResults.get_mic_scores(sub, task=task)
        # get peak gamma freq
        peak_i, peak_f = importResults.get_peakFreq_in_timeFreq(
            tf_values=mic_df.values,
            times=mic_df.index,
            freqs=mic_df.keys(),)
        gamma = np.mean(mic_df.values[:, peak_i-1:peak_i+2], axis=1)
        # calculate mean norm. mean gamma over time
        f_sel = [f_lo <= float(c) <= f_hi for c in mic_df.keys()]
        i_var_freq = importResults.get_most_var_freq(mic_df)
        # gamma = np.mean(mic_df.values[:, i_var_freq:i_var_freq + 2], axis=1)
        # # normalise gamma against grand-sub-mean
        # gamma = gamma - np.mean(gamma)
        # create time with LID-start at 0
        times = np.around(mic_df.index / 60, 0)
        times = times - lid_starts[sub]

        axes[ax_i].plot(
            times, gamma, alpha=.5,
            label=f'sub-{sub} ({mic_df.keys()[i_var_freq]} Hz)', )

    axes[ax_i].set_xlim(-30, 70)
    axes[ax_i].set_ylabel('Gamma Maximised Imag. Coherence\n'
                f'(mean {f_lo}-{f_hi} Hz)', fontsize=fs)
    axes[ax_i].set_xlabel('Time vs LID-start (minutes)', fontsize=fs)


    axes[ax_i].axhline(0, xmin=axes[ax_i].get_xlim()[0], xmax=axes[ax_i].get_xlim()[1],
                color='gray', alpha=.5, ls='--',)
    axes[ax_i].axvline(0, ymin=0, ymax=1, color='gray', alpha=.5, label='LID start')
    axes[ax_i].legend(frameon=False, fontsize=fs, ncol=4)
    axes[ax_i].set_title(task, size=fs + 4)

plt.close()

Plot CDRS vs MIC per subject

In [None]:
"""
Plot: 
one row per subject
col-0: 4 boxplots (rest lid-, rest lid+, tap lod-, tap lid+)
col-1: scatter for rest
col-2: scatter for tap
"""

subs_to_plot = [
    '008', '009', '010', '012', 
    '013', '014', '016']
freqBand = 'mid-gamma'
save_plot = True
select_contraEcogTaps = True
figname='group_LID_vs_MIC_perSubject_RETUNE'

# sets of freqBand ranges
freqRanges = {
    'mid-gamma': (70, 90),
    'beta': (12, 30),
    'low-beta': (12, 20),
    'high-beta': (20, 30),
    'alpha': (7, 10),
    'theta': (4, 7)
}
peak_range = freqRanges[freqBand]

# adjust filename to variables
figname += f'_{freqBand}'
if select_contraEcogTaps:
    figname += '_ecogSided'

prop_cycle = plt.rcParams['axes.prop_cycle']
std_colors = prop_cycle.by_key()['color']
if len(subs_to_plot) > len(std_colors):
    raise ValueError('NOT ENOUGH COLORS SPECIFIED')

fs = 20

fig, axes = plt.subplots(
    len(subs_to_plot), 3,
    figsize=(18, len(subs_to_plot) * 4))


for s, sub in enumerate(subs_to_plot):

    lid_t, lid_y = importClin.get_cdrs_specific(
        sub=sub, side='contralat ecog'
    )
    # get peak gamma freq, same within one subject
    mic_df = importResults.get_mic_scores(sub, task='both')
    peak_i, peak_f = importResults.get_peakFreq_in_timeFreq(
        tf_values=mic_df.values,
        times=mic_df.index,
        freqs=mic_df.keys(),
        f_min=peak_range[0],
        f_max=peak_range[1],
        )
    gamma_y_dict, cdrs_x_dict = {}, {}

    for task in ['rest', 'tap']:

        mic_df = importResults.get_mic_scores(
            sub, task=task, ecogSide_tapAdjust=select_contraEcogTaps,)   
        gamma_y = np.mean(mic_df.values[:, peak_i-1:peak_i+2], axis=1)
        gamma_t = np.around(mic_df.index / 60, 1)

        # find nearest index LID score for gamma times
        closest_lid_idx = [np.argmin(abs(lid_t - t)) for t in gamma_t]
        # get nearest LID score for gamma times
        gamma_lid = lid_y.values[closest_lid_idx]
        # store gamma / cdrs values per task and per LID neg/pos
        gamma_y_dict[f'{task}\nLID -'] = [gamma_y[i] for i in range(len(gamma_y)) if gamma_lid[i] == 0]
        gamma_y_dict[f'{task}\nLID +'] = [gamma_y[i] for i in range(len(gamma_y)) if gamma_lid[i] > 0]
        cdrs_x_dict[f'{task}\nLID -'] = [g for g in gamma_lid if g == 0]
        cdrs_x_dict[f'{task}\nLID +'] = [g for g in gamma_lid if g > 0]
        
    # increase y-limit if any value in one of the list is > 0.5
    if any(
        [any(np.array(v) > .5)
        for v in list(gamma_y_dict.values())]
    ):  
        yLim = (0, .8)
    else:
        yLim = (0, .5)

    # boxplot MIC: WITHOUT vs WITH LID (per task)
    axes[s, 0].boxplot(list(gamma_y_dict.values()))  # list of lists with gamma values
    axes[s, 0].set_ylabel('abs Max Imag Coh', fontsize=fs,)
    axes[s, 0].set_xticklabels(list(gamma_y_dict.keys()), fontsize=fs,)
    axes[s, 0].set_title(f'{sub} (+/- {peak_f} Hz)',
        fontsize=fs, fontweight='bold',)
    axes[s, 0].tick_params(labelsize=fs, axis='both', size=fs)
    axes[s, 0].set_ylim(yLim)


    # scatterplot MIC vs LID-severeness
    for i_t, task in enumerate(['rest', 'tap']):
        # merge all values per task
        x = cdrs_x_dict[f'{task}\nLID -']
        x.extend(cdrs_x_dict[f'{task}\nLID +'])
        y = gamma_y_dict[f'{task}\nLID -']
        y.extend(gamma_y_dict[f'{task}\nLID +'])
        jitter = np.random.uniform(low=-.15, high=.15, size=len(x))
        axes[s, i_t + 1].scatter(
            np.array(x) + jitter, y, alpha=.5, color=std_colors[s],)
        axes[s, i_t + 1].set_ylim(yLim)
        axes[s, i_t + 1].set_xlim(-.5, 8)
        axes[s, i_t + 1].set_ylabel(
            'abs Max Imag Coh', fontsize=fs,)
        axes[s, i_t + 1].set_xlabel(
            'contralat. CDRS', fontsize=fs,)
        axes[s, i_t + 1].set_title(
            f'{sub}: {task.upper()}', fontsize=fs,)
        axes[s, i_t + 1].tick_params(labelsize=fs, axis='both', size=fs)

plt.tight_layout()

if save_plot:
    plt.savefig(
        os.path.join(
            get_project_path('figures'), 'ft_exploration',
            data_version, 'mvc', 'group_level', figname
        ),
        facecolor='w', dpi=300,
    )

plt.close()

Plot CDRS vs MIC for GROUP

In [7]:
from lfpecog_plotting.plotHelpers import get_colors


In [271]:
def get_selected_mean_mic(
    sub, f_band, cdrs: str = 'binary',
    task= 'both', indiv_peak: bool = False,
    zscore_mic: bool = True,
):
    """
    Returns meanmean MIC values (unilateral), selected based
    on CDRS values for the CONTRALATERAL BODY SIDE to the
    ECoG hemisphere

    Input:
        - mic_df: dataframe with index: time in seconds,
            column-names are freqs, values are mic-values
        - f_band: string name of freq-band
        - cdrs: 'binary' returns off- and on-mean,
            'linear' returns dict with means per cdrs-score
        - task: mic values of tasks to include, both is rest and tap
    """
    freqRanges = {
        'mid-gamma': (60, 90),
        'beta': (12, 30),
        'low-beta': (12, 20),
        'high-beta': (20, 30),
        'theta': (4, 8),
        'alpha': (8, 12),
        'theta-alpha': (4, 12)
    }
    assert f_band in freqRanges.keys(), (f'f_band ({f_band}) not in {freqRanges}')

    # get MIC-related timestamps (in minutes!!)
    mic_df = importResults.get_mic_scores(sub, task=task)
    mic_t = mic_df.index.values.astype(float) / 60
    mic_f = mic_df.keys().values.astype(float)
    # get timestamps and values of clinical CDRS LID assessment
    lid_t_ecog, lid_y_ecog = importClin.get_cdrs_specific(sub=sub, rater='Mean',
                                                side='contralat ecog',)
    lid_t_nonecog, lid_y_nonecog = importClin.get_cdrs_specific(sub=sub, rater='Mean',
                                                side='ipsilat ecog',)
    # find closest CDRS value for MIC values based on timestamps
    lid_idx_mic = [np.argmin(abs(t - lid_t_ecog)) for t in mic_t]
    ecog_lid_micvalues = lid_y_ecog[lid_idx_mic]
    nonecog_lid_micvalues = lid_y_nonecog[lid_idx_mic]

    no_LID_bool = np.logical_and(ecog_lid_micvalues == 0,
                                nonecog_lid_micvalues == 0)
    LID_bool = ecog_lid_micvalues > 0
    LID_values_bool = ~np.logical_and(ecog_lid_micvalues == 0,
                                    nonecog_lid_micvalues > 0)
    
    # sets of freqBand ranges
    f_range = freqRanges[f_band]
    if not indiv_peak:
        # take whole defined freq-bandwidth
        f_sel = np.logical_and(mic_f >= f_range[0],
                               mic_f <= f_range[1])
    elif indiv_peak:
        # take narrow-band around individual peak in defined bandwidth
        _, indiv_peak_f = importResults.get_peakFreq_in_timeFreq(
            tf_values=mic_df.values, times=mic_t, freqs=mic_f,
            f_min=f_range[0], f_max=f_range[1],
        )
        f_sel = np.logical_and(mic_f >= indiv_peak_f - 3,
                               mic_f <= indiv_peak_f + 3)

    if zscore_mic:
        # take mean and stddev for zscore over all MIC values
        # in selected range and in ECoG-selected cdrs-times
        z_M = mic_df.iloc[LID_values_bool, f_sel].values.ravel().mean()
        z_sd = mic_df.iloc[LID_values_bool, f_sel].values.ravel().std()

    # return binary OFF ON
    if cdrs == 'binary':

        mic_sel_noLID = mic_df.iloc[no_LID_bool, f_sel]
        if zscore_mic: mic_sel_noLID = (mic_sel_noLID - z_M) / z_sd
        mic_noLID_grandmean = mic_sel_noLID.mean().mean()
        mic_sel_LID = mic_df.iloc[LID_bool, f_sel]
        if zscore_mic: mic_sel_LID = (mic_sel_LID - z_M) / z_sd
        mic_LID_grandmean = mic_sel_LID.mean().mean()

        return mic_noLID_grandmean, mic_LID_grandmean
    
    elif cdrs == 'linear':
        mic_sel = mic_df.iloc[LID_values_bool, f_sel].mean(axis=1).values  # mean mic-values (over selected freqs) per timepoint
        lid_sel = ecog_lid_micvalues[LID_values_bool].astype(np.float64)  # unilat-cdrs per timepoint
        # round cdrs scores on 0.5
        lid_sel = np.around(lid_sel * 2) / 2
        # norm mic-scores
        if zscore_mic:
            mic_sel = (mic_sel - z_M) / z_sd
        
        # sort values from 0 to high
        sort_idx = np.argsort(lid_sel)
        lid_sel = lid_sel[sort_idx]
        mic_sel = mic_sel[sort_idx]
        
        return lid_sel, mic_sel


Create group figure of individual z-scored MIC-values vs linear-CDRS

In [299]:
subs = ['008', '009', '010', '012',
        '013', '014', '016', '017']
# fbands_incl = ['mid-gamma', 'theta-alpha']
fbands_incl = ['low-beta', 'high-beta']
Z_MICs = False
SAVE_FIG = False
PRINT_STATS = False
figname = 'group_beta-MIC_vs_CDRS_linear'
if Z_MICs: figname += '_ZMIC'
elif not Z_MICs: figname += '_rawMIC'
clrs = list(get_colors().values())
fsize = 18

fig, axes = plt.subplots(1, 2, figsize=(18, 6))

for i_f, f_band in enumerate(fbands_incl):

    for i_s, sub in enumerate(subs):

        lid_sub, mic_sub = get_selected_mean_mic(sub=sub, f_band=f_band,
                                                cdrs='linear', indiv_peak=True,
                                                zscore_mic=Z_MICs)
        
        #calculate equation for trendline
        if len(np.unique(lid_sub)) > 1:
            R, p = stats.pearsonr(lid_sub, mic_sub)
            if PRINT_STATS: print(f'{f_band}, sub-{sub}: R: {R.round(2)}, p = {p.round(4)}\n')

            axes[i_f].scatter(lid_sub, mic_sub, color=clrs[i_s],
                    label=sub, alpha=.3, s=150,
            )
            
            z = np.polyfit(lid_sub, mic_sub, 1)
            p = np.poly1d(z)
            
            # add trendline to plot
            axes[i_f].plot(lid_sub, p(lid_sub), lw=3,
                    color=clrs[i_s], alpha=.9,)
        
        else:
            axes[i_f].scatter(0, np.median(mic_sub),
                        marker='x', s=300, label=sub,
                        color=clrs[i_s], alpha=.9,)

    axes[i_f].set_xlabel('clinical dyskinesia\n(CDRS, unilateral sum'
            ' contralat. to ECoG)', size=fsize,)
    
    ylabel='max. imag. coherence\n(indiv. peak)'
    if Z_MICs: ylabel = ylabel[:-1] + ', z-scored)'
    axes[i_f].set_ylabel(ylabel, size=fsize,)
    
    axes[i_f].set_title(f_band.upper(), weight='bold', size=fsize,)

    axes[i_f].tick_params(axis='both', labelsize=fsize,
                    size=fsize)

    for side in ['right', 'top']:
        getattr(axes[i_f].spines, side).set_visible(False)

axes[1].legend(ncol=1, fontsize=fsize, frameon=True,
                     bbox_to_anchor=(1.02, .75),)

plt.tight_layout()

if SAVE_FIG: plt.savefig(os.path.join(figpath, 'ft_exploration',
                         'v3.1', 'mvc', 'ibags23', figname),
                        facecolor='w', dpi=300,)
plt.close()

Create group figure of individual z-scored MIC-values vs linear-CDRS

In [295]:
subs = ['008', '009', '010', '012',
        '013', '014', '016', '017']
fbands_incl = ['mid-gamma', 'theta-alpha',
               'low-beta', 'high-beta']
figname = 'group_MIC_vs_CDRS_binary_boxplot'
Z_MICs = False
SAVE_FIG = False
clrs = list(get_colors().values())
fsize = 14

fig, ax = plt.subplots(1, 1, figsize=(len(fbands_incl)*4, 4))

box_lists = []  # fill in order of fbands: no-LID/ LID

for i_f, f_band in enumerate(fbands_incl):
    
    LID_mic_group, noLID_mic_group = [], []

    for i_s, sub in enumerate(subs):

        noLID_mic_sub, LID_mic_sub = get_selected_mean_mic(sub=sub, f_band=f_band,
                                                cdrs='binary', indiv_peak=True,
                                                zscore_mic=Z_MICs)

        if ~ np.isnan(LID_mic_sub): LID_mic_group.append(LID_mic_sub)
        if ~ np.isnan(noLID_mic_sub): noLID_mic_group.append(noLID_mic_sub)

    # perform t-test on two groups
    S, p = stats.mannwhitneyu(noLID_mic_group, LID_mic_group)

    bp_no = ax.boxplot([noLID_mic_group], positions=[i_f*3],
                       patch_artist=True, widths=.6,
                       labels=[f'{f_band}\nno-LID'],)
    bp_LID = ax.boxplot([LID_mic_group], positions=[i_f*3+1],
                        patch_artist=True, widths=.6,
                        labels=[f'{f_band}\nLID\n(p={p.round(3)})'])
    
    bp_no['boxes'][0].set_facecolor('white')
    bp_no['boxes'][0].set_edgecolor(clrs[i_f])
    bp_no['boxes'][0].set_hatch('//')
    bp_no['boxes'][0].set_alpha(.5)
    bp_LID['boxes'][0].set_facecolor(clrs[i_f])
    bp_LID['boxes'][0].set_alpha(.5)

    

#     axes[i_f].set_xticklabels(['no dyskinesia', 'dyskinesia'], size=fsize,)
# ax.set_xlabel('clinical dyskinesia\n(CDRS, unilateral sum'
#         ' contralat. to ECoG)', size=fsize,)
ylabel='max. imag. coherence\n(indiv. peak)'
if Z_MICs: ylabel = ylabel[:-1] + ', z-scored)'
ax.set_ylabel(ylabel, size=fsize,)

ax.tick_params(axis='both', labelsize=fsize,
                size=fsize)

for side in ['right', 'top']: getattr(ax.spines, side).set_visible(False)

# ax.legend(ncol=1, fontsize=fsize, frameon=True,
#             )

plt.tight_layout()

if SAVE_FIG: plt.savefig(os.path.join(figpath, 'ft_exploration',
                         'v3.1', 'mvc', 'ibags23', figname),
            facecolor='w', dpi=150,)
plt.close()