# Explore extracted Features based on SSD

### 0) Load packages and functions

In [1]:
# Importing Python and external packages
import os
import sys
import importlib
import json
import csv
from dataclasses import dataclass, field, fields
from itertools import compress
import pandas as pd
import numpy as np

import sklearn as sk
from scipy import signal, stats

import matplotlib.pyplot as plt



In [2]:
def get_project_path_in_notebook(
    subfolder: str = '',
):
    """
    Finds path of projectfolder from Notebook.
    Start running this once to correctly find
    other modules/functions
    """
    path = os.getcwd()

    while path[-20:] != 'dyskinesia_neurophys':

        path = os.path.dirname(path)
    
    return path

In [3]:
# define local storage directories
projectpath = get_project_path_in_notebook()
codepath = os.path.join(projectpath, 'code')
figpath = os.path.join(projectpath, 'figures')
datapath = os.path.join(projectpath, 'data')
feat_path = os.path.join(projectpath, 'results', 'features')

In [4]:
os.chdir(codepath)
# own utility functions
import utils.utils_fileManagement as utilsFiles
import utils.utils_windowing as utilsWindows
from utils.utils_fileManagement import (get_project_path,
                                        load_class_pickle,
                                        save_class_pickle,
                                        mergedData,
                                        correct_acc_class)
# own data preprocessing functions
import lfpecog_preproc.preproc_data_management as dataMng
# own data exploration functions
import lfpecog_features.feats_read_proc_data as read_data
import lfpecog_plotting.expl_plotting as expl_plot



import lfpecog_preproc.preproc_import_scores_annotations as importClin
import lfpecog_analysis.import_ephys_results as importResults
import lfpecog_analysis.get_acc_derivs as accDerivs


from lfpecog_plotting.plotHelpers import remove_duplicate_legend

### 1) Define settings

In [47]:
SETTINGS = utilsFiles.load_ft_ext_cfg(cfg_fname='ftExtr_spectral_v1.json')

# WIN_LEN_sec = 10
# WIN_OVERLAP_part = 0.0
ssd_ft_path = os.path.join(feat_path, 'SSD_feats',
                           SETTINGS['DATA_VERSION'],
                           f"windows_{SETTINGS['WIN_LEN_sec']}s_"
                           f"{SETTINGS['WIN_OVERLAP_part']}overlap")
IGNORE_PTS = ['010', ]

LID_SCORE_INCL = 1  # from this score, features are labeled into LID+ group

In [48]:
# get all available subs with features 
SUBS = list(set([name.split('_')[1] for name in os.listdir(ssd_ft_path)]))

for sub in IGNORE_PTS:
    if sub in SUBS: SUBS.remove(sub)

First try:
- only include ECoG and ipsilateral STN LFP
- exclude moments where was only Dyskinesia in body-side ipsilateral to ECoG (NOT CORRESPONDING WITH ECoG-hemisphere)

### 1a) Load Clinical Scores

Select moments with Dyskinesia at WRONG BODYSIDE (ipsilateral to ECoG) for removal later on

In [21]:
sub = '008'
scores_P = importClin.read_clinical_scores(sub=sub, rater='Patricia')
scores_J = importClin.read_clinical_scores(sub=sub, rater='Jeroen')

In [49]:
SCORES = {}
ECOG_SIDES = {}
REMOVE_TIMES = {}  # remove moments with only 'WRONG SIDE' dyskinesia

for sub in SUBS:
        # # GET UNILATERAL (CONTRA ECOG) CDRS SCORES
        # scores_temp = importClin.read_clinical_scores(sub=sub,
        #                                               rater='Patricia')
        #         # check if scores are present
        # if type(scores_temp) == type(None):
        #         print(f'None CDRS-scores loaded for sub {sub}')
        #         continue

        # # get ECoG-side
        # ecog_side = importClin.get_ecog_side(sub)
        # ECOG_SIDES[sub] = ecog_side
        # # define CDRS of body-side to include
        # if ecog_side == 'left': LID_side_incl = 'right'
        # elif ecog_side == 'right': LID_side_incl = 'left'
        
        # # identify minutes to remove bcs only Dyskinesia at none-ECoG side
        # REMOVE_TIMES[sub] = []
        # for i, t in enumerate(scores_temp['dopa_time']):
        #         if np.logical_and(scores_temp.iloc[i][f'CDRS_total_{LID_side_incl}'] < 1,
        #                           scores_temp.iloc[i][f'CDRS_total_{ecog_side}'] > 0):
        #                 REMOVE_TIMES[sub].append(t)

        # # include selected CDRS
        # SCORES[sub] = scores_temp[['dopa_time', f'CDRS_total_{LID_side_incl}']]
        

        # GET LID PRESENT OR NOT
        temp = importClin.read_clinical_scores(sub=sub, rater='Patricia')
        SCORES[sub] = temp[['dopa_time', 'CDRS_total']]


In [50]:
SCORES.keys()

dict_keys(['012', '011', '016', '013', '014', '009', '008'])

### 1b) Load Features

Only include ECoG and ECoG-sided STN-LFP for now

In [130]:
def load_ssd_powers(sub, feat_path):

    sub_ft_files = [f for f in os.listdir(feat_path) if
                    sub in f and 'spectralFeatures' in f]
    df_out = None
    for f in sub_ft_files:
        for dType in ['lfp_left', 'lfp_right',
                      'ecog_left', 'ecog_right']:
            if dType not in f: continue

            temp = pd.read_csv(join(feat_path, f), header=0, index_col=0)
            
            temp = temp.rename(columns={k: f'{dType}_{k}'
                                        for k in temp.keys()})

            if not isinstance(df_out, pd.DataFrame):
                df_out = temp
                continue
            # add to existing df_out
            df_out = pd.concat([df_out, temp], ignore_index=False, axis=1,)


    return df_out

In [156]:
from collections import namedtuple

In [157]:
localPAC = namedtuple('localPAC', 'times values')

In [170]:
def load_ssd_localPAC(
    sub, feat_path, pac_freqs):

    sub_ft_files = [f for f in os.listdir(feat_path) if
                    sub in f and 'localPAC' in f]
    dict_out = {}

    for dType in ['lfp_left', 'lfp_right',
                    'ecog_left', 'ecog_right']:
        
        dtype_files = [f for f in sub_ft_files if dType in f]
        if len(dtype_files) == 0: continue  # skip for i.e. not-existing ecog side 
        
        for pha_f, amp_f in pac_freqs:
        
            for f in dtype_files:
                if f'{pha_f}_{amp_f}' not in f: continue
        
                if 'times' in f: 
                    times = np.loadtxt(join(feat_path, f), delimiter=',')
                else:
                    dat = np.load(join(feat_path, f), allow_pickle=True)
            
            assert len(times) == dat.shape[-1], (
                f'loaded PACs times ({len(times)}) and data ({dat.shape})'
                f' dont match for {dType}_{pha_f}_{amp_f}'
            )
            dict_out[f'{dType}_{pha_f}_{amp_f}'] = localPAC(times, dat)
            del(times, dat)

    return dict_out

In [174]:
from os.path import join, exists

@dataclass(init=True, repr=True)
class ssdFeatures():
    settings_json: str = 'ftExtr_spectral_v1.json'
    sub_list: list = field(default_factory=lambda: [])
    data_version: str = 'v3.0'
    win_len_sec: int or float = 10
    win_overlap_part: float = 0.0
    incl_powers: bool = True
    incl_localPAC: bool = True
    incl_coherence: bool = True

    def __post_init__(self,):
        # load feature extraction settings
        extract_settings = utilsFiles.load_ft_ext_cfg(cfg_fname=self.settings_json)

        # define feature path and check existence
        self.feat_path = join(get_project_path('results'),
                    'features',
                    'SSD_feats',
                    self.data_version,
                    f'windows_{self.win_len_sec}s_'
                    f'{self.win_overlap_part}overlap')
        assert exists(self.feat_path), f'feat_path ([{self.feat_path}]) does not exist'
        # take all available subjects from feature path if sub_list is not defined
        if self.sub_list == []:
            self.sub_list = list(set([name.split('_')[1]
                                      for name in os.listdir(self.feat_path)]))

        keywords = vars(self)

        for sub in self.sub_list:
            print(f'\nload SSDd features for sub-{sub}')
            setattr(self,
                    f'sub{sub}',
                    ssdFeats_perSubject(sub=sub, feat_path=self.feat_path,
                                        settings=keywords,
                                        extract_settings=extract_settings),)
            


@dataclass(init=True, repr=True)
class ssdFeats_perSubject:
    sub: str = 'default'  # default given to prevent inheritance error
    feat_path: str = 'default'
    settings: dict = field(default_factory=lambda: {})
    extract_settings: dict = field(default_factory=lambda: {})
    verbose: bool = False

    def __post_init__(self,):
        
        if self.settings['incl_powers']:
            if self.verbose: print(f'load POWERS - {self.sub}')
            self.powers = load_ssd_powers(sub, feat_path=self.feat_path)
        
        if self.settings['incl_localPAC']:
            if self.verbose: print(f'load local PAC - {self.sub}')
            pac_freqs = self.extract_settings['FEATS_INCL']['local_PAC_freqs']
            self.localPAC = load_ssd_localPAC(sub, feat_path=self.feat_path,
                                              pac_freqs=pac_freqs)
        
        if self.settings['incl_coherence']:
            print(f'TODO: load COHERENCES - {self.sub}')
            # self.powers = load_ssd_coherences(sub, feat_path=self.feat_path)

        # TODO: LOAD CDRSB SCORES
        


In [175]:
fts = ssdFeatures()


load SSDd features for sub-012
TODO: load COHERENCES - 012

load SSDd features for sub-011
TODO: load COHERENCES - 011

load SSDd features for sub-016
TODO: load COHERENCES - 016

load SSDd features for sub-013
TODO: load COHERENCES - 013

load SSDd features for sub-014
TODO: load COHERENCES - 014

load SSDd features for sub-009
TODO: load COHERENCES - 009

load SSDd features for sub-008
TODO: load COHERENCES - 008


In [176]:
fts.sub009.localPAC['ecog_right_lo_beta_narrow_gamma'].values.shape

(8, 4, 333)

In [54]:
# SPECTRAL POWERS
sub = '014'
sub_ft_files = [f for f in os.listdir(ssd_ft_path) if sub in f]

ft_name = 'localPAC'
ft_name = 'spectralFeatures'
[f for f in sub_ft_files if ft_name in f]


['SSDfeats_014_ecog_right_local_spectralFeatures.csv',
 'SSDfeats_014_lfp_left_local_spectralFeatures.csv',
 'SSDfeats_014_lfp_right_local_spectralFeatures.csv']

In [None]:
FEATS = {}
for sub in SUBS:
    ecog_side = ECOG_SIDES[sub]
    # load ECog Features
    ecog_fts = pd.read_csv(os.path.join(ssd_ft_path, f'SSDfeatures_{sub}_ecog_{ecog_side}.csv'),
                            index_col=0, header=0)
    # rename and add ECOG to ft-names
    rename_cols = {}
    for key in ecog_fts.keys(): rename_cols[key] = f'ECOG_{key}'
    ecog_fts = ecog_fts.rename(columns=rename_cols)
    
    # load ECog Features
    stn_fts = pd.read_csv(os.path.join(ssd_ft_path, f'SSDfeatures_{sub}_lfp_{ecog_side}.csv'),
                            index_col=0, header=0)
    # rename and add STN to ft-names
    rename_cols = {}
    for key in stn_fts.keys(): rename_cols[key] = f'STN_{key}'
    stn_fts = stn_fts.rename(columns=rename_cols)

    merged_fts = pd.concat([stn_fts, ecog_fts], axis=1, ignore_index=False)
    merged_fts.index = merged_fts.index / 60  # convert to minutes to agree with CDRS score
    FEATS[sub] = merged_fts
    
    
    

### 1c) Prepare Features and Scores

Remove features to exclude and get CDRS scores to remaining features

In [None]:
# REMOVE ROWS DUE TO DYSKINESIA ONLY (!!) IN NONE-ECOG-SIDE
for sub in SUBS:
  ft_times = FEATS[sub].index
  score_times = SCORES[sub]['dopa_time']

  remove_ft_idx = []
  # select feature-rows which are closest to a CDRS-moments which should be excluded
  for ft_row, t in enumerate(ft_times):
      t_diffs = abs(score_times - t)
      i = np.argmin(t_diffs)

      if score_times[i] in REMOVE_TIMES[sub]:
        remove_ft_idx.append(ft_times[i])  
          
  FEATS[sub] = FEATS[sub].drop(remove_ft_idx, axis=0)
  print(f'removed {len(remove_ft_idx)} rows in sub-{sub}')

In [None]:
# DEFINE CDRS LABELS FOR FEATURE WINDOW TIMES
FT_LABELS = {}

for sub in SUBS:
    ft_times = FEATS[sub].index

    ft_scores = []

    for t in ft_times:
        t_diffs = abs(SCORES[sub]['dopa_time'] - t)
        i = np.argmin(t_diffs)
        ft_scores.append(SCORES[sub].iat[i, 1])  # take column 1, is CDRS score

    FT_LABELS[sub] = ft_scores

    assert FEATS[sub].shape[0] == len(FT_LABELS[sub]), (
        'Feature DataFrame and Ft-Labels must have same length'
    )
# no_LID_sel = np.array(ft_scores) == 0
# LID_sel = np.array(ft_scores) >= LID_SCORE_INCL


### 2) Explore

In [None]:
from itertools import product

In [None]:
import lfpecog_plotting.plotHelpers as pltHelp

In [None]:
X_total = []
y_total_binary = []
y_total_scale = []
sub_ids_total = []
ft_times_total = []

EXCL_CODE = 99

TO_PLOT = False

if TO_PLOT:
    fig, axes = plt.subplots(len(SUBS), 1, figsize=(12, 16))
    fs = 16


for i_s, sub in enumerate(SUBS):
    # create lists to store values for boxplotting
    bp_LID_values_list = []
    bp_noLID_values_list = []
    bp_keys = []


    ### Create Y-labels based on CDRS (FT_LABELS)
    no_LID_sel = np.array(FT_LABELS[sub]) == 0
    LID_sel = np.array(FT_LABELS[sub]) >= LID_SCORE_INCL

    # create binary y-labels
    sub_y_bin = []  # y as binary
    for noLID, LID in zip(no_LID_sel, LID_sel):
        if noLID: sub_y_bin.append(0)
        elif LID: sub_y_bin.append(1)
        else: sub_y_bin.append(EXCL_CODE)
    # add full scaled y-labels
    sub_y_scale = FT_LABELS[sub]

    # append sub-codes to sub-id list
    sub_ids_total.append([sub] * FEATS[sub].shape[0])  # add subject code, as many times as there are feature rows

    # add subjects ft-times to list
    ft_times_total.append(FEATS[sub].index.values)

    ### Create X with standardised Feature-arrays
    sub_X = np.zeros_like((FEATS[sub]))

    for n_col, ft in enumerate(FEATS[sub].keys()):
        values = FEATS[sub].values[:, n_col]
        # split values on Dyskinesia
        noLID_values = values[no_LID_sel]
        LID_values = values[LID_sel]
        
        # define mean and std of no-LID for Z-SCORE
        m = np.nanmean(noLID_values)
        sd = np.nanstd(noLID_values)
        # Z-SCORE values
        Z_LID_values = (LID_values - m) / sd
        Z_noLID_values = (noLID_values - m) / sd
        Z_ALL_values = (values - m) / sd

        # add feat and z-score values to lists for BOXPLOT (WITHOUT NaNs)
        bp_LID_values_list.append(list(Z_LID_values[~np.isnan(LID_values)]))
        bp_keys.append(ft)

        # store all feats for pred-exploration
        sub_X[:, n_col] = Z_ALL_values
    
    X_total.append(sub_X)
    y_total_binary.append(sub_y_bin)
    y_total_scale.append(sub_y_scale)

    if TO_PLOT:
        ##### PLOT BOXPLOT OF FEATURES ######
        box = axes[i_s].boxplot(bp_LID_values_list)
        plt.setp(box['fliers'], color='gray')
        # plt.setp(box['whiskers'], color='red')

        axes[i_s].axhline(y=0, xmin=0, xmax=24, color='k', alpha=.3)
        for y_line in [-2, 2]: axes[i_s].axhline(y=y_line, xmin=0, xmax=24, color='r', alpha=.3)

        axes[i_s].set_ylim(-6, 6)
        axes[i_s].set_ylabel(f'z-scores\nvs no-LID (a.u.)', fontsize=fs)
        axes[i_s].set_title(f'Sub-{sub} (mean unilat. CDRS '
                            f'{round(np.mean(FT_LABELS[sub]), 2)})',
                            weight='bold', fontsize=fs)
        axes[i_s].set_xticklabels(['mx', 'mn', 'cv'] * int(len(bp_keys) / 3),
                                fontsize=fs,)

        for side in ['top','right','bottom']:
            axes[i_s].spines[side].set_visible(False)

        ### fill colors
        colors = {
            'alpha': 'yellow',
            'lo_beta': 'lightblue',
            'hi_beta': 'darkblue',
            'midgamma': 'green'
        }
        hatches = {
            'STN': '',
            'ECoG': '//'
        }

        x_fill_list = []
        for x1 in np.arange(.5, len(bp_keys) + .5, 3):
            x2 = x1 + 3
            x_fill_list.append([x1, x2])

        for i_x, (src, bw) in  enumerate(product(hatches.keys(), colors.keys())):
            axes[i_s].fill_betweenx(
                y=np.arange(-6, 6), x1=x_fill_list[i_x][0],
                x2=x_fill_list[i_x][1], color=colors[bw], hatch=hatches[src],
                label=f'{src} {bw}', alpha=.2, edgecolor='gray',)
if TO_PLOT:
    leg_content = plt.gca().get_legend_handles_labels()
    handles, labels = pltHelp.remove_duplicate_legend(leg_content)
    plt.legend(handles, labels, ncol=4, frameon=False,
            loc='upper center', bbox_to_anchor=(0.5, -0.2),fancybox=False,
            prop={'weight': 'bold', 'size': fs})

    plt.suptitle('Individual Feature values during Dyskinesia\n', weight='bold', fontsize=fs+4)
    plt.tight_layout()

    figname = 'LID_ssdFeatures_boxplots_indiv'
    # plt.savefig(os.path.join(figpath, 'ft_exploration', 'SSD', figname),
    #             dpi=300, facecolor='w',)
    plt.close()

print(f'FEATURES X-AXIS: {bp_keys}')
