# Predict CDRS based on SSD'd Spectral Features

### 0) Load packages and functions

In [None]:
# Importing Python and external packages
import os
import sys
import importlib
import json
import csv
from dataclasses import dataclass, field, fields
from itertools import compress
import pandas as pd
import numpy as np
from itertools import product
import sklearn as sk
from scipy import signal, stats

import matplotlib.pyplot as plt



In [None]:
def get_project_path_in_notebook(
    subfolder: str = '',
):
    """
    Finds path of projectfolder from Notebook.
    Start running this once to correctly find
    other modules/functions
    """
    path = os.getcwd()

    while path[-20:] != 'dyskinesia_neurophys':

        path = os.path.dirname(path)
    
    return path

In [None]:
# define local storage directories
projectpath = get_project_path_in_notebook()
codepath = os.path.join(projectpath, 'code')
figpath = os.path.join(projectpath, 'figures')
datapath = os.path.join(projectpath, 'data')
feat_path = os.path.join(projectpath, 'results', 'features')

In [None]:
os.chdir(codepath)
# own utility functions
import utils.utils_fileManagement as utilsFiles
import utils.utils_windowing as utilsWindows
from utils.utils_fileManagement import (get_project_path,
                                        load_class_pickle,
                                        save_class_pickle,
                                        mergedData,
                                        correct_acc_class)
# own data preprocessing functions
import lfpecog_preproc.preproc_data_management as dataMng
import lfpecog_preproc.preproc_filters as fltrs
# own data exploration functions
import lfpecog_features.feats_read_proc_data as read_data
import lfpecog_plotting.expl_plotting as expl_plot
import lfpecog_features.feats_spectral_baseline as specBase
import lfpecog_features.feats_spectral_features as spectral
import lfpecog_features.feats_spectral_helpers as specHelp


import lfpecog_preproc.preproc_import_scores_annotations as importClin
import lfpecog_analysis.ft_processing_helpers as ftProc
import lfpecog_analysis.import_ephys_results as importResults
import lfpecog_analysis.get_acc_task_derivs as accDerivs

import lfpecog_plotting.plotHelpers as pltHelp
from lfpecog_plotting.plotHelpers import remove_duplicate_legend

### 1) Define settings

In [None]:
WIN_LEN_sec = 10
WIN_OVERLAP_part = 0.0
ssd_path = os.path.join(feat_path, 'SSD_powers',
                        f'windows_{WIN_LEN_sec}s_'
                        f'{WIN_OVERLAP_part}overlap')
IGNORE_PTS = ['010', ]

LID_SCORE_INCL = 1  # from this score, features are labeled into LID+ group

In [None]:
# get all available subs with features 
SUBS = list(set([name.split('_')[1] for name in os.listdir(ssd_path)]))

for sub in IGNORE_PTS:
    SUBS.remove(sub)

- only include ECoG and ipsilateral STN LFP
- exclude moments where was only Dyskinesia in body-side ipsilateral to ECoG (NOT CORRESPONDING WITH ECoG-hemisphere)

### 1a) Load Clinical Scores

Select moments with Dyskinesia at WRONG BODYSIDE (ipsilateral to ECoG) for removal later on

In [None]:
# SCORES = {}
# ECOG_SIDES = {}
# REMOVE_TIMES = {}  # remove moments with only 'WRONG SIDE' dyskinesia

# for sub in SUBS:
#         # get CDRS
#         scores_temp = importClin.run_import_clinInfo(sub=sub)
#                 # check if scores are present
#         if type(scores_temp) == type(None):
#                 print(f'None CDRS-scores loaded for sub {sub}')
#                 continue

#         # get ECoG-side
#         ecog_side = importClin.get_ecog_side(sub)
#         ECOG_SIDES[sub] = ecog_side
#         # define CDRS of body-side to include
#         if ecog_side == 'left': LID_side_incl = 'right'
#         elif ecog_side == 'right': LID_side_incl = 'left'
        
#         # identify minutes to remove bcs only Dyskinesia at none-ECoG side
#         REMOVE_TIMES[sub] = []
#         for i, t in enumerate(scores_temp['dopa_time']):
#                 if np.logical_and(
#                         scores_temp.iloc[i][f'CDRS_total_{LID_side_incl}'] == 0,
#                         scores_temp.iloc[i][f'CDRS_total_{ecog_side}'] > 0
#                 ):
#                         REMOVE_TIMES[sub].append(t)

#         # include selected CDRS
#         SCORES[sub] = scores_temp[['dopa_time', f'CDRS_total_{LID_side_incl}']]
        



### 1b) Load Features

Only include ECoG and ECoG-sided STN-LFP for now

In [None]:
import lfpecog_analysis.load_SSD_features as load_ssdFts


In [None]:
def select_coh_feats(sub_fts, coh_sides = 'STN_ECOG'):

    for i_bw, bw in enumerate(['alpha', 'lo_beta', 'hi_beta', 'narrow_gamma']):

        for i_coh, coh_type in enumerate(['sq_coh', 'imag_coh']):
            
            coh_sel = getattr(sub_fts.coherences, coh_sides)
            coh_sel = getattr(coh_sel, bw)
            coh_means = getattr(coh_sel, coh_type).mean(axis=1)
            coh_maxs = getattr(coh_sel, coh_type).max(axis=1)

            if i_bw == i_coh == 0:
                coh_values = pd.DataFrame(index=coh_means.index,
                                        data=[[coh_means.values,
                                                coh_maxs.values]],
                                        columns=[f'{coh_type}_{bw}_mn',
                                                f'{coh_type}_{bw}_mx'])
            else:
                new_values = pd.DataFrame(index=coh_means.index,
                                        data=[[coh_means.values,
                                                coh_maxs.values]],
                                        columns=[f'{coh_type}_{bw}_mn',
                                                f'{coh_type}_{bw}_mx'])
                coh_values = pd.concat([coh_values, new_values],
                                    axis=1, ignore_index=False)
                
    # if necessary: convert to minutes to agree with CDRS score
    if max(coh_values.index) > 120: coh_values.index = coh_values.index / 60

    return coh_values


In [None]:
# NEW LOAD VIA FEATURE CLASS

FEATS = {}
for sub in SUBS:
    # load all features
    fts = load_ssdFts.ssdFeatures(sub_list=[sub],)
    sub_fts = getattr(fts, f'sub{sub}')
    # select ECoG-side power-features
    s = importClin.get_ecog_side(sub)
    col_sel = [
        (f'ecog_{s}' in c or f'lfp_{s}' in c) and
        ('peak_freq' not in c and 'broad_gamma' not in c)
        for c in list(sub_fts.powers.keys())
    ]
    power_fts = sub_fts.powers.iloc[:, col_sel]
    # if necessary: convert to minutes to agree with CDRS score
    if max(power_fts.index) > 120: power_fts.index = power_fts.index / 60
    
    print(f'\tsub-{sub}, POWER FEATS SHAPE INCLUDED: {power_fts.shape}')

    # LOAD COHERENCES
    coh_fts = select_coh_feats(sub_fts=sub_fts, coh_sides='STN_ECOG')
    print(f'\tsub-{sub}, COH FEATS SHAPE INCLUDED: {coh_fts.shape}')
    
    merged_fts = pd.concat([power_fts, coh_fts], axis=1, ignore_index=False)
    print(f'\tsub-{sub}, MERGED FEATS SHAPE INCLUDED: {merged_fts.shape}')

    
    FEATS[sub] = merged_fts
    
    
    

### 1c) Load Scores and Select Feature-Windows to include

Exclude feature windows with ONLY dyskinesia present at the NONE-ECoG-corresponding body side

Remove features to exclude and get CDRS scores to remaining features

In [None]:
# selection test
cdrs_rater = 'Mean'

FT_LABELS = {}

for sub in SUBS:
    ft_times = FEATS[sub].index  # get individual feature-times present
    # select features and clinical scores to include
    select_bool, ecog_related_cdrs = ftProc.get_idx_discardNonEcogLid(
        sub=sub, ft_times=ft_times, cdrs_rater=cdrs_rater,
    )
    FEATS[sub] = FEATS[sub].iloc[select_bool]
    FT_LABELS[sub] = ecog_related_cdrs[select_bool]
    
    print(f'{sub}: rows delete: {sum(~select_bool)}, '
          f'shape post removal: {FEATS[sub].shape}')


### 2) Explore

In [None]:
def boxplot_zscored_LID_features(
    subs_list: list, X_total: list,
    y_total_binary: list, ft_names: list,
    TO_SAVE_FIG: bool = False
):
    """
    make boxplots per subject of z-scored
    features (only LID) used for prediction

    Input:
        - subs_list: list with all sub-string codes
        - X_total: list with all arrays of all features per subject
        - y_total_binary: list with corresponding binary LID-labels
    """

    fig, axes = plt.subplots(len(subs_list), 1, figsize=(12, 16))
    fs = 16
    ##### PLOT BOXPLOT OF FEATURES ######
    for i_s, (sub_fts, sub_y_bin) in enumerate(
        zip(X_total, y_total_binary)
    ):
        sub = subs_list[i_s]
        sub_LID_sel = np.array(sub_y_bin).astype(bool)
        sub_LID_fts = sub_fts[sub_LID_sel, :]
        # make lists for boxplot values (only LID-windows) without NaNs, per features
        bp_LID_values_list = [
            list(sub_LID_fts[~np.isnan(sub_LID_fts[:, i_ft]), i_ft])
            for i_ft in np.arange(sub_LID_fts.shape[1])
        ]
        box = axes[i_s].boxplot(bp_LID_values_list)
        plt.setp(box['fliers'], color='gray')
        # plt.setp(box['whiskers'], color='red')

        axes[i_s].axhline(y=0, xmin=0, xmax=24, color='k', alpha=.3)
        for y_line in [-2, 2]:
            axes[i_s].axhline(y=y_line, xmin=0, xmax=24, color='r', alpha=.3)

        axes[i_s].set_ylim(-6, 6)
        axes[i_s].set_ylabel(f'z-scores\nvs no-LID (a.u.)', fontsize=fs)
        axes[i_s].set_title(f'Sub-{sub} (mean unilat. CDRS '
                            f'{round(np.nanmean(FT_LABELS[sub]), 2)})',
                            weight='bold', fontsize=fs)
        axes[i_s].set_xticklabels(['mx', 'mn', 'cv'] * int(len(ft_names) / 3),
                                fontsize=fs,)

        for side in ['top','right','bottom']:
            axes[i_s].spines[side].set_visible(False)

        ### fill colors
        colors = {
            'alpha': 'yellow',
            'lo_beta': 'lightblue',
            'hi_beta': 'darkblue',
            'midgamma': 'green'
        }
        hatches = {
            'STN': '',
            'ECoG': '//'
        }

        x_fill_list = []
        for x1 in np.arange(.5, len(ft_names) + .5, 3):
            x2 = x1 + 3
            x_fill_list.append([x1, x2])

        for i_x, (src, bw) in  enumerate(product(hatches.keys(), colors.keys())):
            axes[i_s].fill_betweenx(
                y=np.arange(-6, 6), x1=x_fill_list[i_x][0],
                x2=x_fill_list[i_x][1], color=colors[bw], hatch=hatches[src],
                label=f'{src} {bw}', alpha=.2, edgecolor='gray',)

    leg_content = plt.gca().get_legend_handles_labels()
    handles, labels = pltHelp.remove_duplicate_legend(leg_content)
    plt.legend(handles, labels, ncol=4, frameon=False,
            loc='upper center', bbox_to_anchor=(0.5, -0.2),fancybox=False,
            prop={'weight': 'bold', 'size': fs})

    plt.suptitle('Individual Feature values during Dyskinesia\n',
                 weight='bold', fontsize=fs+4)
    plt.tight_layout()

    if TO_SAVE_FIG:
        figname = 'LID_ssdFeatures_boxplots_indiv'
        plt.savefig(os.path.join(figpath, 'ft_exploration', 'SSD', figname),
                    dpi=300, facecolor='w',)
    plt.show()

    print(f'FEATURES X-AXIS: {ft_names}')


In [None]:
CDRS_THRESHOLD = .1

# create empty list to store individual values for next process part
X_total = []
y_total_binary = []
y_total_scale = []
sub_ids_total = []
ft_times_total = []

EXCL_CODE = 99

TO_PLOT = False

for i_s, sub in enumerate(SUBS):
    ft_names = []

    ### Create Y-labels based on CDRS (FT_LABELS)
    no_LID_sel = np.array(FT_LABELS[sub]) == 0
    LID_sel = np.array(FT_LABELS[sub]) >= CDRS_THRESHOLD

    # create binary y-labels
    sub_y_bin = []  # y as binary
    for noLID, LID in zip(no_LID_sel, LID_sel):
        if noLID: sub_y_bin.append(0)
        elif LID: sub_y_bin.append(1)
        else: sub_y_bin.append(EXCL_CODE)
    # add full scaled y-labels
    sub_y_scale = FT_LABELS[sub]
    # append sub-codes to sub-id list (for later identifying subjects)
    sub_ids_total.append([sub] * FEATS[sub].shape[0])  # add subject code, as many times as there are feature rows
    # add subjects ft-times to list (for later plotting)
    ft_times_total.append(FEATS[sub].index.values)
    ### Create X with standardised Feature-arrays
    sub_X = np.zeros_like((FEATS[sub]))

    for n_col, ft in enumerate(FEATS[sub].keys()):
        print(sub, ft)
        ft_names.append(ft)
        values = FEATS[sub].values[:, n_col]
        # Z-score values based NO-LID mean and std
        noLID_values = values[no_LID_sel]
        m = np.nanmean(noLID_values)
        sd = np.nanstd(noLID_values)
        Z_ALL_values = (values - m) / sd
        sub_X[:, n_col] = Z_ALL_values  # store all feats for pred-exploration
        
    # add subject values to total lists
    X_total.append(sub_X)
    y_total_binary.append(sub_y_bin)
    y_total_scale.append(sub_y_scale)

if TO_PLOT:
    boxplot_zscored_LID_features(
        subs_list=SUBS, X_total=X_total,
        y_total_binary=y_total_binary,
        ft_names=ft_names,
        TO_SAVE_FIG=False
    )

In [None]:
# merge all features and labels per Subject together
for i, (X_sub, y_sub) in enumerate(zip(X_total, y_total_binary)):
    # loop over list with arrays of feats and labels per subject

    if i == 0:
        X_all = X_sub.copy()
        y_all_binary = y_sub.copy()
        y_all_scale = list(y_total_scale[i].copy())
        sub_ids = list(sub_ids_total[i].copy())
        ft_times_all = list(ft_times_total[i].copy())

    else:
        X_all = np.concatenate([X_all, X_sub], axis=0)
        y_all_binary.extend(y_sub)
        y_all_scale.extend(y_total_scale[i])
        sub_ids.extend(sub_ids_total[i])
        ft_times_all.extend(ft_times_total[i])

y_all_binary = np.atleast_2d(y_all_binary).T
y_all_scale = np.atleast_2d(y_all_scale).T
sub_ids = np.atleast_2d(sub_ids).T
ft_times_all = np.atleast_2d(ft_times_all).T

# remove all Rows containing NaN Features
nan_row_sel = np.isnan(X_all).any(axis=1)
X_all = X_all[~nan_row_sel]
y_all_binary = y_all_binary[~nan_row_sel]
y_all_scale = y_all_scale[~nan_row_sel]
sub_ids = sub_ids[~nan_row_sel]
ft_times_all = ft_times_all[~nan_row_sel]

# remove all rows not belonging to defined two outcome classes
# (for example: if 0 is CDRS=0, and 1 is CDRS>=3, then CDRS scores 1 and 2 are excluded)
excl_score_sel = y_all_binary == EXCL_CODE

X_all = X_all[~excl_score_sel.ravel()]
y_all_binary = y_all_binary[~excl_score_sel]
y_all_scale = y_all_scale[~excl_score_sel]
sub_ids = sub_ids[~excl_score_sel]
ft_times_all = ft_times_all[~excl_score_sel]

# X_all contains n-windows, n-features
# y_all contains y-values (n-windows)
# sub_ids contains subject-codes corresponding to windows (n-windows)
print(X_all.shape, y_all_binary.shape, y_all_scale.shape,
      sub_ids.shape, ft_times_all.shape)

In [None]:
cov_matrix = np.cov(X_all, rowvar=False)

variances = np.diag(cov_matrix)
std_devs = np.sqrt(variances)
# scale cov matrix
scaled_cov_matrix = cov_matrix / np.outer(std_devs, std_devs)

mask = np.abs(scaled_cov_matrix) < .7
scaled_cov_matrix[mask] = 0

# Plot the covariance matrix
plt.imshow(scaled_cov_matrix, cmap='RdYlBu',
           vmin=-1, vmax=1)
plt.colorbar()
plt.title('Covariance Matrix')
plt.show()



### 3) Explore prediction

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import LeaveOneGroupOut

from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

# performance
from sklearn.metrics import (
    classification_report,
    confusion_matrix, plot_confusion_matrix, ConfusionMatrixDisplay,
    auc, roc_curve, RocCurveDisplay
)

In [None]:
X = X_all.copy()
y = y_all_binary.copy()
groups = sub_ids.ravel()

# cv_method = StratifiedKFold
cv_method = LeaveOneGroupOut

n_folds = 5
clf_method = 'logreg'
random_state = 42
random_perm = False

verbose = False


In [None]:
import lfpecog_predict.predict_helpers as predHelpers

In [None]:
importlib.reload(predHelpers)

# REAL PREDICT
y_true_all, y_pred_all, y_pred_conf_all = predHelpers.perform_prediction(
    X=X_all.copy(), y=y_all_binary.copy(), groups=sub_ids.ravel(),
    cv_method=LeaveOneGroupOut,
    clf_method='logreg',
    perform_random_perm = False,
    n_perms = 0,
    verbose = False,
)

In [None]:
importlib.reload(predHelpers)

# PERMUTATIONS
perm_tpr, perm_fpr = predHelpers.perform_prediction(
    X=X_all.copy(), y=y_all_binary.copy(), groups=sub_ids.ravel(),
    cv_method=LeaveOneGroupOut,
    clf_method='logreg',
    perform_random_perm=True,
    n_perms=100,
    perm_return_ROC=True,
    verbose=False,
)

In [None]:


auc_perms = []

fig, ax = plt.subplots(1,1, figsize=(6, 6))
fs = 14
for x_p, y_p in zip(perm_fpr, perm_tpr):
    ax.plot(x_p, y_p, alpha=.2, lw=.5, c='k',)
    auc_perms.append(auc(x_p, y_p))

alpha01 = np.percentile(auc_perms, 99)
fpr, tpr, _ = roc_curve(y_true_all, y_pred_conf_all,)
auc_score = round(auc(fpr, tpr), 2)
ax.plot(fpr, tpr, c='darkgreen', lw=2,
        label=f'Real Predicted (AUC: {auc_score})',
)
ax.plot(0, 0, c='k', label=f'Permutations\n(n=500, alpha 0.01: {round(alpha01, 3)})')
ax.plot([0, 1], [0, 1], lw=3,  c='orange', label='Chance level (50/50)')

ax.set_xlabel('False Positive Rate', fontsize=fs, weight='bold',)
ax.set_ylabel('True Positive Rate', fontsize=fs, weight='bold',)
ax.set_title('Dyskinesia Prediction - Receiver Operator Curve'
            '\nLeave-One-Subject-Out Cross-Validation',
            fontsize=fs)

ax.legend(frameon=False, fontsize=fs)
plt.tick_params(axis='both', labelsize=fs)
plt.tight_layout()
# fname = f'Group_LID_PRED_MDS23'
# plt.savefig(os.path.join(figpath, 'prediction', fname),
#             facecolor='w', dpi=300,)

plt.show()


In [None]:
import lfpecog_plotting.plot_pred_standards as plotPred

In [None]:
importlib.reload(plotPred)

In [None]:
# Leave-One_subject-Out

# show metrics summary
print(classification_report(y_true_all, y_pred_all))

# show confusion matrix
cm = confusion_matrix(y_true_all, y_pred_all)
cm_figname = ''
plotPred.plot_confMatrix(cm, fig_path=figpath, fig_name=cm_figname,
                         to_show=False, to_save=False)

# show Receiver Operator Cruve
fpr, tpr, _ = roc_curve(y_true_all, y_pred_conf_all,)
# roc_display = RocCurveDisplay(fpr=fpr, tpr=tpr).plot()


Show individual prediction course

In [None]:
importlib.reload(predHelpers)

# REAL PREDICTIONS returned per Subject
preds_subs = predHelpers.perform_prediction(
    X=X_all.copy(), y=y_all_binary.copy(), groups=sub_ids.ravel(),
    cv_method=LeaveOneGroupOut,
    clf_method='logreg',
    perform_random_perm = False,
    n_perms = 0,
    verbose = False,
    return_dict_per_sub=True
)

In [None]:
from lfpecog_plotting.plotHelpers import get_colors


In [None]:
PLOT_PROBA = False
clrs = list(get_colors().values())

fig, axes = plt.subplots(len(SUBS), 1, figsize=(8, 12))
fs = 14
for i_s, sub in enumerate(SUBS):
    handles, labels = [], []

    plot_preds = preds_subs[sub]['pred']
    if PLOT_PROBA: plot_probas = preds_subs[sub]['proba'][:, 1]
    sub_sel = sub_ids == sub
    plot_cdrs = y_all_scale[sub_sel]  # get CDRS as full scale
    plot_fttimes = ft_times_all[sub_sel]
    assert len(plot_preds) == len(plot_cdrs), (
        '# predictions and # scores not equal'
    )

    ymax = max(plot_cdrs)
    if ymax == 0: ymax = 1
    
    # fill moments where LID was predicted
    axes[i_s].fill_between(plot_fttimes,
                           y1=-0, y2=ymax,
                           where=plot_preds == 1, alpha=.4,
                           color=clrs[1],
                           label='LID predicted')
    # fill moments where NO LID was predicted
    axes[i_s].fill_between(plot_fttimes,
                           y1=-0, y2=ymax,
                           color=clrs[4],
                           where=plot_preds == 0, alpha=.4,
                           label='No LID predicted')
    
    # plot probabilities of prediction
    if PLOT_PROBA:
        ax2 = axes[i_s].twinx()  # create second y-axis for probabilities
        ax2.plot(plot_fttimes, plot_probas, lw=.8, color='purple',
                alpha=.8, label='Predicted probability')
        ax2.set_ylim(0, 1)
        ax2.set_ylabel('Predicted\nprobability', fontsize=fs, weight='bold',)
        ax2.tick_params(axis='both', labelsize=fs, size=fs,)
        for side in ['top',]:
            ax2.spines[side].set_visible(False)
        hnd, lab = ax2.get_legend_handles_labels()
        handles.extend(list(hnd))
        labels.extend(list(lab))

    # plot CDRS as full scale
    axes[i_s].plot(plot_fttimes, plot_cdrs, lw=3, color='green',
                      label='Real CDRS (unilat.)')

    axes[i_s].set_title(f'sub-{sub}', weight='bold', fontsize=fs)
    axes[i_s].set_xlabel('Time (minutes vs L-Dopa intake)',
                         fontsize=fs, )
    axes[i_s].set_ylabel('Dyskinesia\n(CDRS)',
                         fontsize=fs, weight='bold',)
    hnd, lab = axes[i_s].get_legend_handles_labels()
    handles.extend(list(hnd))
    labels.extend(list(lab))

axes[0].legend(handles, labels, frameon=False,
            loc='lower center', bbox_to_anchor=(.5, 1.2),
            fancybox=False, shadow=False,
            borderaxespad=1, ncol=3,
            prop={
                # 'weight': 'bold',
                'size': fs
            }
)

# plt.suptitle('Individual binary Dyskinesia-Predictions vs CDRS',
#             #  weight='bold',
#              fontsize=fs+4)

for ax in axes:
    ax.tick_params(axis='both', labelsize=fs, size=fs,)
    for side in ['top','right']:
        ax.spines[side].set_visible(False)
plt.tight_layout()

fname = f'Indiv_binLID_predict_vs_CDRSscale_{cdrs_rater}'
# plt.savefig(os.path.join(figpath, 'prediction', fname),
#             facecolor='w', dpi=300,)
plt.show()