## ReTap - UPDRS-Tapping Assessment - Predictions

This notebooks investigates optimal hand- and fingertapping algorithms as part of the 
ReTune-Dyskinesia project.



### 0. Loading packages and functions, defining paths



In [1]:
# Importing Python and external packages
import os
import sys
import importlib
import json
import pandas as pd
import numpy as np
import sklearn as sk
import scipy
import matplotlib.pyplot as plt
# import matplotlib.gridspec as gridspec

import seaborn as sns
from scipy import signal
from scipy import stats
from array import array
import datetime as dt
from dataclasses import  dataclass, field
from itertools import compress
from typing import Any

In [2]:
# check some package versions for documentation and reproducability
print('Python sys', sys.version)
print('pandas', pd.__version__)
print('numpy', np.__version__)
# print('mne_bids', mne_bids.__version__)
# print('mne', mne.__version__)
print('sci-py', scipy.__version__)
print('sci-kit learn', sk.__version__)


## developed with:
# Python sys 3.9.7 (default, Sep 16 2021, 08:50:36) 
# [Clang 10.0.0 ]
# pandas 1.3.4
# numpy 1.20.3
# mne_bids 0.9
# mne 0.24.1
# sci-py 1.7.1
# sci-kit learn 1.0.1

## Currently (own env) since 31.08.22
# Python sys 3.9.12 (main, Jun  1 2022, 06:36:29) 
# [Clang 12.0.0 ]
# pandas 1.4.3
# numpy 1.21.5
# sci-py 1.7.3
# sci-kit learn 1.1.1

Python sys 3.9.13 (main, Oct 13 2022, 21:23:06) [MSC v.1916 64 bit (AMD64)]
pandas 1.4.4
numpy 1.23.3
sci-py 1.9.1
sci-kit learn 1.1.2


In [3]:
# own functions
from retap_utils import utils_dataManagement
import retap_utils.get_datasplit as get_split

import tap_predict.tap_pred_prepare as pred_prep
import tap_plotting.retap_plot_clusters as plot_cluster

## 1) Split development and hold-out-test data sets

- Development data is used to train and test the model using iterative cross-validation
- Hold-out test data is NOT USED at all during cross-validation, and will be used to test the trained model as an external validation

### 1a. Import extracted Features

In [4]:
### IMPORT CREATED CLASSES FROM FILES
from tap_extract_fts.main_featExtractionClass import FeatureSet, singleTrace

# define path with feature class
deriv_path = os.path.join(utils_dataManagement.get_local_proj_dir(), 'data', 'derivatives')

ftClass = utils_dataManagement.load_class_pickle(os.path.join(deriv_path, 'ftClass_ALL_20221214.P'))
ftClass10 = utils_dataManagement.load_class_pickle(os.path.join(deriv_path, 'ftClass_ALL_max10_20221214.P'))

In [5]:
vars(ftClass10.BER019_M0S0_L_1).keys()

dict_keys(['sub', 'state', 'side', 'rep', 'center', 'filepath', 'tap_score', 'goal_Fs', 'to_extract_feats', 'max_n_taps_incl', 'acc_sig', 'fs', 'impact_idx', 'fts'])

## 2) ML-dataset Preparation

In [6]:
# show all present features
vars(ftClass.BER055_M0S0_L_1.fts).keys()

dict_keys(['triax_arr', 'fs', 'impacts', 'tap_lists', 'max_n_taps_incl', 'updrsSubScore', 'total_nTaps', 'freq', 'tap_durations', 'intraTapInt', 'tapRMS', 'tapRMSnrm', 'impactRMS', 'raise_velocity', 'jerkiness_taps', 'jerkiness_trace', 'mean_tapRMS', 'coefVar_tapRMS', 'IQR_tapRMS', 'decr_tapRMS', 'slope_tapRMS', 'mean_tapRMSnrm', 'coefVar_tapRMSnrm', 'IQR_tapRMSnrm', 'decr_tapRMSnrm', 'slope_tapRMSnrm', 'mean_impactRMS', 'coefVar_impactRMS', 'IQR_impactRMS', 'decr_impactRMS', 'slope_impactRMS', 'mean_raise_velocity', 'coefVar_raise_velocity', 'IQR_raise_velocity', 'decr_raise_velocity', 'slope_raise_velocity', 'mean_intraTapInt', 'coefVar_intraTapInt', 'IQR_intraTapInt', 'decr_intraTapInt', 'slope_intraTapInt', 'mean_jerkiness_taps', 'coefVar_jerkiness_taps', 'IQR_jerkiness_taps', 'decr_jerkiness_taps', 'slope_jerkiness_taps'])

#### 2a. Including ALL features

In [16]:
importlib.reload(pred_prep)

traces, feats = pred_prep.select_traces_and_feats(
    ftClass,
    center='all',
    use_sel_fts=True,
)
X, y = pred_prep.create_X_y_vectors(
    ftClass,
    incl_traces=traces,
    incl_feats=feats,
    to_norm=False,
)

### 2b. Ensemble method, start with clustering on intraTapInterval and overall tapping-frequency

Create X1 with selected input features (mean and coef of variation of intra-tap-interval) and
overall tapping frequency to find two clusters (y_clusters) dividing fast vs slow tappers. 

In [245]:
importlib.reload(pred_prep)
importlib.reload(get_split)
importlib.reload(plot_cluster)

# set variables for pre-clustering
n_clusters = 2
traces_excl = [
    'DUS006_M0S0_L_1',
]
ft_sel = [
    'mean_intraTapInt',
    'coefVar_intraTapInt',
    'freq'
]

# include only dev dataset, exclude holdout data
datasplit_subs = get_split.find_dev_holdout_split(
    feats=ftClass10, )

# create matrix to cluster with
X_1, y, X1_ids = pred_prep.create_X_y_vectors(
    ftClass=ftClass10,
    incl_feats=ft_sel,
    incl_traces=ftClass10.incl_traces,
    excl_traces=traces_excl,
    excl_subs=datasplit_subs['hout'],  # due to hold out data set
    to_zscore=True,
    return_ids=True,
)

## MASKING BCS TOO LOW NUMBERS
# UPDRS 4 -> 3 merge (4: n=3)
mask = y == 4
y[mask] = 3

# # UPDRS 0 -> 1 merge (0: n=40)
# mask = y == 0
# y[mask] = 1

# create cluster labels
y_clust, centr_clust, _ = plot_cluster.get_kMeans_clusters(
    X=X_1,
    n_clusters=n_clusters,
    use_pca=True,
    z_score=True,
)

# Define which cluster contains faster tappers
cluster_mean_ITIs = []

ft = 'mean_intraTapInt'
print(f'Mean {ft} (z-scored):')
for i_cls in np.unique(y_clust):

    i_ft = np.where([f == ft for f in ft_sel])[0][0]
    mean_iti_cluster = np.mean(X_1[y_clust == i_cls, i_ft])
    cluster_mean_ITIs.append(mean_iti_cluster)

    print(f'\tcluster {i_cls}: {mean_iti_cluster}')

fast_cluster_i = np.argmin(cluster_mean_ITIs)
if fast_cluster_i == 0: slow_cluster_i = 1
if fast_cluster_i == 1: slow_cluster_i = 0

print(f'Fast tappers are clustered in cluster index {fast_cluster_i}')
print(f'Slow tappers are clustered in cluster index {slow_cluster_i}')

SPLITTING DATA IN DEV AND HOLD-OUT
Original score distribution: {0: 40, 1: 154, 2: 122, 3: 57, 4: 3}
Original score %: {0: 10.6, 1: 41.0, 2: 32.4, 3: 15.2, 4: 0.8}
Accepted Split: random state 63

Resulting distributions in splitted data sets:

	dev data set (n = 285):
score 0: # 32 (11 %)
score 1: # 115 (40 %)
score 2: # 94 (33 %)
score 3: # 42 (15 %)
score 4: # 2 (1 %)
	hout data set (n = 91):
score 0: # 8 (9 %)
score 1: # 39 (43 %)
score 2: # 28 (31 %)
score 3: # 15 (16 %)
score 4: # 1 (1 %)
# of NaNs per feat: [0 0 0]




Mean mean_intraTapInt (z-scored):
	cluster 0: -0.357289281961114
	cluster 1: 1.4546777908416781
Fast tappers are clustered in cluster index 0
Slow tappers are clustered in cluster index 1


### Make X_2 input Matrix with more features for score-prediction per Cluster

In [241]:
importlib.reload(pred_prep)

incl_traces, ft_list = pred_prep.select_traces_and_feats(
    ftClass10, use_sel_fts=True, excl_traces=traces_excl,
)

feats_for_2nd_pred = [
    # 'freq',
    'coefVar_intraTapInt',
    # 'mean_intraTapInt',
    # 'slope_intraTapInt',
    'decr_intraTapInt',
    'mean_tapRMS',
    'coefVar_tapRMS',
    # 'slope_tapRMS',
    'decr_tapRMS',
    'mean_raise_velocity',
    'jerkiness_trace'
]

X_2, y, X2_ids = pred_prep.create_X_y_vectors(
    ftClass,
    incl_traces=ftClass10.incl_traces,
    incl_feats=feats_for_2nd_pred,
    excl_traces=traces_excl,
    excl_subs=datasplit_subs['hout'],  # due to hold out data set
    to_norm=False,
    to_zscore=True,
    return_ids=True,
)
# Mask UPDRS 4 -> 3 merge (4: n=3)
mask = y == 4
y[mask] = 3

# of NaNs per feat: [0 0 0 0 0 0 0]


In [243]:
print(
    X_1.shape, X_2.shape, y.shape, y_clust.shape,
    len(feats_for_2nd_pred), X2_ids.shape
)

(284, 3) (284, 7) (284,) (284,) 7 (284,)


### Split input matrix X_2 in two generated clusters:
- split X and y in two groups based on clusters
- test default ML modeling on both groups

In [256]:
X_2fast = X_2[y_clust == fast_cluster_i]
y_2fast = y[y_clust == fast_cluster_i]
fast_cls_ids = X2_ids[y_clust == fast_cluster_i]

X_2slow = X_2[y_clust == slow_cluster_i]
y_2slow = y[y_clust == slow_cluster_i]
slow_cls_ids = X2_ids[y_clust == slow_cluster_i]

print(f'Fast X shape: {X_2fast.shape}, Slow X shape: {X_2slow.shape}')



Fast X shape: (228, 7), Slow X shape: (56, 7)


### Visualise features in specific clusters

In [238]:
# create lists for boxplots of features per subscore, per cluster

clst_X = X_2fast
clst_y = y_2fast

box_lists = {}
for f in range(clst_X.shape[1]):
    box_lists[f] = {}
    for i in range(4): box_lists[f][i] = []


for i in np.arange(clst_X.shape[0]):

    score = clst_y[i]

    for f in range(clst_X.shape[1]):

        box_lists[f][int(score)].append(clst_X[i, f])

# plot features within cluster, and decide on strategy
# pm: use pre-knowledge about clusters
# likelihood in faster cluster for 1-2 scores
# use probabilities and adapt the threshold for acceptance
# start finding border scores (e.g. 1 or 3)

for i_f, ft in enumerate(feats_for_2nd_pred):

    plot_lists = [box_lists[i_f][i] for i in range(4)]

    plt.boxplot(plot_lists)
    plt.title(ft)
    plt.xticks(range(1, len(plot_lists) + 1), labels=['0', '1', '2', '3+4'])
    plt.xlabel('UPDRS tap-score')
    plt.ylabel('Z-score (a.u.)')
    plt.close()

### Test different prediction models for second step in Fast Cluster

optimal thresholds (to prevent too large False Positive Values)
#### predicting the best tappers (0-1)
- .58 - .6 for LogReg
- .6 for svm linear kernel
- .6 for svm poly kernel

In [266]:
from retap_utils.plot_helpers import remove_duplicate_legend
from tap_predict import retap_cv_models as cv_models
from tap_plotting import plot_cv_folds as plot_folds

In [59]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold


from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

from sklearn.metrics import (
    confusion_matrix, roc_auc_score, roc_curve,
    accuracy_score, f1_score, precision_score,
    recall_score, plot_roc_curve, plot_confusion_matrix
)


from sklearn.metrics import classification_report
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.metrics import auc
from sklearn.metrics import RocCurveDisplay, ConfusionMatrixDisplay


In [None]:
# tn, fp, fn, tp = confusion_matrix(y_true,y_preds).ravel()
# # add outcomes to dedicated lists
# ['Accuracy'].append(accuracy_score(y_true,y_preds))
# ['AUROC'].append(roc_auc_score(y_true, y_probas[1]))
# ['F1_score'].append(f1_score(y_true,y_preds))
# ['Precision'].append(precision_score(y_true, y_preds)) # precision/PPV
# ['Recall'].append(recall_score(y_true, y_preds)) # sensitivity/recall/TPR
# ['FPR'].append(fp / (fp+tn)) # false-positive, false-alarm rate

In [290]:
importlib.reload(cv_models)
importlib.reload(plot_folds)
# CLassification Settings

score_to_predict = 1
clf_choice = 'logreg'
nFolds = 4
to_plot = False
plot_thresholds = [.55, .58, .65]
    
X_cv = X_2fast.copy()
y_cv = y_2fast.copy() <= score_to_predict
ids_cv = fast_cls_ids.copy()

roc_title = f'Identify UPDRS 0-1 ({clf_choice})'

chance = sum(y_cv) / len(y_cv)

print(f'shape of X_dev: {X_cv.shape}')
print(f'predicting UPDRS score {score_to_predict}')
print(f'total y true: {sum(y_cv)}')
print(f'Chance level: {chance}')
print()


y_pred_dict, y_proba_dict, y_true_dict, og_pred_idx = cv_models.get_cvFold_predictions_dicts(
    X_cv=X_cv, y_cv=y_cv,
    cv_method=StratifiedKFold,
    n_folds=nFolds,
    clf=clf_choice,
    # sv_kernel='linear',
)
if to_plot: 

    plot_folds.plot_ROC_AUC_confMatrices_for_folds(
        y_true_dict=y_true_dict,
        y_proba_dict=y_proba_dict,
        plot_thresholds=plot_thresholds,
        roc_title=roc_title,
        incl_mean_ROC=True,
    )



shape of X_dev: (228, 7)
predicting UPDRS score 1
total y true: 127
Chance level: 0.5570175438596491

LogisticRegression(random_state=27)
# of samples: train 171, test 57
# true labels 31: chance is 0.543859649122807
# of samples: train 171, test 57
# true labels 32: chance is 0.5614035087719298
# of samples: train 171, test 57
# true labels 32: chance is 0.5614035087719298
# of samples: train 171, test 57
# true labels 32: chance is 0.5614035087719298


### Extract best tappers (0-1 predicted) from fast-tappers and classify remaining part

In [283]:
proba_accept_thr = .58



In [292]:
clf_decision = np.zeros((y_cv.shape))

# loop over single probabilities in all folds
for fold_n in y_proba_dict:
    for i_proba, proba in enumerate(y_proba_dict[fold_n]):
        # set correct index to True (1) of proba > acceptance threshold
        if proba[1] > proba_accept_thr:
            # find corresponding index in full cv data
            og_idx = og_pred_idx[fold_n][i_proba]
            clf_decision[og_idx] = 1



In [299]:
ids_cv[clf_decision.astype(bool)]

array(['BER019_M0S0_L_2', 'BER019_M0S0_L_1', 'BER019_M0S0_R_1',
       'BER019_M0S0_R_2', 'BER019_M0S0_R_3', 'BER019_M0S1_L_1',
       'BER019_M0S1_R_2', 'BER019_M0S1_R_3', 'BER019_M0S1_R_1',
       'BER026_M0S0_L_2', 'BER026_M0S0_R_3', 'BER026_M0S0_R_1',
       'BER026_M0S0_R_2', 'BER026_M0S1_L_3', 'BER026_M0S1_L_2',
       'BER026_M0S1_L_1', 'BER026_M1S0_L_1', 'BER026_M1S0_L_3',
       'BER026_M1S0_L_2', 'BER026_M1S0_R_3', 'BER026_M1S0_R_2',
       'BER026_M1S0_R_1', 'BER026_M1S1_L_2', 'BER026_M1S1_L_3',
       'BER026_M1S1_L_1', 'BER026_M1S1_R_3', 'BER026_M1S1_R_1',
       'BER026_M1S1_R_2', 'BER033_M0S1_L_1', 'BER033_M0S1_L_2',
       'BER033_M0S1_R_3', 'BER033_M0S1_R_1', 'BER028_M0S1_R_2',
       'BER028_M1S0_L_1', 'BER028_M1S0_R_2', 'BER028_M1S0_R_3',
       'BER028_M1S0_R_1', 'BER028_M1S1_R_3', 'BER028_M1S1_R_2',
       'BER028_M1S1_R_1', 'BER024_M0S0_L_1', 'BER024_M0S0_L_2',
       'BER024_M0S0_R_2', 'BER024_M0S0_R_3', 'BER024_M0S0_R_1',
       'BER024_M0S1_L_1', 'BER024_M0S1_L

## 3) Clustering & Classifying

- Candidate vetors based on descriptives and concept
    - nTaps
    - freq
    - upVelo sum [std-dev + coefVar]
    - impact RMS [coefVar + stddev]
    - tapRMS and impactRMS [sum]
    - 
- include per run (array tap-features): sum, mean, stddev, trend_slope

- Cluster on UPDRS 4?

### 3a) Clustering

In [10]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# try: K-shape (sklearn), Laio 2014

In [71]:
importlib.reload(plot_cluster)

n_clusters=4
center_incl = 'all'
sel_feats = True


### VISUALISE AGAINST SUBS !! AND CONDITIONS


traces, feats = pred_prep.select_traces_and_feats(
    ftClass,
    center=center_incl,
    use_sel_fts=sel_feats,
)
X, y = pred_prep.create_X_y_vectors(
    ftClass,
    incl_traces=traces,
    incl_feats=feats,
    to_norm=False,
)

figname = (
    f'retap_{n_clusters}clusters_'
    f'{center_incl}'
)
if sel_feats: figname += '_selFeats'
else: figname += '_allFeats'

plot_cluster.plot_cluster_kMeans(
    X=X, y=y,
    n_clusters=n_clusters,
    use_pca=True,
    random_state=27,
    figsave_name=figname,
    figsave_dir=os.path.join(
        utils_dataManagement.find_onedrive_path('figures'),
        'clustering',
    ),
    show=False,
)



#### MANOVA

- normality assumption violated (Shapiro test highly significant)
- for every a priori selected feature: present difference between sub-score-groups is a Kruskal-Wallis test (non-parametric One-Way ANOVA alternative)
- differences between two sub groups within a feature is a non-parametric test of two groups of quantitative values (likely varying lengths): Mann-Whitney-U
- in total: correct alpha for number of repeated measures on specific level

In [None]:
from scipy.stats import shapiro
for col in np.arange(X.shape[1]):
    print(feats[col], shapiro(X[:, col]))

In [90]:
from statsmodels.multivariate.manova import MANOVA

stat_data = np.concatenate([X, y.reshape((len(y), 1))], axis=1)
manova_df = pd.DataFrame(
    data=stat_data,
    columns=feats + ['subscore'],
)
maov = MANOVA.from_formula(
    'nTaps + freq + mean_intraTapInt + coefVar_intraTapInt + IQR_jerkiness +'
    ' mean_raise_velocity + mean_tapRMSnrm ~ subscore ',
    # 'mean_jerkiness_smooth + IQR_jerkiness_smooth ~ subscore',
    data=manova_df,
)
print(maov.mv_test())

                  Multivariate linear model
                                                              
--------------------------------------------------------------
       Intercept        Value  Num DF  Den DF  F Value  Pr > F
--------------------------------------------------------------
          Wilks' lambda 0.1239 7.0000 365.0000 368.6918 0.0000
         Pillai's trace 0.8761 7.0000 365.0000 368.6918 0.0000
 Hotelling-Lawley trace 7.0708 7.0000 365.0000 368.6918 0.0000
    Roy's greatest root 7.0708 7.0000 365.0000 368.6918 0.0000
--------------------------------------------------------------
                                                              
--------------------------------------------------------------
         subscore        Value  Num DF  Den DF  F Value Pr > F
--------------------------------------------------------------
           Wilks' lambda 0.7803 7.0000 365.0000 14.6821 0.0000
          Pillai's trace 0.2197 7.0000 365.0000 14.6821 0.0000
  Hotelling

In [126]:
from scipy.stats import kruskal
importlib.reload(pred_prep)

mask_scores = True

traces, feats = pred_prep.select_traces_and_feats(
    ftClass,
    center=center_incl,
    use_sel_fts=sel_feats,
)
X, y = pred_prep.create_X_y_vectors(
    ftClass,
    incl_traces=traces,
    incl_feats=feats,
    to_norm=False,
)
n_groups = 5
if mask_scores:
    # UPDRS 4 -> 3 merge
    mask = y == 4
    y[mask] = 3
    # UPDRS 0 -> 1 merge
    mask = y == 0
    y[mask] = 1

    n_groups = 3

stat_data = np.concatenate([X, y.reshape((len(y), 1))], axis=1)
stat_df = pd.DataFrame(
    data=stat_data,
    columns=feats + ['subscore'],
)

stat_fts = [
    'freq', 'coefVar_intraTapInt', 'mean_jerkiness', 'coefVar_jerkiness',
    'mean_tapRMSnrm', 'coefVar_tapRMSnrm', 'slope_tapRMSnrm'
]
alpha = .05 / len(stat_fts)
for ft in stat_fts:
    tempft = stat_df[~np.isnan(stat_df[ft])]

    
    if mask_scores:
        groups = [
            tempft[ft][tempft['subscore'] == s].reset_index(drop=True)
            for s in np.arange(1, n_groups + 1)
        ]
        krusk_stat, krusk_p = kruskal(
            groups[0], groups[1], groups[2], 
        )
    else:
        groups = [
            tempft[ft][tempft['subscore'] == s].reset_index(drop=True)
            for s in np.arange(n_groups)
        ]
        krusk_stat, krusk_p = kruskal(
            groups[0], groups[1], groups[2], 
            groups[3], groups[4]
        )
    print(f'\n{ft}: \n\tGroup level sign. difference (Kruskal'
        f' Test): {krusk_p < alpha} (p = {np.round(krusk_p, 6)})\n')
    for g in np.arange(n_groups - 1):

        mnwu_rho, mnwu_p = mannwhitneyu(groups[g], groups[g + 1])
        print(f'\tupdrs {g} vs {g + 1} sign, (Mann-Whitney-U): '
            f'{mnwu_p < (alpha / (n_groups - 1))} (p = {np.round(mnwu_p, 6)})')



freq: 
	Group level sign. difference (Kruskal Test): True (p = 3.8e-05)

	updrs 0 vs 1 sign, (Spearman): False (p = 0.485159)
	updrs 1 vs 2 sign, (Spearman): True (p = 0.00025)

coefVar_intraTapInt: 
	Group level sign. difference (Kruskal Test): True (p = 0.0)

	updrs 0 vs 1 sign, (Spearman): False (p = 0.084536)
	updrs 1 vs 2 sign, (Spearman): True (p = 4.8e-05)

mean_jerkiness: 
	Group level sign. difference (Kruskal Test): False (p = 0.331604)

	updrs 0 vs 1 sign, (Spearman): False (p = 0.375883)
	updrs 1 vs 2 sign, (Spearman): False (p = 0.140139)

coefVar_jerkiness: 
	Group level sign. difference (Kruskal Test): False (p = 0.719329)

	updrs 0 vs 1 sign, (Spearman): False (p = 0.507749)
	updrs 1 vs 2 sign, (Spearman): False (p = 0.490786)

mean_tapRMSnrm: 
	Group level sign. difference (Kruskal Test): True (p = 3e-06)

	updrs 0 vs 1 sign, (Spearman): False (p = 0.017736)
	updrs 1 vs 2 sign, (Spearman): True (p = 0.00162)

coefVar_tapRMSnrm: 
	Group level sign. difference (Kruskal 

In [119]:
from scipy.stats import mannwhitneyu

In [None]:
X, y = pred_prep.create_X_y_vectors(
    ftClass,
    incl_traces=traces,
    incl_feats=feats,
    to_norm=False,
)

# UPDRS 4 -> 3 merge
mask = y == 4
y[mask] = 3
# UPDRS 0 -> 1 merge
mask = y == 0
y[mask] = 1
### 3c. Classification

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

from sklearn.metrics import classification_report
from sklearn.metrics import multilabel_confusion_matrix


#### Classifying UPDRS 0 - 1 - 2 - 3 - 4

In [37]:
traces, feats = pred_prep.select_traces_and_feats(
    ftClass,
    center='all',
    use_sel_fts=True,
)
X, y = pred_prep.create_X_y_vectors(
    ftClass,
    incl_traces=traces,
    incl_feats=feats,
    to_norm=False,
)
print(X.shape)

# use random outcome labels (equal distribution over scores)
random_y = np.random.randint(0, 5, size=X.shape[0])
y = random_y
# use shuffled outcome labels (same distribution)
np.random.seed(27)
# np.random.shuffle(y)

# UPDRS 4 -> 3 merge
# mask = y == 4
# y[mask] = 3
# # UPDRS 0 -> 1 merge
# mask = y == 0
# y[mask] = 1

lda = LDA()
lda.fit(X, y)

#Define method to evaluate model
cv = RepeatedStratifiedKFold(n_splits=8, n_repeats=10, random_state=1)

#evaluate model
scores = cross_val_score(lda, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
print(np.mean(scores)) 
print(scores)

(373, 12)
0.19776827012025905
[0.17021277 0.14893617 0.21276596 0.21276596 0.19148936 0.15217391
 0.13043478 0.2173913  0.14893617 0.08510638 0.17021277 0.23404255
 0.17021277 0.30434783 0.23913043 0.2826087  0.14893617 0.25531915
 0.12765957 0.17021277 0.19148936 0.17391304 0.2173913  0.10869565
 0.27659574 0.12765957 0.21276596 0.17021277 0.29787234 0.26086957
 0.06521739 0.19565217 0.25531915 0.19148936 0.29787234 0.23404255
 0.19148936 0.17391304 0.17391304 0.13043478 0.17021277 0.23404255
 0.23404255 0.29787234 0.19148936 0.13043478 0.19565217 0.26086957
 0.14893617 0.21276596 0.31914894 0.25531915 0.10638298 0.15217391
 0.17391304 0.23913043 0.21276596 0.21276596 0.14893617 0.25531915
 0.27659574 0.15217391 0.17391304 0.13043478 0.21276596 0.19148936
 0.29787234 0.19148936 0.19148936 0.08695652 0.23913043 0.26086957
 0.19148936 0.23404255 0.27659574 0.14893617 0.10638298 0.19565217
 0.2173913  0.17391304]


In [18]:
# CLassification Settings

X, y = pred_prep.create_X_y_vectors(
    ftClass,
    incl_traces=traces,
    incl_feats=feats,
    to_norm=False,
)
# print(f'INCLUDED FEATURE SPACE: {X.shape}')

nFolds = 10

# UPDRS 4 -> 3 merge
mask = y == 4
y[mask] = 3
# UPDRS 0 -> 1 merge
mask = y == 0
y[mask] = 1


# Shuffle order
# allData = np.hstack((X, y))
# X_shf = allData[:, :14]
# y_shf = allData[:, 14]

# np.random.shuffle(allData)

skf = StratifiedKFold(n_splits=nFolds,)
skf.get_n_splits(X, y)
# clf = LinearSVC(
#         penalty='l2',
#     C=1.0,
#     multi_class='ovr',
#     max_iter=10000,
# )
clf = LogisticRegression(
    random_state=0, 
    solver='liblinear',
    multi_class='ovr',
)

shuffled_accs = []
np.random.seed(27)
# for r_state in np.random.randint(100, size=1000):

    # np.random.seed(r_state)
    # # y_shuffled = y.copy()
    # # np.random.shuffle(y_shuffled)
    # np.random.shuffle(y)

y_pred, y_true = {}, {}
all_accs = []
for F, (train_index, test_index) in enumerate(
    skf.split(X, y)
):
    # print(f'\nLinear Support Vector, fold #{F}')
    # print(f'\nLogistic Regression, fold #{F}')
    # print(
    #     f'\tn train: {train_index.shape}, '
    #     f'n test: {test_index.shape}')

    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    # print('\tcount for y-labels in test set\n',
    #     np.array(np.unique(y_test, return_counts=True)).T)
    
    clf = clf
    clf.fit(X=X_train, y=y_train)
    acc = clf.score(X=X_test, y=y_test)
    # print(
    #     f'accuracy for Fold {F}: {acc}')
    all_accs.append(acc)
    
    # save predictions for posthoc analysis and conf matrix
    y_pred[F] = clf.predict(X=X_test)
    y_true[F] = y_test
    # print(multilabel_confusion_matrix(y_true[F], y_pred[F]))

        
print(f'Overall mean Accuracy: {np.mean(all_accs)}\n')
    # shuffled_accs.append(np.mean(all_accs))

# print('grand mean', np.mean(shuffled_accs))


Overall mean Accuracy: 0.534068278805121



In [132]:
print(f'Overall mean Accuracy: {np.mean(all_accs)}')


Overall mean Accuracy: 0.4047700754975978


#### Boolean Classifying (UPDRS 0 or 4 vs The Rest)

In [83]:
# CLassification Settings
nFolds = 4
score_to_predict = 0

# Shuffle order
X = Xdf.values
y_bool = y == score_to_predict

allData = np.hstack((X, y_bool))

np.random.seed(27)
np.random.shuffle(allData)

X_shf = allData[:, :14]
y_shf = allData[:, 14]

skf = StratifiedKFold(n_splits=nFolds,)
skf.get_n_splits(X_shf, y_shf)

y_pred, y_true = {}, {}
print(
    'Classification of UPDRS subscore '
    f'{score_to_predict} versus thre rest')
for F, (train_index, test_index) in enumerate(
    skf.split(X, y)
):
    print(f'\nLinear Support Vector, fold #{F}')

    X_train, X_test = X_shf[train_index], X_shf[test_index]
    y_train, y_test = y_shf[train_index], y_shf[test_index]

    clf = LinearSVC(penalty='l2', C=1.0,)
    clf.fit(X=X_train, y=y_train)
    print(f'Accuracy: {clf.score(X=X_test, y=y_test)}')
    # for own scoring
    y_pred[F] = clf.predict(X=X_test)
    y_true[F] = y_test
    print(classification_report(y_true[F], y_pred[F]))
    



Classification of UPDRS subscore 0 versus thre rest

Linear Support Vector, fold #0


ValueError: Unknown label type: 'continuous'