## ReTap - UPDRS-Tapping Assessment - Predictions

This notebooks investigates optimal hand- and fingertapping algorithms as part of the 
ReTune-Dyskinesia project.



### 0. Loading packages and functions, defining paths



In [1]:
# Importing Python and external packages
import os
import sys
import importlib
import json
import pandas as pd
import numpy as np
import sklearn as sk
import scipy
import matplotlib.pyplot as plt
# import matplotlib.gridspec as gridspec

import seaborn as sns
from scipy import signal
from scipy import stats
from array import array
import datetime as dt
import h5py
from dataclasses import  dataclass, field
from itertools import compress
from typing import Any

In [2]:
# check some package versions for documentation and reproducability
print('Python sys', sys.version)
print('pandas', pd.__version__)
print('numpy', np.__version__)
# print('mne_bids', mne_bids.__version__)
# print('mne', mne.__version__)
print('sci-py', scipy.__version__)
print('sci-kit learn', sk.__version__)


## developed with:
# Python sys 3.9.7 (default, Sep 16 2021, 08:50:36) 
# [Clang 10.0.0 ]
# pandas 1.3.4
# numpy 1.20.3
# mne_bids 0.9
# mne 0.24.1
# sci-py 1.7.1
# sci-kit learn 1.0.1

## Currently (own env) since 31.08.22
# Python sys 3.9.12 (main, Jun  1 2022, 06:36:29) 
# [Clang 12.0.0 ]
# pandas 1.4.3
# numpy 1.21.5
# sci-py 1.7.3
# sci-kit learn 1.1.1

Python sys 3.9.12 (main, Jun  1 2022, 06:36:29) 
[Clang 12.0.0 ]
pandas 1.4.3
numpy 1.21.5
sci-py 1.7.3
sci-kit learn 1.1.1


In [23]:
import run_finding_10sec_blocks as run_find_blocks

In [99]:
# own data preprocessing functions
import tap_load_data.updrsTapping_import as tap_import
import tap_load_data.tapping_preprocess as tap_preproc
import tap_load_data.tapping_find_blocks as find_blocks
import tap_load_data.tapping_impact_finder as tap_impact
import tap_load_data.tapping_time_detect as tap_times

import tapping_run as tap_run

# ft extraction
import tap_extract_fts.tapping_featureset as tap_fts_set
import tap_extract_fts.tapping_extract_features as tap_ft_extr
import tap_extract_fts.tapping_feat_calc as ft_calc

# own data exploration functions
import tap_extract_fts.tapping_feat_boxplots as fts_boxplot

# own helper functions
from utils import utils_dataManagement, tmsi_poly5reader, utils_preprocessing

#### EXTRACT FEATURES

In [16]:
# check which sessions present

block_acc.keys()

dict_keys(['007_L_Off', '007_L_On', '007_R_Off', '007_R_On', '014_L_Off', '014_L_On', '014_R_Off', '014_R_On', '015_L_Off', '015_L_On', '015_R_Off', '015_R_On', '013_L_Off', '013_L_On', '013_R_Off', '013_R_On'])

In [202]:
importlib.reload(tap_ft_extr)
importlib.reload(tap_ft_extr)
importlib.reload(tap_fts_set)


# Read in priorly Block-Selection
block_indx_json = os.path.join(
    data_dir,
    'prep_jsons',
    'manual_block_selection_stimAmpRange.json'
)
 
with open(block_indx_json) as f:
    block_indx = json.load(f, )


FEATS = {}
fs=250
for run in block_acc.keys():
    print(f'start {run}')

    # to select subjects/runs to extract
    splits = run.split('_')
    # if splits[0] != '007': continue
    # if splits[2] != 'Off': continue

    for S, stim in enumerate(subscores[run].keys()):

        for b, block in enumerate([1, 2, 3]):

            tapscore = subscores[run][stim][b]
            block_n = block_indx[run][stim][b]
            tap_acc = block_acc[run][block_n]

            tap_i, imp_i, acc_temp = tap_run.run_updrs_tapping(
                acc_arr=tap_acc, fs=fs, already_preprocd=True,
            )
            
            if len(tap_i) < 1:
                # print(f'No taps detected for {run}_Stim{S}_block {block}')  # for OnOff
                print(f'No taps detected for {run} {stim}_block {block}')  # for Range
                # continue

            FEATS[f'{run}_{stim}_b{block}'] = tap_ft_extr.tapFeatures(
                triax_arr=acc_temp,
                fs=fs,
                impacts=imp_i,
                tapDict=tap_i,  # result of continTap
                updrsSubScore=tapscore,
            )         





start 007_L_Off
No taps detected for 007_L_Off 2mA_block 1
start 007_L_On
start 007_R_Off
No taps detected for 007_R_Off 05mA_block 1
No taps detected for 007_R_Off 05mA_block 3
No taps detected for 007_R_Off 1mA_block 1
No taps detected for 007_R_Off 1mA_block 2
No taps detected for 007_R_Off 1mA_block 3
No taps detected for 007_R_Off 15mA_block 1
No taps detected for 007_R_Off 15mA_block 3
No taps detected for 007_R_Off 2mA_block 1
No taps detected for 007_R_Off 2mA_block 2
No taps detected for 007_R_Off 2mA_block 3
start 007_R_On
start 014_L_Off
start 014_L_On
start 014_R_Off
start 014_R_On
start 015_L_Off
start 015_L_On
start 015_R_Off
start 015_R_On
start 013_L_Off
start 013_L_On
start 013_R_Off
start 013_R_On


# 3. Clustering & Classifying

- Candidate vetors based on descriptives and concept
    - nTaps
    - freq
    - upVelo sum [std-dev + coefVar]
    - impact RMS [coefVar + stddev]
    - tapRMS and impactRMS [sum]
    - 
- include per run (array tap-features): sum, mean, stddev, trend_slope

- Cluster on UPDRS 4?

### 3a. ML-Vector Preparation

In [201]:
importlib.reload(tap_fts_set)

FEATS['007_L_Off_1mA_b1'].impactRMS_svm

tap_fts_set.amplitudeDecrement(
    [
        FEATS['007_L_Off_1mA_b1'].tapRMS_svm,
        FEATS['007_L_Off_1mA_b1'].impactRMS_svm,
        FEATS['007_L_Off_1mA_b1'].upVelo_svm
    ]
)

10

In [187]:
valid_samples

['007_L_Off_0mA_b1',
 '007_L_Off_0mA_b2',
 '007_L_Off_0mA_b3',
 '007_L_Off_05mA_b1',
 '007_L_Off_05mA_b2',
 '007_L_Off_05mA_b3',
 '007_L_Off_1mA_b1',
 '007_L_Off_1mA_b2',
 '007_L_Off_1mA_b3',
 '007_L_Off_15mA_b1',
 '007_L_Off_15mA_b2',
 '007_L_Off_15mA_b3',
 '007_L_Off_2mA_b2',
 '007_L_Off_2mA_b3',
 '007_L_Off_25mA_b1',
 '007_L_Off_25mA_b2',
 '007_L_Off_25mA_b3',
 '007_L_On_0mA_b1',
 '007_L_On_0mA_b2',
 '007_L_On_0mA_b3',
 '007_L_On_05mA_b1',
 '007_L_On_05mA_b2',
 '007_L_On_05mA_b3',
 '007_L_On_1mA_b1',
 '007_L_On_1mA_b2',
 '007_L_On_1mA_b3',
 '007_L_On_15mA_b1',
 '007_L_On_15mA_b2',
 '007_L_On_15mA_b3',
 '007_L_On_2mA_b1',
 '007_L_On_2mA_b2',
 '007_L_On_2mA_b3',
 '007_L_On_25mA_b1',
 '007_L_On_25mA_b2',
 '007_L_On_25mA_b3',
 '007_L_On_3mA_b1',
 '007_L_On_3mA_b2',
 '007_L_On_3mA_b3',
 '007_R_Off_0mA_b1',
 '007_R_Off_0mA_b2',
 '007_R_Off_0mA_b3',
 '007_R_Off_05mA_b2',
 '007_R_Off_15mA_b2',
 '007_R_On_0mA_b1',
 '007_R_On_0mA_b2',
 '007_R_On_0mA_b3',
 '007_R_On_05mA_b1',
 '007_R_On_05mA_b

In [186]:
importlib.reload(ft_calc)

ft_axis = 'svm'
single_ft_names = [
    'nTaps',
    'freq',
    'ampDecrement'
]
arr_ft_names = [
    f'tapRMS_{ft_axis}',
    f'upVelo_{ft_axis}',
    f'impactRMS_{ft_axis}',
    'dirChange_taps',
]
ft_aggr_to_add = [
    'mean', 'coefVar', 'variance', 'sum', 'trend_slope'
]


valid_sel = [
    len(FEATS[s].tapDict) > 0 for s in FEATS.keys()
]
valid_samples = list(compress(FEATS.keys(), valid_sel))
n_samples = len(valid_samples)

Xdf = pd.DataFrame(
    data=np.zeros((
        n_samples,
        len(single_ft_names) + (
            len(arr_ft_names) * len(ft_aggr_to_add)
        )
    )), columns=[single_ft_names + [
        f'{f}_{m}' for f in arr_ft_names for m in ft_aggr_to_add
    ]]
)
y = ft_calc.nan_array([Xdf.shape[0], 1])
for s in range(Xdf.shape[0]):
    y[s] = getattr(
        FEATS[valid_samples[s]],
        'updrsSubScore'
    )

for ft in single_ft_names:
    for s, sam in enumerate(valid_samples):
        try:
            ft_value = getattr(FEATS[sam], ft)
        except AttributeError:
            print(sam)

        Xdf.iloc[s][ft] = ft_value


max_nTaps = np.max([FEATS[k].nTaps for k in valid_samples])

arr_feats = ft_calc.nan_array([
    len(arr_ft_names),
    max_nTaps,
    n_samples
])

for f, ft in enumerate(arr_ft_names):

    for s, sam in enumerate(valid_samples):

        ft_values = getattr(FEATS[sam], ft)
        arr_feats[f, :len(ft_values), s] = ft_values

# Normalise vector per array-feature over all samples
for ft_row in range(len(arr_ft_names)):

    vec_max = np.nanmax(arr_feats[ft_row, :, :])
    arr_feats[ft_row, :, :] = arr_feats[ft_row, :, :] / vec_max

# Normalise single feature over all samples
for ft in single_ft_names:

    fmax = np.max(Xdf[ft])
    Xdf[ft] = Xdf[ft] / fmax

# Aggregate array features
for ft, ft_name in enumerate(arr_ft_names):

    for m in ft_aggr_to_add:

        for s in np.arange(Xdf.shape[0]):
            
            value = ft_calc.aggregate_arr_fts(
                method=m, arr=arr_feats[ft, :, s]
            )
            
            Xdf.iloc[s][f'{ft_name}_{m}'] = value

### assert np.sum(np.isnan(Xdf.values)) == 0
X = Xdf.values

assert np.isnan(X).any() == False, print(
    'X array contains missing values:\n',
    np.isnan(Xdf).any()
)

  return reduction(axis=axis, out=out, **passkwargs)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Xdf.iloc[s][f'{ft_name}_{m}'] = value


artificial 0 added
artificial 0 added
artificial 0 added
artificial 0 added
artificial 0 added
artificial 0 added
artificial 0 added
artificial 0 added
artificial 0 added
artificial 0 added
artificial 0 added
artificial 0 added
artificial 0 added
artificial 0 added
artificial 0 added


  slope = ssxym / ssxm
  t = r * np.sqrt(df / ((1.0 - r + TINY)*(1.0 + r + TINY)))
  slope_stderr = np.sqrt((1 - r**2) * ssym / ssxm / df)


artificial 0 added
artificial 0 added
artificial 0 added
artificial 0 added
artificial 0 added
artificial 0 added
artificial 0 added
artificial 0 added
artificial 0 added
artificial 0 added
artificial 0 added
artificial 0 added
artificial 0 added
artificial 0 added
artificial 0 added
artificial 0 added
artificial 0 added
artificial 0 added
artificial 0 added
artificial 0 added
artificial 0 added
artificial 0 added
artificial 0 added
artificial 0 added
artificial 0 added
artificial 0 added
artificial 0 added
artificial 0 added
artificial 0 added
artificial 0 added
artificial 0 added
artificial 0 added
artificial 0 added
artificial 0 added
artificial 0 added
artificial 0 added
artificial 0 added
artificial 0 added
artificial 0 added
artificial 0 added
artificial 0 added
artificial 0 added
artificial 0 added
artificial 0 added
artificial 0 added
artificial 0 added
artificial 0 added
artificial 0 added
artificial 0 added
artificial 0 added
artificial 0 added
artificial 0 added
artificial 0

AssertionError: None

### 3b. Clustering

In [175]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# DBSCAN
# Laio

In [176]:
### Cluster Settings
## test with full features and PCA1-2;
## test first PCA cluster and full-fts classification within clusters
n_clusters = 4


X = Xdf.values
pca = PCA(2)
# X = pca.fit_transform(X)

kmeans = KMeans(
    n_clusters=n_clusters,
    random_state=27
)
y_clust_labels = kmeans.fit_predict(X)
centroids = kmeans.cluster_centers_

y_cluster0 = y[y_clust_labels == 0]
y_cluster1 = y[y_clust_labels == 1]

score_cols = {
    0: 'green',
    1: 'blue',
    2: 'orange',
    3: 'red',
    4: 'purple'
}

clMarkers = ['o', 'x', '*', '+', '^']

fig, ax = plt.subplots(1, 1, figsize=(12, 12))
s = 75
for n, X_pca_row in enumerate(X):

    score = int(y[n])
    col = score_cols[score]

    for clN in range(n_clusters):

        if y_clust_labels[n] == clN:
            
            ax.scatter(
                X[n, 0], X[n, 1],
                label=f'Cluster-{clN}, Tap-Score {score}',
                s=s, color=col, alpha=.8,
                marker=clMarkers[clN],
            )

    # if y_clust_labels[n] == 0:
    #     ax.scatter(
    #         X[n, 0], X[n, 1],
    #         label=f'Cluster-0, Tap-Score {score}',
    #         # edgecolor=col, facecolor='w',
    #         s=s, color=col, marker='*',
    #     )
    
    # elif y_clust_labels[n] == 1:
    #     ax.scatter(
    #         X[n, 0], X[n, 1], marker='+',
    #         label=f'Cluster-1, Tap-Score {score}',
    #         color=col, s=s, alpha=.7,
    #     )

    # elif y_clust_labels[n] == 2:
    #     ax.scatter(
    #         X[n, 0], X[n, 1],
    #         label=f'Cluster-2, Tap-Score {score}',
    #         color=col, s=s, alpha=.7,
    #     )

for c in range(centroids.shape[0]):
    ax.scatter(
        centroids[c, 0], centroids[c, 1],
        edgecolor='k', s=s + 50, fc='w',
        label='Cluster centers'
    )

handles, labels = plt.gca().get_legend_handles_labels()
by_label = dict(zip(labels, handles))
ax.legend(
    by_label.values(), by_label.keys(),
    frameon=False, fontsize=16,
    ncol=1,
    loc='upper left',
    bbox_to_anchor=[1.01, .95]
)

ax.set_xlabel('PCA-1', fontsize=18)
ax.set_ylabel('PCA-2', fontsize=18)
ax.set_title(
    'kMeans Clustering 10-seconds of Finger Tapping',
    fontsize=20
)

# plt.savefig(
#     os.path.join(
#         fig_dir,
#         f'kMeans_{n_clusters}clusts_1AX_20220727'),
#     dpi=150, facecolor='w',
# )

plt.show()

# plt.hist(labels0, color='purple')
# plt.hist(labels1, color='green')

ValueError: Input X contains NaN.
KMeans does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

### 3c. Classification

In [80]:
from sklearn.svm import LinearSVC
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.metrics import multilabel_confusion_matrix

#### Classifying UPDRS 0 - 1 - 2 - 3 - 4

In [82]:
# CLassification Settings
nFolds = 4

# Shuffle order
X = Xdf.values

allData = np.hstack((X, y))
X_shf = allData[:, :14]
y_shf = allData[:, 14]
np.random.seed(27)
np.random.shuffle(allData)

skf = StratifiedKFold(n_splits=nFolds,)
skf.get_n_splits(X_shf, y_shf)

y_pred, y_true = {}, {}

for F, (train_index, test_index) in enumerate(
    skf.split(X, y)
):
    print(f'Linear Support Vector, fold #{F}')

    X_train, X_test = X_shf[train_index], X_shf[test_index]
    y_train, y_test = y_shf[train_index], y_shf[test_index]

    clf = LinearSVC(penalty='l2', C=1.0,)
    clf.fit(X=X_train, y=y_train)
    print(clf.score(X=X_test, y=y_test))
    # for own scoring
    y_pred[F] = clf.predict(X=X_test)
    y_true[F] = y_test
    # print(multilabel_confusion_matrix(y_true[F], y_pred[F]))

    



Linear Support Vector, fold #0


ValueError: Unknown label type: 'continuous'

#### Boolean Classifying (UPDRS 0 or 4 vs The Rest)

In [83]:
# CLassification Settings
nFolds = 4
score_to_predict = 0

# Shuffle order
X = Xdf.values
y_bool = y == score_to_predict

allData = np.hstack((X, y_bool))

np.random.seed(27)
np.random.shuffle(allData)

X_shf = allData[:, :14]
y_shf = allData[:, 14]

skf = StratifiedKFold(n_splits=nFolds,)
skf.get_n_splits(X_shf, y_shf)

y_pred, y_true = {}, {}
print(
    'Classification of UPDRS subscore '
    f'{score_to_predict} versus thre rest')
for F, (train_index, test_index) in enumerate(
    skf.split(X, y)
):
    print(f'\nLinear Support Vector, fold #{F}')

    X_train, X_test = X_shf[train_index], X_shf[test_index]
    y_train, y_test = y_shf[train_index], y_shf[test_index]

    clf = LinearSVC(penalty='l2', C=1.0,)
    clf.fit(X=X_train, y=y_train)
    print(f'Accuracy: {clf.score(X=X_test, y=y_test)}')
    # for own scoring
    y_pred[F] = clf.predict(X=X_test)
    y_true[F] = y_test
    print(classification_report(y_true[F], y_pred[F]))
    



Classification of UPDRS subscore 0 versus thre rest

Linear Support Vector, fold #0


ValueError: Unknown label type: 'continuous'