This notebook can be used to load raw ultraleap data, save cleaned dataframes for each block, and generate dataframes of distances for further feature extraction

Import public packages and functions

In [1]:
import os
import importlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import csv
from itertools import compress


import openpyxl
from datetime import datetime
import math
import statistics as stat
import json

In [2]:
def get_repo_path_in_notebook():
    """
    Finds path of repo from Notebook.
    Start running this once to correctly find
    other modules/functions
    """
    path = os.getcwd()
    repo_name = 'ultraleap_analysis'

    while path[-len(repo_name):] != 'ultraleap_analysis':

        path = os.path.dirname(path)

    return path

In [3]:
repo_path = get_repo_path_in_notebook()
code_path = os.path.join(repo_path, 'code')
os.chdir(code_path)

Import own functions

In [4]:
import import_data.import_and_convert_data as import_dat
import import_data.find_paths as find_paths
import import_data.preprocessing_meta_info as meta_info
import sig_processing.segment_tasks as seg_tasks
import movement_calc.helpfunctions as hp
import feature_extraction.get_features as get_feat
import feature_extraction.get_files as get_files

### 1. Loading Features (X) and scores (y)
Load features from csv

In [None]:
feat_df_path = os.path.join(repo_path, 'data', 'features', 'dataframes', 'patientdata')

X_df = pd.read_csv(os.path.join(feat_df_path, 'ft_block_features.csv'), index_col=0)


Load scores / labels

In [5]:
def get_labels_for_feat_df(ft_df):

    y = []  # list to store labels

    ids = X_df['filename']
    if ids[0].startswith('feat'): ids = [x[5:-5] for x in ids]
    else: ids = [x[:-5] for x in ids]

    ids = [x.split('_') for x in ids]

    for id_list in ids:
        block, sub, cond, cam, task, side = id_list
        value = get_scores(sub, cond, cam, task, side, block)
        y.append(value)

    return y

In [6]:
def get_scores(sub, cond, cam, task, side, block):

    read_scores = pd.read_excel(os.path.join(
        find_paths.find_onedrive_path('patientdata'),
        f'scores_JB_JH_JR.xlsx'),
        usecols='A:I'
        )

    read_scores.set_index('sub_cond_cam', inplace = True)

    if side == 'left': side='lh'
    elif side == 'right': side='rh'

    # read scores for all blocks of a subject in the same cond, cam per side
    ext_scores = read_scores.loc[f'{sub}_{cond}_{cam}'][f'{task}_{side}']

    if type(ext_scores) != float:

        if isinstance(ext_scores, int):
            ls_int_sc = [ext_scores,]
        else:
            ls_int_sc = [int(s) for s in ext_scores if s in ['0', '1', '2', '3', '4']]


        if block == 'b1':
            score = ls_int_sc[0]
        elif block == 'b2':
            try:
                score = ls_int_sc[1]
            except IndexError:
                score = ls_int_sc[0]

        elif block == 'b3':
            score = ls_int_sc[2]
        else:
            print(f'no scores for block {block} or block does not exist')
            score = np.nan
        
        return score

In [7]:
y = get_labels_for_feat_df(X_df)

y = np.array(y)

NameError: name 'X_df' is not defined

### 2. Select which features you want to use !!

In [8]:
# leave out json name
X = X_df.values[:, 1:]

NameError: name 'X_df' is not defined

### 3. Data splitting, create training and test data

In [None]:
# import data splitting functions
from sklearn.model_selection import StratifiedKFold, KFold
# kf = KFold(n_splits=4, )

skf = StratifiedKFold(n_splits=4)

# for i, (train_index, test_index) in enumerate(kf.split(X)):
#     print(f"Fold {i}:")
#     print(f"  Train: index={train_index}")
#     print(f"  Test:  index={test_index}")

for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    print(f"Fold {i}:")
    print(f"  Train: index={train_index}")
    print(f"  Test:  index={test_index}")   

### 3. Define classifiers

In [None]:
# import clsasifiers
from sklearn.svm import LinearSVC

# try multiclass with Random Forest
# import metrics
from sklearn.metrics import accuracy_score
clf = LinearSVC()

### 4. Run Cross validation

In [None]:
skf = StratifiedKFold(n_splits=4)

# go from multiclass to binary
# y_binary = y > 1

# y = y_binary

for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    
    # loops over all folds

    # get training and testing split for current fold
    train_X, test_X = X[train_index], X[test_index]
    train_y, test_y = y[train_index], y[test_index]
    
    # train classifier with train X and y
    clf.fit(train_X, train_y)

    y_pred = clf.predict(test_X)
    y_true = test_y

    print(accuracy_score(y_true, y_pred))

    # print(y_true)