In [1]:
import numpy as np
import matplotlib.pyplot as plt
import sys
import os
import copy
import pandas as pd

In [None]:
# Do these need to get imported still if they're imported in the other .py file? Idk
#from sklearn.decomposition import PCA
#from sklearn.manifold import TSNE
#from sklearn.decomposition import IncrementalPCA
#from sklearn.decomposition import KernelPCA
#from umap import UMAP
#from sklearn.manifold import MDS
#from sklearn.manifold import Isomap

In [None]:
from subspace_clustering_helper_funcs import *

- https://scikit-learn.org/stable/modules/classes.html#module-sklearn.manifold

## Loading in the data

In [None]:
# remove pID 101 because it doesn't exist
#remove pID 131 because it  doesnt have enough user defined gestures
# each participant has 100 experimenter defined files and 50 user defined files
#10 experimenter defined gestures and 5 user defined gestures

file_types = ["IMU_extract", "movavg_files"]
expt_types = ["experimenter-defined"]

#remove participant 131 because they are missing gestures 
pIDs_impaired = ['P102','P103','P104','P105','P106','P107','P108','P109','P110','P111',
       'P112','P114','P115','P116','P118','P119','P121','P122','P123','P124','P125',
       'P126','P127','P128', 'P132']
# remove participants P001 and P003 because they dont have duplicate or open gestures
pIDs_unimpaired = ['P004','P005','P006','P008','P010','P011']

pIDs_both = pIDs_impaired + pIDs_unimpaired

From Ben:

In [10]:
def load_data(pIDs, plot_raw, plot_interpolated, data_dir_path="C:\\Users\\kdmen\\Box Sync\\$M data segmented\\segmented_filtered_data\\"):    
    """
    generate data set for experimenter defined gestures
    the data structure will be a nested dictionary with multiple tiers:
    first level will be the participant id
    second level will be the file type: IMU or EMG
    third level will be the gestureID
    the third level will contain all 10 trials for each gesture smashed together vertically
    """
    
    print("Loading Data")
    global_path = data_dir_path
    data ={}
    for expt_type in expt_types:
        for pid in pIDs:
            print(pid)
            data[pid] = {}
            path = global_path + pid + "\\"
            for file_type in file_types:
                data[pid][file_type] = {}
                sub_path = path + file_type #+ "\\"
                for file in os.listdir(sub_path):
                        split_filename = file.split("_")
                        gestureID = split_filename[4]
                        gestureNum = split_filename[5]
                        if file_type == "movavg_files":
                            headers=['EMG1','EMG2','EMG3','EMG4','EMG5',
                                                           'EMG6','EMG7','EMG8','EMG9','EMG10',
                                                           'EMG11','EMG12','EMG13','EMG14','EMG15',
                                                           'EMG16']
                        else:
                            headers= ['IMU1_ax', 'IMU1_ay', 'IMU1_az', 'IMU1_vx', 'IMU1_vy', 'IMU1_vz',
                                    'IMU2_ax', 'IMU2_ay', 'IMU2_az', 'IMU2_vx', 'IMU2_vy', 'IMU2_vz', 
                                    'IMU3_ax', 'IMU3_ay', 'IMU3_az', 'IMU3_vx', 'IMU3_vy', 'IMU3_vz',
                                    'IMU4_ax', 'IMU4_ay', 'IMU4_az', 'IMU4_vx', 'IMU4_vy', 'IMU4_vz', 
                                    'IMU5_ax', 'IMU5_ay', 'IMU5_az', 'IMU5_vx', 'IMU5_vy', 'IMU5_vz',
                                    'IMU6_ax', 'IMU6_ay', 'IMU6_az', 'IMU6_vx', 'IMU6_vy', 'IMU6_vz',
                                    'IMU7_ax', 'IMU7_ay', 'IMU7_az', 'IMU7_vx', 'IMU7_vy', 'IMU7_vz', 
                                    'IMU8_ax', 'IMU8_ay', 'IMU8_az', 'IMU8_vx', 'IMU8_vy', 'IMU8_vz', 
                                    'IMU9_ax', 'IMU9_ay', 'IMU9_az', 'IMU9_vx', 'IMU9_vy', 'IMU9_vz', 
                                    'IMU11_ax', 'IMU11_ay', 'IMU11_az', 'IMU11_vx', 'IMU11_vy', 'IMU11_vz', 
                                    'IMU13_ax', 'IMU13_ay', 'IMU13_az', 'IMU13_vx', 'IMU13_vy', 'IMU13_vz', 
                                    'IMU15_ax', 'IMU15_ay', 'IMU15_az', 'IMU15_vx', 'IMU15_vy', 'IMU15_vz']
                        #read the data 
                        df = pd.read_csv(sub_path + "\\" + file,names=headers,header=0)
                        
                        #interpolate the data 
                        df_interpolated = interpolate_dataframe(df, num_rows=64)

                        if gestureID not in data[pid][file_type]:
                            #create the dataframe for the emgs of this gesture for this user
                            data[pid][file_type][gestureID] = df_interpolated
                        else:
                            #smash them together 
                            data[pid][file_type][gestureID]= pd.concat([data[pid][file_type][gestureID], df_interpolated])
    print("Loading Complete")
    return data

Data from Box Sync still isn't loaded in yet...

In [None]:
data = load_data(pIDs_both, False, False)

Rewritten version using dataframes:

In [None]:
file_types = ["IMU_extract", "movavg_files"]
expt_types = ["experimenter-defined"]

def load_data(pIDs, data_dir_path="C:\\Users\\kdmen\\Box Sync\\$M data segmented\\segmented_filtered_data\\"):
    
    data_list = []
    
    for pid in pIDs:
        path = os.path.join(data_dir_path, pid)
        for expt_type in expt_types:
            for file_type in file_types:
                sub_path = os.path.join(path, file_type)
                for file in os.listdir(sub_path):
                    split_filename = file.split("_")
                    gestureID = split_filename[4]
                    gestureNum = split_filename[5]
                    gesture_type = expt_type
                    
                    if file_type == "movavg_files":
                        headers = ['EMG1','EMG2','EMG3','EMG4','EMG5',
                                   'EMG6','EMG7','EMG8','EMG9','EMG10',
                                   'EMG11','EMG12','EMG13','EMG14','EMG15',
                                   'EMG16']
                    else:
                        headers = ['IMU1_ax', 'IMU1_ay', 'IMU1_az', 'IMU1_vx', 'IMU1_vy', 'IMU1_vz',
                                   'IMU2_ax', 'IMU2_ay', 'IMU2_az', 'IMU2_vx', 'IMU2_vy', 'IMU2_vz', 
                                   'IMU3_ax', 'IMU3_ay', 'IMU3_az', 'IMU3_vx', 'IMU3_vy', 'IMU3_vz',
                                   'IMU4_ax', 'IMU4_ay', 'IMU4_az', 'IMU4_vx', 'IMU4_vy', 'IMU4_vz', 
                                   'IMU5_ax', 'IMU5_ay', 'IMU5_az', 'IMU5_vx', 'IMU5_vy', 'IMU5_vz',
                                   'IMU6_ax', 'IMU6_ay', 'IMU6_az', 'IMU6_vx', 'IMU6_vy', 'IMU6_vz',
                                   'IMU7_ax', 'IMU7_ay', 'IMU7_az', 'IMU7_vx', 'IMU7_vy', 'IMU7_vz', 
                                   'IMU8_ax', 'IMU8_ay', 'IMU8_az', 'IMU8_vx', 'IMU8_vy', 'IMU8_vz', 
                                   'IMU9_ax', 'IMU9_ay', 'IMU9_az', 'IMU9_vx', 'IMU9_vy', 'IMU9_vz', 
                                   'IMU11_ax', 'IMU11_ay', 'IMU11_az', 'IMU11_vx', 'IMU11_vy', 'IMU11_vz', 
                                   'IMU13_ax', 'IMU13_ay', 'IMU13_az', 'IMU13_vx', 'IMU13_vy', 'IMU13_vz', 
                                   'IMU15_ax', 'IMU15_ay', 'IMU15_az', 'IMU15_vx', 'IMU15_vy', 'IMU15_vz']
                    df = pd.read_csv(os.path.join(sub_path, file), names=headers, header=0)
                    df['Participant'] = pid
                    df['Gesture_ID'] = gestureID
                    df['Gesture_Num'] = gestureNum
                    df['Gesture_Type'] = gesture_type
                    data_list.append(df)
    
    # Concatenate all dataframes into one
    dataframe = pd.concat(data_list, ignore_index=True)
    return dataframe


In [None]:
data_df = load_data(pIDs_both)

In [None]:
print(data_df.shape)
data_df.head()

## Applying dimensionality reduction algorithms:

Modified version of Ben's PCA_all_participants() func

In [None]:
def apply_dim_reduc(model_str, data_df, num_dims, hp=None, modality=['EMG and IMU'], participant_inclusion=['All Participants']):
    '''
    model_str: what kind of model to use (eg PCA, T-SNE, ...)
    data_df: df containing all the (training) data
    num_dims: how many dimensions/components should be used [HYPERPARAM!]
    hp: [hyperparams] use this to store 
    modality: ['EMG', 'IMU', 'EMG and IMU']
    '''
    
    gestures = ['pan', 'duplicate', 'zoom-out', 'zoom-in', 'move', 'rotate', 'select-single', 'delete', 'close', 'open']
    data_types = modality
    participant_types = participant_inclusion
        
    for f_type in data_types:
        # Why does this exist if we already extracted the data...
        if f_type == 'EMG':
            file_types = ['movavg_files']
        if f_type == 'IMU':
            file_types = ['IMU_extract']
        if f_type == 'EMG and IMU':
            file_types = ['movavg_files', 'IMU_extract']
            
        for p_type in participant_types:
            
            if p_type == "All Participants":
                pIDs = data_df['Participant'].unique()
            elif p_type == "Impaired Participants":
                pIDs = data_df[data_df['Participant'].isin(pIDs_impaired)]['Participant'].unique()
            elif p_type == "Unimpaired Participants":
                pIDs = data_df[data_df['Participant'].isin(pIDs_unimpaired)]['Participant'].unique()
            else:
                raise ValueError(f"Participant type {p_type} not supported, check supported versions.")
               
            if apply_all:
                # Figure out where/how it determines whether it is applying it to all data or not..
                ## Pass in just EMG data, or pass in full thing and cut it up after
                ### If the latter, do I need to cut it up for every single case...
                for file_type in file_types:
                    # Simplify this down to just using 1 of these fields, depending on what data_df looks like...
                    df = data_df[(data_df['Gesture_Type'] == f_type) & 
                                 (data_df['File_Type'] == file_type)]

                    df_t = apply_model(model_str, df, num_dims, hp)
            elif apply_by_user:
                for pid in pIDs:
                    for file_type in file_types:
                            df = data_df[(data_df['Participant'] == pid) & 
                                         (data_df['Gesture_Type'] == f_type) & 
                                         (data_df['File_Type'] == file_type)]

                            df_t = apply_model(model_str, df, num_dims, hp)
            elif apply_by_gesture:
                for file_type in file_types:
                    for gesture in gestures:
                        df = data_df[(data_df['Gesture_ID'] == gesture) & 
                                     (data_df['Gesture_Type'] == f_type) & 
                                     (data_df['File_Type'] == file_type)]
                        
                        df_t = apply_model(model_str, df, num_dims, hp)

            return df_t
                

## Evaluation

In [None]:
def eval_dim_reduc():
    # AIC? ...
    pass

## Application

In [None]:
# Need to do train/test splits?
# I have plenty of clustering metrics but not dim reduc specific ones...