In [1]:
import numpy as np
import matplotlib.pyplot as plt
import sys
import os
import copy
import pandas as pd
import time

In [2]:
from subspace_clustering_helper_funcs import *

- https://scikit-learn.org/stable/modules/classes.html#module-sklearn.manifold

## Loading in the data

In [3]:
# remove pID 101 because it doesn't exist
# remove pID 131 because it  doesnt have enough user defined gestures
# each participant has 100 experimenter defined files and 50 user defined files
# 10 experimenter defined gestures and 5 user defined gestures

file_types = ["IMU_extract", "movavg_files"]
expt_types = ["experimenter-defined"]

#remove participant 131 because they are missing gestures 
pIDs_impaired = ['P102','P103','P104','P105','P106','P107','P108','P109','P110','P111',
       'P112','P114','P115','P116','P118','P119','P121','P122','P123','P124','P125',
       'P126','P127','P128', 'P132']
# remove participants P001 and P003 because they dont have duplicate or open gestures
pIDs_unimpaired = ['P004','P005','P006','P008','P010','P011']

pIDs_both = pIDs_impaired + pIDs_unimpaired

Version using dataframes (wasted way too much time writing this...)

In [4]:
def load_data(pIDs, data_dir_path="C:\\Users\\kdmen\\Box Sync\\$M data segmented\\segmented_filtered_data\\", file_types=["IMU_extract", "movavg_files"], expt_types=["experimenter-defined"]):
    
    data_dict = {}
    for expt_type in expt_types:
        for pid in pIDs:
            print(pid)
            pid_path = os.path.join(data_dir_path, pid)
            for file_type in file_types:
                sub_path = os.path.join(pid_path, file_type)
                if not os.path.exists(sub_path):
                    print(f"Subpath does not exist: {sub_path}")
                    continue
                for file in os.listdir(sub_path):
                    split_filename = file.split("_")
                    if len(split_filename) < 6:
                        print(f"Unexpected filename format: {file}")
                        continue
                    gestureID = split_filename[4]
                    gestureNum = split_filename[5]

                    if file_type == "movavg_files":
                        headers = ['EMG1', 'EMG2', 'EMG3', 'EMG4', 'EMG5',
                                   'EMG6', 'EMG7', 'EMG8', 'EMG9', 'EMG10',
                                   'EMG11', 'EMG12', 'EMG13', 'EMG14', 'EMG15',
                                   'EMG16']
                    else:
                        headers = ['IMU1_ax', 'IMU1_ay', 'IMU1_az', 'IMU1_vx', 'IMU1_vy', 'IMU1_vz',
                                   'IMU2_ax', 'IMU2_ay', 'IMU2_az', 'IMU2_vx', 'IMU2_vy', 'IMU2_vz',
                                   'IMU3_ax', 'IMU3_ay', 'IMU3_az', 'IMU3_vx', 'IMU3_vy', 'IMU3_vz',
                                   'IMU4_ax', 'IMU4_ay', 'IMU4_az', 'IMU4_vx', 'IMU4_vy', 'IMU4_vz',
                                   'IMU5_ax', 'IMU5_ay', 'IMU5_az', 'IMU5_vx', 'IMU5_vy', 'IMU5_vz',
                                   'IMU6_ax', 'IMU6_ay', 'IMU6_az', 'IMU6_vx', 'IMU6_vy', 'IMU6_vz',
                                   'IMU7_ax', 'IMU7_ay', 'IMU7_az', 'IMU7_vx', 'IMU7_vy', 'IMU7_vz',
                                   'IMU8_ax', 'IMU8_ay', 'IMU8_az', 'IMU8_vx', 'IMU8_vy', 'IMU8_vz',
                                   'IMU9_ax', 'IMU9_ay', 'IMU9_az', 'IMU9_vx', 'IMU9_vy', 'IMU9_vz',
                                   'IMU11_ax', 'IMU11_ay', 'IMU11_az', 'IMU11_vx', 'IMU11_vy', 'IMU11_vz',
                                   'IMU13_ax', 'IMU13_ay', 'IMU13_az', 'IMU13_vx', 'IMU13_vy', 'IMU13_vz',
                                   'IMU15_ax', 'IMU15_ay', 'IMU15_az', 'IMU15_vx', 'IMU15_vy', 'IMU15_vz']

                    file_path = os.path.join(sub_path, file)
                    if not os.path.exists(file_path):
                        print(f"File does not exist: {file_path}")
                        continue
                    df = pd.read_csv(file_path, names=headers, header=0)
                    if df.empty:
                        print(f"DataFrame is empty for file: {file_path}")
                        continue

                    df['Participant'] = pid
                    df['Gesture_ID'] = gestureID
                    df['Gesture_Num'] = gestureNum

                    # Interpolate the data 
                    df_interpolated = interpolate_df(df, num_rows=64, columns_to_exclude=['Participant', 'Gesture_ID', 'Gesture_Num'])

                    # Create a unique key for dict access, based on PID, Gesture_ID, and Gesture_Num
                    unique_key = (pid, gestureID, gestureNum)

                    if unique_key in data_dict:
                        # Merge the DataFrames on index to avoid duplicate columns
                        existing_df = data_dict[unique_key]
                        merged_df = existing_df.merge(df_interpolated, left_index=True, right_index=True, suffixes=('', '_dup'))
                        # Drop duplicate columns if necessary
                        for col in merged_df.columns:
                            if col.endswith('_dup'):
                                merged_df.drop(columns=col, inplace=True)
                        data_dict[unique_key] = merged_df
                    else:
                        data_dict[unique_key] = df_interpolated

    # Convert the dictionary to a list of DataFrames
    data_lst = list(data_dict.values())

    # Concatenate all dataframes into one
    print(f"Shape of first df: {data_lst[0].shape} (expected shape is (64, 91))")
    # ... does this work as expected... shouldn't it also be ele_df[0].shape
    edited_data_lst = [ele_df for ele_df in data_lst if ele_df.shape == data_lst[0].shape]
    dataframe = pd.concat(edited_data_lst, ignore_index=True)

    # Check for NaN values in the resulting dataframe
    nan_participant_rows = dataframe[dataframe['Participant'].isna()]
    print(f"Number of rows with NaN Participant: {nan_participant_rows.shape[0]}")
        
    return dataframe


# EMG and IMU Dataset

In [6]:
start_time = time.time()
data_df = load_data(pIDs_both)
end_time = time.time()

print(f"\nCompleted in {end_time - start_time}s")

P102
P103
P104
P105
P106
P107
P108
P109
P110
P111
P112
P114
P115
P116
P118
P119
P121
P122
P123
P124
P125
P126
P127
P128
P132
P004
P005
P006
P008
P010
P011
Shape of first df: (64, 91) (expected shape is (64, 91))
Number of rows with NaN Participant: 0

Completed in 2179.995052576065


In [9]:
print(data_df.shape)
data_df.head()

(426752, 91)


Unnamed: 0,Participant,Gesture_ID,Gesture_Num,IMU1_ax,IMU1_ay,IMU1_az,IMU1_vx,IMU1_vy,IMU1_vz,IMU2_ax,...,EMG7,EMG8,EMG9,EMG10,EMG11,EMG12,EMG13,EMG14,EMG15,EMG16
0,P102,pan,1,0.341797,-0.939941,0.000977,-0.00745,-0.192625,0.005321,-0.380859,...,2e-06,2e-06,3e-06,2e-05,4e-06,4e-06,2e-06,9e-06,1e-06,2e-06
1,P102,pan,1,0.336178,-0.963185,0.003898,0.009595,-0.190446,-0.026116,-0.394547,...,3e-06,3e-06,3e-06,1.4e-05,7e-06,7e-06,2e-06,1.7e-05,1e-06,2e-06
2,P102,pan,1,0.353539,-0.963704,0.011711,0.095966,-0.20548,-0.155563,-0.398406,...,3e-06,3e-06,4e-06,7e-06,4e-06,5e-06,3e-06,2e-05,3e-06,2e-06
3,P102,pan,1,0.352841,-0.950288,0.011509,0.058836,-0.184871,-0.083567,-0.38923,...,3e-06,3e-06,6e-06,5e-06,4e-06,3e-06,4e-06,1.5e-05,3e-06,3e-06
4,P102,pan,1,0.372621,-0.991273,0.029847,0.293946,-0.178756,-0.281361,-0.396043,...,3e-06,2e-06,8e-06,3e-06,7e-06,2.2e-05,4e-06,1.7e-05,2e-06,3e-06


In [None]:
# Example dataframe, assuming 'df' is your dataframe
# Count NaNs per row
nans_per_row = data_df.isna().sum(axis=1)

# Count NaNs per column
nans_per_column = data_df.isna().sum(axis=0)


In [None]:
# Summary statistics for NaNs per row
nans_per_row.describe()

# Summary statistics for NaNs per column
#nans_per_column.describe()


# EMG Only Dataset

In [5]:
start_time = time.time()
emg_df = load_data(pIDs_both, file_types=["movavg_files"])
end_time = time.time()

print(f"\nCompleted in {end_time - start_time}s")

P102
Subpath does not exist: C:\Users\kdmen\Box Sync\$M data segmented\segmented_filtered_data\P102\movavg_files
P103
Subpath does not exist: C:\Users\kdmen\Box Sync\$M data segmented\segmented_filtered_data\P103\movavg_files
P104
Subpath does not exist: C:\Users\kdmen\Box Sync\$M data segmented\segmented_filtered_data\P104\movavg_files
P105
Subpath does not exist: C:\Users\kdmen\Box Sync\$M data segmented\segmented_filtered_data\P105\movavg_files
P106
Subpath does not exist: C:\Users\kdmen\Box Sync\$M data segmented\segmented_filtered_data\P106\movavg_files
P107
Subpath does not exist: C:\Users\kdmen\Box Sync\$M data segmented\segmented_filtered_data\P107\movavg_files
P108
Subpath does not exist: C:\Users\kdmen\Box Sync\$M data segmented\segmented_filtered_data\P108\movavg_files
P109
Subpath does not exist: C:\Users\kdmen\Box Sync\$M data segmented\segmented_filtered_data\P109\movavg_files
P110
Subpath does not exist: C:\Users\kdmen\Box Sync\$M data segmented\segmented_filtered_data\P

IndexError: list index out of range

In [None]:
print(emg_df.shape)
emg_df.head()

In [None]:
# Example dataframe, assuming 'df' is your dataframe
# Count NaNs per row
nans_per_row = emg_df.isna().sum(axis=1)

# Count NaNs per column
nans_per_column = emg_df.isna().sum(axis=0)


In [None]:
# Summary statistics for NaNs per row
nans_per_row.describe()

# Summary statistics for NaNs per column
#nans_per_column.describe()


# IMU Only Dataset

In [None]:
start_time = time.time()
imu_df = load_data(pIDs_both, file_types=["IMU_extract"])
end_time = time.time()

print(f"\nCompleted in {end_time - start_time}s")

In [None]:
print(imu_df.shape)
imu_df.head()

In [None]:
# Example dataframe, assuming 'df' is your dataframe
# Count NaNs per row
nans_per_row = imu_df.isna().sum(axis=1)

# Count NaNs per column
nans_per_column = imu_df.isna().sum(axis=0)


In [None]:
# Summary statistics for NaNs per row
nans_per_row.describe()

# Summary statistics for NaNs per column
#nans_per_column.describe()


# Save the dataframe

In [None]:
assert(1==0)

In [10]:
## Pickle is theoretically faster for Python...

data_df.to_pickle('metadata_IMU_EMG_allgestures_allusers.pkl')
#df = pd.read_pickle('your_dataframe.pkl')