In [1]:
import numpy as np
import matplotlib.pyplot as plt
import sys
import os
import copy
import pandas as pd
import time

from sklearn.decomposition import PCA


In [2]:
from utils.subspace_clustering_helper_funcs import *
from utils.preprocessing import *

- https://scikit-learn.org/stable/modules/classes.html#module-sklearn.manifold

## Loading in the data

In [3]:
# remove pID 101 because it doesn't exist
# remove pID 131 because it  doesnt have enough user defined gestures
# each participant has 100 experimenter defined files and 50 user defined files
# 10 experimenter defined gestures and 5 user defined gestures

file_types = ["IMU_extract", "movavg_files"]
expt_types = ["experimenter-defined"]

#remove participant 131 because they are missing gestures 
pIDs_impaired = ['P102','P103','P104','P105','P106','P107','P108','P109','P110','P111',
       'P112','P114','P115','P116','P118','P119','P121','P122','P123','P124','P125',
       'P126','P127','P128', 'P132']
# remove participants P001 and P003 because they dont have duplicate or open gestures
pIDs_unimpaired = ['P004','P005','P006','P008','P010','P011']

pIDs_both = pIDs_impaired + pIDs_unimpaired

In [4]:
## Pickle is theoretically faster for Python...

print("Loading")
start_time = time.time()
#data_df = pd.read_pickle('C:\\Users\\kdmen\\Desktop\\Research\\Data\\$M\\Filtered_Datasets\\metadata_IMU_EMG_allgestures_allusers.pkl')
data_df = pd.read_pickle('D:\\Kai_MetaGestureClustering_24\\saved_datasets\\Filtered_Datasets\\metadata_IMU_EMG_allgestures_allusers.pkl')
end_time = time.time()
print(f"Completed in {end_time - start_time}s")

Loading
Completed in 0.3856058120727539s


In [5]:
print(data_df.shape)
data_df.head()

(426752, 91)


Unnamed: 0,Participant,Gesture_ID,Gesture_Num,IMU1_ax,IMU1_ay,IMU1_az,IMU1_vx,IMU1_vy,IMU1_vz,IMU2_ax,...,EMG7,EMG8,EMG9,EMG10,EMG11,EMG12,EMG13,EMG14,EMG15,EMG16
0,P102,pan,1,0.341797,-0.939941,0.000977,-0.00745,-0.192625,0.005321,-0.380859,...,2e-06,2e-06,3e-06,2e-05,4e-06,4e-06,2e-06,9e-06,1e-06,2e-06
1,P102,pan,1,0.336178,-0.963185,0.003898,0.009595,-0.190446,-0.026116,-0.394547,...,3e-06,3e-06,3e-06,1.4e-05,7e-06,7e-06,2e-06,1.7e-05,1e-06,2e-06
2,P102,pan,1,0.353539,-0.963704,0.011711,0.095966,-0.20548,-0.155563,-0.398406,...,3e-06,3e-06,4e-06,7e-06,4e-06,5e-06,3e-06,2e-05,3e-06,2e-06
3,P102,pan,1,0.352841,-0.950288,0.011509,0.058836,-0.184871,-0.083567,-0.38923,...,3e-06,3e-06,6e-06,5e-06,4e-06,3e-06,4e-06,1.5e-05,3e-06,3e-06
4,P102,pan,1,0.372621,-0.991273,0.029847,0.293946,-0.178756,-0.281361,-0.396043,...,3e-06,2e-06,8e-06,3e-06,7e-06,2.2e-05,4e-06,1.7e-05,2e-06,3e-06


In [6]:
metadata_cols = ['Participant', 'Gesture_ID', 'Gesture_Num']
metadata_cols_df = data_df[metadata_cols]
X = data_df.drop(metadata_cols, axis=1)

In [7]:
print(metadata_cols_df.shape)
metadata_cols_df.head()

(426752, 3)


Unnamed: 0,Participant,Gesture_ID,Gesture_Num
0,P102,pan,1
1,P102,pan,1
2,P102,pan,1
3,P102,pan,1
4,P102,pan,1


## NEED TO MEAN SUBTRACT THE EMG!!!

In [8]:
print(X.shape)
X.head()

(426752, 88)


Unnamed: 0,IMU1_ax,IMU1_ay,IMU1_az,IMU1_vx,IMU1_vy,IMU1_vz,IMU2_ax,IMU2_ay,IMU2_az,IMU2_vx,...,EMG7,EMG8,EMG9,EMG10,EMG11,EMG12,EMG13,EMG14,EMG15,EMG16
0,0.341797,-0.939941,0.000977,-0.00745,-0.192625,0.005321,-0.380859,-0.888184,-0.334961,0.124514,...,2e-06,2e-06,3e-06,2e-05,4e-06,4e-06,2e-06,9e-06,1e-06,2e-06
1,0.336178,-0.963185,0.003898,0.009595,-0.190446,-0.026116,-0.394547,-0.905297,-0.344967,0.144735,...,3e-06,3e-06,3e-06,1.4e-05,7e-06,7e-06,2e-06,1.7e-05,1e-06,2e-06
2,0.353539,-0.963704,0.011711,0.095966,-0.20548,-0.155563,-0.398406,-0.90525,-0.343246,0.14063,...,3e-06,3e-06,4e-06,7e-06,4e-06,5e-06,3e-06,2e-05,3e-06,2e-06
3,0.352841,-0.950288,0.011509,0.058836,-0.184871,-0.083567,-0.38923,-0.896252,-0.3371,0.185074,...,3e-06,3e-06,6e-06,5e-06,4e-06,3e-06,4e-06,1.5e-05,3e-06,3e-06
4,0.372621,-0.991273,0.029847,0.293946,-0.178756,-0.281361,-0.396043,-0.903902,-0.34337,0.1895,...,3e-06,2e-06,8e-06,3e-06,7e-06,2.2e-05,4e-06,1.7e-05,2e-06,3e-06


In [9]:
X.shape[0]/64

6668.0

In [10]:
X.iloc[:,-16:].head()

Unnamed: 0,EMG1,EMG2,EMG3,EMG4,EMG5,EMG6,EMG7,EMG8,EMG9,EMG10,EMG11,EMG12,EMG13,EMG14,EMG15,EMG16
0,2e-06,2e-06,1e-06,2e-06,3e-06,4e-06,2e-06,2e-06,3e-06,2e-05,4e-06,4e-06,2e-06,9e-06,1e-06,2e-06
1,2e-06,2e-06,1e-06,2e-06,3e-06,5e-06,3e-06,3e-06,3e-06,1.4e-05,7e-06,7e-06,2e-06,1.7e-05,1e-06,2e-06
2,2e-06,2e-06,1e-06,2e-06,4e-06,5e-06,3e-06,3e-06,4e-06,7e-06,4e-06,5e-06,3e-06,2e-05,3e-06,2e-06
3,2e-06,2e-06,2e-06,2e-06,5e-06,6e-06,3e-06,3e-06,6e-06,5e-06,4e-06,3e-06,4e-06,1.5e-05,3e-06,3e-06
4,3e-06,2e-06,2e-06,2e-06,5e-06,7e-06,3e-06,2e-06,8e-06,3e-06,7e-06,2.2e-05,4e-06,1.7e-05,2e-06,3e-06


Last 16 cols are EMG

In [12]:
# THIS VERSION DOESNT WORK

#block_size = 64 # --> I bet this is why it thinks it has 64 columns... this must be processed wrong...
## Group the DataFrame into blocks of 64 rows based on the index
#grouped_df = data_df.groupby(data_df.index // block_size)
#
#print("Started")
#start_time = time.time()
## Apply mean subtraction to each block separately
#mean_subtracted_df = grouped_df.transform(mean_subtraction_blockwise)
#print(f"Completed in {time.time() - start_time}s")
#
## Concatenate the mean-subtracted blocks back into a single DataFrame
#mean_subtracted_df = mean_subtracted_df.reset_index(drop=True)

In [13]:
X.head()

Unnamed: 0,IMU1_ax,IMU1_ay,IMU1_az,IMU1_vx,IMU1_vy,IMU1_vz,IMU2_ax,IMU2_ay,IMU2_az,IMU2_vx,...,EMG7,EMG8,EMG9,EMG10,EMG11,EMG12,EMG13,EMG14,EMG15,EMG16
0,0.341797,-0.939941,0.000977,-0.00745,-0.192625,0.005321,-0.380859,-0.888184,-0.334961,0.124514,...,2e-06,2e-06,3e-06,2e-05,4e-06,4e-06,2e-06,9e-06,1e-06,2e-06
1,0.336178,-0.963185,0.003898,0.009595,-0.190446,-0.026116,-0.394547,-0.905297,-0.344967,0.144735,...,3e-06,3e-06,3e-06,1.4e-05,7e-06,7e-06,2e-06,1.7e-05,1e-06,2e-06
2,0.353539,-0.963704,0.011711,0.095966,-0.20548,-0.155563,-0.398406,-0.90525,-0.343246,0.14063,...,3e-06,3e-06,4e-06,7e-06,4e-06,5e-06,3e-06,2e-05,3e-06,2e-06
3,0.352841,-0.950288,0.011509,0.058836,-0.184871,-0.083567,-0.38923,-0.896252,-0.3371,0.185074,...,3e-06,3e-06,6e-06,5e-06,4e-06,3e-06,4e-06,1.5e-05,3e-06,3e-06
4,0.372621,-0.991273,0.029847,0.293946,-0.178756,-0.281361,-0.396043,-0.903902,-0.34337,0.1895,...,3e-06,2e-06,8e-06,3e-06,7e-06,2.2e-05,4e-06,1.7e-05,2e-06,3e-06


In [14]:
imu_split_df = X.iloc[:, :72]
emg_split_df = X.iloc[:, 72:]

ms_imu_df = meansubtract_df_by_gesture(imu_split_df, metacol_start_idx=0)
ms_emg_df = meansubtract_df_by_gesture(emg_split_df, metacol_start_idx=0)

X_ms = pd.concat([metadata_cols_df, ms_imu_df, ms_emg_df], axis=1)

In [15]:
print(X_ms.shape)
X_ms.head()

(426752, 91)


Unnamed: 0,Participant,Gesture_ID,Gesture_Num,IMU1_ax,IMU1_ay,IMU1_az,IMU1_vx,IMU1_vy,IMU1_vz,IMU2_ax,...,EMG7,EMG8,EMG9,EMG10,EMG11,EMG12,EMG13,EMG14,EMG15,EMG16
0,P102,pan,1,-0.154795,-0.207561,-0.276788,0.051098,0.016745,0.024443,0.296552,...,-1.519137e-06,-1.470037e-07,-4.804797e-06,-6e-06,-3.194128e-06,-3.955994e-06,-2.761547e-06,-1e-05,-7.029408e-07,-5.178945e-07
1,P102,pan,1,-0.160414,-0.230805,-0.273866,0.068143,0.018924,-0.006994,0.282865,...,-6.918069e-07,4.930801e-07,-4.487795e-06,-1.1e-05,-3.454802e-08,-7.666755e-07,-2.022076e-06,-1e-06,-6.147716e-07,-2.144813e-07
2,P102,pan,1,-0.143053,-0.231324,-0.266054,0.154514,0.00389,-0.136441,0.279005,...,-3.763653e-07,6.437183e-07,-3.674076e-06,-1.9e-05,-2.892269e-06,-2.629805e-06,-1.651926e-06,1e-06,6.229549e-07,2.599464e-08
3,P102,pan,1,-0.14375,-0.217908,-0.266255,0.117384,0.024499,-0.064445,0.288182,...,-3.238861e-07,4.452346e-07,-2.333567e-06,-2e-05,-3.138945e-06,-4.262033e-06,-7.956578e-07,-3e-06,8.054968e-07,1.099523e-06
4,P102,pan,1,-0.123971,-0.258893,-0.247918,0.352494,0.030614,-0.262239,0.281369,...,-2.16007e-08,2.283233e-07,-9.088692e-08,-2.2e-05,-7.021894e-07,1.475079e-05,-7.793838e-07,-1e-06,-1.959684e-07,9.493486e-07


## Applying chosen dimensionality reduction algorithm:
> For now, just PCA

In [16]:
def apply_dim_reduc(data_df, model_str='PCA', use_full_dataset=False, num_dims=40, hp=None, modality=['EMG and IMU'], participant_inclusion=['All'], apply='ALL'):

    print("Start")

    gestures = ['pan', 'duplicate', 'zoom-out', 'zoom-in', 'move', 'rotate', 'select-single', 'delete', 'close', 'open']
    data_types = modality
    participant_types = participant_inclusion

    if use_full_dataset:        
        sel_df = data_df
        df_t, dim_reduc_model = apply_model(model_str, sel_df, num_dims, hp)
    else:
        for f_type in data_types:

            #print(f"f_type: {f_type}")
            #print(f"f_type[0]: {f_type[0]}")
            # My code assumes you are doing EMG and IMU together...
            ## Add slicing functionality later
            if f_type == 'EMG and IMU':
                sel_df = data_df
            #elif f_type[0] == 'IMU':
            #    # slice just the IMU columns (cols with IMU in name)
            #elif f_type[0] == 'EMG':
            #    # slice just the EMG columns (cols with EMG in name)
            else:
                raise ValueError(f"f_type {f_type} not found in [EMG, IMU, EMG and IMU]")

            for p_type in participant_types:
                if p_type == "All":
                    pIDs = sel_df['Participant'].unique()
                elif p_type == "Impaired":
                    # Idk what this indexing by ['Participant'] the second time is doing, presumably is broken
                    pIDs = sel_df[sel_df['Participant'].isin(pIDs_impaired)]['Participant'].unique()
                elif p_type == "Unimpaired":
                    # Idk what this indexing by ['Participant'] the second time is doing, presumably is broken
                    pIDs = sel_df[sel_df['Participant'].isin(pIDs_unimpaired)]['Participant'].unique()
                else:
                    raise ValueError(f"Participant type {p_type} not supported, check supported versions.")

                if apply.upper() == 'ALL':
                    df_t, dim_reduc_model = apply_model(model_str, sel_df, num_dims, hp)
                elif apply.upper() == 'BY USER':
                    for pid in pIDs:
                        for file_type in file_types:
                                user_df = sel_df[(sel_df['Participant'] == pid)]
                                df_t, dim_reduc_model = apply_model(model_str, user_df, num_dims, hp)
                elif apply.upper() == 'BY GESTURE':
                    for file_type in file_types:
                        for gesture in gestures:
                            gesture_df = sel_df[(data_df['Gesture_ID'] == gesture)]
                            df_t, dim_reduc_model = apply_model(model_str, gesture_df, num_dims, hp)
    print("Success")
    return df_t, dim_reduc_model


In [17]:
#df_t, dim_reduc_model = apply_dim_reduc(X_ms, model_str='PCA', num_dims=40, hp=None, modality=['EMG and IMU'], participant_inclusion=['All'], apply='ALL')
df_t, dim_reduc_model = apply_dim_reduc(X_ms, model_str='PCA', num_dims=40, use_full_dataset=True)

Start
Success


In [18]:
#pca = PCA(n_components=40)
#pca.fit(X)
#print(f"Total explained variance: {np.sum(pca.explained_variance_ratio_)}")

print(df_t.shape)
df_t.head()

(426752, 40)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30,31,32,33,34,35,36,37,38,39
0,-0.027903,0.001411,-0.019509,0.013428,-0.019699,0.027333,-0.031254,-0.02291,0.066484,0.108729,...,-0.019478,0.063074,-0.026073,0.014037,-0.013245,-0.037501,-0.186015,-0.046802,-0.104549,-0.00246
1,-0.038982,0.00647,-0.000111,0.010904,-0.015323,0.031336,-0.007901,-0.027368,0.06037,0.074712,...,0.04141,0.03514,-0.057055,-0.009167,-0.02239,-0.022397,-0.160583,-0.048761,-0.073869,0.043859
2,-0.116782,0.003824,0.01155,-0.014612,-0.093325,0.081718,-0.013155,-0.04615,0.036385,0.052746,...,-0.014321,0.072193,-0.026726,-0.03462,0.018831,-0.011807,-0.160339,-0.042462,-0.109649,0.027638
3,-0.030245,-0.017409,0.02254,-0.048905,-0.029129,0.090026,-0.024645,-0.064307,0.074589,0.053055,...,-0.011014,0.060085,-0.09727,-0.05714,-0.000897,-0.007877,-0.165659,-0.05014,-0.108752,0.070623
4,-0.11295,0.026262,0.004837,-0.063254,-0.108892,0.198729,-0.010583,-0.124893,0.114817,0.038628,...,0.035711,0.050971,-0.093861,-0.13152,0.018166,0.056297,-0.157751,-0.042673,-0.145383,0.063896


In [19]:
dim_reduc_model.explained_variance_ratio_

array([0.23949413, 0.16666576, 0.13205721, 0.07407956, 0.06548491,
       0.05501368, 0.03358861, 0.02753107, 0.02516064, 0.02208645,
       0.01972678, 0.01636877, 0.01332203, 0.01270167, 0.01106796,
       0.00761657, 0.00592563, 0.00551143, 0.00498404, 0.0046687 ,
       0.00408729, 0.00392518, 0.00325429, 0.00310754, 0.00295851,
       0.00277442, 0.00263187, 0.00252613, 0.00233023, 0.00201009,
       0.00196373, 0.00183227, 0.00177018, 0.001713  , 0.00164146,
       0.00156655, 0.00137417, 0.00130314, 0.00125864, 0.00114667])

First two PCs only account for 23.9% and 16.6% of the variance, respectively... plotting in 2D probably isn't worthwhile...

In [20]:
print(f"Total explained variance: {np.sum(dim_reduc_model.explained_variance_ratio_)}")

Total explained variance: 0.988230956896386


In [21]:
# Save the post-dim reduc dataframe

#df_t.to_pickle('PCA_ms_IMUEMG_df.pkl')
#metadata_cols_df.to_pickle('metadata_cols_df.pkl')

In [22]:
full_PCA20, dim_reduc_model20 = apply_dim_reduc(X_ms, model_str='PCA', num_dims=20, use_full_dataset=True)

Start
Success


In [23]:
emg_file_path_kai = 'C:\\Users\\kdmen\\Desktop\\Research\\Data\\$M\\Filtered_Datasets\\metadata_EMG_allgestures_allusers.pkl'
emg_file_path_brc = 'D:\\Kai_MetaGestureClustering_24\\saved_datasets\\Filtered_Datasets\\metadata_EMG_allgestures_allusers.pkl'

emg_df = pd.read_pickle(emg_file_path_brc)
ms_emg_df = meansubtract_df_by_gesture(emg_df)

In [24]:
emg_PCA3, dim_reduc_model3 = apply_dim_reduc(X_ms, model_str='PCA', num_dims=3, use_full_dataset=True)

Start
Success


In [25]:
emg_PCA8, dim_reduc_model8 = apply_dim_reduc(X_ms, model_str='PCA', num_dims=8, use_full_dataset=True)

Start
Success


Save the above datasets

Now do train test split

In [26]:
def manual_train_test_split(df_t, metadata_cols_df, save_file_name, save_bool=True, save_path='D:\Kai_MetaGestureClustering_24\saved_datasets', user_holdout=True, gesture_holdout=False, held_out_user_pids=['P103','P109','P114','P124','P128','P004','P010']):
    # Randomly chosing gestures to hold out
    held_out_gestures=['pan', 'gesture-1', 'gesture-4', 'normal', 'delete',
           'close', 'open', 'move', 'two-handed-tap', 'double-pinch',
           'single-pinch', 'single-clench', 'shake-and-release']

    save_lst = []
    save_name_lst = []

    # Combine the data and metadata dfs again:
    # Ensure both DataFrames have the same index
    df_t.reset_index(drop=True, inplace=True)
    metadata_cols_df.reset_index(drop=True, inplace=True)
    # Concatenate the DataFrames
    metadata_PCA_df = pd.concat([metadata_cols_df, df_t], axis=1)
    save_lst.append(metadata_PCA_df)
    save_name_lst.append("full_dimreduc_df.pkl")

    if user_holdout:
        test_users_df = metadata_PCA_df[metadata_PCA_df['Participant'].isin(held_out_user_pids)]

        # Merge the DataFrames with an indicator
        merged_df = metadata_PCA_df.merge(test_users_df, how='left', indicator=True)
        # Filter out the rows that are in both DataFrames
        training_u_df = merged_df[merged_df['_merge'] == 'left_only'].drop(columns='_merge')
        save_lst.extend([test_users_df, training_u_df])
        save_name_lst.extend(['test_users_df.pkl', 'training_users_df.pkl'])
    if gesture_holdout:
        test_fullgestures_df = metadata_PCA_df[(metadata_PCA_df['Participant'].isin(held_out_user_pids)) & (data_df['Gesture_ID'].isin(held_out_gestures))]

        # Merge the DataFrames with an indicator
        merged_df = metadata_PCA_df.merge(test_fullgestures_df, how='left', indicator=True)
        # Filter out the rows that are in both DataFrames
        training_g_df = merged_df[merged_df['_merge'] == 'left_only'].drop(columns='_merge')
        save_lst.extend([test_fullgestures_df, training_g_df])
        save_name_lst.extend(['test_fullgestures_df.pkl', 'training_fullgestures_df.pkl'])
        
    if save_bool:
        for idx, ele in enumerate(return_lst):
            ele.to_pickle(save_file_name + '\\' + save_name_lst[idx])

    return return_lst

In [27]:
_ = manual_train_test_split(full_PCA20, metadata_cols_df, 'Full_PCA20', save_bool=True, save_path='D:\Kai_MetaGestureClustering_24\saved_datasets', user_holdout=True, gesture_holdout=False, held_out_user_pids=['P103','P109','P114','P124','P128','P004','P010'])


In [28]:
_ = manual_train_test_split(emg_PCA3, metadata_cols_df, 'EMG_PCA3', save_bool=True, save_path='D:\Kai_MetaGestureClustering_24\saved_datasets', user_holdout=True, gesture_holdout=False, held_out_user_pids=['P103','P109','P114','P124','P128','P004','P010'])


In [29]:
_ = manual_train_test_split(emg_PCA8, metadata_cols_df, 'EMG_PCA8', save_bool=True, save_path='D:\Kai_MetaGestureClustering_24\saved_datasets', user_holdout=True, gesture_holdout=False, held_out_user_pids=['P103','P109','P114','P124','P128','P004','P010'])


In [30]:
assert(False)

AssertionError: 

## Original Manual Train/test split

In [31]:
len(data_df['Participant'].unique())*.25

7.75

Thus, we should have 7 participants who are (each) test set
> Let's create multiple test sets, according to:
1. Held out users
2. Held out entire gestures (eg all of pan from one client in the test set)
3. Hold out half of the existing gesture for a client (eg half of all performed pan gestures are in the train and the other half are in the test)
    - Maybe this one isn't a great idea... let's wait on it

In [32]:
data_df['Participant'].unique()

array(['P102', 'P103', 'P104', 'P105', 'P106', 'P107', 'P108', 'P109',
       'P110', 'P111', 'P112', 'P114', 'P115', 'P116', 'P118', 'P119',
       'P121', 'P122', 'P123', 'P124', 'P125', 'P126', 'P127', 'P128',
       'P132', 'P004', 'P005', 'P006', 'P008', 'P010', 'P011'],
      dtype=object)

In [33]:
data_df['Gesture_ID'].unique()

array(['pan', 'duplicate', 'gesture-1', 'gesture-2', 'gesture-3',
       'gesture-4', 'gesture-5', 'normal', 'frequency', 'range-of-motion',
       'zoom-out', 'zoom-in', 'move', 'rotate', 'select-single', 'delete',
       'close', 'open', 'two-handed-tap', 'point-and-pinch',
       'pinch-and-scroll', 'air-tap', 'palm-pinch', 'double-pinch',
       'single-pinch', 'single-clench', 'shake-and-release',
       'double-clench'], dtype=object)

In [34]:
data_df['Gesture_Num'].unique()

array(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10'], dtype=object)

Creating testing sets

In [35]:
# Randomly choosing 7 users for this:
held_out_user_pids = ['P103','P109','P114','P124','P128','P004','P010']

In [36]:
# Combine these two dataframes again:
# Ensure both DataFrames have the same index
df_t.reset_index(drop=True, inplace=True)
metadata_cols_df.reset_index(drop=True, inplace=True)
# Concatenate the DataFrames
metadata_PCA_df = pd.concat([metadata_cols_df, df_t], axis=1)

Testing set of users held out:

In [37]:
test_users_df = metadata_PCA_df[metadata_PCA_df['Participant'].isin(held_out_user_pids)]

print(test_users_df.shape)
test_users_df.head()

(99584, 43)


Unnamed: 0,Participant,Gesture_ID,Gesture_Num,0,1,2,3,4,5,6,...,30,31,32,33,34,35,36,37,38,39
11520,P103,gesture-1,1,0.068817,0.042105,-0.044102,0.053256,0.080938,0.106052,-0.022074,...,0.063511,0.082009,0.101756,-0.19091,-0.12872,-0.133571,-0.141644,-0.068472,-0.03202,0.019002
11521,P103,gesture-1,1,0.074061,0.047283,-0.042408,0.054238,0.074058,0.102463,-0.015359,...,0.072782,0.087097,0.10718,-0.189693,-0.127141,-0.140841,-0.143582,-0.066495,-0.03333,0.018676
11522,P103,gesture-1,1,0.079972,0.043184,-0.055275,0.046477,0.079097,0.100367,-0.017235,...,0.071237,0.081909,0.103853,-0.186424,-0.11959,-0.137423,-0.144509,-0.064755,-0.036093,0.016775
11523,P103,gesture-1,1,0.066582,0.035923,-0.04061,0.045127,0.083319,0.103084,-0.026479,...,0.065577,0.077761,0.10023,-0.19515,-0.128329,-0.133421,-0.147594,-0.064763,-0.031984,0.016183
11524,P103,gesture-1,1,0.070243,0.048672,-0.050055,0.045911,0.075328,0.10219,-0.026261,...,0.064754,0.082754,0.104914,-0.185338,-0.125946,-0.138729,-0.14844,-0.06813,-0.030904,0.020221


Testing set of gestures held out
- Should I hold out these gestures just from test users, or should I hold out all these gestures across the entire dataset...

In [38]:
# Randomly chosing gestures to hold out
held_out_gestures = ['pan', 'gesture-1', 'gesture-4', 'normal', 'delete',
       'close', 'open', 'move', 'two-handed-tap', 'double-pinch',
       'single-pinch', 'single-clench', 'shake-and-release']

test_fullgestures_df = metadata_PCA_df[(metadata_PCA_df['Participant'].isin(held_out_user_pids)) & (data_df['Gesture_ID'].isin(held_out_gestures))]

print(test_fullgestures_df.shape)
test_fullgestures_df.head()

(43328, 43)


Unnamed: 0,Participant,Gesture_ID,Gesture_Num,0,1,2,3,4,5,6,...,30,31,32,33,34,35,36,37,38,39
11520,P103,gesture-1,1,0.068817,0.042105,-0.044102,0.053256,0.080938,0.106052,-0.022074,...,0.063511,0.082009,0.101756,-0.19091,-0.12872,-0.133571,-0.141644,-0.068472,-0.03202,0.019002
11521,P103,gesture-1,1,0.074061,0.047283,-0.042408,0.054238,0.074058,0.102463,-0.015359,...,0.072782,0.087097,0.10718,-0.189693,-0.127141,-0.140841,-0.143582,-0.066495,-0.03333,0.018676
11522,P103,gesture-1,1,0.079972,0.043184,-0.055275,0.046477,0.079097,0.100367,-0.017235,...,0.071237,0.081909,0.103853,-0.186424,-0.11959,-0.137423,-0.144509,-0.064755,-0.036093,0.016775
11523,P103,gesture-1,1,0.066582,0.035923,-0.04061,0.045127,0.083319,0.103084,-0.026479,...,0.065577,0.077761,0.10023,-0.19515,-0.128329,-0.133421,-0.147594,-0.064763,-0.031984,0.016183
11524,P103,gesture-1,1,0.070243,0.048672,-0.050055,0.045911,0.075328,0.10219,-0.026261,...,0.064754,0.082754,0.104914,-0.185338,-0.125946,-0.138729,-0.14844,-0.06813,-0.030904,0.020221


Now create the training df by removing the intersection with the testing dataframes

For test_users:

In [39]:
# Merge the DataFrames with an indicator
merged_df = metadata_PCA_df.merge(test_users_df, how='left', indicator=True)
# Filter out the rows that are in both DataFrames
training_u_df = merged_df[merged_df['_merge'] == 'left_only'].drop(columns='_merge')
training_u_df.head()

Unnamed: 0,Participant,Gesture_ID,Gesture_Num,0,1,2,3,4,5,6,...,30,31,32,33,34,35,36,37,38,39
0,P102,pan,1,-0.027903,0.001411,-0.019509,0.013428,-0.019699,0.027333,-0.031254,...,-0.019478,0.063074,-0.026073,0.014037,-0.013245,-0.037501,-0.186015,-0.046802,-0.104549,-0.00246
1,P102,pan,1,-0.038982,0.00647,-0.000111,0.010904,-0.015323,0.031336,-0.007901,...,0.04141,0.03514,-0.057055,-0.009167,-0.02239,-0.022397,-0.160583,-0.048761,-0.073869,0.043859
2,P102,pan,1,-0.116782,0.003824,0.01155,-0.014612,-0.093325,0.081718,-0.013155,...,-0.014321,0.072193,-0.026726,-0.03462,0.018831,-0.011807,-0.160339,-0.042462,-0.109649,0.027638
3,P102,pan,1,-0.030245,-0.017409,0.02254,-0.048905,-0.029129,0.090026,-0.024645,...,-0.011014,0.060085,-0.09727,-0.05714,-0.000897,-0.007877,-0.165659,-0.05014,-0.108752,0.070623
4,P102,pan,1,-0.11295,0.026262,0.004837,-0.063254,-0.108892,0.198729,-0.010583,...,0.035711,0.050971,-0.093861,-0.13152,0.018166,0.056297,-0.157751,-0.042673,-0.145383,0.063896


For test_fullgestures:

In [40]:
# Merge the DataFrames with an indicator
merged_df = metadata_PCA_df.merge(test_fullgestures_df, how='left', indicator=True)
# Filter out the rows that are in both DataFrames
training_g_df = merged_df[merged_df['_merge'] == 'left_only'].drop(columns='_merge')
training_g_df.head()

Unnamed: 0,Participant,Gesture_ID,Gesture_Num,0,1,2,3,4,5,6,...,30,31,32,33,34,35,36,37,38,39
0,P102,pan,1,-0.027903,0.001411,-0.019509,0.013428,-0.019699,0.027333,-0.031254,...,-0.019478,0.063074,-0.026073,0.014037,-0.013245,-0.037501,-0.186015,-0.046802,-0.104549,-0.00246
1,P102,pan,1,-0.038982,0.00647,-0.000111,0.010904,-0.015323,0.031336,-0.007901,...,0.04141,0.03514,-0.057055,-0.009167,-0.02239,-0.022397,-0.160583,-0.048761,-0.073869,0.043859
2,P102,pan,1,-0.116782,0.003824,0.01155,-0.014612,-0.093325,0.081718,-0.013155,...,-0.014321,0.072193,-0.026726,-0.03462,0.018831,-0.011807,-0.160339,-0.042462,-0.109649,0.027638
3,P102,pan,1,-0.030245,-0.017409,0.02254,-0.048905,-0.029129,0.090026,-0.024645,...,-0.011014,0.060085,-0.09727,-0.05714,-0.000897,-0.007877,-0.165659,-0.05014,-0.108752,0.070623
4,P102,pan,1,-0.11295,0.026262,0.004837,-0.063254,-0.108892,0.198729,-0.010583,...,0.035711,0.050971,-0.093861,-0.13152,0.018166,0.056297,-0.157751,-0.042673,-0.145383,0.063896


## Save the dataframes

In [None]:
assert(False)

In [None]:
test_users_df.to_pickle('test_users_df.pkl')
test_fullgestures_df.to_pickle('test_fullgestures_df.pkl')
training_u_df.to_pickle('training_u_df.pkl')
training_g_df.to_pickle('training_g_df.pkl')