In [1]:
import numpy as np
import matplotlib.pyplot as plt
import sys
import os
import copy
import pandas as pd
import time

from sklearn.decomposition import PCA


In [2]:
from subspace_clustering_helper_funcs import *

- https://scikit-learn.org/stable/modules/classes.html#module-sklearn.manifold

## Loading in the data

In [3]:
# remove pID 101 because it doesn't exist
# remove pID 131 because it  doesnt have enough user defined gestures
# each participant has 100 experimenter defined files and 50 user defined files
# 10 experimenter defined gestures and 5 user defined gestures

file_types = ["IMU_extract", "movavg_files"]
expt_types = ["experimenter-defined"]

#remove participant 131 because they are missing gestures 
pIDs_impaired = ['P102','P103','P104','P105','P106','P107','P108','P109','P110','P111',
       'P112','P114','P115','P116','P118','P119','P121','P122','P123','P124','P125',
       'P126','P127','P128', 'P132']
# remove participants P001 and P003 because they dont have duplicate or open gestures
pIDs_unimpaired = ['P004','P005','P006','P008','P010','P011']

pIDs_both = pIDs_impaired + pIDs_unimpaired

In [4]:
## Pickle is theoretically faster for Python...

print("Loading")
start_time = time.time()
data_df = pd.read_pickle('C:\\Users\\kdmen\\Desktop\\Research\\Data\\$M\\metadata_IMU_EMG_allgestures_allusers.pkl')
end_time = time.time()
print(f"Completed in {end_time - start_time}s")

Loading
Completed in 1.0266139507293701s


In [5]:
print(data_df.shape)
data_df.head()

(426752, 91)


Unnamed: 0,Participant,Gesture_ID,Gesture_Num,IMU1_ax,IMU1_ay,IMU1_az,IMU1_vx,IMU1_vy,IMU1_vz,IMU2_ax,...,EMG7,EMG8,EMG9,EMG10,EMG11,EMG12,EMG13,EMG14,EMG15,EMG16
0,P102,pan,1,0.341797,-0.939941,0.000977,-0.00745,-0.192625,0.005321,-0.380859,...,2e-06,2e-06,3e-06,2e-05,4e-06,4e-06,2e-06,9e-06,1e-06,2e-06
1,P102,pan,1,0.336178,-0.963185,0.003898,0.009595,-0.190446,-0.026116,-0.394547,...,3e-06,3e-06,3e-06,1.4e-05,7e-06,7e-06,2e-06,1.7e-05,1e-06,2e-06
2,P102,pan,1,0.353539,-0.963704,0.011711,0.095966,-0.20548,-0.155563,-0.398406,...,3e-06,3e-06,4e-06,7e-06,4e-06,5e-06,3e-06,2e-05,3e-06,2e-06
3,P102,pan,1,0.352841,-0.950288,0.011509,0.058836,-0.184871,-0.083567,-0.38923,...,3e-06,3e-06,6e-06,5e-06,4e-06,3e-06,4e-06,1.5e-05,3e-06,3e-06
4,P102,pan,1,0.372621,-0.991273,0.029847,0.293946,-0.178756,-0.281361,-0.396043,...,3e-06,2e-06,8e-06,3e-06,7e-06,2.2e-05,4e-06,1.7e-05,2e-06,3e-06


In [6]:
metadata_cols = ['Participant', 'Gesture_ID', 'Gesture_Num']
metadata_cols_df = data_df[metadata_cols]
X = data_df.drop(metadata_cols, axis=1)

In [7]:
print(metadata_cols_df.shape)
metadata_cols_df.head()

(426752, 3)


Unnamed: 0,Participant,Gesture_ID,Gesture_Num
0,P102,pan,1
1,P102,pan,1
2,P102,pan,1
3,P102,pan,1
4,P102,pan,1


## NEED TO MEAN SUBTRACT THE EMG!!!

In [8]:
print(X.shape)
X.head()

(426752, 88)


Unnamed: 0,IMU1_ax,IMU1_ay,IMU1_az,IMU1_vx,IMU1_vy,IMU1_vz,IMU2_ax,IMU2_ay,IMU2_az,IMU2_vx,...,EMG7,EMG8,EMG9,EMG10,EMG11,EMG12,EMG13,EMG14,EMG15,EMG16
0,0.341797,-0.939941,0.000977,-0.00745,-0.192625,0.005321,-0.380859,-0.888184,-0.334961,0.124514,...,2e-06,2e-06,3e-06,2e-05,4e-06,4e-06,2e-06,9e-06,1e-06,2e-06
1,0.336178,-0.963185,0.003898,0.009595,-0.190446,-0.026116,-0.394547,-0.905297,-0.344967,0.144735,...,3e-06,3e-06,3e-06,1.4e-05,7e-06,7e-06,2e-06,1.7e-05,1e-06,2e-06
2,0.353539,-0.963704,0.011711,0.095966,-0.20548,-0.155563,-0.398406,-0.90525,-0.343246,0.14063,...,3e-06,3e-06,4e-06,7e-06,4e-06,5e-06,3e-06,2e-05,3e-06,2e-06
3,0.352841,-0.950288,0.011509,0.058836,-0.184871,-0.083567,-0.38923,-0.896252,-0.3371,0.185074,...,3e-06,3e-06,6e-06,5e-06,4e-06,3e-06,4e-06,1.5e-05,3e-06,3e-06
4,0.372621,-0.991273,0.029847,0.293946,-0.178756,-0.281361,-0.396043,-0.903902,-0.34337,0.1895,...,3e-06,2e-06,8e-06,3e-06,7e-06,2.2e-05,4e-06,1.7e-05,2e-06,3e-06


In [9]:
X.shape[0]/64

6668.0

In [10]:
X.iloc[:,-16:].head()

Unnamed: 0,EMG1,EMG2,EMG3,EMG4,EMG5,EMG6,EMG7,EMG8,EMG9,EMG10,EMG11,EMG12,EMG13,EMG14,EMG15,EMG16
0,2e-06,2e-06,1e-06,2e-06,3e-06,4e-06,2e-06,2e-06,3e-06,2e-05,4e-06,4e-06,2e-06,9e-06,1e-06,2e-06
1,2e-06,2e-06,1e-06,2e-06,3e-06,5e-06,3e-06,3e-06,3e-06,1.4e-05,7e-06,7e-06,2e-06,1.7e-05,1e-06,2e-06
2,2e-06,2e-06,1e-06,2e-06,4e-06,5e-06,3e-06,3e-06,4e-06,7e-06,4e-06,5e-06,3e-06,2e-05,3e-06,2e-06
3,2e-06,2e-06,2e-06,2e-06,5e-06,6e-06,3e-06,3e-06,6e-06,5e-06,4e-06,3e-06,4e-06,1.5e-05,3e-06,3e-06
4,3e-06,2e-06,2e-06,2e-06,5e-06,7e-06,3e-06,2e-06,8e-06,3e-06,7e-06,2.2e-05,4e-06,1.7e-05,2e-06,3e-06


Last 16 cols are EMG

In [11]:
def mean_subtraction_blockwise(block):
    # Don't hardcode in 72... but ig it's not changing for this dataset...
    imu_block = block[:, :72]  # Extract IMU block
    emg_block = block[:, 72:]  # Extract EMG block
    
    # Perform mean subtraction separately for IMU and EMG blocks
    imu_block_mean_subtracted = imu_block - imu_block.mean(axis=0)
    emg_block_mean_subtracted = emg_block - emg_block.mean(axis=0)
    
    # Concatenate the mean subtracted IMU and EMG blocks
    mean_subtracted_block = np.concatenate((imu_block_mean_subtracted, emg_block_mean_subtracted), axis=1)
    
    return mean_subtracted_block

# Define a function to perform mean subtraction for each block
def mean_subtraction(block):
    return block - block.mean()

In [12]:
print("Started")
block_size = 64
# Group the DataFrame into blocks of 64 rows based on the index
grouped_df = data_df.groupby(data_df.index // block_size)
print("Completed")

Started
Completed


This one takes a really long time to run...

In [13]:
print("Started")
# Apply mean subtraction to each block separately
mean_subtracted_df = grouped_df.transform(mean_subtraction)
print("Completed")

Started
Completed


In [14]:
print("Started")
# Concatenate the mean-subtracted blocks back into a single DataFrame
mean_subtracted_df = mean_subtracted_df.reset_index(drop=True)
print("Completed")

Started
Completed


In [15]:
print(mean_subtracted_df.shape)
mean_subtracted_df.head()

(426752, 88)


Unnamed: 0,IMU1_ax,IMU1_ay,IMU1_az,IMU1_vx,IMU1_vy,IMU1_vz,IMU2_ax,IMU2_ay,IMU2_az,IMU2_vx,...,EMG7,EMG8,EMG9,EMG10,EMG11,EMG12,EMG13,EMG14,EMG15,EMG16
0,-0.154795,-0.207561,-0.276788,0.051098,0.016745,0.024443,0.296552,-0.248091,-0.092122,-0.062305,...,-1.519137e-06,-1.470037e-07,-4.804797e-06,-6e-06,-3.194128e-06,-3.955994e-06,-2.761547e-06,-1e-05,-7.029408e-07,-5.178945e-07
1,-0.160414,-0.230805,-0.273866,0.068143,0.018924,-0.006994,0.282865,-0.265204,-0.102128,-0.042085,...,-6.918069e-07,4.930801e-07,-4.487795e-06,-1.1e-05,-3.454802e-08,-7.666755e-07,-2.022076e-06,-1e-06,-6.147716e-07,-2.144813e-07
2,-0.143053,-0.231324,-0.266054,0.154514,0.00389,-0.136441,0.279005,-0.265157,-0.100407,-0.04619,...,-3.763653e-07,6.437183e-07,-3.674076e-06,-1.9e-05,-2.892269e-06,-2.629805e-06,-1.651926e-06,1e-06,6.229549e-07,2.599464e-08
3,-0.14375,-0.217908,-0.266255,0.117384,0.024499,-0.064445,0.288182,-0.256159,-0.094261,-0.001746,...,-3.238861e-07,4.452346e-07,-2.333567e-06,-2e-05,-3.138945e-06,-4.262033e-06,-7.956578e-07,-3e-06,8.054968e-07,1.099523e-06
4,-0.123971,-0.258893,-0.247918,0.352494,0.030614,-0.262239,0.281369,-0.263809,-0.100531,0.00268,...,-2.16007e-08,2.283233e-07,-9.088692e-08,-2.2e-05,-7.021894e-07,1.475079e-05,-7.793838e-07,-1e-06,-1.959684e-07,9.493486e-07


In [16]:
X_ms = mean_subtracted_df

## Applying chosen dimensionality reduction algorithm:
> For now, just PCA

In [17]:
def apply_model(model_str, input_df, num_dims, hp):
    
    # Drop the metadata columns (eg cols that are not the actual timeseries data)
    columns_to_drop = ['Participant', 'Gesture_ID', 'Gesture_Num']
    # Find columns that are present in the DataFrame
    columns_to_drop = [col for col in columns_to_drop if col in input_df.columns]
    # Drop only the columns that are present
    training_df = input_df.drop(columns=columns_to_drop)
    
    if not training_df.empty:
        if model_str.upper() == 'PCA':
            dim_reduc_model = PCA(n_components=num_dims)
            dim_reduc_model.fit(training_df)
            reduced_df = pd.DataFrame(dim_reduc_model.transform(training_df))
        elif (model_str.upper() == 'T-SNE') or (model_str.upper() == 'TSNE'):
            dim_reduc_model = TSNE(n_components=num_dims, perplexity=hp, random_state=42)
            reduced_df = pd.DataFrame(dim_reduc_model.fit_transform(df))
        elif (model_str.upper() == 'INCREMENTALPCA') or (model_str.upper() == 'IPCA'):
            dim_reduc_model = IncrementalPCA(n_components=num_dims)
            reduced_df = pd.DataFrame(dim_reduc_model.fit_transform(training_df))
        elif (model_str.upper() == 'KERNELPCA') or (model_str.upper() == 'KPCA'):
            dim_reduc_model = KernelPCA(n_components=num_dims)
            reduced_df = pd.DataFrame(dim_reduc_model.fit_transform(training_df))
        #elif model_str.upper() == 'UMAP':
        #    raise ValueError("Need to install the umap library first...")
        #    dim_reduc_model = UMAP(n_components=num_dims)
        #    reduced_df = pd.DataFrame(dim_reduc_model.fit_transform(training_df))
        elif model_str.upper() == 'MDS':
            dim_reduc_model = MDS(n_components=num_dims, random_state=42)
            reduced_df = pd.DataFrame(dim_reduc_model.fit_transform(training_df))
        elif model_str.upper() == 'ISOMAP':
            dim_reduc_model = Isomap(n_components=num_dims)
            reduced_df = pd.DataFrame(dim_reduc_model.fit_transform(training_df))
        else:
            raise ValueError(f"{model_str} not implemented. Choose an implemented model.")
    else:
        raise ValueError(f"training_df is empty!")
    
    return reduced_df, dim_reduc_model

In [18]:
def apply_dim_reduc(data_df, model_str='PCA', use_full_dataset=False, num_dims=40, hp=None, modality=['EMG and IMU'], participant_inclusion=['All'], apply='ALL'):

    print("Start")

    gestures = ['pan', 'duplicate', 'zoom-out', 'zoom-in', 'move', 'rotate', 'select-single', 'delete', 'close', 'open']
    data_types = modality
    participant_types = participant_inclusion

    if use_full_dataset:        
        sel_df = data_df
        df_t, dim_reduc_model = apply_model(model_str, sel_df, num_dims, hp)
    else:
        for f_type in data_types:

            #print(f"f_type: {f_type}")
            #print(f"f_type[0]: {f_type[0]}")
            # My code assumes you are doing EMG and IMU together...
            ## Add slicing functionality later
            if f_type == 'EMG and IMU':
                sel_df = data_df
            #elif f_type[0] == 'IMU':
            #    # slice just the IMU columns (cols with IMU in name)
            #elif f_type[0] == 'EMG':
            #    # slice just the EMG columns (cols with EMG in name)
            else:
                raise ValueError(f"f_type {f_type} not found in [EMG, IMU, EMG and IMU]")

            for p_type in participant_types:
                if p_type == "All":
                    pIDs = sel_df['Participant'].unique()
                elif p_type == "Impaired":
                    # Idk what this indexing by ['Participant'] the second time is doing, presumably is broken
                    pIDs = sel_df[sel_df['Participant'].isin(pIDs_impaired)]['Participant'].unique()
                elif p_type == "Unimpaired":
                    # Idk what this indexing by ['Participant'] the second time is doing, presumably is broken
                    pIDs = sel_df[sel_df['Participant'].isin(pIDs_unimpaired)]['Participant'].unique()
                else:
                    raise ValueError(f"Participant type {p_type} not supported, check supported versions.")

                if apply.upper() == 'ALL':
                    df_t, dim_reduc_model = apply_model(model_str, sel_df, num_dims, hp)
                elif apply.upper() == 'BY USER':
                    for pid in pIDs:
                        for file_type in file_types:
                                user_df = sel_df[(sel_df['Participant'] == pid)]
                                df_t, dim_reduc_model = apply_model(model_str, user_df, num_dims, hp)
                elif apply.upper() == 'BY GESTURE':
                    for file_type in file_types:
                        for gesture in gestures:
                            gesture_df = sel_df[(data_df['Gesture_ID'] == gesture)]
                            df_t, dim_reduc_model = apply_model(model_str, gesture_df, num_dims, hp)
    print("Success")
    return df_t, dim_reduc_model


In [19]:
#df_t, dim_reduc_model = apply_dim_reduc(X_ms, model_str='PCA', num_dims=40, hp=None, modality=['EMG and IMU'], participant_inclusion=['All'], apply='ALL')
df_t, dim_reduc_model = apply_dim_reduc(X_ms, model_str='PCA', num_dims=40, use_full_dataset=True)

Start
Success


In [20]:
#pca = PCA(n_components=40)
#pca.fit(X)
#print(f"Total explained variance: {np.sum(pca.explained_variance_ratio_)}")

print(df_t.shape)
df_t.head()

(426752, 40)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30,31,32,33,34,35,36,37,38,39
0,-0.027903,0.001411,-0.019509,0.013428,-0.019699,0.027333,-0.031254,-0.02291,0.066484,0.108729,...,-0.019378,0.063114,-0.025896,0.014243,-0.013221,-0.037526,-0.186648,-0.047065,-0.105093,-0.002803
1,-0.038982,0.00647,-0.000111,0.010904,-0.015323,0.031336,-0.007901,-0.027368,0.06037,0.074712,...,0.04151,0.035183,-0.056877,-0.008949,-0.022367,-0.022429,-0.161205,-0.049021,-0.074391,0.04352
2,-0.116782,0.003824,0.01155,-0.014612,-0.093325,0.081718,-0.013155,-0.04615,0.036385,0.052746,...,-0.014227,0.072234,-0.026563,-0.034422,0.018859,-0.011837,-0.160937,-0.042706,-0.110184,0.027303
3,-0.030245,-0.017409,0.02254,-0.048905,-0.029129,0.090026,-0.024645,-0.064307,0.074589,0.053055,...,-0.010918,0.060124,-0.097102,-0.056927,-0.000876,-0.007895,-0.166284,-0.050364,-0.109238,0.070306
4,-0.11295,0.026262,0.004837,-0.063254,-0.108892,0.198729,-0.010583,-0.124893,0.114817,0.038628,...,0.035804,0.051011,-0.093713,-0.131315,0.018181,0.056281,-0.158343,-0.042877,-0.145842,0.063618


In [21]:
dim_reduc_model.explained_variance_ratio_

array([0.23949413, 0.16666576, 0.13205721, 0.07407956, 0.06548491,
       0.05501368, 0.03358861, 0.02753107, 0.02516064, 0.02208645,
       0.01972678, 0.01636877, 0.01332203, 0.01270167, 0.01106796,
       0.00761657, 0.00592563, 0.00551143, 0.00498404, 0.0046687 ,
       0.00408729, 0.00392518, 0.00325429, 0.00310754, 0.00295851,
       0.00277442, 0.00263187, 0.00252613, 0.00233023, 0.00201009,
       0.00196373, 0.00183227, 0.00177019, 0.00171301, 0.00164146,
       0.00156655, 0.00137422, 0.00130313, 0.00125862, 0.00114667])

First two PCs only account for 23.9% and 16.6% of the variance, respectively... plotting in 2D probably isn't worthwhile...

In [22]:
print(f"Total explained variance: {np.sum(dim_reduc_model.explained_variance_ratio_)}")

Total explained variance: 0.9882309721328306


In [None]:
assert(1==0)

In [44]:
# Save the post-dim reduc dataframe

df_t.to_pickle('PCA_ms_IMUEMG_df.pkl')
metadata_cols_df.to_pickle('metadata_cols_df.pkl')

## Train/test split

In [51]:
len(data_df['Participant'].unique())*.25

7.75

Thus, we should have 7 participants who are (each) test set
> Let's create multiple test sets, according to:
1. Held out users
2. Held out entire gestures (eg all of pan from one client in the test set)
3. Hold out half of the existing gesture for a client (eg half of all performed pan gestures are in the train and the other half are in the test)
    - Maybe this one isn't a great idea... let's wait on it

In [46]:
data_df['Participant'].unique()

array(['P102', 'P103', 'P104', 'P105', 'P106', 'P107', 'P108', 'P109',
       'P110', 'P111', 'P112', 'P114', 'P115', 'P116', 'P118', 'P119',
       'P121', 'P122', 'P123', 'P124', 'P125', 'P126', 'P127', 'P128',
       'P132', 'P004', 'P005', 'P006', 'P008', 'P010', 'P011'],
      dtype=object)

In [47]:
data_df['Gesture_ID'].unique()

array(['pan', 'duplicate', 'gesture-1', 'gesture-2', 'gesture-3',
       'gesture-4', 'gesture-5', 'normal', 'frequency', 'range-of-motion',
       'zoom-out', 'zoom-in', 'move', 'rotate', 'select-single', 'delete',
       'close', 'open', 'two-handed-tap', 'point-and-pinch',
       'pinch-and-scroll', 'air-tap', 'palm-pinch', 'double-pinch',
       'single-pinch', 'single-clench', 'shake-and-release',
       'double-clench'], dtype=object)

In [48]:
data_df['Gesture_Num'].unique()

array(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10'], dtype=object)

Creating testing sets

In [49]:
# Randomly choosing 7 users for this:
held_out_user_pids = ['P103','P109','P114','P124','P128','P004','P010']

In [52]:
# Combine these two dataframes again:
# Ensure both DataFrames have the same index
df_t.reset_index(drop=True, inplace=True)
metadata_cols_df.reset_index(drop=True, inplace=True)
# Concatenate the DataFrames
metadata_PCA_df = pd.concat([metadata_cols_df, df_t], axis=1)

Testing set of users held out:

In [53]:
test_users_df = metadata_PCA_df[metadata_PCA_df['Participant'].isin(held_out_user_pids)]

print(test_users_df.shape)
test_users_df.head()

(99584, 43)


Unnamed: 0,Participant,Gesture_ID,Gesture_Num,0,1,2,3,4,5,6,...,30,31,32,33,34,35,36,37,38,39
11520,P103,gesture-1,1,0.068817,0.042105,-0.044102,0.053256,0.080938,0.106052,-0.022074,...,0.063422,0.081947,0.101682,-0.190886,-0.128849,-0.133675,-0.141095,-0.068262,-0.032521,0.018558
11521,P103,gesture-1,1,0.074061,0.047283,-0.042408,0.054238,0.074058,0.102463,-0.015359,...,0.072693,0.087035,0.107107,-0.189667,-0.127273,-0.14095,-0.143033,-0.066269,-0.033821,0.018198
11522,P103,gesture-1,1,0.079972,0.043184,-0.055275,0.046477,0.079097,0.100367,-0.017235,...,0.07115,0.081851,0.103781,-0.1864,-0.119714,-0.137523,-0.143988,-0.064562,-0.03661,0.016388
11523,P103,gesture-1,1,0.066582,0.035923,-0.04061,0.045127,0.083319,0.103084,-0.026479,...,0.065488,0.077696,0.100158,-0.195122,-0.128462,-0.13353,-0.147047,-0.064546,-0.032468,0.015712
11524,P103,gesture-1,1,0.070243,0.048672,-0.050055,0.045911,0.075328,0.10219,-0.026261,...,0.064663,0.082686,0.104846,-0.185309,-0.126085,-0.138847,-0.147857,-0.067871,-0.031347,0.019686


Testing set of gestures held out
- Should I hold out these gestures just from test users, or should I hold out all these gestures across the entire dataset...

In [54]:
# Randomly chosing gestures to hold out
held_out_gestures = ['pan', 'gesture-1', 'gesture-4', 'normal', 'delete',
       'close', 'open', 'move', 'two-handed-tap', 'double-pinch',
       'single-pinch', 'single-clench', 'shake-and-release']

test_fullgestures_df = metadata_PCA_df[(metadata_PCA_df['Participant'].isin(held_out_user_pids)) & (data_df['Gesture_ID'].isin(held_out_gestures))]

print(test_fullgestures_df.shape)
test_fullgestures_df.head()

(43328, 43)


Unnamed: 0,Participant,Gesture_ID,Gesture_Num,0,1,2,3,4,5,6,...,30,31,32,33,34,35,36,37,38,39
11520,P103,gesture-1,1,0.068817,0.042105,-0.044102,0.053256,0.080938,0.106052,-0.022074,...,0.063422,0.081947,0.101682,-0.190886,-0.128849,-0.133675,-0.141095,-0.068262,-0.032521,0.018558
11521,P103,gesture-1,1,0.074061,0.047283,-0.042408,0.054238,0.074058,0.102463,-0.015359,...,0.072693,0.087035,0.107107,-0.189667,-0.127273,-0.14095,-0.143033,-0.066269,-0.033821,0.018198
11522,P103,gesture-1,1,0.079972,0.043184,-0.055275,0.046477,0.079097,0.100367,-0.017235,...,0.07115,0.081851,0.103781,-0.1864,-0.119714,-0.137523,-0.143988,-0.064562,-0.03661,0.016388
11523,P103,gesture-1,1,0.066582,0.035923,-0.04061,0.045127,0.083319,0.103084,-0.026479,...,0.065488,0.077696,0.100158,-0.195122,-0.128462,-0.13353,-0.147047,-0.064546,-0.032468,0.015712
11524,P103,gesture-1,1,0.070243,0.048672,-0.050055,0.045911,0.075328,0.10219,-0.026261,...,0.064663,0.082686,0.104846,-0.185309,-0.126085,-0.138847,-0.147857,-0.067871,-0.031347,0.019686


Now create the training df by removing the intersection with the testing dataframes

For test_users:

In [55]:
# Merge the DataFrames with an indicator
merged_df = metadata_PCA_df.merge(test_users_df, how='left', indicator=True)
# Filter out the rows that are in both DataFrames
training_u_df = merged_df[merged_df['_merge'] == 'left_only'].drop(columns='_merge')
training_u_df.head()

Unnamed: 0,Participant,Gesture_ID,Gesture_Num,0,1,2,3,4,5,6,...,30,31,32,33,34,35,36,37,38,39
0,P102,pan,1,-0.027903,0.001411,-0.019509,0.013428,-0.019699,0.027333,-0.031254,...,-0.019453,0.062983,-0.025869,0.014303,-0.013387,-0.037645,-0.18627,-0.046251,-0.10463,-0.002939
1,P102,pan,1,-0.038982,0.00647,-0.000111,0.010904,-0.015323,0.031336,-0.007901,...,0.041438,0.035053,-0.056843,-0.008895,-0.022542,-0.022563,-0.160826,-0.048161,-0.073771,0.043268
2,P102,pan,1,-0.116782,0.003824,0.01155,-0.014612,-0.093325,0.081718,-0.013155,...,-0.014298,0.072109,-0.026536,-0.034365,0.018695,-0.01194,-0.16058,-0.041831,-0.109653,0.027043
3,P102,pan,1,-0.030245,-0.017409,0.02254,-0.048905,-0.029129,0.090026,-0.024645,...,-0.010992,0.05999,-0.097073,-0.05687,-0.001038,-0.008015,-0.165858,-0.049424,-0.108671,0.069886
4,P102,pan,1,-0.11295,0.026262,0.004837,-0.063254,-0.108892,0.198729,-0.010583,...,0.035735,0.05088,-0.093678,-0.131263,0.018035,0.056185,-0.157963,-0.041911,-0.145308,0.063311


For test_fullgestures:

In [56]:
# Merge the DataFrames with an indicator
merged_df = metadata_PCA_df.merge(test_fullgestures_df, how='left', indicator=True)
# Filter out the rows that are in both DataFrames
training_g_df = merged_df[merged_df['_merge'] == 'left_only'].drop(columns='_merge')
training_g_df.head()

Unnamed: 0,Participant,Gesture_ID,Gesture_Num,0,1,2,3,4,5,6,...,30,31,32,33,34,35,36,37,38,39
0,P102,pan,1,-0.027903,0.001411,-0.019509,0.013428,-0.019699,0.027333,-0.031254,...,-0.019453,0.062983,-0.025869,0.014303,-0.013387,-0.037645,-0.18627,-0.046251,-0.10463,-0.002939
1,P102,pan,1,-0.038982,0.00647,-0.000111,0.010904,-0.015323,0.031336,-0.007901,...,0.041438,0.035053,-0.056843,-0.008895,-0.022542,-0.022563,-0.160826,-0.048161,-0.073771,0.043268
2,P102,pan,1,-0.116782,0.003824,0.01155,-0.014612,-0.093325,0.081718,-0.013155,...,-0.014298,0.072109,-0.026536,-0.034365,0.018695,-0.01194,-0.16058,-0.041831,-0.109653,0.027043
3,P102,pan,1,-0.030245,-0.017409,0.02254,-0.048905,-0.029129,0.090026,-0.024645,...,-0.010992,0.05999,-0.097073,-0.05687,-0.001038,-0.008015,-0.165858,-0.049424,-0.108671,0.069886
4,P102,pan,1,-0.11295,0.026262,0.004837,-0.063254,-0.108892,0.198729,-0.010583,...,0.035735,0.05088,-0.093678,-0.131263,0.018035,0.056185,-0.157963,-0.041911,-0.145308,0.063311


In [None]:
assert(1==0)

In [57]:
# Save the various test and training sets

test_users_df.to_pickle('test_users_df.pkl')
test_fullgestures_df.to_pickle('test_fullgestures_df.pkl')
training_u_df.to_pickle('training_u_df.pkl')
training_g_df.to_pickle('training_g_df.pkl')