In [1]:
import numpy as np
import pandas as pd
import time

from sklearn.decomposition import PCA

from utils.subspace_clustering_helper_funcs import *
from utils.preprocessing import *
from utils.traintestsplits import *

- https://scikit-learn.org/stable/modules/classes.html#module-sklearn.manifold

## Loading in the data

In [4]:
## Pickle is theoretically faster for Python...

print("Loading")
start_time = time.time()
data_df = pd.read_pickle('C:\\Users\\kdmen\\Desktop\\Research\\Data\\$M\\saved_datasets\\filtered_datasets\\metadata_IMU_EMG_allgestures_allusers.pkl')
#data_df = pd.read_pickle('D:\\Kai_MetaGestureClustering_24\\saved_datasets\\filtered_datasets\\metadata_IMU_EMG_allgestures_allusers.pkl')
end_time = time.time()
print(f"Completed in {end_time - start_time}s")

Loading
Completed in 0.5634138584136963s


In [5]:
print(data_df.shape)
data_df.head()

(204800, 91)


Unnamed: 0,Participant,Gesture_ID,Gesture_Num,IMU1_ax,IMU1_ay,IMU1_az,IMU1_vx,IMU1_vy,IMU1_vz,IMU2_ax,...,EMG7,EMG8,EMG9,EMG10,EMG11,EMG12,EMG13,EMG14,EMG15,EMG16
0,P102,pan,1,0.341797,-0.939941,0.000977,-0.00745,-0.192625,0.005321,-0.380859,...,2e-06,2e-06,3e-06,2e-05,4e-06,4e-06,2e-06,9e-06,1e-06,2e-06
1,P102,pan,1,0.336178,-0.963185,0.003898,0.009595,-0.190446,-0.026116,-0.394547,...,3e-06,3e-06,3e-06,1.4e-05,7e-06,7e-06,2e-06,1.7e-05,1e-06,2e-06
2,P102,pan,1,0.353539,-0.963704,0.011711,0.095966,-0.20548,-0.155563,-0.398406,...,3e-06,3e-06,4e-06,7e-06,4e-06,5e-06,3e-06,2e-05,3e-06,2e-06
3,P102,pan,1,0.352841,-0.950288,0.011509,0.058836,-0.184871,-0.083567,-0.38923,...,3e-06,3e-06,6e-06,5e-06,4e-06,3e-06,4e-06,1.5e-05,3e-06,3e-06
4,P102,pan,1,0.372621,-0.991273,0.029847,0.293946,-0.178756,-0.281361,-0.396043,...,3e-06,2e-06,8e-06,3e-06,7e-06,2.2e-05,4e-06,1.7e-05,2e-06,3e-06


In [6]:
metadata_cols = ['Participant', 'Gesture_ID', 'Gesture_Num']
metadata_cols_df = data_df[metadata_cols]
X = data_df.drop(metadata_cols, axis=1)

In [7]:
print(metadata_cols_df.shape)
metadata_cols_df.head()

(204800, 3)


Unnamed: 0,Participant,Gesture_ID,Gesture_Num
0,P102,pan,1
1,P102,pan,1
2,P102,pan,1
3,P102,pan,1
4,P102,pan,1


## Mean subtract each column, standardize each gesture

In [8]:
imu_split_df = X.iloc[:, :72]
emg_split_df = X.iloc[:, 72:]

ppd_emg_df = preprocess_df_by_gesture(emg_split_df, '$B', biosignal_switch_ix_lst=[])
ppd_imu_df = preprocess_df_by_gesture(imu_split_df, '$B', biosignal_switch_ix_lst=[])
X_ppd = pd.concat([metadata_cols_df, ppd_imu_df, ppd_emg_df], axis=1)

ppd_imu_df = pd.concat([metadata_cols_df, ppd_imu_df], axis=1)
ppd_emg_df = pd.concat([metadata_cols_df, ppd_emg_df], axis=1)

print(X_ppd.shape)
X_ppd.head()

(204800, 91)


Unnamed: 0,Participant,Gesture_ID,Gesture_Num,IMU1_ax,IMU1_ay,IMU1_az,IMU1_vx,IMU1_vy,IMU1_vz,IMU2_ax,...,EMG7,EMG8,EMG9,EMG10,EMG11,EMG12,EMG13,EMG14,EMG15,EMG16
0,P102,pan,1,-0.551109,-0.738972,-0.985439,0.181924,0.059616,0.087024,1.055804,...,-0.276292,-0.026736,-0.87387,-1.036152,-0.58093,-0.719494,-0.502255,-1.750091,-0.127847,-0.094192
1,P102,pan,1,-0.571115,-0.821726,-0.975036,0.242607,0.067375,-0.0249,1.007074,...,-0.125822,0.089679,-0.816215,-2.082635,-0.006283,-0.139439,-0.367764,-0.208084,-0.111811,-0.039009
2,P102,pan,1,-0.509305,-0.823575,-0.947221,0.550111,0.013848,-0.485765,0.993332,...,-0.068451,0.117076,-0.668221,-3.403064,-0.52603,-0.478294,-0.300443,0.203266,0.1133,0.004728
3,P102,pan,1,-0.511788,-0.77581,-0.947939,0.417919,0.087222,-0.229441,1.026003,...,-0.058907,0.080977,-0.424416,-3.709413,-0.570894,-0.775155,-0.14471,-0.619539,0.146499,0.199975
4,P102,pan,1,-0.441369,-0.921726,-0.882652,1.25497,0.108993,-0.933639,1.001748,...,-0.003929,0.041526,-0.01653,-4.07515,-0.12771,2.682791,-0.14175,-0.208404,-0.035642,0.172662


In [9]:
print(ppd_imu_df.shape)
ppd_imu_df.head()

(204800, 75)


Unnamed: 0,Participant,Gesture_ID,Gesture_Num,IMU1_ax,IMU1_ay,IMU1_az,IMU1_vx,IMU1_vy,IMU1_vz,IMU2_ax,...,IMU13_az,IMU13_vx,IMU13_vy,IMU13_vz,IMU15_ax,IMU15_ay,IMU15_az,IMU15_vx,IMU15_vy,IMU15_vz
0,P102,pan,1,-0.551109,-0.738972,-0.985439,0.181924,0.059616,0.087024,1.055804,...,-0.047646,-0.04523,0.104149,-0.046899,-0.027093,0.181512,0.003823,0.002525,0.05564,-0.009292
1,P102,pan,1,-0.571115,-0.821726,-0.975036,0.242607,0.067375,-0.0249,1.007074,...,-0.008147,-0.074827,0.006427,-0.049691,0.007038,0.18416,-0.014212,0.009704,0.034081,-0.02625
2,P102,pan,1,-0.509305,-0.823575,-0.947221,0.550111,0.013848,-0.485765,0.993332,...,-0.009861,-0.125477,0.203083,-0.09508,-0.077138,0.186451,0.034482,-0.061853,0.137769,-0.009996
3,P102,pan,1,-0.511788,-0.77581,-0.947939,0.417919,0.087222,-0.229441,1.026003,...,0.008394,-0.174411,-0.027033,-0.137434,-0.025918,0.193092,0.016815,-0.050325,-0.040597,-0.04018
4,P102,pan,1,-0.441369,-0.921726,-0.882652,1.25497,0.108993,-0.933639,1.001748,...,0.003086,-0.25905,0.11823,-0.111992,-0.081781,0.165432,0.029207,-0.19725,-0.006715,-0.030144


In [10]:
print(ppd_emg_df.shape)
ppd_emg_df.head()

(204800, 19)


Unnamed: 0,Participant,Gesture_ID,Gesture_Num,EMG1,EMG2,EMG3,EMG4,EMG5,EMG6,EMG7,EMG8,EMG9,EMG10,EMG11,EMG12,EMG13,EMG14,EMG15,EMG16
0,P102,pan,1,-0.362743,-0.801651,-0.383077,-0.195299,-0.203047,-0.464472,-0.276292,-0.026736,-0.87387,-1.036152,-0.58093,-0.719494,-0.502255,-1.750091,-0.127847,-0.094192
1,P102,pan,1,-0.351553,-0.775334,-0.382545,-0.154773,-0.131977,-0.295204,-0.125822,0.089679,-0.816215,-2.082635,-0.006283,-0.139439,-0.367764,-0.208084,-0.111811,-0.039009
2,P102,pan,1,-0.380825,-0.762588,-0.398388,-0.085411,0.017528,-0.205675,-0.068451,0.117076,-0.668221,-3.403064,-0.52603,-0.478294,-0.300443,0.203266,0.1133,0.004728
3,P102,pan,1,-0.366795,-0.765464,-0.374423,-0.073225,0.183172,0.009277,-0.058907,0.080977,-0.424416,-3.709413,-0.570894,-0.775155,-0.14471,-0.619539,0.146499,0.199975
4,P102,pan,1,-0.245578,-0.761283,-0.303976,-0.081947,0.224996,0.103319,-0.003929,0.041526,-0.01653,-4.07515,-0.12771,2.682791,-0.14175,-0.208404,-0.035642,0.172662


## Applying chosen dimensionality reduction algorithm:
> For now, just PCA

In [11]:
#df_t, dim_reduc_model = apply_dim_reduc(X_ms, model_str='PCA', num_dims=40, hp=None, modality=['EMG and IMU'], participant_inclusion=['All'], apply='ALL')
full_PCA40, dim_reduc_model = apply_dim_reduc(X_ppd, model_str='PCA', num_dims=40, use_full_dataset=True)

Start
Success


In [12]:
#pca = PCA(n_components=40)
#pca.fit(X)
#print(f"Total explained variance: {np.sum(pca.explained_variance_ratio_)}")

print(full_PCA40.shape)
full_PCA40.head()

(204800, 40)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30,31,32,33,34,35,36,37,38,39
0,-0.03073,-0.052976,0.176556,-0.048078,-0.39079,-2.119829,0.875118,-0.18769,0.685004,0.870079,...,-0.253432,-0.247214,0.287789,-0.155467,-0.084334,-0.029493,-0.005024,0.124663,-0.126503,-0.141775
1,0.077533,-0.108477,0.12897,-0.032167,-0.182756,-1.604412,0.7015,-0.068444,0.667338,1.098287,...,-0.423593,0.095906,0.180436,0.041204,0.113957,-0.417156,-0.210102,0.277506,0.032508,-0.058298
2,0.268279,-0.369027,0.26875,0.105239,-0.190292,-2.093812,1.012038,-0.080958,0.510502,0.75894,...,-0.312033,0.143082,-0.078348,0.199307,-0.051143,-0.478996,-0.082199,0.398729,0.039489,-0.11479
3,-0.106155,-0.291958,0.270681,0.035371,-0.204142,-2.314179,0.801514,-0.155267,0.519967,0.523682,...,-0.585471,-0.099189,0.079148,0.047709,-0.178641,-0.10566,-0.040289,0.387405,-0.054154,-0.153793
4,0.673338,-0.345606,0.24674,0.238184,0.948365,-0.041183,0.279347,0.034288,1.173712,2.11572,...,-0.261451,0.019803,-0.363926,0.194755,-0.400416,-0.350606,0.101981,0.374861,0.202281,-0.096663


In [13]:
dim_reduc_model.explained_variance_ratio_

array([0.19556914, 0.09905124, 0.07405321, 0.06060998, 0.05812266,
       0.04590841, 0.04439215, 0.03269277, 0.03184176, 0.02794496,
       0.02182327, 0.02065164, 0.01993278, 0.01934186, 0.0174894 ,
       0.01414135, 0.01335692, 0.01196144, 0.01162076, 0.01099271,
       0.01017437, 0.00974402, 0.00860753, 0.0082805 , 0.0065139 ,
       0.00624817, 0.00603539, 0.00595793, 0.00566937, 0.0056236 ,
       0.0052173 , 0.00501134, 0.00485189, 0.00482584, 0.00451897,
       0.00432917, 0.00423094, 0.00403372, 0.00377313, 0.00351227])

First two PCs only account for 19.5% and 9.9% of the variance, respectively... plotting in 2D probably isn't worthwhile...

In [14]:
print(f"Total explained variance: {np.sum(dim_reduc_model.explained_variance_ratio_)}")

Total explained variance: 0.9486577659555508


In [15]:
full_PCA20, dim_reduc_model20 = apply_dim_reduc(X_ppd, model_str='PCA', num_dims=20, use_full_dataset=True)

Start
Success


In [16]:
full_PCA20.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,-0.03073,-0.052977,0.176557,-0.048082,-0.390792,-2.119846,0.875128,-0.187688,0.684884,0.86996,-0.682863,0.06937,-0.520705,0.373229,-0.047855,0.933528,0.988589,0.342934,-0.312565,-0.21596
1,0.077533,-0.108477,0.12897,-0.032173,-0.182759,-1.604429,0.701507,-0.068515,0.667234,1.098191,-0.922688,0.264686,-0.635698,0.670112,0.127756,0.804255,1.453505,0.762946,-0.567068,-0.723159
2,0.268279,-0.369027,0.26875,0.105231,-0.190294,-2.093835,1.01204,-0.081049,0.510353,0.758916,-1.272393,-0.09473,-0.447659,0.881619,0.079153,0.493427,1.85739,0.955008,-0.581848,-1.587918
3,-0.106155,-0.291958,0.270681,0.035363,-0.204144,-2.314207,0.801518,-0.155334,0.519777,0.523658,-1.211119,-0.15616,-0.47708,1.03882,0.120556,0.482149,1.946075,1.183615,-0.705848,-2.004887
4,0.673338,-0.345606,0.24674,0.238174,0.948365,-0.041217,0.279349,0.03423,1.173458,2.115754,-2.096405,-1.04832,-0.90814,0.846074,0.036501,0.959733,2.119603,1.551716,-0.910794,-2.500869


In [17]:
not_saved_yet = False

if not_saved_yet:
    # ADD FULL PATHS TO THESE SO THEY DONT JUST SAVE IN THE REPO!
    ppd_emg_df.to_pickle('$BStand_EMG_df.pkl')
    X_ppd.to_pickle('$BStand_IMUEMG_df.pkl')
    df_t.to_pickle('$BStand_PCA40_IMUEMG_df.pkl')
    full_PCA20.to_pickle('$BStand_PCA20_IMUEMG_df.pkl')
    metadata_cols_df.to_pickle('metadata_cols_df.pkl')

## EMG Only

In [None]:
#emg_file_path_kai = 'C:\\Users\\kdmen\\Desktop\\Research\\Data\\$M\\filtered_datasets\\metadata_EMG_allgestures_allusers.pkl'
#emg_file_path_brc = 'D:\\Kai_MetaGestureClustering_24\\saved_datasets\\filtered_datasets\\metadata_EMG_allgestures_allusers.pkl'
#emg_df = pd.read_pickle(emg_file_path_brc)
#ms_emg_df = meansubtract_df_by_gesture(emg_df)

# ALREADY HAVE ppd_emg_df!

In [19]:
emg_PCA3, dim_reduc_model3 = apply_dim_reduc(ppd_emg_df, model_str='PCA', num_dims=3, use_full_dataset=True)

Start
Success


In [20]:
emg_PCA8, dim_reduc_model8 = apply_dim_reduc(ppd_emg_df, model_str='PCA', num_dims=8, use_full_dataset=True)

Start
Success


In [24]:
emg_PCA8.head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,-1.341323,-0.812209,0.089633,-0.231389,-1.102774,-0.123494,-0.452216,-0.563469
1,-0.644262,-0.860011,0.240803,-0.036121,-1.728382,-0.271787,-1.100263,0.143207
2,-1.251829,-0.823973,-0.213519,0.139521,-2.368628,-0.276726,-2.114243,0.46832
3,-1.494897,-0.633561,-0.233368,0.250943,-2.434067,-0.437012,-2.570415,0.370891
4,1.61575,-1.806795,-1.070433,-0.33264,-2.581807,-0.507581,-3.102347,0.600151


Save the above datasets

In [21]:
not_saved_yet = False

if not_saved_yet:
    emg_PCA3.to_pickle('$BStand_EMG_PCA3_df.pkl')
    emg_PCA8.to_pickle('$BStand_EMG_PCA8_df.pkl')

Now do train test split

In [30]:
full_PCA40_return_lst, save_name_lst = manual_train_test_split(full_PCA40, metadata_cols_df, 'Both_PCA40', save_bool=True, save_path='D:\\Kai_MetaGestureClustering_24\\saved_datasets', user_holdout=True, gesture_holdout=False)


In [31]:
full_PCA20_return_lst, save_name_lst = manual_train_test_split(full_PCA20, metadata_cols_df, 'Both_PCA20', save_bool=True, save_path='D:\\Kai_MetaGestureClustering_24\\saved_datasets', user_holdout=True, gesture_holdout=False)


In [32]:
emg_PCA3_return_lst, save_name_lst = manual_train_test_split(emg_PCA3, metadata_cols_df, 'EMG_PCA3', save_bool=True, save_path='D:\\Kai_MetaGestureClustering_24\\saved_datasets', user_holdout=True, gesture_holdout=False)


In [33]:
emg_PCA8_return_lst = manual_train_test_split(emg_PCA8, metadata_cols_df, 'EMG_PCA8', save_bool=True, save_path='D:\\Kai_MetaGestureClustering_24\\saved_datasets', user_holdout=True, gesture_holdout=False)
