In [9]:
import numpy as np
import matplotlib.pyplot as plt
import sys
import os
import copy
import pandas as pd
import time

In [10]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

In [11]:
from subspace_clustering_helper_funcs import *

- https://scikit-learn.org/stable/modules/classes.html#module-sklearn.manifold

## Loading in the data

In [12]:
# remove pID 101 because it doesn't exist
# remove pID 131 because it  doesnt have enough user defined gestures
# each participant has 100 experimenter defined files and 50 user defined files
# 10 experimenter defined gestures and 5 user defined gestures

file_types = ["IMU_extract", "movavg_files"]
expt_types = ["experimenter-defined"]

#remove participant 131 because they are missing gestures 
pIDs_impaired = ['P102','P103','P104','P105','P106','P107','P108','P109','P110','P111',
       'P112','P114','P115','P116','P118','P119','P121','P122','P123','P124','P125',
       'P126','P127','P128', 'P132']
# remove participants P001 and P003 because they dont have duplicate or open gestures
pIDs_unimpaired = ['P004','P005','P006','P008','P010','P011']

pIDs_both = pIDs_impaired + pIDs_unimpaired

In [18]:
## Pickle is theoretically faster for Python...

data_path = "C:\\Users\\kdmen\\Desktop\\Research\\Data\\$M\\"
print("Loading")
start_time = time.time()

metadata_cols = ['Participant', 'Gesture_ID', 'Gesture_Num']

PCA_df = pd.read_pickle(data_path+'PCA_ms_IMUEMG_df.pkl')
metadata_cols_df = pd.read_pickle(data_path+'metadata_cols_df.pkl')

# Dropping the metadata when we read it in!
test_users_df = pd.read_pickle(data_path+'test_users_df.pkl').drop(metadata_cols, axis=1)
test_fullgestures_df = pd.read_pickle(data_path+'test_fullgestures_df.pkl').drop(metadata_cols, axis=1)
training_u_df = pd.read_pickle(data_path+'training_u_df.pkl').drop(metadata_cols, axis=1)
training_g_df = pd.read_pickle(data_path+'training_g_df.pkl').drop(metadata_cols, axis=1)

end_time = time.time()
print(f"Completed in {end_time - start_time}s")

Loading
Completed in 0.5221610069274902s


In [19]:
print(training_u_df.shape)
training_u_df.head()

(327168, 40)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30,31,32,33,34,35,36,37,38,39
0,-0.027903,0.001411,-0.019509,0.013428,-0.019699,0.027333,-0.031254,-0.02291,0.066484,0.108729,...,-0.019453,0.062983,-0.025869,0.014303,-0.013387,-0.037645,-0.18627,-0.046251,-0.10463,-0.002939
1,-0.038982,0.00647,-0.000111,0.010904,-0.015323,0.031336,-0.007901,-0.027368,0.06037,0.074712,...,0.041438,0.035053,-0.056843,-0.008895,-0.022542,-0.022563,-0.160826,-0.048161,-0.073771,0.043268
2,-0.116782,0.003824,0.01155,-0.014612,-0.093325,0.081718,-0.013155,-0.04615,0.036385,0.052746,...,-0.014298,0.072109,-0.026536,-0.034365,0.018695,-0.01194,-0.16058,-0.041831,-0.109653,0.027043
3,-0.030245,-0.017409,0.02254,-0.048905,-0.029129,0.090026,-0.024645,-0.064307,0.074589,0.053055,...,-0.010992,0.05999,-0.097073,-0.05687,-0.001038,-0.008015,-0.165858,-0.049424,-0.108671,0.069886
4,-0.11295,0.026262,0.004837,-0.063254,-0.108892,0.198729,-0.010583,-0.124893,0.114817,0.038628,...,0.035735,0.05088,-0.093678,-0.131263,0.018035,0.056185,-0.157963,-0.041911,-0.145308,0.063311


In [20]:
print(test_users_df.shape)
test_users_df.head()

(99584, 40)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30,31,32,33,34,35,36,37,38,39
11520,0.068817,0.042105,-0.044102,0.053256,0.080938,0.106052,-0.022074,0.03636,-0.069783,-0.009365,...,0.063422,0.081947,0.101682,-0.190886,-0.128849,-0.133675,-0.141095,-0.068262,-0.032521,0.018558
11521,0.074061,0.047283,-0.042408,0.054238,0.074058,0.102463,-0.015359,0.038657,-0.073921,-0.017769,...,0.072693,0.087035,0.107107,-0.189667,-0.127273,-0.14095,-0.143033,-0.066269,-0.033821,0.018198
11522,0.079972,0.043184,-0.055275,0.046477,0.079097,0.100367,-0.017235,0.037957,-0.077119,-0.026219,...,0.07115,0.081851,0.103781,-0.1864,-0.119714,-0.137523,-0.143988,-0.064562,-0.03661,0.016388
11523,0.066582,0.035923,-0.04061,0.045127,0.083319,0.103084,-0.026479,0.037962,-0.074883,-0.020225,...,0.065488,0.077696,0.100158,-0.195122,-0.128462,-0.13353,-0.147047,-0.064546,-0.032468,0.015712
11524,0.070243,0.048672,-0.050055,0.045911,0.075328,0.10219,-0.026261,0.038253,-0.068708,-0.025581,...,0.064663,0.082686,0.104846,-0.185309,-0.126085,-0.138847,-0.147857,-0.067871,-0.031347,0.019686


In [21]:
print(training_g_df.shape)
training_g_df.head()

(383424, 40)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30,31,32,33,34,35,36,37,38,39
0,-0.027903,0.001411,-0.019509,0.013428,-0.019699,0.027333,-0.031254,-0.02291,0.066484,0.108729,...,-0.019453,0.062983,-0.025869,0.014303,-0.013387,-0.037645,-0.18627,-0.046251,-0.10463,-0.002939
1,-0.038982,0.00647,-0.000111,0.010904,-0.015323,0.031336,-0.007901,-0.027368,0.06037,0.074712,...,0.041438,0.035053,-0.056843,-0.008895,-0.022542,-0.022563,-0.160826,-0.048161,-0.073771,0.043268
2,-0.116782,0.003824,0.01155,-0.014612,-0.093325,0.081718,-0.013155,-0.04615,0.036385,0.052746,...,-0.014298,0.072109,-0.026536,-0.034365,0.018695,-0.01194,-0.16058,-0.041831,-0.109653,0.027043
3,-0.030245,-0.017409,0.02254,-0.048905,-0.029129,0.090026,-0.024645,-0.064307,0.074589,0.053055,...,-0.010992,0.05999,-0.097073,-0.05687,-0.001038,-0.008015,-0.165858,-0.049424,-0.108671,0.069886
4,-0.11295,0.026262,0.004837,-0.063254,-0.108892,0.198729,-0.010583,-0.124893,0.114817,0.038628,...,0.035735,0.05088,-0.093678,-0.131263,0.018035,0.056185,-0.157963,-0.041911,-0.145308,0.063311


In [22]:
print(test_fullgestures_df.shape)
test_fullgestures_df.head()

(43328, 40)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30,31,32,33,34,35,36,37,38,39
11520,0.068817,0.042105,-0.044102,0.053256,0.080938,0.106052,-0.022074,0.03636,-0.069783,-0.009365,...,0.063422,0.081947,0.101682,-0.190886,-0.128849,-0.133675,-0.141095,-0.068262,-0.032521,0.018558
11521,0.074061,0.047283,-0.042408,0.054238,0.074058,0.102463,-0.015359,0.038657,-0.073921,-0.017769,...,0.072693,0.087035,0.107107,-0.189667,-0.127273,-0.14095,-0.143033,-0.066269,-0.033821,0.018198
11522,0.079972,0.043184,-0.055275,0.046477,0.079097,0.100367,-0.017235,0.037957,-0.077119,-0.026219,...,0.07115,0.081851,0.103781,-0.1864,-0.119714,-0.137523,-0.143988,-0.064562,-0.03661,0.016388
11523,0.066582,0.035923,-0.04061,0.045127,0.083319,0.103084,-0.026479,0.037962,-0.074883,-0.020225,...,0.065488,0.077696,0.100158,-0.195122,-0.128462,-0.13353,-0.147047,-0.064546,-0.032468,0.015712
11524,0.070243,0.048672,-0.050055,0.045911,0.075328,0.10219,-0.026261,0.038253,-0.068708,-0.025581,...,0.064663,0.082686,0.104846,-0.185309,-0.126085,-0.138847,-0.147857,-0.067871,-0.031347,0.019686


In [29]:
# Maximum number of components
max_clusters = 20 # Original dataset size

# Create a combined range: 1-20, and then every 5th value after 20
num_clusters_range = list(range(1, max_clusters+1))

In [None]:
## 4. Clustering and Classification Performance

# Clustering: Silhouette Score
silhouette_scores = []

for n in num_clusters_range:
    kmeans = KMeans(n_clusters=n)
    cluster_labels = kmeans.fit_predict(PCA_df)
    silhouette = silhouette_score(PCA_df, cluster_labels)
    silhouette_scores.append(silhouette)

# Plot the silhouette scores
plt.plot(components_range, silhouette_scores, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score vs Number of Clusters')
plt.grid(True)
plt.show()

Idk if I can use this one... I don't have cluster labels y...

In [None]:
if False:
    # Classification: Accuracy and F1 Score
    classification_accuracies = []
    classification_f1_scores = []

    # Assuming `y` is your target variable
    #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # This actually did a train/test split... I should do this manually...
    for n in num_clusters_range:
        # This one needs train/test split!
        clf = LogisticRegression(max_iter=1000)
        clf.fit(X_train_reduced, y_train)
        y_pred = clf.predict(X_test_reduced)
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='weighted')
        classification_accuracies.append(accuracy)
        classification_f1_scores.append(f1)

    # Plot the classification accuracies
    plt.plot(components_range, classification_accuracies, marker='o', label='Accuracy')
    plt.plot(components_range, classification_f1_scores, marker='o', label='F1 Score')
    plt.xlabel('Number of Clusters')
    plt.ylabel('Score')
    plt.title('Classification Performance vs Number of Clusters')
    plt.legend()
    plt.grid(True)
    plt.show()


In [None]:
## Topological Measures: Neighborhood Preservation
# This one also uses an arbitrary number of clusters...

from sklearn.neighbors import NearestNeighbors

def knn_preservation_score(X_orig, X_reduced, k=5):
    knn_orig = NearestNeighbors(n_neighbors=k).fit(X_orig)
    knn_reduced = NearestNeighbors(n_neighbors=k).fit(X_reduced)
    neighbors_orig = knn_orig.kneighbors(X_orig, return_distance=False)
    neighbors_reduced = knn_reduced.kneighbors(X_reduced, return_distance=False)
    preservation = np.mean([len(set(neighbors_orig[i]).intersection(set(neighbors_reduced[i]))) / k for i in range(X_orig.shape[0])])
    return preservation

In [None]:
preservation_scores = []

X = pd.read_pickle(data_path+'metadata_IMU_EMG_allgestures_allusers.pkl').drop(metadata_cols, axis=1)
X_reduced = PCA_df

for n in num_clusters_range:
    # Needs full original dataset and then PCA reduced dataset
    score = knn_preservation_score(X, X_reduced, k=n)
    preservation_scores.append(score)

# Plot the neighborhood preservation scores
plt.plot(components_range, preservation_scores, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('Neighborhood Preservation Score')
plt.title('Neighborhood Preservation vs Number of Clusters')
plt.grid(True)
plt.show()

## Hierarchy / Dendrogram
Now, let's investigate a more informed choice, based on measured statistical heterogenity. Could look at Shannon or KL divergence (heatmap), but I'll just look at a dendogram for now.

In [38]:
data_df.head()

Unnamed: 0,Participant,Gesture_ID,Gesture_Num,IMU1_ax,IMU1_ay,IMU1_az,IMU1_vx,IMU1_vy,IMU1_vz,IMU2_ax,...,EMG7,EMG8,EMG9,EMG10,EMG11,EMG12,EMG13,EMG14,EMG15,EMG16
0,P102,pan,1,0.341797,-0.939941,0.000977,-0.00745,-0.192625,0.005321,-0.380859,...,2e-06,2e-06,3e-06,2e-05,4e-06,4e-06,2e-06,9e-06,1e-06,2e-06
1,P102,pan,1,0.336178,-0.963185,0.003898,0.009595,-0.190446,-0.026116,-0.394547,...,3e-06,3e-06,3e-06,1.4e-05,7e-06,7e-06,2e-06,1.7e-05,1e-06,2e-06
2,P102,pan,1,0.353539,-0.963704,0.011711,0.095966,-0.20548,-0.155563,-0.398406,...,3e-06,3e-06,4e-06,7e-06,4e-06,5e-06,3e-06,2e-05,3e-06,2e-06
3,P102,pan,1,0.352841,-0.950288,0.011509,0.058836,-0.184871,-0.083567,-0.38923,...,3e-06,3e-06,6e-06,5e-06,4e-06,3e-06,4e-06,1.5e-05,3e-06,3e-06
4,P102,pan,1,0.372621,-0.991273,0.029847,0.293946,-0.178756,-0.281361,-0.396043,...,3e-06,2e-06,8e-06,3e-06,7e-06,2.2e-05,4e-06,1.7e-05,2e-06,3e-06


In [39]:
data_df['Gesture_ID'].unique()

array(['pan', 'duplicate', 'gesture-1', 'gesture-2', 'gesture-3',
       'gesture-4', 'gesture-5', 'normal', 'frequency', 'range-of-motion',
       'zoom-out', 'zoom-in', 'move', 'rotate', 'select-single', 'delete',
       'close', 'open', 'two-handed-tap', 'point-and-pinch',
       'pinch-and-scroll', 'air-tap', 'palm-pinch', 'double-pinch',
       'single-pinch', 'single-clench', 'shake-and-release',
       'double-clench'], dtype=object)

In [40]:
len(data_df['Gesture_ID'].unique())

28

In [41]:
len(data_df['Gesture_Num'].unique())

10

In [42]:
len(data_df['Participant'].unique())

31

In [None]:
subset_df = data_df[data_df['Participant'] == 'P102']
subset_df.head()

In [None]:
for p in data_df['Participant'].unique():
    subset_df = data_df[data_df['Participant'] == p]
    print(subset_df.shape)

In [None]:
assert(1==0)

# This code is not ready to run yet...
## Does not handle data assymetries (eg different dataset sizes between clients)

n_clients = len(data_df['Participant'].unique())
n_trials = 8
# Flatten data for easy computation
flattened_data = np.vstack(data_clients2)
# Reshape data to separate trials for each client
reshaped_data = flattened_data.reshape(n_clients, n_trials, -1)
# Calculate pairwise distances between clients' trial data
inter_subject_distances = np.zeros((n_clients, n_clients))
for i in range(n_clients):
    for j in range(i + 1, n_clients):
        distance = np.mean(np.linalg.norm(reshaped_data[i] - reshaped_data[j], axis=1))
        inter_subject_distances[i, j] = distance
        inter_subject_distances[j, i] = distance
# Convert inter-subject distances to a condensed distance matrix
condensed_distances = squareform(inter_subject_distances)

# Perform hierarchical clustering
linkage_matrix_single = linkage(condensed_distances, method='single')
# Plot dendrogram
plt.figure(figsize=(10, 6))
dendrogram(linkage_matrix_single, labels=np.arange(n_clients))
plt.title("Hierarchical Clustering Dendrogram")
plt.xlabel("Client Index")
plt.ylabel("Distance")
plt.show()

## Choosing Optimal Number of Clusters

In [None]:
assert(1==0)

> Elbow Plot

In [None]:
from sklearn.cluster import KMeans

# determining the maximum number of clusters 
# using the simple method
limit = int((dataset_new.shape[0]//2)**0.5)
 
# wcss - within cluster sum of squared distances
wcss = {}
 
for k in range(2,limit+1):
    model = KMeans(n_clusters=k)
    model.fit(dataset_new)
    wcss[k] = model.inertia_
     
# plotting the wcss values to find the elbow value
plt.plot(wcss.keys(), wcss.values(), 'gs-')
plt.xlabel('Values of "k"')
plt.ylabel('WCSS')
plt.show()

# determining the maximum number of clusters using the simple method
limit = int((dataset_new.shape[0]//2)**0.5)

> Silhouette Score

In [None]:
from sklearn.metrics import silhouette_score

for k in range(2, limit+1):
    model = KMeans(n_clusters=k)
    model.fit(dataset_new)
    pred = model.predict(dataset_new)
    score = silhouette_score(dataset_new, pred)
    print('Silhouette Score for k = {}: {:<.3f}'.format(k, score))