> __Purpose:__ Implement an initial privacy attack to quantify how much re-identiifcation and linkability risk exists from  filtered EMG data (should be highly unique). Thus, implement a basic ML model to link the EMG data back to the corresponding subject.  

-- 7 Subjects, therefore pure guessing would be 14.28% correct on average

In [1]:
import numpy as np
import matplotlib.pyplot as plt
# import seaborn
import time
import pandas as pd
import pickle
import random
from kcs_ml_infr import *

random.seed(a=1)

# Load In Data

In [2]:
keys = ['METACPHS_S106', 'METACPHS_S107','METACPHS_S108', 'METACPHS_S109', 'METACPHS_S110', 'METACPHS_S111', 'METACPHS_S112']

with open('Data\cphs_data_block1.pickle', 'rb') as handle:
    #refs_block1, poss_block1, dec_vels_block1, int_vel_block1, emgs_block1, Ws_block1, Hs_block1, alphas_block1, pDs_block1, times_block1, conditions_block1 = pickle.load(handle)
    _, _, _, _, emgs_block1, Ws_block1, _, _, _, times_block1, _ = pickle.load(handle)

with open('Data\cphs_data_block2.pickle', 'rb') as handle:
    #refs_block2, poss_block2, dec_vels_block2, int_vel_block2, emgs_block2, Ws_block2, Hs_block2, alphas_block2, pDs_block2, times_block2, conditions_block2 = pickle.load(handle)
    _, _, _, _, emgs_block2, Ws_block2, _, _, _, times_block2, _ = pickle.load(handle)

In [None]:
# Decoder updates
W = Ws_block1[keys[0]][0]
W[1:,:,:].shape # 7199 time points x (decoder dimensions is 2 x 6)
dold = W[0]
update_ix = []
for ix,d in enumerate(W[1:]):
  if (np.array_equal(dold,d)==False):
    update_ix.append(ix)
    dold = d

update_ix.append(len(W) - 1) 
update_ix = np.asarray(update_ix)
update_ix = np.hstack([[0],update_ix])

print(update_ix.shape)
print(update_ix)

Load in the other data
> This cell takes forever to run... not sure if it was actually any quicker to import vs just make it all over again

In [8]:
t0 = time.time()

emg_data_df1 = pd.read_csv("Data\emg_data1.csv")
emg_labels_df1 = pd.read_csv("Data\emg_labels1.csv")
emg_data_df2 = pd.read_csv("Data\emg_data2.csv")
emg_labels_df2 = pd.read_csv("Data\emg_labels2.csv")

t1 = time.time()
total = t1-t0  
print(total)

251.7547857761383


In [9]:
emg_data_df = pd.concat((emg_data_df1, emg_data_df2))
emg_labels_df = pd.concat((emg_labels_df1, emg_labels_df2))

In [10]:
print(emg_data_df.shape)
emg_data_df.drop('Unnamed: 0', axis=1, inplace=True)
emg_data_df.head()

(7232, 20771)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20760,20761,20762,20763,20764,20765,20766,20767,20768,20769
0,0.0,0.0,0.0,10.778148,10.778148,10.778148,26.874088,43.189501,43.189501,38.594581,...,54.343173,54.343173,53.363208,53.363208,53.363208,53.363208,59.557374,59.557374,59.557374,55.634152
1,0.0,0.0,0.0,9.891218,9.891218,9.891218,23.589349,36.477933,36.477933,31.296507,...,79.894291,79.894291,86.860329,86.860329,86.860329,86.860329,71.319955,71.319955,71.319955,56.606641
2,0.0,0.0,0.0,1.606057,1.606057,1.606057,8.623857,15.845217,15.845217,17.957593,...,89.500295,89.500295,84.266738,84.266738,84.266738,84.266738,71.979639,71.979639,71.979639,65.918534
3,0.0,0.0,0.0,3.668527,3.668527,3.668527,7.165376,11.62829,11.62829,15.308951,...,68.943668,68.943668,66.983974,66.983974,66.983974,66.983974,64.104558,64.104558,64.104558,61.848159
4,0.0,0.0,0.0,1.41181,1.41181,1.41181,9.707134,15.677262,15.677262,18.92241,...,43.565918,43.565918,42.34359,42.34359,42.34359,42.34359,42.235306,42.235306,42.235306,41.818073


In [11]:
print(emg_labels_df.shape)
emg_labels_df.drop('Unnamed: 0', axis=1, inplace=True)
emg_labels_df.head()

(7232, 4)


Unnamed: 0,Subject,Condition,Channel
0,METACPHS_S106,0,0
1,METACPHS_S106,0,1
2,METACPHS_S106,0,2
3,METACPHS_S106,0,3
4,METACPHS_S106,0,4


# ML Pipeline

In [14]:
# Machine learning
from sklearn.model_selection import train_test_split
from sklearn import model_selection, tree, preprocessing, metrics, linear_model
from sklearn.svm import LinearSVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier

In [19]:
# Set the number of k-folds
cv = 10

In [21]:
key_to_num = dict()
num_to_key = dict()
for idx, key in enumerate(keys):
    key_to_num[key] = idx
    num_to_key[idx] = key

In [23]:
# Result logs
my_metrics_cols = ['Algorithm', 'One Off Acc', 'CV Acc', 'K Folds']
res_df = pd.DataFrame(columns=my_metrics_cols)

In [24]:
my_models = [LogisticRegression(), KNeighborsClassifier(), GaussianNB(), LinearSVC(), SGDClassifier(), DecisionTreeClassifier(), GradientBoostingClassifier()]

# Filtered EMG Data Privacy Evaluation
>Linking EMG Data to Participants
1. Does the channel matter / improve performance? Within the same block (assuming different blocks have the strap re-put on) then presumably the same muscles should act in similar ways.  __Just do PCA on it and don't worry about it__
2. Does the condition matter? Presumably, but the question is how much.  __Only look at condition for conditions that changed performance (eg learning rate speed)__
3. Ways to compress the input data: PCA/LDA/nonlinearversion, or norms of the vectors... is it even necessary / beneficial to performance.  __Focus on just PCA for now__
4. __No standard scaler since negative filtered EMG data has no meaning__

## 1) Create Envelope of Filtered EMG Data

In [None]:
plt.plot(emg_data_df[keys[0]][0, :, 0])

## 2) PCA On the Channels

In [None]:
print(emg_data_df.shape)
emg_data_df.head()

In [51]:
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA

Choosing just 2 components was super arbitrary, ought to do some analysis

In [52]:
def do_pca(num_cs, input_df):
    pca=PCA(n_components=num_cs)
    pca.fit(input_df)
    x_pca=pca.transform(input_df)

    return x_pca

Could drop channel column, or combine subject and channel into 1 label. Dropping channel column is easiest but presumably loses some info (eg if channel is place over same muscle that could provide a better matching criterion, but this may be counteracted if the two blocks had sensors placed in different places

In [43]:
emg_labels_df.head()

Unnamed: 0,Subject,Condition,Channel
0,METACPHS_S106,0,0
1,METACPHS_S106,0,1
2,METACPHS_S106,0,2
3,METACPHS_S106,0,3
4,METACPHS_S106,0,4


In [44]:
# Also drop condition for now
emg_labels_NC_df = emg_labels_df.drop(['Channel', 'Condition'], axis=1)
emg_labels_NC_df["Subject"] = emg_labels_NC_df["Subject"].map(key_to_num)
print(emg_labels_NC_df.shape)
emg_labels_NC_df.head()

(7232, 1)


Unnamed: 0,Subject
0,METACPHS_S106
1,METACPHS_S106
2,METACPHS_S106
3,METACPHS_S106
4,METACPHS_S106


In [47]:
emgPCA_res_df = pd.DataFrame(columns=my_metrics_cols)
emgPCA_res_df.head()

Unnamed: 0,Algorithm,One Off Acc,CV Acc,K Folds


In [None]:

emg_PCA_df2 = pd.DataFrame(do_pca(2, emg_data_df))
emg_PCA_df3 = pd.DataFrame(do_pca(3, emg_data_df))
emg_PCA_df5 = pd.DataFrame(do_pca(5, emg_data_df))
emg_PCA_df7 = pd.DataFrame(do_pca(7, emg_data_df))
emg_PCA_df10 = pd.DataFrame(do_pca(10, emg_data_df))
emg_PCA_df20 = pd.DataFrame(do_pca(20, emg_data_df))

my_pca_dfs = [emg_PCA_df2, emg_PCA_df3, emg_PCA_df5, emg_PCA_df7, emg_PCA_df10, emg_PCA_df20]

In [None]:
for my_input_df in my_pca_dfs:
    X_train, y_train, X_test, y_test, X_val, y_val = train_test_val_split(my_input_df, emg_labels_NC_df)
    y_train = np.ravel(y_train)

    # Need to keep track of what condition was used when... appending a column after?
    #emgPCA_res_df = pd.DataFrame(columns=my_metrics_cols)

    for model in my_models:
        emgPCA_res_df = train_model(model, X_train, y_train, cv, emgPCA_res_df)

In [None]:
emgPCA_res_df.head(100)

In [None]:
# Do some code to add a column
num_comp_col = np.zeros((len(my_pca_dfs)*len(my_models), 1))

#for i in range():
#    num_comp_col

In [None]:
emgPCA_res_df.head(100)

## 3) Make ML Model Attack