> __Purpose:__ Implement an initial privacy attack to quantify how much re-identiifcation and linkability risk exists from both filtered EMG data and for personalization parameters (the decoder). Thus, implement a basic ML model to link the EMG data back to the corresponding subject, and likewise to link the decoder matrices back to the corresponding subject.  

-- 7 Subjects, therefore pure guessing would be 14.28% correct on average

In [1]:
import numpy as np
import matplotlib.pyplot as plt
# import matplotlib.image as mpimg
# import seaborn
import scipy
from scipy.optimize import minimize,least_squares
# from scipy.io import loadmat
import copy as copy
import time
import glob
import os
import pandas as pd
import pickle
from scipy.stats import wilcoxon as wilcoxon

# Load In Data

In [2]:
keys = ['METACPHS_S106', 'METACPHS_S107','METACPHS_S108', 'METACPHS_S109', 'METACPHS_S110', 'METACPHS_S111', 'METACPHS_S112']

with open('cphs_data_block1.pickle', 'rb') as handle:
    #refs_block1, poss_block1, dec_vels_block1, int_vel_block1, emgs_block1, Ws_block1, Hs_block1, alphas_block1, pDs_block1, times_block1, conditions_block1 = pickle.load(handle)
    _, _, _, _, emgs_block1, Ws_block1, _, _, _, times_block1, _ = pickle.load(handle)

with open('cphs_data_block2.pickle', 'rb') as handle:
    #refs_block2, poss_block2, dec_vels_block2, int_vel_block2, emgs_block2, Ws_block2, Hs_block2, alphas_block2, pDs_block2, times_block2, conditions_block2 = pickle.load(handle)
    _, _, _, _, emgs_block2, Ws_block2, _, _, _, times_block2, _ = pickle.load(handle)

In [3]:
# trying to figure out how often decoder updates -- every 1202 samples

W = Ws_block1[keys[0]][0]
W[1:,:,:].shape # 7199 time points x (decoder dimensions is 2 x 6)
dold = W[0]
update_ix = []
for ix,d in enumerate(W[1:]):
  if (np.array_equal(dold,d)==False):
    update_ix.append(ix)
    dold = d

update_ix.append(len(W) - 1) 
update_ix = np.asarray(update_ix)
update_ix = np.hstack([[0],update_ix])
print("update index in time indices")
print(update_ix)

# only go up to 20432

update_times = times_block1[keys[0]][0][update_ix]
print("")
print("update times in seconds")
print(update_times)

update_mins = update_times/60
print("")
print("update times in minutes")
print(update_mins)

tscale = update_ix[-1]/update_times[-1]
print("")
print("time scale conversion (index --> seconds): ", tscale)

update index in time indices
[    0  1200  2402  3604  4806  6008  7210  8412  9614 10816 12018 13220
 14422 15624 16826 18028 19230 20432 20769]

update times in seconds
[  0.          16.81372571  33.70942521  50.44435     67.45188546
  84.74962473 101.82288647 118.90209508 136.32002926 153.7001555
 170.77472734 188.02292895 205.35715556 222.46085095 239.64881945
 256.75217056 274.14669037 291.41761136 296.34003878]

update times in minutes
[0.         0.28022876 0.56182375 0.84073917 1.12419809 1.41249375
 1.69704811 1.98170158 2.27200049 2.56166926 2.84624546 3.13371548
 3.42261926 3.70768085 3.99414699 4.27920284 4.56911151 4.85696019
 4.93900065]

time scale conversion (index --> seconds):  70.08502828627614


# Make Input and Label DFs

In [4]:
print("The subject keys are the labels")
print(keys)

The subject keys are the labels
['METACPHS_S106', 'METACPHS_S107', 'METACPHS_S108', 'METACPHS_S109', 'METACPHS_S110', 'METACPHS_S111', 'METACPHS_S112']


In [5]:
print("Processed EMG Data:")
print("(number of conditions, all data points, number of channels)")
print(emgs_block1[keys[0]].shape)

Processed EMG Data:
(number of conditions, all data points, number of channels)
(8, 20770, 64)


In [6]:
print("Decoder AKA Wiener Filter:")
print("(number of conditions, all data points, XY?, number of channels)")
print(Ws_block1[keys[0]].shape)

Decoder AKA Wiener Filter:
(number of conditions, all data points, XY?, number of channels)
(8, 20770, 2, 64)


In [7]:
emg_labels_df = pd.DataFrame(columns=["Subject", "Condition", "Channel"])
emg_labels_df.head()

Unnamed: 0,Subject,Condition,Channel


In [8]:
emg_data_df = pd.DataFrame()
emg_data_df.head()

In [9]:
print("I don't think I can actually just pass in the matrix...")

dec_df = pd.DataFrame(columns=["Subject", "Condition", "Update Number", "????"])
dec_df.head()

I don't think I can actually just pass in the matrix...


Unnamed: 0,Subject,Condition,Update Number,????


In [10]:
dec_norms_df = pd.DataFrame(columns=["Subject", "Condition", "Update Number", "Frobenius Norm"])
dec_norms_df.head()

Unnamed: 0,Subject,Condition,Update Number,Frobenius Norm


In [11]:
num_conds = 8
num_channels = 64

t0 = time.time()

for key in keys:
    patient_emgs = emgs_block1[key]
    patient_dec = Ws_block1[key]
    
    for my_cond in range(num_conds):
        for my_channel in range(num_channels):
            emg_labels_df.loc[len(emg_labels_df)] = [key, my_cond, my_channel]
            # Ought to make sure all of these are the same length...
            emg_data_df = pd.concat([emg_data_df, pd.DataFrame(patient_emgs[my_cond, :, my_channel]).transpose()], ignore_index=True)
            
        for update_number, update_idx in enumerate(update_ix):
            dec_norms_df.loc[len(dec_norms_df)] = [key, my_cond, update_number, np.linalg.norm(patient_dec[my_cond, update_idx, :, :])]
        
t1 = time.time()
total = t1-t0  
print(total)

346.0781581401825


In [12]:
print(emg_data_df.shape)
emg_data_df.head()

(3584, 20770)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20760,20761,20762,20763,20764,20765,20766,20767,20768,20769
0,0.0,0.0,0.0,10.778148,10.778148,10.778148,26.874088,43.189501,43.189501,38.594581,...,54.343173,54.343173,53.363208,53.363208,53.363208,53.363208,59.557374,59.557374,59.557374,55.634152
1,0.0,0.0,0.0,9.891218,9.891218,9.891218,23.589349,36.477933,36.477933,31.296507,...,79.894291,79.894291,86.860329,86.860329,86.860329,86.860329,71.319955,71.319955,71.319955,56.606641
2,0.0,0.0,0.0,1.606057,1.606057,1.606057,8.623857,15.845217,15.845217,17.957593,...,89.500295,89.500295,84.266738,84.266738,84.266738,84.266738,71.979639,71.979639,71.979639,65.918534
3,0.0,0.0,0.0,3.668527,3.668527,3.668527,7.165376,11.62829,11.62829,15.308951,...,68.943668,68.943668,66.983974,66.983974,66.983974,66.983974,64.104558,64.104558,64.104558,61.848159
4,0.0,0.0,0.0,1.41181,1.41181,1.41181,9.707134,15.677262,15.677262,18.92241,...,43.565918,43.565918,42.34359,42.34359,42.34359,42.34359,42.235306,42.235306,42.235306,41.818073


In [13]:
print(emg_labels_df.shape)
emg_labels_df.head()

(3584, 3)


Unnamed: 0,Subject,Condition,Channel
0,METACPHS_S106,0,0
1,METACPHS_S106,0,1
2,METACPHS_S106,0,2
3,METACPHS_S106,0,3
4,METACPHS_S106,0,4


In [14]:
print(dec_norms_df.shape)
dec_norms_df.head()

(1064, 4)


Unnamed: 0,Subject,Condition,Update Number,Frobenius Norm
0,METACPHS_S106,0,0,0.06636
1,METACPHS_S106,0,1,0.06636
2,METACPHS_S106,0,2,9.70939
3,METACPHS_S106,0,3,8.20908
4,METACPHS_S106,0,4,10.406943


# ML Pipeline

In [15]:
# Machine learning
from sklearn.model_selection import train_test_split
from sklearn import model_selection, tree, preprocessing, metrics, linear_model
from sklearn.svm import LinearSVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier

In [16]:
# Standard model fitting

def fit_ml_algo(algo, X_train, y_train, cv, verbose=False, num_decimals=3, testing=False):
    '''Runs given algorithm and returns the accuracy metrics'''
    
    model = algo.fit(X_train, y_train)
    
    # Notice that this is tested on the data it just trained on...
    acc = round(model.score(X_train, y_train) * 100, 3)
    
    # Cross Validation - this fixes that issue of validating on the data that the model was trained on
    train_pred = model_selection.cross_val_predict(algo, 
                                                  X_train, 
                                                  y_train, 
                                                  cv=cv, 
                                                  n_jobs=-1)
    # Cross-validation metric
    acc_cv = round(metrics.accuracy_score(y_train, train_pred) * 100, num_decimals)
    #pre_cv = round(metrics.precision_score(y_train, train_pred) * 100, num_decimals)
    #rec_cv = round(metrics.recall_score(y_train, train_pred) * 100, num_decimals)
    
    if verbose:
        print("Training predictions:")
        print(train_pred)
        print("Ground Truth:")
        print(y_train)
        print(f"One Off Accuracy: {acc}")
        print(f"CV Accuracy: {acc_cv}")
    
    if testing:
        return train_pred, acc, acc_cv, model
    
    return train_pred, acc, acc_cv

In [87]:
def train_test_val_split(input_df, label_df, rng_seed=2, validation=False, test_percent=0.3, val_percent=0.3):
    ''''''
    
    x_train = input_df.copy(deep=True)
    y_train_reg = label_df

    ## TRAIN / TEST
    # Stratify might be good to ensure that all classes are represented, I'm not sure if it'll do that by default
    X_train, X_test, y_train, y_test = train_test_split(
        x_train, y_train_reg, test_size=test_percent, random_state=rng_seed)

    if validation:
        ## TRAIN / VAL
        # Might not use... easier to just use cross validation I think
        X_train_pv, X_val, y_train_pv, y_val = train_test_split(
            X_train, y_train, test_size=val_percent, random_state=rng_seed)

        return X_train_pv, y_train_pv, X_test, y_test, X_val, y_val
    else:
        return X_train, y_train, X_test, y_test

## Classification
1. Logistic Regression
2. K-Nearest Neighbor
3. Gaussian Naive Bayes
4. Linear SVC
5. Stochastic Gradient Descent
6. Decision Tree Classifier
7. Gradient Boosting Trees

In [88]:
# Set the number of k-folds
cv = 10

In [89]:
# Should I do standard scaler?

#from sklearn.preprocessing import StandardScaler
#scalerX = StandardScaler().fit(X_train)
#scalery = StandardScaler().fit(y_train)

#X_train = scalerX.transform(X_train)
#y_train = scalery.transform(y_train)
#X_test = scalerX.transform(X_test)
#y_test = scalery.transform(y_test)

## 1) Linking Matrix Norms to Patients

In [90]:
norm_input_df = dec_norms_df.drop(["Subject",  "Condition", "Update Number"], axis=1)
norm_label_df = dec_norms_df.drop(["Condition", "Update Number", "Frobenius Norm"], axis=1)

norm_input_df.head()

Unnamed: 0,Frobenius Norm
0,0.06636
1,0.06636
2,9.70939
3,8.20908
4,10.406943


In [91]:
key_to_num = dict()
num_to_key = dict()
for idx, key in enumerate(keys):
    key_to_num[key] = idx
    num_to_key[idx] = key

In [92]:
norm_label_df["Subject"] = norm_label_df["Subject"].map(key_to_num)
norm_label_df.head()

Unnamed: 0,Subject
0,0
1,0
2,0
3,0
4,0


In [93]:
try:
    X_train, y_train, X_test, y_test, X_val, y_val = train_test_val_split(norm_input_df, norm_label_df)
except ValueError:
    # You probably turned off validation so just remove the val sets
    X_train, y_train, X_test, y_test = train_test_val_split(norm_input_df, norm_label_df)
y_train = np.ravel(y_train)

print(X_train.shape)
X_train.head()

(744, 1)


Unnamed: 0,Frobenius Norm
581,2.734792
926,6.368164
542,3.122834
80,3.092765
903,5.662591


In [94]:
# Result logs
my_metrics_cols = ['Algorithm', 'One Off Acc', 'CV Acc', 'K Folds']
res_df = pd.DataFrame(columns=my_metrics_cols)

Logistic Regression

In [95]:
start_time = time.time()
train_pred_log, acc, acc_cv = fit_ml_algo(LogisticRegression(), X_train, y_train, cv)
log_time = (time.time() - start_time)
print(f"Accuracy: {acc}")
print(f"Accuracy CV 10-Fold: {acc_cv}")
print(f"Run Time: {log_time:.3f}")

temp_df = pd.DataFrame([['Logistic Regression', acc, acc_cv, cv]], columns=my_metrics_cols)
res_df = pd.concat((res_df, temp_df))

Accuracy: 19.624
Accuracy CV 10-Fold: 18.683
Run Time: 0.137


K-Nearest Neighbors

In [96]:
start_time = time.time()
train_pred_log, acc, acc_cv = fit_ml_algo(KNeighborsClassifier(), X_train, y_train, cv)
log_time = (time.time() - start_time)
print(f"Accuracy: {acc}")
print(f"Accuracy CV 10-Fold: {acc_cv}")
print(f"Run Time: {log_time:.3f}")

temp_df = pd.DataFrame([['KNN', acc, acc_cv, cv]], columns=my_metrics_cols)
res_df = pd.concat((res_df, temp_df))

Accuracy: 50.403
Accuracy CV 10-Fold: 27.957
Run Time: 0.150


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


Gaussian Naive Bayes

In [97]:
start_time = time.time()
train_pred_log, acc, acc_cv = fit_ml_algo(GaussianNB(), X_train, y_train, cv)
log_time = (time.time() - start_time)
print(f"Accuracy: {acc}")
print(f"Accuracy CV 10-Fold: {acc_cv}")
print(f"Run Time: {log_time:.3f}")

temp_df = pd.DataFrame([['Gaussian NB', acc, acc_cv, cv]], columns=my_metrics_cols)
res_df = pd.concat((res_df, temp_df))

Accuracy: 20.296
Accuracy CV 10-Fold: 19.758
Run Time: 0.075


Linear SVC

In [98]:
start_time = time.time()
train_pred_log, acc, acc_cv = fit_ml_algo(LinearSVC(), X_train, y_train, cv)
log_time = (time.time() - start_time)
print(f"Accuracy: {acc}")
print(f"Accuracy CV 10-Fold: {acc_cv}")
print(f"Run Time: {log_time:.3f}")

temp_df = pd.DataFrame([['Linear SVC', acc, acc_cv, cv]], columns=my_metrics_cols)
res_df = pd.concat((res_df, temp_df))



Accuracy: 19.489
Accuracy CV 10-Fold: 18.952
Run Time: 0.617


SGD

In [99]:
start_time = time.time()
train_pred_log, acc, acc_cv = fit_ml_algo(SGDClassifier(), X_train, y_train, cv)
log_time = (time.time() - start_time)
print(f"Accuracy: {acc}")
print(f"Accuracy CV 10-Fold: {acc_cv}")
print(f"Run Time: {log_time:.3f}")

temp_df = pd.DataFrame([['SGD', acc, acc_cv, cv]], columns=my_metrics_cols)
res_df = pd.concat((res_df, temp_df))

Accuracy: 13.978
Accuracy CV 10-Fold: 15.054
Run Time: 0.158


Decision Tree

In [100]:
start_time = time.time()
train_pred_log, acc, acc_cv = fit_ml_algo(DecisionTreeClassifier(), X_train, y_train, cv)
log_time = (time.time() - start_time)
print(f"Accuracy: {acc}")
print(f"Accuracy CV 10-Fold: {acc_cv}")
print(f"Run Time: {log_time:.3f}")

temp_df = pd.DataFrame([['Decision Tree', acc, acc_cv, cv]], columns=my_metrics_cols)
res_df = pd.concat((res_df, temp_df))

Accuracy: 100.0
Accuracy CV 10-Fold: 27.285
Run Time: 0.070


Gradient Boosted Trees

In [101]:
start_time = time.time()
train_pred_log, acc, acc_cv = fit_ml_algo(GradientBoostingClassifier(), X_train, y_train, cv)
log_time = (time.time() - start_time)
print(f"Accuracy: {acc}")
print(f"Accuracy CV 10-Fold: {acc_cv}")
print(f"Run Time: {log_time:.3f}")

temp_df = pd.DataFrame([['Gradient Boosted Trees', acc, acc_cv, cv]], columns=my_metrics_cols)
res_df = pd.concat((res_df, temp_df))

Accuracy: 77.016
Accuracy CV 10-Fold: 29.839
Run Time: 2.950


In [102]:
res_df.head(100)

Unnamed: 0,Algorithm,One Off Acc,CV Acc,K Folds
0,Logistic Regression,19.624,18.683,10
0,KNN,50.403,27.957,10
0,Gaussian NB,20.296,19.758,10
0,Linear SVC,19.489,18.952,10
0,SGD,13.978,15.054,10
0,Decision Tree,100.0,27.285,10
0,Gradient Boosted Trees,77.016,29.839,10


Testing

In [106]:
# Still using a cv of 5, as set way earlier
num_decimals = 3
test_df = pd.DataFrame(columns=my_metrics_cols)

print("KNN")
_, _, acc_cv, my_dt = fit_ml_algo(KNeighborsClassifier(), X_train, y_train, cv, testing=True)
y_pred = my_dt.predict(X_test)
acc_cv = round(metrics.accuracy_score(y_test, y_pred) * 100, num_decimals)
print(acc_cv)
temp_df = pd.DataFrame([['KNN', 'NA', acc_cv, cv]], columns=my_metrics_cols)
test_df = pd.concat((test_df, temp_df))

KNN
34.062


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [108]:
print("GBT")
_, _, acc_cv, my_dt = fit_ml_algo(GradientBoostingClassifier(), X_train, y_train, cv, testing=True)
y_pred = my_dt.predict(X_test)
acc_cv = round(metrics.accuracy_score(y_test, y_pred) * 100, num_decimals)
print(acc_cv)
temp_df = pd.DataFrame([['GBT', 'NA', acc_cv, cv]], columns=my_metrics_cols)
test_df = pd.concat((test_df, temp_df))

GBT
34.688


In [109]:
test_df.head()

Unnamed: 0,Algorithm,One Off Acc,CV Acc,K Folds
0,KNN,,34.062,10
0,GBT,,34.688,10


## 2) Linking EEG Data to Patients

## Cluster Decoder Matrix Norms