> __Purpose:__ Implement an initial privacy attack to quantify how much re-identiifcation and linkability risk exists from both filtered EMG data and for personalization parameters (the decoder). Thus, implement a basic ML model to link the EMG data back to the corresponding subject, and likewise to link the decoder matrices back to the corresponding subject.  

-- 7 Subjects, therefore pure guessing would be 14.28% correct on average

In [1]:
import numpy as np
import matplotlib.pyplot as plt
# import matplotlib.image as mpimg
# import seaborn
import time
import os
import pandas as pd
import pickle
import random

random.seed(a=1)

# Load In Data

In [2]:
keys = ['METACPHS_S106', 'METACPHS_S107','METACPHS_S108', 'METACPHS_S109', 'METACPHS_S110', 'METACPHS_S111', 'METACPHS_S112']

with open('cphs_data_block1.pickle', 'rb') as handle:
    #refs_block1, poss_block1, dec_vels_block1, int_vel_block1, emgs_block1, Ws_block1, Hs_block1, alphas_block1, pDs_block1, times_block1, conditions_block1 = pickle.load(handle)
    _, _, _, _, emgs_block1, Ws_block1, _, _, _, times_block1, _ = pickle.load(handle)

with open('cphs_data_block2.pickle', 'rb') as handle:
    #refs_block2, poss_block2, dec_vels_block2, int_vel_block2, emgs_block2, Ws_block2, Hs_block2, alphas_block2, pDs_block2, times_block2, conditions_block2 = pickle.load(handle)
    _, _, _, _, emgs_block2, Ws_block2, _, _, _, times_block2, _ = pickle.load(handle)

In [3]:
# Decoder updates
W = Ws_block1[keys[0]][0]
W[1:,:,:].shape # 7199 time points x (decoder dimensions is 2 x 6)
dold = W[0]
update_ix = []
for ix,d in enumerate(W[1:]):
  if (np.array_equal(dold,d)==False):
    update_ix.append(ix)
    dold = d

update_ix.append(len(W) - 1) 
update_ix = np.asarray(update_ix)
update_ix = np.hstack([[0],update_ix])

In [4]:
emg_data_df1 = pd.read_csv("emg_data1.csv")
emg_labels_df1 = pd.read_csv("emg_labels1.csv")
dec_norms_df1 = pd.read_csv("decoder_norms1.csv")

emg_data_df2 = pd.read_csv("emg_data2.csv")
emg_labels_df2 = pd.read_csv("emg_labels2.csv")
dec_norms_df2 = pd.read_csv("decoder_norms2.csv")

In [5]:
emg_data_df = pd.concat((emg_data_df1, emg_data_df2))
emg_labels_df = pd.concat((emg_labels_df1, emg_labels_df2))
dec_norms_df = pd.concat((dec_norms_df1, dec_norms_df2))

In [6]:
print(emg_data_df.shape)
emg_data_df.drop('Unnamed: 0', axis=1, inplace=True)
emg_data_df.head()

(7232, 20771)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20760,20761,20762,20763,20764,20765,20766,20767,20768,20769
0,0.0,0.0,0.0,10.778148,10.778148,10.778148,26.874088,43.189501,43.189501,38.594581,...,54.343173,54.343173,53.363208,53.363208,53.363208,53.363208,59.557374,59.557374,59.557374,55.634152
1,0.0,0.0,0.0,9.891218,9.891218,9.891218,23.589349,36.477933,36.477933,31.296507,...,79.894291,79.894291,86.860329,86.860329,86.860329,86.860329,71.319955,71.319955,71.319955,56.606641
2,0.0,0.0,0.0,1.606057,1.606057,1.606057,8.623857,15.845217,15.845217,17.957593,...,89.500295,89.500295,84.266738,84.266738,84.266738,84.266738,71.979639,71.979639,71.979639,65.918534
3,0.0,0.0,0.0,3.668527,3.668527,3.668527,7.165376,11.62829,11.62829,15.308951,...,68.943668,68.943668,66.983974,66.983974,66.983974,66.983974,64.104558,64.104558,64.104558,61.848159
4,0.0,0.0,0.0,1.41181,1.41181,1.41181,9.707134,15.677262,15.677262,18.92241,...,43.565918,43.565918,42.34359,42.34359,42.34359,42.34359,42.235306,42.235306,42.235306,41.818073


In [7]:
print(emg_labels_df.shape)
emg_labels_df.drop('Unnamed: 0', axis=1, inplace=True)
emg_labels_df.head()

(7232, 4)


Unnamed: 0,Subject,Condition,Channel
0,METACPHS_S106,0,0
1,METACPHS_S106,0,1
2,METACPHS_S106,0,2
3,METACPHS_S106,0,3
4,METACPHS_S106,0,4


In [8]:
print(dec_norms_df.shape)
dec_norms_df.drop('Unnamed: 0', axis=1, inplace=True)
dec_norms_df.head()

(2128, 5)


Unnamed: 0,Subject,Condition,Update Number,Frobenius Norm
0,METACPHS_S106,0,0,0.06636
1,METACPHS_S106,0,1,0.06636
2,METACPHS_S106,0,2,9.70939
3,METACPHS_S106,0,3,8.20908
4,METACPHS_S106,0,4,10.406943


# ML Pipeline

In [9]:
# Machine learning
from sklearn.model_selection import train_test_split
from sklearn import model_selection, tree, preprocessing, metrics, linear_model
from sklearn.svm import LinearSVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier

In [10]:
# Standard model fitting

def fit_ml_algo(algo, X_train, y_train, cv, verbose=False, num_decimals=3, testing=False):
    '''Runs given algorithm and returns the accuracy metrics'''
    
    model = algo.fit(X_train, y_train)
    
    # Notice that this is tested on the data it just trained on...
    acc = round(model.score(X_train, y_train) * 100, 3)
    
    # Cross Validation - this fixes that issue of validating on the data that the model was trained on
    train_pred = model_selection.cross_val_predict(algo, 
                                                  X_train, 
                                                  y_train, 
                                                  cv=cv, 
                                                  n_jobs=-1)
    # Cross-validation metric
    acc_cv = round(metrics.accuracy_score(y_train, train_pred) * 100, num_decimals)
    #pre_cv = round(metrics.precision_score(y_train, train_pred) * 100, num_decimals)
    #rec_cv = round(metrics.recall_score(y_train, train_pred) * 100, num_decimals)
    
    if verbose:
        print("Training predictions:")
        print(train_pred)
        print("Ground Truth:")
        print(y_train)
        print(f"One Off Accuracy: {acc}")
        print(f"CV Accuracy: {acc_cv}")
    
    if testing:
        return train_pred, acc, acc_cv, model
    
    return train_pred, acc, acc_cv

In [11]:
def train_test_val_split(input_df, label_df, rng_seed=2, validation=False, test_percent=0.3, val_percent=0.3):
    ''''''
    
    x_train = input_df.copy(deep=True)
    y_train_reg = label_df

    ## TRAIN / TEST
    # Stratify might be good to ensure that all classes are represented, I'm not sure if it'll do that by default
    X_train, X_test, y_train, y_test = train_test_split(
        x_train, y_train_reg, test_size=test_percent, random_state=rng_seed)

    if validation:
        ## TRAIN / VAL
        # Might not use... easier to just use cross validation I think
        X_train_pv, X_val, y_train_pv, y_val = train_test_split(
            X_train, y_train, test_size=val_percent, random_state=rng_seed)

        return X_train_pv, y_train_pv, X_test, y_test, X_val, y_val
    else:
        return X_train, y_train, X_test, y_test

## Classification
1. Logistic Regression
2. K-Nearest Neighbor
3. Gaussian Naive Bayes
4. Linear SVC
5. Stochastic Gradient Descent
6. Decision Tree Classifier
7. Gradient Boosting Trees

In [12]:
# Set the number of k-folds
cv = 10

In [13]:
# Should I do standard scaler?

#from sklearn.preprocessing import StandardScaler
#scalerX = StandardScaler().fit(X_train)
#scalery = StandardScaler().fit(y_train)

#X_train = scalerX.transform(X_train)
#y_train = scalery.transform(y_train)
#X_test = scalerX.transform(X_test)
#y_test = scalery.transform(y_test)

## 1) Linking Matrix Norms to Patients

In [14]:
norm_input_df = dec_norms_df.drop(["Subject",  "Condition", "Update Number"], axis=1)
norm_label_df = dec_norms_df.drop(["Condition", "Update Number", "Frobenius Norm"], axis=1)

norm_input_df.head()

Unnamed: 0,Frobenius Norm
0,0.06636
1,0.06636
2,9.70939
3,8.20908
4,10.406943


In [15]:
key_to_num = dict()
num_to_key = dict()
for idx, key in enumerate(keys):
    key_to_num[key] = idx
    num_to_key[idx] = key

In [16]:
norm_label_df["Subject"] = norm_label_df["Subject"].map(key_to_num)
norm_label_df.head()

Unnamed: 0,Subject
0,0
1,0
2,0
3,0
4,0


In [17]:
try:
    X_train, y_train, X_test, y_test, X_val, y_val = train_test_val_split(norm_input_df, norm_label_df)
except ValueError:
    # You probably turned off validation so just remove the val sets
    X_train, y_train, X_test, y_test = train_test_val_split(norm_input_df, norm_label_df)
y_train = np.ravel(y_train)

print(X_train.shape)
X_train.head()

(1489, 1)


Unnamed: 0,Frobenius Norm
420,3.567582
146,5.589772
266,0.069974
695,5.796815
372,17.938752


In [21]:
# Result logs
my_metrics_cols = ['Algorithm', 'One Off Acc', 'CV Acc', 'K Folds']
res_df = pd.DataFrame(columns=my_metrics_cols)

norm1d_res_df = pd.DataFrame(columns=my_metrics_cols)
flatten_dec_res_df = pd.DataFrame(columns=my_metrics_cols)
norm_vec_res_df = pd.DataFrame(columns=my_metrics_cols)

## Case 1: 1D Norm

In [19]:
# Fully functionalized

def train_model(my_model, X_train, y_train, cv, res_df, verbose=False):
    #start_time = time.time()
    train_pred_log, acc, acc_cv = fit_ml_algo(my_model, X_train, y_train, cv)
    #log_time = (time.time() - start_time)
    if verbose:
        print(f"{str(my_model)}")
        print(f"Accuracy: {acc}")
        print(f"Accuracy CV 10-Fold: {acc_cv}")
        print()
        #print(f"Run Time: {log_time:.3f}")

    my_metrics_cols = ['Algorithm', 'One Off Acc', 'CV Acc', 'K Folds']
    temp_df = pd.DataFrame([[str(my_model), acc, acc_cv, cv]], columns=my_metrics_cols)
    res_df = pd.concat((res_df, temp_df))
    
    return res_df

In [22]:
# For repeatition, find the average
# Do I need to add RNG? Probably...
num_runs = 1

my_models = [LogisticRegression(), KNeighborsClassifier(), GaussianNB(), LinearSVC(), SGDClassifier(), DecisionTreeClassifier(), GradientBoostingClassifier()]
running_average_array = np.zeros((len(my_models), num_runs))

for trial_num in range(num_runs):
    print(f"Trial {trial_num} of {num_runs}")
    for model_num, model in enumerate(my_models):
        norm1d_res_df = train_model(model, X_train, y_train, cv, norm1d_res_df)
        running_average_array[model_num, trial_num] = norm1d_res_df.iloc[-1, 2]

LogisticRegression()
Accuracy: 17.931
Accuracy CV 10-Fold: 16.857



  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


KNeighborsClassifier()
Accuracy: 52.048
Accuracy CV 10-Fold: 30.356

GaussianNB()
Accuracy: 19.073
Accuracy CV 10-Fold: 17.26





LinearSVC()
Accuracy: 18.603
Accuracy CV 10-Fold: 18.805

SGDClassifier()
Accuracy: 14.641
Accuracy CV 10-Fold: 15.581

DecisionTreeClassifier()
Accuracy: 100.0
Accuracy CV 10-Fold: 28.946

GradientBoostingClassifier()
Accuracy: 65.682
Accuracy CV 10-Fold: 32.304

LogisticRegression()
Accuracy: 17.931
Accuracy CV 10-Fold: 16.857



  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


KNeighborsClassifier()
Accuracy: 52.048
Accuracy CV 10-Fold: 30.356

GaussianNB()
Accuracy: 19.073
Accuracy CV 10-Fold: 17.26





LinearSVC()
Accuracy: 20.618
Accuracy CV 10-Fold: 17.999

SGDClassifier()
Accuracy: 14.305
Accuracy CV 10-Fold: 15.245

DecisionTreeClassifier()
Accuracy: 100.0
Accuracy CV 10-Fold: 28.946

GradientBoostingClassifier()
Accuracy: 65.682
Accuracy CV 10-Fold: 32.304

LogisticRegression()
Accuracy: 17.931
Accuracy CV 10-Fold: 16.857



  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


KNeighborsClassifier()
Accuracy: 52.048
Accuracy CV 10-Fold: 30.356

GaussianNB()
Accuracy: 19.073
Accuracy CV 10-Fold: 17.26





LinearSVC()
Accuracy: 19.342
Accuracy CV 10-Fold: 18.334

SGDClassifier()
Accuracy: 15.044
Accuracy CV 10-Fold: 13.969

DecisionTreeClassifier()
Accuracy: 100.0
Accuracy CV 10-Fold: 28.946

GradientBoostingClassifier()
Accuracy: 65.682
Accuracy CV 10-Fold: 32.304

LogisticRegression()
Accuracy: 17.931
Accuracy CV 10-Fold: 16.857



  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


KNeighborsClassifier()
Accuracy: 52.048
Accuracy CV 10-Fold: 30.356

GaussianNB()
Accuracy: 19.073
Accuracy CV 10-Fold: 17.26





LinearSVC()
Accuracy: 19.342
Accuracy CV 10-Fold: 18.267

SGDClassifier()
Accuracy: 14.171
Accuracy CV 10-Fold: 13.969

DecisionTreeClassifier()
Accuracy: 100.0
Accuracy CV 10-Fold: 28.946

GradientBoostingClassifier()
Accuracy: 65.682
Accuracy CV 10-Fold: 32.304

LogisticRegression()
Accuracy: 17.931
Accuracy CV 10-Fold: 16.857



  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


KNeighborsClassifier()
Accuracy: 52.048
Accuracy CV 10-Fold: 30.356

GaussianNB()
Accuracy: 19.073
Accuracy CV 10-Fold: 17.26





LinearSVC()
Accuracy: 20.752
Accuracy CV 10-Fold: 18.67

SGDClassifier()
Accuracy: 13.768
Accuracy CV 10-Fold: 14.305

DecisionTreeClassifier()
Accuracy: 100.0
Accuracy CV 10-Fold: 28.946

GradientBoostingClassifier()
Accuracy: 65.682
Accuracy CV 10-Fold: 32.304

LogisticRegression()
Accuracy: 17.931
Accuracy CV 10-Fold: 16.857



  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


KNeighborsClassifier()
Accuracy: 52.048
Accuracy CV 10-Fold: 30.356

GaussianNB()
Accuracy: 19.073
Accuracy CV 10-Fold: 17.26





LinearSVC()
Accuracy: 19.879
Accuracy CV 10-Fold: 17.126

SGDClassifier()
Accuracy: 14.372
Accuracy CV 10-Fold: 14.372

DecisionTreeClassifier()
Accuracy: 100.0
Accuracy CV 10-Fold: 28.946

GradientBoostingClassifier()
Accuracy: 65.682
Accuracy CV 10-Fold: 32.304

LogisticRegression()
Accuracy: 17.931
Accuracy CV 10-Fold: 16.857



  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


KNeighborsClassifier()
Accuracy: 52.048
Accuracy CV 10-Fold: 30.356

GaussianNB()
Accuracy: 19.073
Accuracy CV 10-Fold: 17.26





LinearSVC()
Accuracy: 16.454
Accuracy CV 10-Fold: 18.2

SGDClassifier()
Accuracy: 14.372
Accuracy CV 10-Fold: 15.245

DecisionTreeClassifier()
Accuracy: 100.0
Accuracy CV 10-Fold: 28.946

GradientBoostingClassifier()
Accuracy: 65.682
Accuracy CV 10-Fold: 32.304

LogisticRegression()
Accuracy: 17.931
Accuracy CV 10-Fold: 16.857



  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


KNeighborsClassifier()
Accuracy: 52.048
Accuracy CV 10-Fold: 30.356

GaussianNB()
Accuracy: 19.073
Accuracy CV 10-Fold: 17.26





LinearSVC()
Accuracy: 19.342
Accuracy CV 10-Fold: 18.469

SGDClassifier()
Accuracy: 13.902
Accuracy CV 10-Fold: 14.372

DecisionTreeClassifier()
Accuracy: 100.0
Accuracy CV 10-Fold: 28.946

GradientBoostingClassifier()
Accuracy: 65.682
Accuracy CV 10-Fold: 32.304

LogisticRegression()
Accuracy: 17.931
Accuracy CV 10-Fold: 16.857



  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


KNeighborsClassifier()
Accuracy: 52.048
Accuracy CV 10-Fold: 30.356

GaussianNB()
Accuracy: 19.073
Accuracy CV 10-Fold: 17.26





LinearSVC()
Accuracy: 20.215
Accuracy CV 10-Fold: 18.536

SGDClassifier()
Accuracy: 14.305
Accuracy CV 10-Fold: 15.178

DecisionTreeClassifier()
Accuracy: 100.0
Accuracy CV 10-Fold: 28.946

GradientBoostingClassifier()
Accuracy: 65.682
Accuracy CV 10-Fold: 32.304

LogisticRegression()
Accuracy: 17.931
Accuracy CV 10-Fold: 16.857



  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


KNeighborsClassifier()
Accuracy: 52.048
Accuracy CV 10-Fold: 30.356

GaussianNB()
Accuracy: 19.073
Accuracy CV 10-Fold: 17.26





LinearSVC()
Accuracy: 20.081
Accuracy CV 10-Fold: 19.073

SGDClassifier()
Accuracy: 14.641
Accuracy CV 10-Fold: 14.238

DecisionTreeClassifier()
Accuracy: 100.0
Accuracy CV 10-Fold: 28.946

GradientBoostingClassifier()
Accuracy: 65.682
Accuracy CV 10-Fold: 32.304



In [23]:
running_average_array

array([[16.857, 16.857, 16.857, 16.857, 16.857, 16.857, 16.857, 16.857,
        16.857, 16.857],
       [16.857, 16.857, 16.857, 16.857, 16.857, 16.857, 16.857, 16.857,
        16.857, 16.857],
       [16.857, 16.857, 16.857, 16.857, 16.857, 16.857, 16.857, 16.857,
        16.857, 16.857],
       [16.857, 16.857, 16.857, 16.857, 16.857, 16.857, 16.857, 16.857,
        16.857, 16.857],
       [16.857, 16.857, 16.857, 16.857, 16.857, 16.857, 16.857, 16.857,
        16.857, 16.857],
       [16.857, 16.857, 16.857, 16.857, 16.857, 16.857, 16.857, 16.857,
        16.857, 16.857],
       [16.857, 16.857, 16.857, 16.857, 16.857, 16.857, 16.857, 16.857,
        16.857, 16.857]])

In [None]:
#avg_cv_acc_lst = running_average_lst/10

#running_average_lst = np.mean(running_average_array)

In [None]:
#res_df.head(100)

Choose the Top Performers for Testing

In [None]:
# Still using a cv of 5, as set way earlier
num_decimals = 3
test_df = pd.DataFrame(columns=my_metrics_cols)

print("KNN")
_, _, acc_cv, my_dt = fit_ml_algo(KNeighborsClassifier(), X_train, y_train, cv, testing=True)
y_pred = my_dt.predict(X_test)
acc_cv = round(metrics.accuracy_score(y_test, y_pred) * 100, num_decimals)
print(acc_cv)
temp_df = pd.DataFrame([['KNN', 'NA', acc_cv, cv]], columns=my_metrics_cols)
test_df = pd.concat((test_df, temp_df))

In [None]:
print("GBT")
_, _, acc_cv, my_dt = fit_ml_algo(GradientBoostingClassifier(), X_train, y_train, cv, testing=True)
y_pred = my_dt.predict(X_test)
acc_cv = round(metrics.accuracy_score(y_test, y_pred) * 100, num_decimals)
print(acc_cv)
temp_df = pd.DataFrame([['GBT', 'NA', acc_cv, cv]], columns=my_metrics_cols)
test_df = pd.concat((test_df, temp_df))

In [None]:
test_df.head()

## Case 2: Flatten Decoder Matrices as Input

## Case 3: Feed in a Vector of Norms

## 2) Linking EEG Data to Patients

## Cluster Decoder Matrix Norms