> __Purpose:__ Implement an initial privacy attack to quantify how much re-identiifcation and linkability risk exists from personalization parameters (the decoder used in the co-adaptation algorithm). Thus, implement a basic ML model to link the decoder matrices back to the corresponding subject.  

-- 7 Subjects, therefore pure guessing would be 14.28% correct on average

In [2]:
import numpy as np
import matplotlib.pyplot as plt
# import seaborn
import time
import pandas as pd
import pickle
import random
from kcs_ml_infr import *

random.seed(a=1)

# Load In Data

In [3]:
keys = ['METACPHS_S106', 'METACPHS_S107','METACPHS_S108', 'METACPHS_S109', 'METACPHS_S110', 'METACPHS_S111', 'METACPHS_S112']

with open('Data\cphs_data_block1.pickle', 'rb') as handle:
    #refs_block1, poss_block1, dec_vels_block1, int_vel_block1, emgs_block1, Ws_block1, Hs_block1, alphas_block1, pDs_block1, times_block1, conditions_block1 = pickle.load(handle)
    _, _, _, _, _, Ws_block1, _, _, _, times_block1, _ = pickle.load(handle)

with open('Data\cphs_data_block2.pickle', 'rb') as handle:
    #refs_block2, poss_block2, dec_vels_block2, int_vel_block2, emgs_block2, Ws_block2, Hs_block2, alphas_block2, pDs_block2, times_block2, conditions_block2 = pickle.load(handle)
    _, _, _, _, _, Ws_block2, _, _, _, times_block2, _ = pickle.load(handle)

In [4]:
update_ix = np.load(r"Data\update_ix.npy")

print(update_ix.shape)
print(update_ix)

(19,)
[    0  1200  2402  3604  4806  6008  7210  8412  9614 10816 12018 13220
 14422 15624 16826 18028 19230 20432 20769]


I'm just making the flatten array input DF here

In [4]:
dec_flattened_df1 = pd.DataFrame(columns=["Subject", "Condition", "Update Number", "Flattened Decoder"])
dec_flattened_df2 = pd.DataFrame(columns=["Subject", "Condition", "Update Number", "Flattened Decoder"])

dec_flattened_df1.head()

Unnamed: 0,Subject,Condition,Update Number,Flattened Decoder


In [5]:
num_conds = 8

t0 = time.time()

for key in keys:
    participant_dec1 = Ws_block1[key]
    participant_dec2 = Ws_block2[key]
    
    for my_cond in range(num_conds):
        for update_number, update_idx in enumerate(update_ix):
            dec_flattened_df1.loc[len(dec_flattened_df1)] = [key, my_cond, update_number, np.ravel(participant_dec1[my_cond, update_idx, :, :])]
            dec_flattened_df2.loc[len(dec_flattened_df2)] = [key, my_cond, update_number, np.ravel(participant_dec2[my_cond, update_idx, :, :])]
        
t1 = time.time()
total = t1-t0  
print(total)

9.705183982849121


Load in the other data

In [8]:
t0 = time.time()

dec_norms_df1 = pd.read_csv("Data\decoder_norms1.csv")
dec_norms_df2 = pd.read_csv("Data\decoder_norms2.csv")

t1 = time.time()
total = t1-t0  
print(total)

0.02067089080810547


In [9]:
dec_norms_df = pd.concat((dec_norms_df1, dec_norms_df2))
dec_flattened_df = pd.concat((dec_flattened_df1, dec_flattened_df2))
# This one isn't working yet
#dec_vec_norm_df = pd.concat((dec_vec_norm_df1, dec_vec_norm_df2))

In [10]:
print(dec_norms_df.shape)
dec_norms_df.drop('Unnamed: 0', axis=1, inplace=True)
dec_norms_df.head()

(2128, 5)


Unnamed: 0,Subject,Condition,Update Number,Frobenius Norm
0,METACPHS_S106,0,0,0.06636
1,METACPHS_S106,0,1,0.06636
2,METACPHS_S106,0,2,9.70939
3,METACPHS_S106,0,3,8.20908
4,METACPHS_S106,0,4,10.406943


In [11]:
print(dec_flattened_df.shape)
dec_flattened_df.head()

(2128, 4)


Unnamed: 0,Subject,Condition,Update Number,Flattened Decoder
0,METACPHS_S106,0,0,"[0.002722144351611262, 0.002605931562722017, 0..."
1,METACPHS_S106,0,1,"[0.002722144351611262, 0.002605931562722017, 0..."
2,METACPHS_S106,0,2,"[-0.34157085409453486, 0.08410593293763585, -0..."
3,METACPHS_S106,0,3,"[-0.1738624752800762, 0.003705171262358347, 0...."
4,METACPHS_S106,0,4,"[0.8321061501817386, -1.3261242289666402, 0.28..."


# Classification
1. Logistic Regression
2. K-Nearest Neighbor
3. Gaussian Naive Bayes
4. Linear SVC
5. Stochastic Gradient Descent
6. Decision Tree Classifier
7. Gradient Boosting Trees

In [12]:
# Machine learning
from sklearn.model_selection import train_test_split
from sklearn import model_selection, tree, preprocessing, metrics, linear_model
from sklearn.svm import LinearSVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier

In [13]:
# Set the number of k-folds
cv = 10

## 1) Linking Matrix Norms to Participants

In [14]:
norm_label_df = dec_norms_df.drop(["Condition", "Update Number", "Frobenius Norm"], axis=1)

In [15]:
key_to_num = dict()
num_to_key = dict()
for idx, key in enumerate(keys):
    key_to_num[key] = idx
    num_to_key[idx] = key

In [16]:
norm_label_df["Subject"] = norm_label_df["Subject"].map(key_to_num)
norm_label_df.head()

Unnamed: 0,Subject
0,0
1,0
2,0
3,0
4,0


In [17]:
# Result logs
my_metrics_cols = ['Algorithm', 'One Off Acc', 'CV Acc', 'K Folds']
res_df = pd.DataFrame(columns=my_metrics_cols)

norm1d_res_df = pd.DataFrame(columns=my_metrics_cols)
SSnorm1d_res_df = pd.DataFrame(columns=my_metrics_cols)
flatten_dec_res_df = pd.DataFrame(columns=my_metrics_cols)
SSflatten_dec_res_df = pd.DataFrame(columns=my_metrics_cols)
norm_vec_res_df = pd.DataFrame(columns=my_metrics_cols)
SSnorm_vec_res_df = pd.DataFrame(columns=my_metrics_cols)

In [18]:
my_models = [LogisticRegression(), KNeighborsClassifier(), GaussianNB(), LinearSVC(), SGDClassifier(), DecisionTreeClassifier(), GradientBoostingClassifier()]

## Case 1: 1D Norm

In [19]:
norm_input_df = dec_norms_df.drop(["Subject",  "Condition", "Update Number"], axis=1)
norm_input_df.head()

Unnamed: 0,Frobenius Norm
0,0.06636
1,0.06636
2,9.70939
3,8.20908
4,10.406943


In [20]:
X_train, y_train, X_test, y_test, X_val, y_val = train_test_val_split(norm_input_df, norm_label_df)
y_train = np.ravel(y_train)

print(X_train.shape)
X_train.head()

(1489, 1)


Unnamed: 0,Frobenius Norm
420,3.567582
146,5.589772
266,0.069974
695,5.796815
372,17.938752


In [21]:
for model_num, model in enumerate(my_models):
    norm1d_res_df = train_model(model, X_train, y_train, cv, norm1d_res_df)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [22]:
norm1d_res_df.head(100)

Unnamed: 0,Algorithm,One Off Acc,CV Acc,K Folds
0,LogisticRegression(),17.931,16.857,10
0,KNeighborsClassifier(),52.048,30.356,10
0,GaussianNB(),19.073,17.26,10
0,LinearSVC(),19.342,18.469,10
0,SGDClassifier(),13.835,14.842,10
0,DecisionTreeClassifier(),100.0,28.946,10
0,GradientBoostingClassifier(),65.682,32.304,10


Now Test

In [23]:
test_df = pd.DataFrame(columns=['Algorithm', 'CV Acc', 'Test Acc', 'K Folds'])

for model_num, model in enumerate(my_models):
    test_df = test_model(model, X_train, y_train, X_test, y_test, test_df, cv)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [24]:
test_df.head(100)

Unnamed: 0,Algorithm,CV Acc,Test Acc,K Folds
0,LogisticRegression(),16.857,16.901,10
0,KNeighborsClassifier(),30.356,28.795,10
0,GaussianNB(),17.26,19.718,10
0,LinearSVC(),18.469,17.371,10
0,SGDClassifier(),15.85,14.554,10
0,DecisionTreeClassifier(),28.946,27.856,10
0,GradientBoostingClassifier(),32.304,27.23,10


## Case 2: Flattened Decoder Matrices as Input

In [25]:
flattened_input_df = dec_flattened_df.drop(["Subject",  "Condition", "Update Number"], axis=1)
flattened_input_df.head()

Unnamed: 0,Flattened Decoder
0,"[0.002722144351611262, 0.002605931562722017, 0..."
1,"[0.002722144351611262, 0.002605931562722017, 0..."
2,"[-0.34157085409453486, 0.08410593293763585, -0..."
3,"[-0.1738624752800762, 0.003705171262358347, 0...."
4,"[0.8321061501817386, -1.3261242289666402, 0.28..."


In [26]:
X_train, y_train, X_test, y_test, X_val, y_val = train_test_val_split(flattened_input_df, norm_label_df)
y_train = np.ravel(y_train)

print(X_train.shape)
X_train.head()

(1489, 1)


Unnamed: 0,Flattened Decoder
420,"[0.5594479988320739, 0.4812012592595234, -0.31..."
146,"[-0.05469055551972085, 0.08547177500493985, -0..."
266,"[-0.008973604228856694, -0.008313810900701315,..."
695,"[-0.05559843195043232, -1.0993648180373194, 0...."
372,"[2.0976718157092886, -1.4354093970974517, -0.0..."


In [27]:
X_train2 = pd.DataFrame()
X_test2 = pd.DataFrame()
for my_row in range(X_train.shape[0]):
    test=pd.DataFrame(X_train.iloc[my_row,0]).T
    X_train2 = pd.concat((X_train2, test))
    
for my_row in range(X_test.shape[0]):
    test=pd.DataFrame(X_test.iloc[my_row,0]).T
    X_test2 = pd.concat((X_test2, test))
    
X_train = X_train2
X_test = X_test2
    
X_train2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,118,119,120,121,122,123,124,125,126,127
0,0.559448,0.481201,-0.311722,-0.309007,0.106709,0.161785,-0.301699,-0.321967,-0.08306,0.570488,...,0.039315,0.361549,0.411409,0.181654,0.168465,0.048938,0.070892,-0.032129,0.055582,0.466536
0,-0.054691,0.085472,-0.139612,0.109828,-0.132205,-0.125863,0.872657,0.057642,1.215535,-0.042984,...,0.295885,-0.664531,-0.831946,1.214455,0.905204,-1.332841,-0.037715,0.540905,0.181231,-0.617622
0,-0.008974,-0.008314,-0.006412,-0.001125,-0.003369,-0.001689,-0.008623,-0.004556,-0.009388,-0.00221,...,-0.008424,-0.002359,-0.00153,-0.002416,-0.001757,-0.007078,-0.007921,-0.00653,-0.003725,-0.005481
0,-0.055598,-1.099365,0.040769,-0.288075,0.016066,-0.113498,0.08255,0.019032,-0.090005,-1.670067,...,0.003365,0.346554,-0.189199,0.189891,0.210803,0.022203,-0.021496,0.434912,-0.832817,0.03453
0,2.097672,-1.435409,-0.056953,1.67027,-1.501497,-0.962351,0.083523,2.73304,-1.270545,0.097296,...,-2.02884,-0.35464,2.23102,2.792864,-1.524705,2.493864,-0.648781,-1.369147,1.047059,-0.720754


Convert single element of a vector into a vector of single elements!

In [29]:
for model_num, model in enumerate(my_models):
    print(f"{model_num} of {len(my_models)}")
    flatten_dec_res_df = train_model(model, X_train, y_train, cv, flatten_dec_res_df)

0 of 7
1 of 7


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


2 of 7
3 of 7




4 of 7
5 of 7
6 of 7


In [30]:
flatten_dec_res_df.head(100)

Unnamed: 0,Algorithm,One Off Acc,CV Acc,K Folds
0,LogisticRegression(),87.911,70.853,10
0,KNeighborsClassifier(),83.815,76.494,10
0,GaussianNB(),71.592,69.778,10
0,LinearSVC(),83.479,71.256,10
0,SGDClassifier(),79.987,69.51,10
0,DecisionTreeClassifier(),100.0,66.555,10
0,GradientBoostingClassifier(),100.0,88.046,10


Test the models on the testing data

In [31]:
test_df = pd.DataFrame(columns=['Algorithm', 'CV Acc', 'Test Acc', 'K Folds'])
for model_num, model in enumerate(my_models):
    print(f"{model_num+1} of {len(my_models)}")
    test_df = test_model(model, X_train, y_train, X_test, y_test, test_df, cv)

LogisticRegression()
CV Accuracy: 70.853
Test Accuracy: 71.987



  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


KNeighborsClassifier()
CV Accuracy: 76.494
Test Accuracy: 76.526

GaussianNB()
CV Accuracy: 69.778
Test Accuracy: 72.926





LinearSVC()
CV Accuracy: 71.323
Test Accuracy: 73.552

SGDClassifier()
CV Accuracy: 68.972
Test Accuracy: 67.606

DecisionTreeClassifier()
CV Accuracy: 65.749
Test Accuracy: 67.919

GradientBoostingClassifier()
CV Accuracy: 87.911
Test Accuracy: 89.984



In [32]:
test_df.head(100)

Unnamed: 0,Algorithm,CV Acc,Test Acc,K Folds
0,LogisticRegression(),70.853,71.987,10
0,KNeighborsClassifier(),76.494,76.526,10
0,GaussianNB(),69.778,72.926,10
0,LinearSVC(),71.323,73.552,10
0,SGDClassifier(),68.972,67.606,10
0,DecisionTreeClassifier(),65.749,67.919,10
0,GradientBoostingClassifier(),87.911,89.984,10


Redo but try using Standard Scaler this time

In [33]:
from sklearn.preprocessing import StandardScaler

scaler_Xtrain = StandardScaler().fit(X_train)
XtrainSS = scaler_Xtrain.transform(X_train)

print(X_train.shape)
print(XtrainSS.shape)

(1489, 128)
(1489, 128)


In [34]:
for model_num, model in enumerate(my_models):
    print(f"{model_num+1} of {len(my_models)}")
    SSflatten_dec_res_df = train_model(model, XtrainSS, y_train, cv, SSflatten_dec_res_df)

1 of 7
2 of 7


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


3 of 7
4 of 7




5 of 7
6 of 7
7 of 7


In [35]:
SSflatten_dec_res_df.head(100)

Unnamed: 0,Algorithm,One Off Acc,CV Acc,K Folds
0,LogisticRegression(),89.322,70.114,10
0,KNeighborsClassifier(),82.74,74.211,10
0,GaussianNB(),71.592,69.778,10
0,LinearSVC(),83.345,71.39,10
0,SGDClassifier(),79.516,68.435,10
0,DecisionTreeClassifier(),100.0,66.42,10
0,GradientBoostingClassifier(),100.0,88.314,10


Thus we see no real performance gains by using standard scaler.

Now try and optimize the best performer (GBT)
1. max_depth: int, default=3
- The maximum depth of the individual regression estimators. The maximum depth limits the number of nodes in the tree. Tune this parameter for best performance; the best value depends on the interaction of the input variables. Values must be in the range [1, inf).

## Case 3: Feed in a Vector of Norms
> This isn't super high priority since the flattened version has high accuracy

In [None]:
'''
#dec_vec_norm_df
dec_vec_norm_input_df = dec_vec_norm_df.drop(["Subject",  "Condition"], axis=1)
dec_vec_norm_input_df.head()



# norm_label_df probably needs to be changed to a new variable... shouldn't have the same shape...
X_train, y_train, X_test, y_test, X_val, y_val = train_test_val_split(norm_input_df, norm_label_df)
y_train = np.ravel(y_train)

print(X_train.shape)
X_train.head()



X_train2 = pd.DataFrame()
X_test2 = pd.DataFrame()
for my_row in range(X_train.shape[0]):
    test=pd.DataFrame(X_train.iloc[my_row,0]).T
    X_train2 = pd.concat((X_train2, test))
    
for my_row in range(X_test.shape[0]):
    test=pd.DataFrame(X_test.iloc[my_row,0]).T
    X_test2 = pd.concat((X_test2, test))
X_train2.head()



for model_num, model in enumerate(my_models):
    print(f"{model_num} of {len(my_models)}")
    norm_vec_res_df = train_model(model, X_train2, y_train, cv, norm_vec_res_df)



norm_vec_res_df.head(100)
'''
0

## 2) Cluster Decoder Matrix Norms
> Cluster using 2D dataset of (norm, condition), condition being the y axis maybe?
> Still not sure if I should just take the norm of take the norm wrt the first/last.  
> Can I make a similarity matrix? How would I use that for clustering...
> Save this idea for later... clustering is just another form of an attack vector, the ML model above already was successful