> __Purpose:__ Implement an initial privacy attack to quantify how much re-identiifcation and linkability risk exists from personalization parameters (the decoder used in the co-adaptation algorithm). Thus, implement a basic ML model to link the decoder matrices back to the corresponding subject.  

- 14 Subjects, therefore random guessing would be 7.14% correct on average

In [2]:
import numpy as np
import matplotlib.pyplot as plt
# import seaborn
import time
import pandas as pd
import pickle
import random
from kcs_ml_infr import *
from experiment_params import *

import warnings
warnings.filterwarnings("ignore")

random.seed(a=1)

# Load In Data

I'm just making the flatten array input DF here

In [3]:
dec_flattened_df1 = pd.read_csv("Data\dec_full_flattened_df1.csv")
dec_flattened_df2 = pd.read_csv("Data\dec_full_flattened_df2.csv")

dec_norms_df1 = pd.read_csv("Data\decoder_full_norms1.csv")
dec_norms_df2 = pd.read_csv("Data\decoder_full_norms2.csv")

(19,)
[    0  1200  2402  3604  4806  6008  7210  8412  9614 10816 12018 13220
 14422 15624 16826 18028 19230 20432 20769]


In [4]:
dec_norms_df = pd.concat((dec_norms_df1, dec_norms_df2))
dec_flattened_df = pd.concat((dec_flattened_df1, dec_flattened_df2))
if dec_flattened_df.columns[0]=='Unnamed: 0':
    dec_flattened_df.drop('Unnamed: 0', axis=1, inplace=True)

print(dec_flattened_df.shape)
dec_flattened_df.head()

(4256, 4)


Unnamed: 0,Subject,Condition,Update Number,Flattened Decoder
0,METACPHS_S106,0,0,[2.72214435e-03 2.60593156e-03 3.08748960e-03 ...
1,METACPHS_S106,0,1,[2.72214435e-03 2.60593156e-03 3.08748960e-03 ...
2,METACPHS_S106,0,2,[-0.34157085 0.08410593 -0.54057447 0.444319...
3,METACPHS_S106,0,3,[-0.17386248 0.00370517 0.40721562 -0.515221...
4,METACPHS_S106,0,4,[ 0.83210615 -1.32612423 0.28718983 1.302252...


In [5]:
print(dec_norms_df.shape)
dec_norms_df.drop('Unnamed: 0', axis=1, inplace=True)
dec_norms_df.head()

(4256, 5)


Unnamed: 0,Subject,Condition,Update Number,Frobenius Norm
0,METACPHS_S106,0,0,0.06636
1,METACPHS_S106,0,1,0.06636
2,METACPHS_S106,0,2,9.70939
3,METACPHS_S106,0,3,8.20908
4,METACPHS_S106,0,4,10.406943


# Classification
1. Logistic Regression
2. K-Nearest Neighbor
3. Gaussian Naive Bayes
4. Linear SVC
5. Stochastic Gradient Descent
6. Decision Tree Classifier
7. Gradient Boosting Trees

In [6]:
# Machine learning
from sklearn.model_selection import train_test_split
from sklearn import model_selection, tree, preprocessing, metrics, linear_model
from sklearn.svm import LinearSVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier

my_models = [LogisticRegression(), KNeighborsClassifier(), LinearSVC(), SGDClassifier(), DecisionTreeClassifier(), GradientBoostingClassifier()]

## 1) Linking Matrix Norms to Participants

In [7]:
norm_label_df = dec_norms_df.drop(["Condition", "Update Number", "Frobenius Norm"], axis=1)

In [9]:
norm_label_df["Subject"] = norm_label_df["Subject"].map(key_to_num)
norm_label_df.head()

Unnamed: 0,Subject
0,0
1,0
2,0
3,0
4,0


In [10]:
# Result logs
norm1d_res_df = pd.DataFrame(columns=my_metrics_cols)
SSnorm1d_res_df = pd.DataFrame(columns=my_metrics_cols)
flatten_dec_res_df = pd.DataFrame(columns=my_metrics_cols)
SSflatten_dec_res_df = pd.DataFrame(columns=my_metrics_cols)
norm_vec_res_df = pd.DataFrame(columns=my_metrics_cols)
SSnorm_vec_res_df = pd.DataFrame(columns=my_metrics_cols)

## Case 1: 1D Norm

In [12]:
norm_input_df = dec_norms_df.drop(["Subject",  "Condition", "Update Number"], axis=1)
norm_input_df.head()

Unnamed: 0,Frobenius Norm
0,0.06636
1,0.06636
2,9.70939
3,8.20908
4,10.406943


In [13]:
X_train, y_train, X_test, y_test, X_val, y_val = train_test_val_split(norm_input_df, norm_label_df)
y_train = np.ravel(y_train)

print(X_train.shape)
X_train.head()

(2979, 1)


Unnamed: 0,Frobenius Norm
965,9.668409
1527,10.882174
2044,27.41625
1346,7.021757
526,7.276107


In [14]:
for model_num, model in enumerate(my_models):
    norm1d_res_df = train_model(model, X_train, y_train, cv, norm1d_res_df)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [15]:
norm1d_res_df.head(100)

Unnamed: 0,Algorithm,One Off Acc,CV Acc,K Folds,N
0,LogisticRegression(),11.346,11.413,5,
0,KNeighborsClassifier(),41.692,20.913,5,
0,GaussianNB(),10.138,10.138,5,
0,LinearSVC(),11.178,10.339,5,
0,SGDClassifier(),7.184,6.982,5,
0,DecisionTreeClassifier(),100.0,20.443,5,
0,GradientBoostingClassifier(),60.087,21.45,5,


Now Test

In [16]:
test_df = pd.DataFrame(columns=['Algorithm', 'CV Acc', 'Test Acc', 'K Folds'])

for model_num, model in enumerate(my_models):
    test_df = test_model(model, X_train, y_train, X_test, y_test, test_df, cv)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [17]:
test_df.head(10)

Unnamed: 0,Algorithm,CV Acc,Test Acc,K Folds,N
0,LogisticRegression(),11.413,10.023,5,
0,KNeighborsClassifier(),20.913,21.848,5,
0,GaussianNB(),10.138,10.337,5,
0,LinearSVC(),10.507,10.023,5,
0,SGDClassifier(),7.553,5.638,5,
0,DecisionTreeClassifier(),20.443,20.595,5,
0,GradientBoostingClassifier(),21.45,23.179,5,


## Case 2: Flattened Decoder Matrices as Input

In [18]:
flattened_input_df = dec_flattened_df.drop(["Subject",  "Condition", "Update Number"], axis=1)
flattened_input_df.head()

Unnamed: 0,Flattened Decoder
0,[2.72214435e-03 2.60593156e-03 3.08748960e-03 ...
1,[2.72214435e-03 2.60593156e-03 3.08748960e-03 ...
2,[-0.34157085 0.08410593 -0.54057447 0.444319...
3,[-0.17386248 0.00370517 0.40721562 -0.515221...
4,[ 0.83210615 -1.32612423 0.28718983 1.302252...


Since it was loaded in from a csv, the list in the column needs to be coverted from a string back to a list

In [19]:
#df_obj.apply(lambda x: x.str.strip())
flattened_input_df["List Flat Dec"] = flattened_input_df["Flattened Decoder"].apply(lambda x: x.strip("[]").split())
flattened_input_df.head()

Unnamed: 0,Flattened Decoder,List Flat Dec
0,[2.72214435e-03 2.60593156e-03 3.08748960e-03 ...,"[2.72214435e-03, 2.60593156e-03, 3.08748960e-0..."
1,[2.72214435e-03 2.60593156e-03 3.08748960e-03 ...,"[2.72214435e-03, 2.60593156e-03, 3.08748960e-0..."
2,[-0.34157085 0.08410593 -0.54057447 0.444319...,"[-0.34157085, 0.08410593, -0.54057447, 0.44431..."
3,[-0.17386248 0.00370517 0.40721562 -0.515221...,"[-0.17386248, 0.00370517, 0.40721562, -0.51522..."
4,[ 0.83210615 -1.32612423 0.28718983 1.302252...,"[0.83210615, -1.32612423, 0.28718983, 1.302252..."


In [20]:
flattened_input_df.drop("Flattened Decoder", axis=1, inplace=True)
flattened_input_df.head()

Unnamed: 0,List Flat Dec
0,"[2.72214435e-03, 2.60593156e-03, 3.08748960e-0..."
1,"[2.72214435e-03, 2.60593156e-03, 3.08748960e-0..."
2,"[-0.34157085, 0.08410593, -0.54057447, 0.44431..."
3,"[-0.17386248, 0.00370517, 0.40721562, -0.51522..."
4,"[0.83210615, -1.32612423, 0.28718983, 1.302252..."


In [21]:
X_train, y_train, X_test, y_test, X_val, y_val = train_test_val_split(flattened_input_df, norm_label_df)
y_train = np.ravel(y_train)

print(X_train.shape)
X_train.head()

(2979, 1)


Unnamed: 0,List Flat Dec
965,"[-0.00792164, -0.42145286, 0.30272061, -0.2527..."
1527,"[-0.103048, 1.11832736, 0.38169341, 1.54101125..."
2044,"[7.72862531e-01, -1.05491275e+00, 3.80358874e+..."
1346,"[0.11450897, -0.33299429, -1.37520299, -0.1067..."
526,"[-3.76154983e-02, 8.36445761e-01, -4.14212376e..."


In [22]:
X_train2 = pd.DataFrame()
X_test2 = pd.DataFrame()
for my_row in range(X_train.shape[0]):
    test=pd.DataFrame(X_train.iloc[my_row,0]).T
    X_train2 = pd.concat((X_train2, test))
    
for my_row in range(X_test.shape[0]):
    test=pd.DataFrame(X_test.iloc[my_row,0]).T
    X_test2 = pd.concat((X_test2, test))
    
X_train = X_train2
X_test = X_test2
    
X_train2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,118,119,120,121,122,123,124,125,126,127
0,-0.00792164,-0.42145286,0.30272061,-0.25272956,-0.33508519,-0.04219821,-2.65191646,0.07388736,2.15039853,-1.46173108,...,-0.31101374,0.2905577,0.24299258,-0.51209972,-0.39023449,0.37560675,-0.34286523,-0.65905896,0.15440369,0.95438884
0,-0.103048,1.11832736,0.38169341,1.54101125,1.04691243,-0.59616771,1.79022657,-0.43157433,-0.2690051,0.63446982,...,-0.58316167,0.67792521,-1.57285097,0.79482142,0.51092744,1.07589604,-0.7768124,0.15189033,0.06429752,2.38022212
0,0.772862531,-1.05491275,3.80358874,1.13811241,1.27843309,-0.28429594,-1.35410422,1.90774398,1.30870409,1.67018305,...,-2.50039626,-1.70101834,4.10756574,-3.80735266,1.6631298,0.759469791,2.81032811,-1.81786069,0.952208403,0.401977875
0,0.11450897,-0.33299429,-1.37520299,-0.10673519,0.62908695,-0.43619196,-0.78332833,-0.07576671,0.68623717,-2.03665833,...,0.02652651,0.09112522,2.19220989,0.42864349,-0.20907603,0.11129408,-0.28677815,-0.60538864,0.72347392,0.41170373
0,-0.0376154983,0.836445761,-0.414212376,-0.662608656,0.0138755237,0.566857392,-0.517831971,0.345051666,0.268683558,-1.39033617,...,-0.0167394324,-0.752436349,0.136969029,0.0729869278,-0.762968558,0.150646463,-0.18640582,-0.398605288,-0.301946463,0.366669439


Convert single element of a vector into a vector of single elements!

In [23]:
for model_num, model in enumerate(my_models):
    print(f"{model_num+1} of {len(my_models)}")
    flatten_dec_res_df = train_model(model, X_train, y_train, cv, flatten_dec_res_df)

0 of 7
1 of 7


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


2 of 7
3 of 7




4 of 7
5 of 7
6 of 7


In [24]:
flatten_dec_res_df.head(10)

Unnamed: 0,Algorithm,One Off Acc,CV Acc,K Folds,N
0,LogisticRegression(),72.675,56.26,5,
0,KNeighborsClassifier(),71.165,58.006,5,
0,GaussianNB(),51.393,47.667,5,
0,LinearSVC(),67.875,57.872,5,
0,SGDClassifier(),56.831,50.117,5,
0,DecisionTreeClassifier(),100.0,46.66,5,
0,GradientBoostingClassifier(),100.0,75.663,5,


Test the models on the testing data

In [25]:
test_df = pd.DataFrame(columns=['Algorithm', 'CV Acc', 'Test Acc', 'K Folds'])
for model_num, model in enumerate(my_models):
    print(f"{model_num+1} of {len(my_models)}")
    test_df = test_model(model, X_train, y_train, X_test, y_test, test_df, cv)

1 of 7
2 of 7


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


3 of 7
4 of 7




5 of 7
6 of 7
7 of 7


In [26]:
test_df.head(100)

Unnamed: 0,Algorithm,CV Acc,Test Acc,K Folds,N
0,LogisticRegression(),56.26,54.19,5,
0,KNeighborsClassifier(),58.006,59.749,5,
0,GaussianNB(),47.667,47.69,5,
0,LinearSVC(),57.771,58.105,5,
0,SGDClassifier(),50.42,50.196,5,
0,DecisionTreeClassifier(),46.224,43.305,5,
0,GradientBoostingClassifier(),75.764,76.038,5,


Redo but try using Standard Scaler this time

In [27]:
from sklearn.preprocessing import StandardScaler

scaler_Xtrain = StandardScaler().fit(X_train)
XtrainSS = scaler_Xtrain.transform(X_train)

print(X_train.shape)
print(XtrainSS.shape)

(2979, 128)
(2979, 128)


In [28]:
for model_num, model in enumerate(my_models):
    print(f"{model_num+1} of {len(my_models)}")
    SSflatten_dec_res_df = train_model(model, XtrainSS, y_train, cv, SSflatten_dec_res_df)

1 of 7
2 of 7


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


3 of 7
4 of 7




5 of 7
6 of 7
7 of 7


In [29]:
SSflatten_dec_res_df.head(10)

Unnamed: 0,Algorithm,One Off Acc,CV Acc,K Folds,N
0,LogisticRegression(),73.011,56.193,5,
0,KNeighborsClassifier(),71.87,57.838,5,
0,GaussianNB(),51.393,47.667,5,
0,LinearSVC(),68.345,57.268,5,
0,SGDClassifier(),57.503,50.487,5,
0,DecisionTreeClassifier(),100.0,45.821,5,
0,GradientBoostingClassifier(),100.0,76.2,5,


Thus we see no real performance gains by using standard scaler.

Now try and optimize the best performer (GBT)
1. max_depth: int, default=3
- The maximum depth of the individual regression estimators. The maximum depth limits the number of nodes in the tree. Tune this parameter for best performance; the best value depends on the interaction of the input variables. Values must be in the range [1, inf).