# Imports, Data Loading, & Misc.

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.neighbors import KNeighborsClassifier as KNC
from sklearn import svm

from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import GridSearchCV

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
rand_seed_int = 7739
sns.set(style="whitegrid", color_codes=True)
%matplotlib inline

In [3]:
path_to_data = "./Dataset/"  # generate paths
file_names = ["user_a.csv", "user_b.csv", "user_c.csv", "user_d.csv"]
data_paths = [path_to_data + fn for fn in file_names]
# print(data_paths)

In [4]:
user_dfs = [pd.read_csv(data_path) for data_path in data_paths]

In [5]:
for df, letter in zip(user_dfs, "ABCD"):
    df['user'] = pd.Series(letter, index=df.index) # append categorical variable denoting user

In [6]:
print( [user_dfs[i].shape for i in range(len(user_dfs))])
df = pd.concat(user_dfs, axis=0)

[(2880, 114), (2880, 114), (2880, 114), (2880, 114)]


In [7]:
# df.shape
df

Unnamed: 0,Class,AF3 delta std,AF3 delta m,AF3 theta std,AF3 theta m,AF3 alpha std,AF3 alpha m,AF3 beta std,AF3 beta m,F7 delta std,...,F8 beta m,AF4 delta std,AF4 delta m,AF4 theta std,AF4 theta m,AF4 alpha std,AF4 alpha m,AF4 beta std,AF4 beta m,user
0,1.0,3569.164550,2063.892754,1.673726,4.444736,0.526209,3.002088,1.425022,3.302739,3563.803888,...,72.508750,3701.186330,2182.676835,18.192418,41.349662,16.004756,42.046467,46.280843,73.565719,A
1,1.0,3568.423670,2063.099248,1.897790,3.728823,1.304186,1.854353,1.366575,2.546458,3563.560922,...,66.931186,3725.210509,2180.197439,8.820788,38.012788,19.601233,29.431054,38.559351,67.470041,A
2,1.0,3568.157929,2062.445859,2.798014,2.574504,1.120537,1.958819,0.982433,2.258622,3563.279981,...,66.816547,3724.417296,2176.823208,18.159202,23.612639,14.378291,19.555084,43.210004,67.781924,A
3,1.0,3567.710021,2062.112673,2.181775,3.610507,0.629608,2.155876,0.856275,2.233711,3562.787801,...,63.915386,3725.822160,2177.089059,19.737616,29.484396,15.793034,25.713513,39.250246,65.031031,A
4,1.0,3565.546124,2063.128867,1.685161,3.384311,0.677526,1.795798,0.927924,1.909810,3562.655091,...,64.534645,3723.053978,2167.798335,8.429414,26.374975,14.920736,35.675266,33.901687,66.956313,A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2875,0.0,3572.183440,2063.784708,1.229472,2.118878,0.554802,1.531674,1.515939,2.064786,3564.057879,...,1.589724,3568.988084,2063.376781,0.731822,1.842909,0.271320,1.235039,1.081958,2.239726,D
2876,0.0,3571.739312,2062.870495,1.131872,2.143829,0.600867,2.161191,1.481767,1.981241,3564.245318,...,1.474497,3569.102041,2063.103722,0.914127,2.014998,0.249743,1.573777,1.254084,2.494333,D
2877,0.0,3571.184751,2063.216128,1.139433,2.066175,0.613243,2.005979,1.202326,1.918536,3564.496395,...,1.539265,3569.136435,2063.114849,0.982602,2.111638,0.943919,1.525629,1.244487,2.555489,D
2878,0.0,3570.595817,2062.900476,1.013536,1.579048,0.214779,1.701602,1.314499,1.782709,3564.497194,...,1.475072,3567.959877,2062.375466,1.036738,2.004392,1.395303,1.550127,1.095299,2.372722,D


In [8]:
df = df.sample(frac=1, random_state=rand_seed_int).reset_index(drop=True)  # shuffle and reindex dataframe

In [9]:
df

Unnamed: 0,Class,AF3 delta std,AF3 delta m,AF3 theta std,AF3 theta m,AF3 alpha std,AF3 alpha m,AF3 beta std,AF3 beta m,F7 delta std,...,F8 beta m,AF4 delta std,AF4 delta m,AF4 theta std,AF4 theta m,AF4 alpha std,AF4 alpha m,AF4 beta std,AF4 beta m,user
0,1.0,3572.740122,2065.925458,1.185344,2.313863,1.511347,1.779033,2.151755,2.702375,3568.188014,...,25.944354,3608.920898,2092.725560,3.459469,15.541393,10.217921,19.738413,28.741694,29.452970,A
1,2.0,3575.217662,2066.069769,1.048088,1.567821,1.594162,2.048887,2.070036,2.610019,3566.182191,...,2.234100,3572.622773,2064.425045,1.262161,1.510096,0.621931,1.985719,1.495773,2.064559,B
2,1.0,3569.482772,2062.449313,0.775122,1.993172,0.789177,2.411156,1.337159,2.445601,3561.976864,...,2.013047,3568.347818,2061.632880,0.986714,2.487093,1.127154,2.075247,1.134185,2.337065,D
3,0.0,3566.556878,2062.149296,1.272005,1.885792,0.580135,2.125361,0.932530,2.268825,3563.785619,...,2.060122,3566.443267,2060.940128,1.127451,2.665813,1.528374,2.223407,1.027889,2.193454,D
4,0.0,3569.869503,2063.306770,0.719125,1.153106,0.886325,2.028228,2.932595,3.581840,3565.626072,...,28.696374,3644.890238,2116.975932,4.794681,11.841362,4.188648,14.678540,38.839145,35.139374,A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11515,1.0,3565.689279,2065.479288,0.580389,2.823328,0.771873,3.011752,3.148538,4.433364,3575.116521,...,26.749057,3572.314504,2111.389578,3.112227,12.396598,2.247408,6.379228,1.532029,4.262639,C
11516,1.0,3571.098846,2064.594995,1.729041,3.425471,1.128479,1.624798,3.489636,3.943141,3574.792999,...,12.119590,3563.429987,2068.539135,1.923759,2.602696,1.248980,2.665588,2.753850,3.565360,C
11517,2.0,3574.547889,2066.745873,0.332495,2.616315,0.708282,1.615514,2.682437,3.009752,3568.716436,...,1.943236,3573.420786,2065.512823,0.868336,2.006091,0.339171,1.000358,1.656243,2.944557,B
11518,1.0,3569.127228,2063.362552,0.358300,1.791695,0.930024,2.264602,0.981592,2.290724,3564.184687,...,1.963345,3570.610368,2063.540624,0.641076,1.268576,1.192215,1.969478,1.153423,2.596662,D


# Train-Val-Test Splitting, & Data Scaling

In [10]:
def data_process(df):
    # generate one hot encoded dummy variables for user categories
    user_dummies = pd.get_dummies(df['user'], prefix='user')

    # user_dummies
    # concatenate dummies and drop orig user feature
    df = pd.concat([df, user_dummies], axis=1)
    df = df.drop('user', axis=1)
    
    # separate target feature class from data
    y = df['Class'].copy()
    X = df.drop('Class', axis=1)
    
    # dummy columns to not transform
    cols_to_std = list(X.columns[:-4])
    
    # cast target as integer
    y = y.astype(int)
    
    # split into training and testing sets: 80% train, 20% test
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=rand_seed_int)
    
    # decided to use k-fold cv rather than leave-one-out cross-validation
    # I think k-fold makes more sense with respect to the # of observations
    # split training into training and validation sets: 70% train, 30% validation
    # X_train, X_val, y_train, y_val = train_test_split(X_Train, y_Train, train_size=0.7, random_state=rand_seed_int)
    
    # scale frequencies s.t. features are normalized and dummies are left alone

    ct = ColumnTransformer([
            ('blah_standard_scaler', StandardScaler(), cols_to_std)
        ], remainder='passthrough')
    
    ct.fit(X_train)
    # return as dataframes
    X_train = pd.DataFrame(ct.transform(X_train), columns=X.columns)
    # X_val   = pd.DataFrame(ct.transform(X_val), columns=X.columns)
    X_test  = pd.DataFrame(ct.transform(X_test), columns=X.columns)
    
    # print(ct.named_transformers_)
    
    # return X_train, y_train, X_val, y_val, X_test, y_test
    return X_train, y_train, X_test, y_test

In [11]:
# X_train, y_train, X_val, y_val, X_test, y_test = data_process(df)
X_train, y_train, X_test, y_test = data_process(df)

In [12]:
X_train

Unnamed: 0,AF3 delta std,AF3 delta m,AF3 theta std,AF3 theta m,AF3 alpha std,AF3 alpha m,AF3 beta std,AF3 beta m,F7 delta std,F7 delta m,...,AF4 theta std,AF4 theta m,AF4 alpha std,AF4 alpha m,AF4 beta std,AF4 beta m,user_A,user_B,user_C,user_D
0,-0.512224,-0.672548,0.185588,0.125307,2.764138,0.764986,-1.296539,-0.955065,-0.879263,-0.562871,...,-0.440912,-0.405671,-0.377965,-0.496969,-0.539410,-0.528835,0.0,0.0,0.0,1.0
1,-0.351427,-0.349637,-0.318874,-0.598020,0.639897,1.267067,0.950030,1.652566,1.624179,1.644434,...,-0.529805,-0.450101,-0.130521,-0.422896,-0.493541,-0.482568,0.0,0.0,1.0,0.0
2,-0.202096,-0.182844,0.160312,0.124356,-0.214642,0.122180,-0.862074,-0.464742,-0.500001,-0.521718,...,1.463166,0.574310,3.377459,1.870127,0.525448,1.744057,1.0,0.0,0.0,0.0
3,-0.428892,-0.265401,-0.225800,-0.991139,0.121445,1.005085,1.893015,0.930144,0.626798,0.422674,...,-0.060217,-0.189309,-0.475156,-0.433042,-0.499584,-0.526896,0.0,0.0,1.0,0.0
4,-0.193209,-0.476222,-0.145892,-0.465681,-1.787153,-1.766610,-0.872115,-1.053123,-1.460612,-0.778014,...,-0.525641,-0.409948,-0.240856,-0.492232,-0.540042,-0.549973,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9211,-0.065508,-0.359403,-0.506200,0.166998,-0.715704,-0.267043,-0.865888,-1.078102,-1.129743,-1.500955,...,-0.341631,-0.488714,-0.451617,-0.567404,-0.509289,-0.540810,0.0,0.0,0.0,1.0
9212,-0.511870,-0.500803,1.553980,0.499305,1.820197,0.880486,-1.290339,-1.262719,-0.654466,-0.425025,...,-0.323490,-0.255664,-0.372590,-0.485677,-0.544481,-0.541396,0.0,0.0,0.0,1.0
9213,-0.204451,-0.078696,1.389852,0.120734,0.881718,0.072730,-1.181189,-0.726111,-0.983185,-0.721704,...,-0.045325,-0.331241,-0.535916,-0.460987,-0.527359,-0.567371,0.0,0.0,0.0,1.0
9214,0.044088,-0.270784,0.377097,-0.852714,-0.466895,-0.954969,-0.686164,-0.834179,-0.419934,-0.243665,...,-0.185725,-0.460024,-0.488154,-0.574551,-0.532168,-0.549059,0.0,1.0,0.0,0.0


In [13]:
print("X_train shape", X_train.shape)
print("X_test shape", X_test.shape)

X_train shape (9216, 116)
X_test shape (2304, 116)


# Hyperparameter Tuning

In [14]:
# LDA Tuning
param_grid = {
    'shrinkage': [1.0, 0, 0.1, 0.01, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7, 1e-8],
    'solver': ['svd', 'lsqr', 'eigen']
}

gs = GridSearchCV(
    LDA(),
    param_grid=param_grid,
    verbose=1,
    cv=5,
    n_jobs=-1
)

lda_gs_results = gs.fit(X_train, y_train)

Fitting 5 folds for each of 30 candidates, totalling 150 fits


        nan 0.4364155  0.4364155         nan 0.44531322 0.44531322
        nan 0.45117181 0.45117181        nan 0.45225612 0.45225612
        nan 0.45193074 0.45193074        nan 0.45214778 0.45214778
        nan 0.45214778 0.45214778        nan 0.45214778 0.45214778]


In [15]:
# print best results for the LDA Tuning
print("%-30s %4.10f" % ("LDA Best Score:", lda_gs_results.best_score_))
print("%-30s %4s"    % ("LDA Best Parameters:", lda_gs_results.best_params_))

LDA Best Score:                0.4522561183
LDA Best Parameters:           {'shrinkage': 0.0001, 'solver': 'lsqr'}


In [16]:
# QDA Tuning
param_grid = {
    'reg_param': [0, 0.1, 0.2, 0.4, 0.6, 0.8, 0.9, 1.0, 2.0],
    'tol': [0.0001]
}

gs = GridSearchCV(
    QDA(),
    param_grid=param_grid,
    verbose=1,
    cv=5,
    n_jobs=-1
)

qda_gs_results = gs.fit(X_train, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits




In [17]:
# print best results for the QDA Tuning
print("%-30s %4.10f" % ("QDA Best Score:", qda_gs_results.best_score_))
print("%-30s %4s"    % ("QDA Best Parameters:", qda_gs_results.best_params_))

QDA Best Score:                0.4976132355
QDA Best Parameters:           {'reg_param': 0.1, 'tol': 0.0001}


In [18]:
# KNC Tuning
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11, 15, 19],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

gs = GridSearchCV(
    KNC(),
    param_grid=param_grid,
    verbose=1,
    cv=5,
    n_jobs=-1
)

knc_gs_results = gs.fit(X_train, y_train)

Fitting 5 folds for each of 28 candidates, totalling 140 fits


In [19]:
# print best results for the KNC Tuning
print("%-30s %4.10f" % ("KNC Best Score:", knc_gs_results.best_score_))
print("%-30s %4s"    % ("KNC Best Parameters:", knc_gs_results.best_params_))

KNC Best Score:                0.8068557466
KNC Best Parameters:           {'metric': 'manhattan', 'n_neighbors': 3, 'weights': 'distance'}


In [20]:
# SVM tunning
param_grid = {'C': [0.1, 1, 10, 100, 1000], 
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']}
gs = GridSearchCV(
    svm.SVC(),
    param_grid, 
    refit = True, 
    verbose = 1,
    n_jobs=-1
)

svm_gs_results = gs.fit(X_train,y_train)

Fitting 5 folds for each of 25 candidates, totalling 125 fits


In [21]:
# print best results for the SVM Tuning
print("%-30s %4.10f" % ("SVM Best Score:", svm_gs_results.best_score_))
print("%-30s %4s"    % ("SVM Best Parameters:", svm_gs_results.best_params_))

SVM Best Score:                0.7145166739
SVM Best Parameters:           {'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}
