In [167]:
import numpy as np
import pandas as pd
from time import sleep

from scipy.io import arff

from sklearn.model_selection import KFold, GridSearchCV
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split

from sklearn.metrics import f1_score, roc_auc_score

from thundersvm import SVC
#from thundergbm import TGBMClassifier

OSError: libcusparse.so.9.0: cannot open shared object file: No such file or directory

# EEG Data

In [159]:
data = arff.loadarff('data/eeg/EEG Eye State.arff')

In [160]:
df = pd.DataFrame(data[0])

df.eyeDetection = df.eyeDetection.astype('int32')

df = df.rename(columns = {'eyeDetection' : 'y'})

df = df.sample(frac = 1).reset_index(drop = True)

X = df.iloc[:, :-1]
y = df.y

df

Unnamed: 0,AF3,F7,F3,FC5,T7,P7,O1,O2,P8,T8,FC6,F4,F8,AF4,y
0,4266.15,4057.95,4280.51,4170.77,4395.38,4662.56,4102.56,4641.54,4213.85,4237.95,4194.87,4276.92,4590.77,4314.87,0
1,4279.49,4013.33,4245.64,4116.41,4322.56,4601.03,4070.26,4613.85,4189.23,4215.38,4192.31,4264.10,4601.54,4338.46,0
2,4291.79,4042.56,4265.64,4144.62,4346.15,4615.90,4047.69,4596.92,4174.36,4218.46,4184.10,4268.72,4564.62,4323.08,1
3,4311.28,4004.10,4267.69,4139.49,4328.21,4613.85,4066.15,4618.97,4198.46,4227.69,4191.28,4280.00,4605.13,4366.67,1
4,4304.10,4012.31,4265.64,4124.10,4325.13,4607.69,4058.46,4622.05,4198.97,4228.21,4205.13,4288.21,4605.64,4377.95,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14975,4303.08,4006.67,4276.92,4136.41,4345.13,4617.44,4060.51,4626.67,4199.49,4230.26,4170.26,4284.10,4603.08,4358.97,1
14976,4313.85,4010.77,4260.00,4124.62,4337.44,4580.51,4088.72,4618.46,4203.59,4223.59,4198.97,4267.69,4613.33,4370.77,0
14977,4268.21,3982.56,4240.51,4106.15,4331.79,4606.67,4055.90,4616.92,4197.95,4212.82,4200.51,4265.64,4585.13,4330.26,1
14978,4257.44,3967.69,4235.38,4104.10,4327.18,4608.21,4039.49,4611.79,4192.31,4217.44,4175.38,4255.90,4560.00,4323.59,1


In [161]:
scaler = RobustScaler().fit(X)
X = scaler.transform(X)

In [162]:
def run_svm(X, y):
    
    clf = SVC()
    C_list = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3]
    gamma_list = [1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1/(X.shape[1])]

    search_params = {
        'C': C_list,
        'gamma': gamma_list
    }
    
    metrics = ['accuracy', 'f1', 'roc_auc']

    raw_train_arr = []
    raw_test_arr = []
    
    for i in range(5):
        X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 5000)
        
        search_results = GridSearchCV(clf, search_params, scoring = metrics, refit = False)
        
        search_results.fit(X_train, y_train)
        
        results = pd.DataFrame(search_results.cv_results_['params'])

        results['mean_accuracy'] = search_results.cv_results_['mean_test_accuracy']
        results['mean_f1'] = search_results.cv_results_['mean_test_f1']
        results['mean_auc'] = search_results.cv_results_['mean_test_roc_auc']
        
        opt_acc_inf = results.sort_values(by = 'mean_accuracy', ascending = False).iloc[0]
        opt_f1_inf = results.sort_values(by = 'mean_f1', ascending = False).iloc[0]
        opt_auc_inf = results.sort_values(by = 'mean_auc', ascending = False).iloc[0]
        
        opt_acc_clf = SVC(C = opt_acc_inf.C, gamma = opt_acc_inf.gamma)
        opt_f1_clf = SVC(C = opt_f1_inf.C, gamma = opt_f1_inf.gamma)
        opt_auc_clf = SVC(C = opt_auc_inf.C, gamma = opt_auc_inf.gamma)
        
        opt_acc_clf.fit(X_train, y_train)
        opt_f1_clf.fit(X_train, y_train)
        opt_auc_clf.fit(X_train, y_train)
        
        train_score_acc = opt_acc_clf.score(X_train, y_train)
        train_score_f1 = f1_score(y_train, opt_f1_clf.predict(X_train))
        train_score_auc = roc_auc_score(y_train, opt_auc_clf.predict(X_train))
                        
        test_score_acc = opt_acc_clf.score(X_test, y_test)
        test_score_f1 = f1_score(y_test, opt_f1_clf.predict(X_test))
        test_score_auc = roc_auc_score(y_test, opt_auc_clf.predict(X_test))
        
        raw_train_arr.append([train_score_acc, train_score_f1, train_score_auc])
        raw_test_arr.append([test_score_acc, test_score_f1, test_score_auc])
        
                
    
    raw_train_arr = np.array(raw_train_arr).reshape(5, 3)
    raw_test_arr = np.array(raw_test_arr).reshape(5, 3)
    
    raw_train_df = pd.DataFrame(data = raw_train_arr, columns = ['accuracy', 'f1', 'auc'])
    raw_test_df = pd.DataFrame(data = raw_test_arr, columns = ['accuracy', 'f1', 'auc'])
    
    return raw_train_df, raw_test_df

In [164]:
raw_train_df

Unnamed: 0,accuracy,f1,auc
0,0.9706,0.966825,0.969817
1,0.9754,0.972782,0.974868
2,0.9702,0.966614,0.969014
3,0.9728,0.969831,0.972007
4,0.9692,0.965839,0.5


In [155]:
raw_test_df

Unnamed: 0,accuracy,f1,auc
0,0.937475,0.929793,0.935784
1,0.935571,0.92768,0.934615
2,0.936573,0.928804,0.935377
3,0.938878,0.931229,0.937751
4,0.940381,0.932912,0.5
