In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import dask_ml.model_selection as dcv

from time import sleep

from scipy.io import arff

from sklearn.model_selection import KFold, GridSearchCV
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split

from sklearn.metrics import f1_score, roc_auc_score, accuracy_score

#from thundersvm import SVC
#from thundergbm import TGBMClassifier

from cuml.svm import SVC
from cuml.ensemble import RandomForestClassifier
from cuml.neighbors import KNeighborsClassifier

# EEG Data

In [4]:
data = arff.loadarff('data/eeg/EEG Eye State.arff')

In [5]:
df = pd.DataFrame(data[0])

df.eyeDetection = df.eyeDetection.astype('int32')

df = df.rename(columns = {'eyeDetection' : 'y'})

df = df.sample(frac = 1).reset_index(drop = True)

X = df.iloc[:, :-1]
y = df.y

df

Unnamed: 0,AF3,F7,F3,FC5,T7,P7,O1,O2,P8,T8,FC6,F4,F8,AF4,y
0,4296.92,4005.13,4263.08,4103.59,4329.23,4605.64,4045.64,4602.56,4190.26,4227.18,4191.79,4263.59,4599.49,4343.59,1
1,4254.87,4017.44,4248.72,4152.31,4343.59,4614.87,4061.54,4591.28,4178.46,4197.44,4149.74,4252.31,4543.08,4297.44,1
2,4300.00,4047.69,4280.51,4149.74,4380.51,4654.87,4103.08,4656.41,4233.85,4260.00,4216.92,4291.28,4612.31,4357.95,1
3,4296.92,4006.67,4269.74,4129.23,4346.15,4618.97,4058.46,4606.15,4202.05,4233.33,4210.77,4281.54,4612.31,4342.56,0
4,4280.00,3975.90,4237.44,4114.36,4327.18,4600.51,4071.79,4592.31,4193.33,4232.82,4180.00,4269.23,4606.15,4352.31,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14975,4307.18,4011.28,4265.64,4136.92,4348.21,4631.79,4052.31,4626.67,4218.97,4238.97,4206.67,4270.26,4606.15,4356.41,0
14976,4290.26,3984.62,4233.85,4110.77,4342.56,4621.03,4053.85,4610.77,4201.03,4231.79,4172.31,4260.00,4596.92,4337.95,0
14977,4288.72,4032.31,4274.87,4155.38,4372.31,4642.56,4097.44,4635.90,4201.03,4214.36,4195.90,4274.36,4590.26,4335.38,0
14978,4308.21,3994.36,4279.49,4140.51,4354.36,4633.85,4084.10,4631.79,4213.85,4237.95,4210.77,4283.59,4624.10,4354.36,1


In [6]:
scaler = RobustScaler().fit(X)
X = scaler.transform(X)

In [5]:
def run_svm(X, y):
    
    clf = SVC()
    C_list = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3]
    gamma_list = [1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1/(X.shape[1])]

    search_params = {
        'C': C_list,
        'gamma': gamma_list
    }
    
    metrics = ['accuracy', 'f1', 'roc_auc']

    raw_train_arr = []
    raw_test_arr = []
    
    for i in range(5):
        X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 5000)
        
        search_results = GridSearchCV(clf, search_params, scoring = metrics, refit = False)
        
        search_results.fit(X_train, y_train)
        
        results = pd.DataFrame(search_results.cv_results_['params'])

        results['mean_accuracy'] = search_results.cv_results_['mean_test_accuracy']
        results['mean_f1'] = search_results.cv_results_['mean_test_f1']
        results['mean_auc'] = search_results.cv_results_['mean_test_roc_auc']
        
        opt_acc_inf = results.sort_values(by = 'mean_accuracy', ascending = False).iloc[0]
        opt_f1_inf = results.sort_values(by = 'mean_f1', ascending = False).iloc[0]
        opt_auc_inf = results.sort_values(by = 'mean_auc', ascending = False).iloc[0]
        
        opt_acc_clf = SVC(C = opt_acc_inf.C, gamma = opt_acc_inf.gamma)
        opt_f1_clf = SVC(C = opt_f1_inf.C, gamma = opt_f1_inf.gamma)
        opt_auc_clf = SVC(C = opt_auc_inf.C, gamma = opt_auc_inf.gamma)
        
        opt_acc_clf.fit(X_train, y_train)
        opt_f1_clf.fit(X_train, y_train)
        opt_auc_clf.fit(X_train, y_train)
        
        train_score_acc = opt_acc_clf.score(X_train, y_train)
        train_score_f1 = f1_score(y_train, opt_f1_clf.predict(X_train))
        train_score_auc = roc_auc_score(y_train, opt_auc_clf.predict(X_train))
                        
        test_score_acc = opt_acc_clf.score(X_test, y_test)
        test_score_f1 = f1_score(y_test, opt_f1_clf.predict(X_test))
        test_score_auc = roc_auc_score(y_test, opt_auc_clf.predict(X_test))
        
        raw_train_arr.append([train_score_acc, train_score_f1, train_score_auc])
        raw_test_arr.append([test_score_acc, test_score_f1, test_score_auc])
        
                
    
    raw_train_arr = np.array(raw_train_arr).reshape(5, 3)
    raw_test_arr = np.array(raw_test_arr).reshape(5, 3)
    
    raw_train_df = pd.DataFrame(data = raw_train_arr, columns = ['accuracy', 'f1', 'auc'])
    raw_test_df = pd.DataFrame(data = raw_test_arr, columns = ['accuracy', 'f1', 'auc'])
    
    return raw_train_df, raw_test_df

In [6]:
raw_train_df, raw_test_df = run_svm(X, y)

In [7]:
raw_train_df

Unnamed: 0,accuracy,f1,auc
0,0.9718,0.968435,0.970676
1,0.9718,0.968049,0.970709
2,0.994,0.993421,0.971136
3,0.9706,0.967967,0.970118
4,0.972,0.968124,0.971087


In [8]:
raw_test_df

Unnamed: 0,accuracy,f1,auc
0,0.939479,0.93215,0.938449
1,0.937375,0.929878,0.935901
2,0.93477,0.927303,0.932673
3,0.936373,0.928273,0.935703
4,0.940982,0.934006,0.939417


In [12]:
def GridSearch_random_forest(X_train, y_train):
    X_train = X_train.astype('float32')
    y_train = y_train.to_numpy().astype('float32')
    
    folds = KFold(n_splits = 5)
    
    n_estimators_lst = [128, 256, 512, 1024]
    max_features_lst = [1, 2, 4, 6, 8, 12, 16, 20]
    max_features_lst = [1 / item for item in max_features_lst]
        
    fin_arr = []    
    
    for n_estimators in n_estimators_lst:
        
        for max_features in max_features_lst:

            clf = RandomForestClassifier(n_estimators = n_estimators, max_features = max_features)

            predicted_y = []
            true_y = []

            for train, holdout in folds.split(X_train):
                clf.fit(X_train[train], y_train[train])

                predicted_y.append(clf.predict(X_train[holdout]))

                true_y.append(y_train[holdout])

            predicted_y = np.concatenate(predicted_y)
            true_y = np.concatenate(true_y)
            
            accuracy_train = accuracy_score(true_y, predicted_y)
            f1_train = f1_score(true_y, predicted_y)
            roc_auc_train = roc_auc_score(true_y, predicted_y)
            
            fin_arr.append([n_estimators, max_features, accuracy_train, f1_train, roc_auc_train])
        
    fin_arr = np.array(fin_arr).reshape(40, 5)
    
    columns = ['n_estimators', 'max_features', 'mean_accuracy', 'mean_f1', 'mean_auc']
    
    results = pd.DataFrame(data = fin_arr, columns = columns)    
    

    return results
    

In [13]:
search_results = run_random_forest(X_train, y_train)

KeyboardInterrupt: 

In [None]:

opt_acc_inf = results.sort_values(by = 'mean_accuracy', ascending = False).iloc[0]
opt_f1_inf = results.sort_values(by = 'mean_f1', ascending = False).iloc[0]
opt_auc_inf = results.sort_values(by = 'mean_auc', ascending = False).iloc[0]

opt_acc_clf = RandomForestClassifier(n_estimators = opt_acc_inf.n_estimators,
                                     max_features = opt_acc_inf.max_features)

opt_f1_clf = RandomForestClassifier(n_estimators = opt_f1_inf.n_estimators,
                                     max_features = opt_f1_inf.max_features)

opt_auc_clf = RandomForestClassifier(n_estimators = opt_auc_inf.n_estimators,
                                     max_features = opt_auc_inf.max_features)
