# **SUPPORT VECTOR CLASSIFIER** 🏛️

this script is for setup, execution, and evaluation of the support vector classifier algorithm

In [21]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import funcs as f
import scipy.stats as sts
from sklearn.svm import SVC

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [22]:
'''
Set clf, import data from data_organization.ipynb, and set random seed
'''
clf = SVC(kernel='poly', C=.1, gamma=10)

df_all = pd.read_csv('../data/df_all.csv').drop('Unnamed: 0', axis = 1)
df_gus = pd.read_csv('../data/df_gus.csv').drop('Unnamed: 0', axis = 1)
df_tgus = pd.read_csv('../data/df_tgus.csv').drop('Unnamed: 0', axis = 1)
df_tgus_st = pd.read_csv('../data/df_tgus*.csv').drop('Unnamed: 0', axis = 1)
df_raw = pd.read_csv('../data/df_raw.csv').drop('Unnamed: 0', axis = 1)

np.random.seed(8)

In [23]:
'''
Considering GUS, TGUS,  TGUS*, and raw values with other features

'''
all_train =[]
all_vals =[]
all_tests=[]
for i in range(100):
    model, train_acc, val_acc, test_acc = f.kfold_crossval(df_all, clf, 'svc')
    all_train = np.append(train_acc, all_train)
    all_vals = np.append(val_acc, all_vals)
    all_tests = np.append(test_acc, all_tests)

    # keep best model
    if test_acc >= np.max(all_tests):
        all_model = model
        all_train_acc = train_acc
        all_val_acc = val_acc
        all_test_acc = test_acc

print('All - Results:')
print(f'Best Scenario Training Accuracy: {all_train_acc}%')
print(f'Average Training Accuracy: {round(np.mean(all_train),1)}%')
print(f'Best Scenario Validation Accuracy: {all_val_acc}%')
print(f'Average Validation Accuracy: {round(np.mean(all_vals),1)}%')
print(f'Best Scenario Test Accuracy: {all_test_acc}%')
print(f'Average Test Accuracy: {round(np.mean(all_tests),1)}%')

All - Results:
Best Scenario Training Accuracy: 99.0%
Average Training Accuracy: 99.7%
Best Scenario Validation Accuracy: 100.0%
Average Validation Accuracy: 99.2%
Best Scenario Test Accuracy: 100.0%
Average Test Accuracy: 96.9%


In [29]:
print(len(all_tests[all_tests == 100]))

2


In [24]:
'''
Considering just GUS with other features

'''
gus_train =[]
gus_vals =[]
gus_tests=[]
for i in range(100):
    model, train_acc, val_acc, test_acc = f.kfold_crossval(df_gus, clf, 'svc')
    gus_train = np.append(train_acc, gus_train)
    gus_vals = np.append(val_acc, gus_vals)
    gus_tests = np.append(test_acc, gus_tests)

    # keep best model
    if test_acc >= np.max(gus_tests):
        gus_model = model
        gus_train_acc = train_acc
        gus_val_acc = val_acc
        gus_test_acc = test_acc

print('GUS - Results:')
print(f'Best Scenario Training Accuracy: {gus_train_acc}%')
print(f'Average Training Accuracy: {round(np.mean(gus_train),1)}%')
print(f'Best Scenario Validation Accuracy: {gus_val_acc}%')
print(f'Average Validation Accuracy: {round(np.mean(gus_vals),1)}%')
print(f'Best Scenario Test Accuracy: {gus_test_acc}%')
print(f'Average Test Accuracy: {round(np.mean(gus_tests),1)}%')


GUS - Results:
Best Scenario Training Accuracy: 99.0%
Average Training Accuracy: 99.2%
Best Scenario Validation Accuracy: 100.0%
Average Validation Accuracy: 98.9%
Best Scenario Test Accuracy: 100.0%
Average Test Accuracy: 96.4%


In [25]:
'''
Considering just TGUS with other features

'''
tgus_train =[]
tgus_vals =[]
tgus_tests=[]
for i in range(100):
    model, train_acc, val_acc, test_acc = f.kfold_crossval(df_tgus, clf, 'svc')
    tgus_train = np.append(train_acc, tgus_train)
    tgus_vals = np.append(val_acc, tgus_vals)
    tgus_tests = np.append(test_acc, tgus_tests)

    # keep best model
    if test_acc >= np.max(tgus_tests):
        tgus_model = model
        tgus_train_acc = train_acc
        tgus_val_acc = val_acc
        tgus_test_acc = test_acc

print('TGUS - Results:')
print(f'Best Scenario Training Accuracy: {tgus_train_acc}%')
print(f'Average Training Accuracy: {round(np.mean(tgus_train),1)}%')
print(f'Best Scenario Validation Accuracy: {tgus_val_acc}%')
print(f'Average Validation Accuracy: {round(np.mean(tgus_vals),1)}%')
print(f'Best Scenario Test Accuracy: {tgus_test_acc}%')
print(f'Average Test Accuracy: {round(np.mean(tgus_tests),1)}%')

TGUS - Results:
Best Scenario Training Accuracy: 99.0%
Average Training Accuracy: 98.9%
Best Scenario Validation Accuracy: 98.6%
Average Validation Accuracy: 99.2%
Best Scenario Test Accuracy: 100.0%
Average Test Accuracy: 96.8%


In [26]:
'''
Considering just TGUS* and raw values with other features


tgus_st_train =[]
tgus_st_vals =[]
tgus_st_tests=[]
for i in range(30):
    model, train_acc, val_acc, test_acc = f.kfold_crossval(df_tgus_st, clf, 'svc')
    tgus_st_train = np.append(train_acc, tgus_st_train)
    tgus_st_vals = np.append(val_acc, tgus_st_vals)
    tgus_st_tests = np.append(test_acc, tgus_st_tests)

    # keep best model
    if test_acc >= np.max(tgus_st_tests):
        tgus_st_model = model
        tgus_st_train_acc = train_acc
        tgus_st_val_acc = val_acc
        tgus_st_test_acc = test_acc

print('TGUS* - Results:')
print(f'Best Scenario Training Accuracy: {tgus_st_train_acc}%')
print(f'Average Training Accuracy: {round(np.mean(tgus_st_train),1)}%')
print(f'Best Scenario Validation Accuracy: {tgus_st_val_acc}%')
print(f'Average Validation Accuracy: {round(np.mean(tgus_st_vals),1)}%')
print(f'Best Scenario Test Accuracy: {tgus_st_test_acc}%')
print(f'Average Test Accuracy: {round(np.mean(tgus_st_tests),1)}%')
'''

"\nConsidering just TGUS* and raw values with other features\n\n\ntgus_st_train =[]\ntgus_st_vals =[]\ntgus_st_tests=[]\nfor i in range(30):\n    model, train_acc, val_acc, test_acc = f.kfold_crossval(df_tgus_st, clf, 'svc')\n    tgus_st_train = np.append(train_acc, tgus_st_train)\n    tgus_st_vals = np.append(val_acc, tgus_st_vals)\n    tgus_st_tests = np.append(test_acc, tgus_st_tests)\n\n    # keep best model\n    if test_acc >= np.max(tgus_st_tests):\n        tgus_st_model = model\n        tgus_st_train_acc = train_acc\n        tgus_st_val_acc = val_acc\n        tgus_st_test_acc = test_acc\n\nprint('TGUS* - Results:')\nprint(f'Best Scenario Training Accuracy: {tgus_st_train_acc}%')\nprint(f'Average Training Accuracy: {round(np.mean(tgus_st_train),1)}%')\nprint(f'Best Scenario Validation Accuracy: {tgus_st_val_acc}%')\nprint(f'Average Validation Accuracy: {round(np.mean(tgus_st_vals),1)}%')\nprint(f'Best Scenario Test Accuracy: {tgus_st_test_acc}%')\nprint(f'Average Test Accuracy: {

In [27]:
'''
Considering just raw values with other features

'''
raw_train =[]
raw_vals =[]
raw_tests=[]
for i in range(100):
    model, train_acc, val_acc, test_acc = f.kfold_crossval(df_raw, clf, 'svc')
    raw_train = np.append(train_acc, raw_train)
    raw_vals = np.append(val_acc, raw_vals)
    raw_tests = np.append(test_acc, raw_tests)

    # keep best model
    if test_acc >= np.max(raw_tests):
        raw_model = model
        raw_train_acc = train_acc
        raw_val_acc = val_acc
        raw_test_acc = test_acc

print('Raw - Results:')
print(f'Best Scenario Training Accuracy: {raw_train_acc}%')
print(f'Average Training Accuracy: {round(np.mean(raw_train),1)}%')
print(f'Best Scenario Validation Accuracy: {raw_val_acc}%')
print(f'Average Validation Accuracy: {round(np.mean(raw_vals),1)}%')
print(f'Best Scenario Test Accuracy: {raw_test_acc}%')
print(f'Average Test Accuracy: {round(np.mean(raw_tests),1)}%')

Raw - Results:
Best Scenario Training Accuracy: 99.0%
Average Training Accuracy: 99.6%
Best Scenario Validation Accuracy: 98.6%
Average Validation Accuracy: 99.0%
Best Scenario Test Accuracy: 100.0%
Average Test Accuracy: 96.5%


In [28]:
'''
Equal variance t-tests to compare result means
'''
cols = ['All', 'GUS', 'TGUS', 'Raw']
train_scores = pd.DataFrame({'All': all_train, 'GUS': gus_train, 'TGUS': tgus_train, 'Raw': raw_train})
val_scores = pd.DataFrame({'All': all_vals, 'GUS': gus_vals, 'TGUS': tgus_vals, 'Raw': raw_vals})
test_scores = pd.DataFrame({'All': all_tests, 'GUS': gus_tests, 'TGUS': tgus_tests, 'Raw': raw_tests})

comp_train = pd.DataFrame(columns = train_scores.columns, index = train_scores.columns )
comp_val = pd.DataFrame(columns = train_scores.columns, index = train_scores.columns )
comp_test = pd.DataFrame(columns = train_scores.columns, index = train_scores.columns )

for i in cols:
    for j in cols:
        stat_train,p_train = sts.ttest_ind(train_scores.loc[:,i], train_scores.loc[:,j], equal_var = True, alternative = 'two-sided')
        comp_train.loc[i,j] = [round(p_train,100)]

        stat_val,p_val = sts.ttest_ind(val_scores.loc[:,i], val_scores.loc[:,j], equal_var = True, alternative = 'two-sided')
        comp_val.loc[i,j] = [round(p_val,100)]

        stat_test,p_test = sts.ttest_ind(test_scores.loc[:,i], test_scores.loc[:,j], equal_var = True, alternative = 'two-sided')
        comp_test.loc[i,j] = [round(p_test,100)]