# Support vector classifier (SVC)

## Setup

In [None]:
import numpy as np
import pandas as pd 

from sklearn.model_selection import RandomizedSearchCV
from sklearn.datasets import fetch_openml
from sklearn import set_config
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score, make_scorer


from shared import RANDOM_STATE, DIABETES_DATASET_ID, BANKNOTE_DATASET_ID, CREDIT_DATASET_ID, SPAMBASE_DATASET_ID
from shared.utilities import prepare_and_split, create_preprocessor

In [None]:
random_state = RANDOM_STATE
output_path ="..\\results\\svc\\random_search_results.csv"
bayes_output_path = "..\\results\\svc\\bayes_search_results.csv"
set_config(transform_output = "pandas")

In [None]:
C_MIN = 0.01
C_MAX = 100.0
N_C_POINTS = 25
KERNEL = ['linear', 'sigmoid', 'rbf']
GAMMA_MIN = 0.001
GAMMA_MAX = 10.0
N_GAMMA_POINTS = 25

N_JOBS = -1
N_ITERS = 100

In [None]:
diabetes_dataset = fetch_openml(data_id=DIABETES_DATASET_ID, as_frame=True)
banknote_authentication_dataset = fetch_openml(data_id=BANKNOTE_DATASET_ID, as_frame=True)
credit_dataset = fetch_openml(data_id=CREDIT_DATASET_ID, as_frame=True)
spambase_dataset = fetch_openml(data_id=SPAMBASE_DATASET_ID, as_frame=True)

In [None]:
diabetes_train_x, diabetes_train_y= prepare_and_split(diabetes_dataset)
banknotes_train_x, banknotes_train_y= prepare_and_split(banknote_authentication_dataset)
credit_train_x, credit_train_y= prepare_and_split(credit_dataset)
spambase_train_x, spambase_train_y= prepare_and_split(spambase_dataset)

## Randomized Search

In [None]:
param_distributions = {
    'model__C': np.logspace(np.log10(C_MIN), np.log10(C_MAX), num=N_C_POINTS).astype(float),
    'model__kernel': KERNEL,
    'model__gamma':  np.logspace(np.log10(GAMMA_MIN), np.log10(GAMMA_MAX), num=N_GAMMA_POINTS).astype(float),
}

pipeline = Pipeline([
    ('preprocessing', create_preprocessor()),
    ('model', SVC())
])

randomized_search_CV = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_distributions,
    n_jobs=N_JOBS,
    n_iter=N_ITERS,
    verbose=1,
    random_state=RANDOM_STATE,
    cv=5,
    scoring='roc_auc'
    )

In [None]:
datasets = [
    (diabetes_train_x, diabetes_train_y, "diabetes"),
    (banknotes_train_x, banknotes_train_y, "banknotes"),
    (credit_train_x, credit_train_y, "credit"),
    (spambase_train_x, spambase_train_y, "spambase")
]

In [None]:
random_search_results = []

for x, y, name in datasets:
    randomized_search_CV.fit(x, y)
    cv_results = randomized_search_CV.cv_results_
    df_results = pd.DataFrame(cv_results)
    df_results['dataset'] = name
    random_search_results.append(df_results)
    
all_results = pd.concat(random_search_results)
all_results.to_csv(output_path, index=False)

In [None]:
all_results.groupby('dataset').agg({'mean_test_score': 'max'})

# Default params score

In [None]:
results = []
scoring = make_scorer(accuracy_score)
for x, y, name in datasets:
    result = cross_validate(pipeline, x, y, cv=5, scoring=scoring)['test_score'].mean()
    
    results.append({'dataset': name, 'score': result})
pd.DataFrame(results)

## Bayes Optimization

In [None]:
from smac.scenario import Scenario
from ConfigSpace import ConfigurationSpace
from ConfigSpace.hyperparameters import UniformFloatHyperparameter, CategoricalHyperparameter
from smac.facade.hyperparameter_optimization_facade import HyperparameterOptimizationFacade
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
import numpy as np

In [None]:
bayes_results_log = []
def objective_function(config, seed=RANDOM_STATE):
    scores = []
    
    for x, y, name in datasets:
        pipeline = Pipeline([
            ('preprocessing', create_preprocessor()),
            ('model', SVC())
        ])
        
        pipeline.set_params(
            model__C=config['C'],
            model__kernel=config['kernel'],
            model__gamma=config['gamma']
        )
        
        score = cross_val_score(pipeline, x, y, cv=5, scoring='roc_auc').mean()
        scores.append({'score': score, 'dataset': name})
    
        bayes_results_log.append({
            'C': config['C'],
            'kernel': config['kernel'],
            'gamma': config['gamma'],
            'score': score,
            'dataset': name
        })
    
    scores_mean = np.mean([s['score'] for s in scores])
    
    return 1 - scores_mean


In [None]:
cs = ConfigurationSpace()

c_bayes_values = np.logspace(np.log10(C_MIN), np.log10(C_MAX), num=N_C_POINTS).tolist()
gamma_bayes_values = np.logspace(np.log10(GAMMA_MIN), np.log10(GAMMA_MAX), num=N_GAMMA_POINTS).tolist()
C = CategoricalHyperparameter("C", c_bayes_values, default_value=1.0)
gamma = CategoricalHyperparameter("gamma", gamma_bayes_values, default_value=0.001)
kernel = CategoricalHyperparameter("kernel", KERNEL, default_value="rbf")

cs.add([C, kernel, gamma])

scenario = Scenario(cs, deterministic=True, n_trials=N_ITERS)

smac = HyperparameterOptimizationFacade(scenario, objective_function)
bayes_best_hyperparameters = smac.optimize()

pd.DataFrame(bayes_results_log).to_csv(bayes_output_path, index=False)

In [None]:
pd.DataFrame(bayes_results_log).groupby('dataset').agg({'score': 'max'})