# Support Vector Classifier Pipeline

The explanation for this notebook is available at https://youranalystbuddy.com/support-vector-machine-pipeline/

For example of classification, we use the heart_disease data. The target is `HeartDisease` which is binary.

### Import and split data

In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np

In [2]:
data = pd.read_csv('heart_disease.csv')
data.head(n=2)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1


In [3]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(data, test_size=0.25)

### Processing pipeline

In [4]:
num_cols = ['Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'Oldpeak']
cat_cols = ['Sex', 'ChestPainType','RestingECG', 'ExerciseAngina', 'ST_Slope']
target = 'HeartDisease'

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

def remove_0(X):
    X.loc[X['Cholesterol']==0, 'Cholesterol'] = np.nan
    X.loc[X['RestingBP']==0, 'RestingBP'] = np.nan
    return X

num_pipeline = Pipeline([
    ('remove 0', FunctionTransformer(remove_0, validate=False)),
    ('impute', SimpleImputer(strategy='median')),
    ('standardize', StandardScaler())
])

cat_pipeline = Pipeline([
    ('encode', OneHotEncoder())
])

process_pipeline = ColumnTransformer([
    ('numeric', num_pipeline, num_cols),
    ('class', cat_pipeline, cat_cols)
])

### Modeling pipeline

In [5]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

svc = Pipeline([
    ('processing', process_pipeline), 
    ('svc', SVC())
])

param_grid = [
    {'svc__kernel':['linear'], 
     'svc__C' : [0.001, 0.1, 1, 10, 100]},
    {'svc__kernel':['poly'], 
     'svc__degree' : [2, 3, 4], 
     'svc__coef0' : [0, 1, 10], 
     'svc__C' : [0.001, 0.1, 1, 10, 100]},
    {'svc__kernel':['rbf'], 
     'svc__gamma' : [0.001, 0.01, 0.1, 1, 10, 100, 1000], 
     'svc__C' : [0.001, 0.01, 0.1, 1, 10, 100]}
]

grid_search = GridSearchCV(svc, param_grid, cv=5, scoring='accuracy', return_train_score=True)

### Train and test

In [6]:
grid_search.fit(train,train[target])

Let's check the best model

In [7]:
print(grid_search.best_params_)
print(grid_search.best_score_)

{'svc__C': 1, 'svc__gamma': 0.1, 'svc__kernel': 'rbf'}
0.8735216333439121


And test it on testing data

In [8]:
grid_search.score(test,test[target])

0.8782608695652174