In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
sns.set()

from sklearn import datasets
from sklearn.svm import SVC
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

# SVM and Multi-Category Models

As we often are given situtations where the required analysis is underpinned by a multiclass response, there are many different families of models that allow us to perform this type of analysis. In this notebook, we will review how to implement many varieties of Support Vector Machines to expand our toolbox of skills. Let's start by looking at a simple example of a Support Vector Machine for classification.

In [2]:
Xs,y = datasets.make_classification(n_features=5,n_samples=1000)
X_train, X_test, y_train, y_test = train_test_split(Xs, y, test_size=0.2)

In [5]:
#Simple Train-Test
svc = SVC()
svc.fit(X_train,y_train)
print(svc.score(X_test,y_test))

0.905




In [4]:
#Cross-Validated
svc = SVC(gamma='auto')
print(cross_val_score(svc,Xs,y,cv=10).mean().round(4))

0.9129


In [10]:
#Hyperparameter Tuned Cross-Validated
svc = SVC()
grid = {'C':[100,.001,.01,1000,10000],
        'kernel':['rbf','linear','poly','sigmoid'],
        'gamma':['auto','scale']}
gscv = GridSearchCV(svc,grid,cv=4,verbose=True,return_train_score=False)
gscv.fit(Xs,y)

Fitting 4 folds for each of 40 candidates, totalling 160 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


KeyboardInterrupt: 

In [None]:
scores = pd.DataFrame(gscv.cv_results_).filter(regex='param_+|mean_test_score'
                                            ).sort_values('mean_test_score',
                                                          ascending=False).reset_index().drop(['index'],axis=1)
scores.head(15)

And that's it for the simplest model! After all the pain of the last few weeks, we can finally quickly and thoroughly implement an SVM model using everything we learned over the past few weeks.

Which do you think is the best?

## Regression
As this runs, since have included more tuning parameters, take note of the amount of time it takes to do all of the computations. Since this is not optimized for large amounts of data, you will see a greater than multinomial scaling of computation time. We will discuss ways of going about optimizing this solutions in a few minutes - for now however we simply suffer.

In [11]:
Xs,y = datasets.make_regression(n_features=5,n_samples=1000,noise=0.3)

In [12]:
svr = SVR()
grid = {'C':[0.1,0.2,0.3],
        'kernel':['rbf','linear','poly','sigmoid'],
        'gamma':['auto','scale'],
        'tol':[1e-5,1e-4,1e-3,1e-2],
        'epsilon':[0,0.5,1,2]}
gscv = GridSearchCV(svr,grid,cv=4,verbose=True,return_train_score=False)
gscv.fit(Xs,y)

Fitting 4 folds for each of 384 candidates, totalling 1536 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1536 out of 1536 | elapsed:   24.4s finished


GridSearchCV(cv=4, error_score='raise-deprecating',
       estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
  gamma='auto_deprecated', kernel='rbf', max_iter=-1, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': [0.1, 0.2, 0.3], 'kernel': ['rbf', 'linear', 'poly', 'sigmoid'], 'gamma': ['auto', 'scale'], 'tol': [1e-05, 0.0001, 0.001, 0.01], 'epsilon': [0, 0.5, 1, 2]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
       scoring=None, verbose=True)

In [13]:
scores = pd.DataFrame(gscv.cv_results_).filter(regex='param_+|mean_test_score'
                                            ).sort_values('mean_test_score',
                                                          ascending=True).reset_index().drop(['index'],axis=1)
scores.head(15)

Unnamed: 0,param_C,param_epsilon,param_gamma,param_kernel,param_tol,mean_test_score
0,0.1,0.0,auto,rbf,1e-05,0.030404
1,0.1,0.0,auto,rbf,0.0001,0.030404
2,0.1,0.0,auto,rbf,0.001,0.030404
3,0.1,0.0,auto,rbf,0.01,0.030404
4,0.1,0.0,scale,rbf,1e-05,0.030497
5,0.1,0.0,scale,rbf,0.01,0.030497
6,0.1,0.0,scale,rbf,0.0001,0.030497
7,0.1,0.0,scale,rbf,0.001,0.030497
8,0.1,0.5,auto,rbf,0.01,0.030597
9,0.1,0.5,auto,rbf,0.001,0.030597


Sometimes it just won't work! You must test models to discover this.