In [2]:
#!/usr/bin/env python

'''
GA Data Science Q2 2016

Code walk-through 11: Support vector machines
'''

try:
    from urllib.parse import urljoin
except ImportError:
    from urlparse import urljoin

import numpy as np
import pandas as pd

from sklearn import preprocessing, svm, cross_validation as cv, grid_search

from sklearn.pipeline import Pipeline

ImportError: No module named 'numpy'

In [2]:
# Select different Machine Learning Repository mirror if needed
#MLR_MIRROR = 'http://archive.ics.uci.edu/ml/machine-learning-databases/'
MLR_MIRROR = 'http://mlr.cs.umass.edu/ml/machine-learning-databases/'

REDS_URL = urljoin(MLR_MIRROR, 'wine-quality/winequality-red.csv')

WHITES_URL = urljoin(MLR_MIRROR, 'wine-quality/winequality-white.csv')

In [3]:
# Read in the Wine Quality datasets
reds = pd.read_csv(REDS_URL, sep=';')
whites = pd.read_csv(WHITES_URL, sep=';')

In [4]:
# Add a new indicator variable for the type of wine
reds['red'] = 1
whites['red'] = 0

# Merge the two datasets
wines = pd.concat([reds, whites], axis=0)

# Prepare the data for use in scikit-learn
X = wines.drop(['quality', 'red'], axis=1)
y = wines.red.astype('int')

# Create a pipeline that scales the data and trains a support vector classifier
ssvc = Pipeline([
    ('scale', preprocessing.StandardScaler()),
    ('svc', svm.SVC())
])

In [5]:
# Train a support vector classifier with linear (= no) kernel
# if set kernel to anything else but linear, it would be a non-linear classifier
ssvc.set_params(
    svc__kernel='linear'
)
ssvc.fit(X, y)

Pipeline(steps=[('scale', StandardScaler(copy=True, with_mean=True, with_std=True)), ('svc', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [6]:
# Coefficients defining the separating hyperplane (NOT regression coefficients)
ssvc.named_steps['svc'].coef_

array([[-0.24150874,  0.4545384 , -0.18612511, -1.80159802,  0.2984833 ,
         0.50382491, -1.272486  ,  2.51064324, -0.13961128,  0.21443596,
         0.9648553 ]])

In [7]:
# Support vectors, number of support vectors 79 red wines that touch margin on the red side and 79 on white site
ssvc.named_steps['svc'].n_support_

array([79, 79])

In [8]:
# if you want to know which points are the support vectors you can see here:
ssvc.named_steps['svc'].support_

array([1807, 1899, 1901, 2083, 2208, 2282, 2429, 2433, 2454, 2505, 2535,
       2547, 2590, 2592, 2595, 2627, 2635, 2636, 2639, 2641, 2652, 2663,
       2713, 2751, 2788, 2838, 2854, 2871, 2984, 3014, 3095, 3160, 3163,
       3176, 3247, 3265, 3287, 3307, 3382, 3424, 3442, 3452, 3455, 3497,
       3550, 3560, 3561, 3569, 3625, 3629, 3662, 3674, 3753, 3785, 3856,
       3858, 3869, 3873, 3914, 3918, 4074, 4188, 4267, 4329, 4330, 4380,
       4785, 5127, 5170, 5261, 5448, 5478, 5500, 6072, 6325, 6328, 6391,
       6414, 6438,   20,   49,   53,   59,   90,  124,  138,  139,  146,
        154,  155,  156,  157,  163,  164,  188,  189,  190,  192,  195,
        207,  208,  215,  219,  230,  255,  311,  313,  332,  354,  463,
        480,  494,  523,  524,  561,  591,  649,  679,  772,  800,  836,
        837,  861,  978, 1017, 1018, 1044, 1079, 1081, 1087, 1112, 1114,
       1125, 1131, 1157, 1228, 1233, 1235, 1244, 1286, 1322, 1355, 1356,
       1375, 1389, 1397, 1419, 1431, 1456, 1475, 14

In [9]:
ssvc.named_steps['svc'].support_vectors_

array([[-0.32037042,  1.39915368, -0.19705367, ...,  0.63131247,
         1.40281954, -0.2446721 ],
       [-0.86035472,  0.42724077, -2.19283252, ...,  0.32031851,
         0.9323718 ,  0.25842195],
       [-0.86035472,  0.42724077, -2.19283252, ...,  0.32031851,
         0.9323718 ,  0.25842195],
       ..., 
       [ 0.45103572,  1.58138735, -0.40351355, ..., -0.05287424,
        -0.07573051, -0.49621913],
       [ 0.45103572,  1.58138735, -0.40351355, ..., -0.05287424,
        -0.07573051, -0.49621913],
       [-1.2460578 , -0.1802048 ,  3.17512437, ...,  1.06670401,
        -0.34455779,  0.00687492]])

In [10]:
# Define stratified folds for cross-validation
kf = cv.StratifiedKFold(y, n_folds=10, shuffle=True)

In [11]:
# Compute average AUC across folds
aucs = cv.cross_val_score(ssvc, X, y, scoring='roc_auc', cv=kf)
np.mean(aucs)

0.99622304893906488

In [12]:
# Train using the Radial Basis Function (RBF) kernel
ssvc.set_params(
    svc__kernel='rbf'
)
ssvc.fit(X, y)

Pipeline(steps=[('scale', StandardScaler(copy=True, with_mean=True, with_std=True)), ('svc', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [13]:
# Compute average AUC across folds
aucs = cv.cross_val_score(ssvc, X, y, scoring='roc_auc', cv=kf)
np.mean(aucs)

0.99759293766219681

In [14]:
# Determine ‘optimal’ kernel and value of C by cross-validation using AUC
# C - how much are you wiling to sacrifice to get a larger margin, 
# scoring
gs = grid_search.GridSearchCV(
    estimator=ssvc,
    param_grid={
        'svc__C': [1e-15, 0.0001, 0.001, 0.01, 0.1, 1, 10],
        'svc__kernel': ['linear', 'rbf']
    },
    scoring='roc_auc',
    cv=kf
)
gs.fit(X, y)

GridSearchCV(cv=sklearn.cross_validation.StratifiedKFold(labels=[1 1 ..., 0 0], n_folds=10, shuffle=True, random_state=None),
       error_score='raise',
       estimator=Pipeline(steps=[('scale', StandardScaler(copy=True, with_mean=True, with_std=True)), ('svc', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'svc__kernel': ['linear', 'rbf'], 'svc__C': [1e-15, 0.0001, 0.001, 0.01, 0.1, 1, 10]},
       pre_dispatch='2*n_jobs', refit=True, scoring='roc_auc', verbose=0)

In [15]:
gs.best_score_

0.9989919129346142

In [16]:
gs.best_estimator_

Pipeline(steps=[('scale', StandardScaler(copy=True, with_mean=True, with_std=True)), ('svc', SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [17]:
#when set error to basically zer, we are not penalising error
#strong penalising is best (10) 
gs.grid_scores_

[mean: 0.50000, std: 0.00000, params: {'svc__kernel': 'linear', 'svc__C': 1e-15},
 mean: 0.50000, std: 0.00000, params: {'svc__kernel': 'rbf', 'svc__C': 1e-15},
 mean: 0.99402, std: 0.00327, params: {'svc__kernel': 'linear', 'svc__C': 0.0001},
 mean: 0.99587, std: 0.00286, params: {'svc__kernel': 'rbf', 'svc__C': 0.0001},
 mean: 0.99483, std: 0.00289, params: {'svc__kernel': 'linear', 'svc__C': 0.001},
 mean: 0.99586, std: 0.00286, params: {'svc__kernel': 'rbf', 'svc__C': 0.001},
 mean: 0.99553, std: 0.00256, params: {'svc__kernel': 'linear', 'svc__C': 0.01},
 mean: 0.99623, std: 0.00283, params: {'svc__kernel': 'rbf', 'svc__C': 0.01},
 mean: 0.99601, std: 0.00246, params: {'svc__kernel': 'linear', 'svc__C': 0.1},
 mean: 0.99697, std: 0.00265, params: {'svc__kernel': 'rbf', 'svc__C': 0.1},
 mean: 0.99622, std: 0.00251, params: {'svc__kernel': 'linear', 'svc__C': 1},
 mean: 0.99759, std: 0.00249, params: {'svc__kernel': 'rbf', 'svc__C': 1},
 mean: 0.99629, std: 0.00250, params: {'svc__k

In [None]:
#can also do regression with svm