In [1]:
%load_ext autoreload

In [2]:
%autoreload 1

In [3]:
%aimport helpers, loaders, plotters

In [4]:
from pprint import pprint
from math import ceil
from itertools import product

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.stats
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler

from helpers import load_best_result, save_search_result, scikit_cv_result_to_df
from loaders import load_adult, load_mnist
from plotters import plot_means_w_stds, gen_and_plot_learning_curve, plot_learning_curve, gen_and_plot_validation_curve, plot_validation_curve

### Load Data

In [5]:
dataset = 'adult'
learner_type = 'SVM'
loader_func = load_adult

In [6]:
df = loader_func(preprocess=True)

### Split data into training and testing

In [7]:
X = df[[c for c in df.columns if c != 'target']]
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=1)

### Scale Data

In [8]:
# Use min max scaling because input dataset is sparse, i.e. mostly zeros
scaler = MinMaxScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Support Vector Machine Model Complexity
Sources:
1. https://medium.com/@aneesha/svm-parameter-tuning-in-scikit-learn-using-gridsearchcv-2413c02125a0
2. https://www.datacamp.com/community/tutorials/svm-classification-scikit-learn-python

#### Hyperparameters for SVMs

**Kernel:** The kernel function used to perform the "kernel trick"

**C:** Penalty parameter C of the error term.

**gamma:** Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’. If gamma is ‘auto’ then 1/n_features will be used instead.

In [9]:
from sklearn.svm import SVC

base_clf = SVC()

base_clf.get_params()

{'C': 1.0,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'auto',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [10]:
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10],
    'gamma': [0.001, 0.01, 0.1, 1],
    'kernel': ['poly', 'rbf']    
#     'kernel': ['linear', 'poly', 'rbf']
}

In [11]:
grid_search = GridSearchCV(
    estimator=base_clf,
    param_grid=param_grid,
    scoring='accuracy',
    return_train_score=True,
    cv=4,
    verbose=10,
    n_jobs=-1,
)

In [None]:
grid_search.fit(X_train_scaled, y_train)

Fitting 4 folds for each of 40 candidates, totalling 160 fits
[CV] C=0.001, gamma=0.001, kernel=poly ...............................
[CV] C=0.001, gamma=0.001, kernel=poly ...............................
[CV] C=0.001, gamma=0.001, kernel=poly ...............................
[CV] C=0.001, gamma=0.001, kernel=poly ...............................
[CV]  C=0.001, gamma=0.001, kernel=poly, score=0.751042061386889, total=  34.6s
[CV] C=0.001, gamma=0.001, kernel=rbf ................................
[CV]  C=0.001, gamma=0.001, kernel=poly, score=0.751042061386889, total=  34.8s
[CV] C=0.001, gamma=0.001, kernel=rbf ................................
[CV]  C=0.001, gamma=0.001, kernel=poly, score=0.7510892214434552, total=  34.9s
[CV] C=0.001, gamma=0.001, kernel=rbf ................................
[CV]  C=0.001, gamma=0.001, kernel=poly, score=0.751042061386889, total=  34.9s
[CV] C=0.001, gamma=0.001, kernel=rbf ................................
[CV]  C=0.001, gamma=0.001, kernel=rbf, score=0.7

[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:  1.8min


[CV] C=0.001, gamma=0.01, kernel=poly ................................
[CV]  C=0.001, gamma=0.001, kernel=rbf, score=0.751042061386889, total=  37.3s
[CV] C=0.001, gamma=0.01, kernel=poly ................................
[CV]  C=0.001, gamma=0.01, kernel=poly, score=0.751042061386889, total=  34.1s
[CV] C=0.001, gamma=0.01, kernel=rbf .................................
[CV]  C=0.001, gamma=0.01, kernel=poly, score=0.751042061386889, total=  34.0s
[CV] C=0.001, gamma=0.01, kernel=rbf .................................


[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  2.7min


[CV]  C=0.001, gamma=0.01, kernel=poly, score=0.751042061386889, total=  34.4s
[CV] C=0.001, gamma=0.01, kernel=rbf .................................
[CV]  C=0.001, gamma=0.01, kernel=poly, score=0.7510892214434552, total=  34.8s
[CV] C=0.001, gamma=0.01, kernel=rbf .................................
[CV]  C=0.001, gamma=0.01, kernel=rbf, score=0.7510892214434552, total=  38.7s
[CV] C=0.001, gamma=0.1, kernel=poly .................................
[CV]  C=0.001, gamma=0.01, kernel=rbf, score=0.751042061386889, total=  38.8s
[CV] C=0.001, gamma=0.1, kernel=poly .................................
[CV]  C=0.001, gamma=0.01, kernel=rbf, score=0.751042061386889, total=  38.8s
[CV] C=0.001, gamma=0.1, kernel=poly .................................
[CV]  C=0.001, gamma=0.01, kernel=rbf, score=0.751042061386889, total=  38.8s
[CV] C=0.001, gamma=0.1, kernel=poly .................................
[CV]  C=0.001, gamma=0.1, kernel=poly, score=0.7510892214434552, total=  36.1s
[CV] C=0.001, gamma=0.1

[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  4.6min


[CV]  C=0.001, gamma=0.1, kernel=poly, score=0.751042061386889, total=  36.0s
[CV] C=0.001, gamma=0.1, kernel=rbf ..................................
[CV]  C=0.001, gamma=0.1, kernel=poly, score=0.751042061386889, total=  36.3s
[CV] C=0.001, gamma=0.1, kernel=rbf ..................................
[CV]  C=0.001, gamma=0.1, kernel=poly, score=0.751042061386889, total=  36.3s
[CV] C=0.001, gamma=0.1, kernel=rbf ..................................
[CV]  C=0.001, gamma=0.1, kernel=rbf, score=0.751042061386889, total=  41.1s
[CV] C=0.001, gamma=1, kernel=poly ...................................
[CV]  C=0.001, gamma=0.1, kernel=rbf, score=0.7510892214434552, total=  41.8s
[CV] C=0.001, gamma=1, kernel=poly ...................................
[CV]  C=0.001, gamma=0.1, kernel=rbf, score=0.751042061386889, total=  41.4s
[CV] C=0.001, gamma=1, kernel=poly ...................................
[CV]  C=0.001, gamma=0.1, kernel=rbf, score=0.751042061386889, total=  41.2s
[CV] C=0.001, gamma=1, kernel=p

[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  5.6min


[CV]  C=0.001, gamma=1, kernel=poly, score=0.8344383405948096, total=  30.0s
[CV] C=0.001, gamma=1, kernel=rbf ....................................
[CV]  C=0.001, gamma=1, kernel=poly, score=0.8309965896172793, total=  29.7s
[CV] C=0.001, gamma=1, kernel=rbf ....................................
[CV]  C=0.001, gamma=1, kernel=poly, score=0.8395225464190982, total=  30.3s
[CV] C=0.001, gamma=1, kernel=rbf ....................................
[CV]  C=0.001, gamma=1, kernel=poly, score=0.8336491095111784, total=  30.2s
[CV] C=0.001, gamma=1, kernel=rbf ....................................
[CV]  C=0.001, gamma=1, kernel=rbf, score=0.751042061386889, total=  37.9s
[CV] C=0.01, gamma=0.001, kernel=poly ................................
[CV]  C=0.001, gamma=1, kernel=rbf, score=0.7510892214434552, total=  39.9s
[CV] C=0.01, gamma=0.001, kernel=poly ................................
[CV]  C=0.001, gamma=1, kernel=rbf, score=0.751042061386889, total=  38.4s
[CV] C=0.01, gamma=0.001, kernel=poly ..

[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  8.1min


[CV]  C=0.01, gamma=0.001, kernel=poly, score=0.751042061386889, total=  33.8s
[CV] C=0.01, gamma=0.001, kernel=rbf .................................
[CV]  C=0.01, gamma=0.001, kernel=poly, score=0.751042061386889, total=  35.2s
[CV] C=0.01, gamma=0.001, kernel=rbf .................................
[CV]  C=0.01, gamma=0.001, kernel=poly, score=0.751042061386889, total=  34.1s
[CV] C=0.01, gamma=0.001, kernel=rbf .................................
[CV]  C=0.01, gamma=0.001, kernel=rbf, score=0.7510892214434552, total=  37.3s
[CV] C=0.01, gamma=0.01, kernel=poly .................................
[CV]  C=0.01, gamma=0.001, kernel=rbf, score=0.751042061386889, total=  37.0s
[CV] C=0.01, gamma=0.01, kernel=poly .................................
[CV]  C=0.01, gamma=0.001, kernel=rbf, score=0.751042061386889, total=  37.2s
[CV] C=0.01, gamma=0.01, kernel=poly .................................
[CV]  C=0.01, gamma=0.001, kernel=rbf, score=0.751042061386889, total=  37.5s
[CV] C=0.01, gamma=0.01,

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 10.0min


[CV]  C=0.01, gamma=0.01, kernel=poly, score=0.751042061386889, total=  33.2s
[CV] C=0.01, gamma=0.01, kernel=rbf ..................................
[CV]  C=0.01, gamma=0.01, kernel=poly, score=0.751042061386889, total=  34.3s
[CV] C=0.01, gamma=0.01, kernel=rbf ..................................
[CV]  C=0.01, gamma=0.01, kernel=rbf, score=0.7510892214434552, total=  36.8s
[CV] C=0.01, gamma=0.1, kernel=poly ..................................
[CV]  C=0.01, gamma=0.01, kernel=rbf, score=0.751042061386889, total=  37.2s
[CV] C=0.01, gamma=0.1, kernel=poly ..................................
[CV]  C=0.01, gamma=0.01, kernel=rbf, score=0.751042061386889, total=  37.2s
[CV] C=0.01, gamma=0.1, kernel=poly ..................................
[CV]  C=0.01, gamma=0.01, kernel=rbf, score=0.751042061386889, total=  37.8s
[CV] C=0.01, gamma=0.1, kernel=poly ..................................
[CV]  C=0.01, gamma=0.1, kernel=poly, score=0.808675885584391, total=  35.2s
[CV] C=0.01, gamma=0.1, kernel=r

[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed: 12.8min


[CV]  C=0.01, gamma=0.1, kernel=rbf, score=0.797271693823418, total=  42.8s
[CV] C=0.01, gamma=1, kernel=poly ....................................
[CV]  C=0.01, gamma=0.1, kernel=rbf, score=0.7978400909435392, total=  42.3s
[CV] C=0.01, gamma=1, kernel=poly ....................................
[CV]  C=0.01, gamma=0.1, kernel=rbf, score=0.7968927624100038, total=  43.2s
[CV] C=0.01, gamma=1, kernel=poly ....................................
[CV]  C=0.01, gamma=1, kernel=poly, score=0.8408789543474143, total=  31.3s
[CV] C=0.01, gamma=1, kernel=rbf .....................................
[CV]  C=0.01, gamma=1, kernel=poly, score=0.8364910951117848, total=  32.1s
[CV] C=0.01, gamma=1, kernel=rbf .....................................
[CV]  C=0.01, gamma=1, kernel=poly, score=0.8408488063660478, total=  31.8s
[CV] C=0.01, gamma=1, kernel=rbf .....................................
[CV]  C=0.01, gamma=1, kernel=poly, score=0.8366805608184918, total=  32.6s
[CV] C=0.01, gamma=1, kernel=rbf .......

[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed: 15.3min


[CV]  C=0.1, gamma=0.001, kernel=poly, score=0.7510892214434552, total=  33.9s
[CV] C=0.1, gamma=0.001, kernel=rbf ..................................
[CV]  C=0.1, gamma=0.001, kernel=poly, score=0.751042061386889, total=  31.7s
[CV] C=0.1, gamma=0.001, kernel=rbf ..................................
[CV]  C=0.1, gamma=0.001, kernel=poly, score=0.751042061386889, total=  32.0s
[CV] C=0.1, gamma=0.001, kernel=rbf ..................................
[CV]  C=0.1, gamma=0.001, kernel=poly, score=0.751042061386889, total=  32.3s
[CV] C=0.1, gamma=0.001, kernel=rbf ..................................
[CV]  C=0.1, gamma=0.001, kernel=rbf, score=0.7510892214434552, total=  38.4s
[CV] C=0.1, gamma=0.01, kernel=poly ..................................
[CV]  C=0.1, gamma=0.001, kernel=rbf, score=0.751042061386889, total=  37.5s
[CV] C=0.1, gamma=0.01, kernel=poly ..................................
[CV]  C=0.1, gamma=0.001, kernel=rbf, score=0.751042061386889, total=  38.1s
[CV] C=0.1, gamma=0.01, kerne

[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed: 18.7min


[CV]  C=0.1, gamma=0.01, kernel=rbf, score=0.8156498673740054, total=  36.9s
[CV] C=0.1, gamma=0.1, kernel=poly ...................................
[CV]  C=0.1, gamma=0.01, kernel=rbf, score=0.8207654414550967, total=  37.6s
[CV] C=0.1, gamma=0.1, kernel=poly ...................................
[CV]  C=0.1, gamma=0.01, kernel=rbf, score=0.8234179613489958, total=  38.1s
[CV] C=0.1, gamma=0.1, kernel=poly ...................................
[CV]  C=0.1, gamma=0.1, kernel=poly, score=0.8247774199659026, total=  32.2s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV]  C=0.1, gamma=0.1, kernel=poly, score=0.8192497158014399, total=  31.0s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV]  C=0.1, gamma=0.1, kernel=poly, score=0.8270178097764305, total=  30.6s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV]  C=0.1, gamma=0.1, kernel=poly, score=0.8273967411898446, total=  28.5s
[CV] C=0.1, gamma=0.1, kernel=rbf .

[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed: 21.3min


[CV]  C=0.1, gamma=1, kernel=poly, score=0.8357332322849564, total=  40.7s
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV]  C=0.1, gamma=1, kernel=poly, score=0.8313755210306935, total=  37.2s
[CV] C=0.1, gamma=1, kernel=rbf ......................................


In [None]:
save_search_result(grid_search, dataset, learner_type)

### Analysis

In [None]:
res = grid_search.cv_results_

In [None]:
res_df = scikit_cv_result_to_df(res)
res_df = res_df.drop(axis=1, labels=[c for c in res_df.columns if c.startswith('split')])

In [None]:
res_df.sort_values(by='rank_test_score').head(10)

In [None]:
rbf = res_df.xs('rbf', level='kernel')
rbf = rbf.reset_index()
# lin = res_df.xs('linear', level='kernel')
# lin = lin.reset_index()
poly = res_df.xs('poly', level='kernel')
poly = poly.reset_index()

In [None]:
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(12,4))

ax1.set_title('Radial Basis Function')
rbf.plot(kind='scatter', x='C', y='gamma', c='mean_test_score', s=rbf['std_test_score']*100000, cmap='winter', logx=True, logy=True, ax=ax1)

ax2.set_title('Polynomial Function')
poly.plot(kind='scatter', x='C', y='gamma', c='mean_test_score', s=poly['std_test_score']*100000, cmap='winter', logx=True, logy=True, ax=ax2)


# Gamma has no effect for linear kernel
# ax3.set_title('Linear Function')
# lin.plot(kind='scatter', x='C', y='gamma', c='mean_test_score', s=lin['std_test_score']*100000, cmap='winter', logx=True, logy=True, ax=ax3)

In [None]:
# Helper function needed for std==0.0
def std_to_size(std):
    if std > 0:
        return std * 10000
    else:
        return 10

In [None]:
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(12,4))

ax1.set_title('Radial Basis Function')
rbf.plot(kind='scatter', x='C', y='gamma', c='mean_train_score', s=rbf['std_train_score'].apply(std_to_size), cmap='winter', logx=True, logy=True, ax=ax1)

ax2.set_title('Polynomial Function')
poly.plot(kind='scatter', x='C', y='gamma', c='mean_train_score', s=poly['std_train_score'].apply(std_to_size), cmap='winter', logx=True, logy=True, ax=ax2)

# ax3.set_title('Linear Function')
# lin.plot(kind='scatter', x='C', y='gamma', c='mean_train_score', s=lin['std_train_score'].apply(std_to_size), cmap='winter', logx=True, logy=True, ax=ax3)

## SVM Learning Curve Analysis

In [None]:
# Find best estimator
best_params = res['params'][np.argmin(res['rank_test_score'])]
clf = SVC(**best_params)
clf.get_params()

In [None]:
train_sizes, _, _, train_mean, train_std, test_mean, test_std = gen_and_plot_learning_curve(clf, X_train_scaled, y_train, ylim=None, cv=4, train_sizes=np.linspace(.05, 1.0, 20))

### Timing statistics

In [None]:
# Helper function needed for std==0.0
def std_to_size(std):
    if std > 0:
        return std * 100
    else:
        return 10

In [None]:
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(12,4))

ax1.set_title('Radial Basis Function')
rbf.plot(kind='scatter', x='C', y='gamma', c='mean_fit_time', s=rbf['std_fit_time'].apply(std_to_size), cmap='winter', logx=True, logy=True, ax=ax1)

ax2.set_title('Polynomial Function')
poly.plot(kind='scatter', x='C', y='gamma', c='mean_fit_time', s=poly['std_fit_time'].apply(std_to_size), cmap='winter', logx=True, logy=True, ax=ax2)