### Tuning hyperparameters via grid search

https://medium.com/@yoni.levine/how-to-grid-search-with-a-pipeline-93147835d916

In [1]:
import os
import sys
import re
import time
import numpy as np
import pandas as pd

In [2]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn.svm import SVC

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import GridSearchCV

In [3]:
def value_counts(y):
    unique, counts = np.unique(y, return_counts=True)
    for cat in range(len(unique)):
        percentage = (counts[cat]/sum(counts))*100
        print(unique[cat], " ", counts[cat], " ", format(percentage, ".2f"),"%")

In [4]:
df = pd.read_csv('https://archive.ics.uci.edu/ml/'
                 'machine-learning-databases'
                 '/breast-cancer-wisconsin/wdbc.data', header=None)

In [5]:
df.shape # 32 features

(569, 32)

In [6]:
df.columns

Int64Index([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31],
           dtype='int64')

In [7]:
X = df.loc[:, 2:].values # reading from feature labelled as 2
y = df.loc[:, 1].values

In [8]:
X.shape

(569, 30)

In [9]:
y.shape

(569,)

In [10]:
value_counts(y)

B   357   62.74 %
M   212   37.26 %


In [11]:
np.unique(y) # Malignant, Benign
# the printout is alphabatically ordered

array(['B', 'M'], dtype=object)

In [12]:
# y.value_counts() # this will work for pandas series
unique, counts = np.unique(y, return_counts=True)
for cat in range(len(unique)):
    print(unique[cat], " ", counts[cat])

B   357
M   212


In [13]:
#### transform the categories to integer labels
le = LabelEncoder()
y = le.fit_transform(y)

In [14]:
#y # M becomes 1, B becomes 0
le.classes_

array(['B', 'M'], dtype=object)

In [15]:
### checking
le.transform(["M", "B", "B", "M"])

array([1, 0, 0, 1], dtype=int64)

In [16]:
print(np.unique(y, return_counts=True))

(array([0, 1]), array([357, 212], dtype=int64))


In [17]:
value_counts(y)

0   357   62.74 %
1   212   37.26 %


#### Do train_test_split : 80% train, 20% test, stratify with y
#### make sure the category percentage is same in train and test split

In [18]:
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, 
                     test_size=0.20,
                     stratify=y,
                     random_state=1)

In [19]:
print(X_train.shape, " ", y_train.shape)
print(X_test.shape, " ", y_test.shape)

(455, 30)   (455,)
(114, 30)   (114,)


In [20]:
value_counts(y)

0   357   62.74 %
1   212   37.26 %


In [21]:
value_counts(y_train)

0   285   62.64 %
1   170   37.36 %


In [22]:
value_counts(y_test)

0   72   63.16 %
1   42   36.84 %


make_pipeline is a shorthand for Pipeline constructor. No need to give names of estimators. Their names will be set to lowercase of their types automatically.

In [23]:
pipe_svc = Pipeline([
    ("scaling", StandardScaler()),
    ("SVM", SVC(random_state=1))
])

In [24]:
# give transformer and then estimator
#pipe_svc = make_pipeline(StandardScaler(), SVC(random_state=1))

In [25]:
#pipe_svc

In [26]:
#pipe_svc.steps

In [27]:
pipe_svc

Pipeline(memory=None,
     steps=[('scaling', StandardScaler(copy=True, with_mean=True, with_std=True)), ('SVM', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=1,
  shrinking=True, tol=0.001, verbose=False))])

In [28]:
pipe_svc.steps

[('scaling', StandardScaler(copy=True, with_mean=True, with_std=True)),
 ('SVM', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=1,
    shrinking=True, tol=0.001, verbose=False))]

In [29]:
[i[0] for i in pipe_svc.steps] # how the names of the steps are taken

['scaling', 'SVM']

In [49]:
pipe_svc.steps[0]

('scaling', StandardScaler(copy=True, with_mean=True, with_std=True))

In [30]:
pipe_svc.steps[1][1].decision_function_shape

'ovr'

In [31]:
param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
#param_grid = [{'svc__C': param_range, 'svc__kernel': ['linear']},
#              {'svc__C': param_range, 'svc__gamma': param_range, 'svc__kernel': ['rbf']}]

param_grid = [{'SVM__C': param_range, 'SVM__kernel': ['linear']},
              {'SVM__C': param_range, 'SVM__gamma': param_range, 'SVM__kernel': ['rbf']}]

In [32]:
param_range

[0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]

In [33]:
param_grid

[{'SVM__C': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0],
  'SVM__kernel': ['linear']},
 {'SVM__C': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0],
  'SVM__gamma': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0],
  'SVM__kernel': ['rbf']}]

In [46]:
param_grid[0].get("SVM__kernel")

['linear']

In [47]:
param_grid[1].get("SVM__kernel")

['rbf']

In [48]:
for i in range(len(param_grid)):
    print(param_grid[i].get("SVM__kernel"))

['linear']
['rbf']


In [37]:
gs = GridSearchCV(estimator=pipe_svc, param_grid=param_grid, scoring='accuracy', cv=10, n_jobs=-1)

In [38]:
gs

GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('scaling', StandardScaler(copy=True, with_mean=True, with_std=True)), ('SVM', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=1,
  shrinking=True, tol=0.001, verbose=False))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid=[{'SVM__C': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0], 'SVM__kernel': ['linear']}, {'SVM__C': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0], 'SVM__gamma': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0], 'SVM__kernel': ['rbf']}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [39]:
gs.estimator.steps[1][1].decision_function_shape

'ovr'

In [40]:
gs.param_grid

[{'SVM__C': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0],
  'SVM__kernel': ['linear']},
 {'SVM__C': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0],
  'SVM__gamma': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0],
  'SVM__kernel': ['rbf']}]

In [41]:
gs.scoring

'accuracy'

In [42]:
start_time = time.time()
gs.fit(X_train, y_train)
print("--- %s seconds --- " %(time.time() - start_time))

--- 8.336163759231567 seconds --- 


```
--- 8.790812015533447 seconds --- 
```

In [43]:
print(dir(gs))

['__abstractmethods__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_abc_cache', '_abc_negative_cache', '_abc_negative_cache_version', '_abc_registry', '_check_is_fitted', '_estimator_type', '_format_results', '_get_param_names', '_run_search', 'best_estimator_', 'best_index_', 'best_params_', 'best_score_', 'classes_', 'cv', 'cv_results_', 'decision_function', 'error_score', 'estimator', 'fit', 'fit_params', 'get_params', 'iid', 'inverse_transform', 'multimetric_', 'n_jobs', 'n_splits_', 'param_grid', 'pre_dispatch', 'predict', 'predict_log_proba', 'predict_proba', 'refit', 'refit_time_', 'return_train_score', 'score', 'scorer_', 'scoring', 'set_params', 'tr

In [44]:
gs.best_score_

0.9846153846153847

In [45]:
gs.best_params_

{'SVM__C': 100.0, 'SVM__gamma': 0.001, 'SVM__kernel': 'rbf'}