# Grid search CV

- Note that if I set `test_size=0.20` as in the text book I will get test accuracy higher than validation accuracy. I also encountered the same behavior when using CNC data.
- This might ring an alarm (i.e. too few test data as pointed out in https://stats.stackexchange.com/a/59632).
- However, keep in mind that when using the test data, `gs.fit()` already fit to the entire training set so the amount of data that is used to train the final model is different from the amount of data used to train during grid search. 

## Setting up

In [14]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Load data
dataObj = load_breast_cancer()
X = dataObj.data
y = dataObj.target

# Splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y, 
    stratify=y,
    test_size=0.30,
    random_state=1)

In [15]:
from sklearn.svm import SVC

pipe_svc = Pipeline([('scl', StandardScaler()),
            ('clf', SVC(random_state=1))])

In [16]:
# Get parameter names
for k, v in pipe_svc.get_params().items():
    print(f"{k:35.35s}: {str(v):35.35s}...")

memory                             : None                               ...
steps                              : [('scl', StandardScaler()), ('clf',...
verbose                            : False                              ...
scl                                : StandardScaler()                   ...
clf                                : SVC(random_state=1)                ...
scl__copy                          : True                               ...
scl__with_mean                     : True                               ...
scl__with_std                      : True                               ...
clf__C                             : 1.0                                ...
clf__break_ties                    : False                              ...
clf__cache_size                    : 200                                ...
clf__class_weight                  : None                               ...
clf__coef0                         : 0.0                                ...
clf__decisio

In [17]:
param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]

set1 = {'clf__C': param_range,
        'clf__kernel': ['linear']}

set2 = {'clf__C': param_range,
        'clf__gamma': param_range,
        'clf__kernel': ['rbf']}

param_grid = [set1, set2]

In [18]:
from sklearn.model_selection import GridSearchCV

gs = GridSearchCV(estimator=pipe_svc, 
                  param_grid=param_grid, 
                  scoring='accuracy', 
                  cv=5,
                  n_jobs=-1)

In [19]:
gs.fit(X_train,y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scl', StandardScaler()),
                                       ('clf', SVC(random_state=1))]),
             n_jobs=-1,
             param_grid=[{'clf__C': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0,
                                     1000.0],
                          'clf__kernel': ['linear']},
                         {'clf__C': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0,
                                     1000.0],
                          'clf__gamma': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0,
                                         100.0, 1000.0],
                          'clf__kernel': ['rbf']}],
             scoring='accuracy')

In [20]:
# Get parameter names
for k, v in gs.get_params().items():
    print(f"{k:35.35s}: {str(v):35.35}...")

cv                                 : 5                                  ...
error_score                        : nan                                ...
estimator__memory                  : None                               ...
estimator__steps                   : [('scl', StandardScaler()), ('clf',...
estimator__verbose                 : False                              ...
estimator__scl                     : StandardScaler()                   ...
estimator__clf                     : SVC(random_state=1)                ...
estimator__scl__copy               : True                               ...
estimator__scl__with_mean          : True                               ...
estimator__scl__with_std           : True                               ...
estimator__clf__C                  : 1.0                                ...
estimator__clf__break_ties         : False                              ...
estimator__clf__cache_size         : 200                                ...
estimator__c

In [21]:
df = pd.DataFrame(gs.cv_results_)
print(df.shape)
display(df.head())

(72, 16)


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__C,param_clf__kernel,param_clf__gamma,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.005,1.168008e-07,0.0014,0.0004900571,0.0001,linear,,"{'clf__C': 0.0001, 'clf__kernel': 'linear'}",0.625,0.625,0.6375,0.632911,0.632911,0.630665,0.004919,30
1,0.004399,0.0004898235,0.001,3.234067e-07,0.001,linear,,"{'clf__C': 0.001, 'clf__kernel': 'linear'}",0.925,0.95,0.9125,0.949367,0.911392,0.929652,0.017041,23
2,0.0032,0.000399971,0.0006,0.0004899793,0.01,linear,,"{'clf__C': 0.01, 'clf__kernel': 'linear'}",0.975,0.9875,0.9375,0.974684,0.962025,0.967342,0.016957,9
3,0.002401,0.0004892979,0.0004,0.0004900961,0.1,linear,,"{'clf__C': 0.1, 'clf__kernel': 'linear'}",0.975,1.0,0.9625,0.974684,1.0,0.982437,0.015032,1
4,0.002199,0.0004001857,0.0006,0.000490135,1.0,linear,,"{'clf__C': 1.0, 'clf__kernel': 'linear'}",0.9625,0.9875,0.95,0.974684,0.987342,0.972405,0.014534,8


In [22]:
df = df.sort_values(by=['rank_test_score'])
display(df.head())

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__C,param_clf__kernel,param_clf__gamma,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
3,0.002401,0.0004892979,0.0004,0.00049,0.1,linear,,"{'clf__C': 0.1, 'clf__kernel': 'linear'}",0.975,1.0,0.9625,0.974684,1.0,0.982437,0.015032,1
57,0.003001,2.132481e-07,0.0004,0.00049,100.0,rbf,0.001,"{'clf__C': 100.0, 'clf__gamma': 0.001, 'clf__k...",0.975,1.0,0.975,0.974684,0.987342,0.982405,0.010032,2
56,0.003,5.560829e-07,0.0012,0.0004,100.0,rbf,0.0001,"{'clf__C': 100.0, 'clf__gamma': 0.0001, 'clf__...",0.9625,1.0,0.95,0.974684,1.0,0.977437,0.020008,3
50,0.003201,0.0003998518,0.0008,0.0004,10.0,rbf,0.01,"{'clf__C': 10.0, 'clf__gamma': 0.01, 'clf__ker...",0.95,1.0,0.975,0.974684,0.987342,0.977405,0.016574,4
64,0.003001,4.909339e-07,0.0006,0.00049,1000.0,rbf,0.0001,"{'clf__C': 1000.0, 'clf__gamma': 0.0001, 'clf_...",0.95,1.0,0.975,0.974684,0.987342,0.977405,0.016574,4


In [23]:
print(gs.best_score_)
print(gs.best_params_)

0.9824367088607595
{'clf__C': 0.1, 'clf__kernel': 'linear'}


- Note that grid search already refit the entire training data with the best parameters. You can check this from this setting.


In [24]:
gs.refit

True

In [25]:
y_pred = gs.predict(X_test)
testing_accuracy = gs.score(X_test,y_test)
print(f"Testing accuracy: {testing_accuracy:6.3f}")

Testing accuracy:  0.971


In [26]:
# To do this manually
clf = gs.best_estimator_
clf.fit(X_train, y_train)
testing_accuracy = clf.score(X_test,y_test)
print(f"Testing accuracy: {testing_accuracy:6.3f}")

Testing accuracy:  0.971
