# Grid search CV

## Setting up

In [40]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Load data
dataObj = load_breast_cancer()
X = dataObj.data
y = dataObj.target

# Splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y, 
    stratify=y,
    test_size=0.30,
    random_state=1)

In [41]:
from sklearn.svm import SVC

pipe_svc = Pipeline([('scl', StandardScaler()),
            ('clf', SVC(random_state=1))])

In [42]:
# Get parameter names
for k, v in pipe_svc.get_params().items():
    print(f"{k:35.35s}: {str(v):35.35s}...")

memory                             : None                               ...
steps                              : [('scl', StandardScaler()), ('clf',...
verbose                            : False                              ...
scl                                : StandardScaler()                   ...
clf                                : SVC(random_state=1)                ...
scl__copy                          : True                               ...
scl__with_mean                     : True                               ...
scl__with_std                      : True                               ...
clf__C                             : 1.0                                ...
clf__break_ties                    : False                              ...
clf__cache_size                    : 200                                ...
clf__class_weight                  : None                               ...
clf__coef0                         : 0.0                                ...
clf__decisio

In [43]:
param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]

set1 = {'clf__C': param_range,
        'clf__kernel': ['linear']}

set2 = {'clf__C': param_range,
        'clf__gamma': param_range,
        'clf__kernel': ['rbf']}

param_grid = [set1, set2]

In [44]:
from sklearn.model_selection import GridSearchCV

gs = GridSearchCV(estimator=pipe_svc, 
                  param_grid=param_grid, 
                  scoring='accuracy', 
                  cv=10,
                  n_jobs=-1)

In [45]:
gs.fit(X_train,y_train)

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('scl', StandardScaler()),
                                       ('clf', SVC(random_state=1))]),
             n_jobs=-1,
             param_grid=[{'clf__C': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0,
                                     1000.0],
                          'clf__kernel': ['linear']},
                         {'clf__C': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0,
                                     1000.0],
                          'clf__gamma': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0,
                                         100.0, 1000.0],
                          'clf__kernel': ['rbf']}],
             scoring='accuracy')

In [46]:
# Get parameter names
for k, v in gs.get_params().items():
    print(f"{k:35.35s}: {str(v):35.35}...")

cv                                 : 10                                 ...
error_score                        : nan                                ...
estimator__memory                  : None                               ...
estimator__steps                   : [('scl', StandardScaler()), ('clf',...
estimator__verbose                 : False                              ...
estimator__scl                     : StandardScaler()                   ...
estimator__clf                     : SVC(random_state=1)                ...
estimator__scl__copy               : True                               ...
estimator__scl__with_mean          : True                               ...
estimator__scl__with_std           : True                               ...
estimator__clf__C                  : 1.0                                ...
estimator__clf__break_ties         : False                              ...
estimator__clf__cache_size         : 200                                ...
estimator__c

In [47]:
df = pd.DataFrame(gs.cv_results_)
print(df.shape)
display(df.head())

(72, 21)


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__C,param_clf__kernel,param_clf__gamma,params,split0_test_score,split1_test_score,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.007938,0.000435,0.001187,0.00017,0.0001,linear,,"{'clf__C': 0.0001, 'clf__kernel': 'linear'}",0.625,0.625,...,0.65,0.625,0.65,0.625,0.625,0.641026,0.641026,0.633205,0.010442,27
1,0.005526,0.000987,0.001005,0.000135,0.001,linear,,"{'clf__C': 0.001, 'clf__kernel': 'linear'}",0.925,0.925,...,0.95,0.95,0.875,0.975,0.925,0.923077,0.923077,0.932115,0.025251,23
2,0.004326,0.00067,0.000807,0.000156,0.01,linear,,"{'clf__C': 0.01, 'clf__kernel': 'linear'}",0.975,0.975,...,0.975,1.0,0.875,0.975,0.975,1.0,1.0,0.975,0.035355,6
3,0.003924,0.000652,0.000805,3.8e-05,0.1,linear,,"{'clf__C': 0.1, 'clf__kernel': 'linear'}",0.95,1.0,...,1.0,1.0,0.925,0.975,0.975,1.0,1.0,0.9825,0.025125,1
4,0.004248,0.000538,0.000672,7.7e-05,1.0,linear,,"{'clf__C': 1.0, 'clf__kernel': 'linear'}",0.95,0.975,...,0.975,1.0,0.925,0.95,0.975,0.974359,1.0,0.972436,0.023579,9


In [48]:
df = df.sort_values(by=['rank_test_score'])
display(df.head())

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__C,param_clf__kernel,param_clf__gamma,params,split0_test_score,split1_test_score,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
3,0.003924,0.000652,0.000805,3.8e-05,0.1,linear,,"{'clf__C': 0.1, 'clf__kernel': 'linear'}",0.95,1.0,...,1.0,1.0,0.925,0.975,0.975,1.0,1.0,0.9825,0.025125,1
64,0.004353,0.000292,0.000842,4.3e-05,1000.0,rbf,0.0001,"{'clf__C': 1000.0, 'clf__gamma': 0.0001, 'clf_...",0.95,1.0,...,1.0,1.0,0.95,0.975,0.975,0.974359,1.0,0.982436,0.019551,2
50,0.006433,0.002841,0.000973,0.000199,10.0,rbf,0.01,"{'clf__C': 10.0, 'clf__gamma': 0.01, 'clf__ker...",0.95,1.0,...,0.975,1.0,0.95,0.975,0.975,0.974359,1.0,0.979936,0.018726,3
57,0.004724,0.000686,0.00115,0.000845,100.0,rbf,0.001,"{'clf__C': 100.0, 'clf__gamma': 0.001, 'clf__k...",0.95,1.0,...,1.0,1.0,0.95,0.975,0.975,0.974359,1.0,0.979936,0.018726,3
42,0.005076,0.000104,0.001099,0.000119,1.0,rbf,0.01,"{'clf__C': 1.0, 'clf__gamma': 0.01, 'clf__kern...",0.975,0.975,...,1.0,1.0,0.875,0.975,0.975,1.0,1.0,0.9775,0.036142,5


In [49]:
print(gs.best_score_)
print(gs.best_params_)

0.9824999999999999
{'clf__C': 0.1, 'clf__kernel': 'linear'}


- Note that grid search already refit the entire training data with the best parameters. You can check this from this setting.


In [50]:
gs.refit

True

In [51]:
y_pred = gs.predict(X_test)
testing_accuracy = gs.score(X_test,y_test)
print(f"Testing accuracy: {testing_accuracy:6.3f}")

Testing accuracy:  0.971


In [52]:
# To do this manually
clf = gs.best_estimator_
clf.fit(X_train, y_train)
testing_accuracy = clf.score(X_test,y_test)
print(f"Testing accuracy: {testing_accuracy:6.3f}")

Testing accuracy:  0.971
