# Randommized seach CV

## Setting up

In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Load data
dataObj = load_breast_cancer()
X = dataObj.data
y = dataObj.target

# Splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y, 
    stratify=y,
    test_size=0.30,
    random_state=1)

In [2]:
from sklearn.svm import SVC

pipe_svc = Pipeline([('scl', StandardScaler()),
            ('clf', SVC(random_state=1))])

In [3]:
# Get parameter names
for k, v in pipe_svc.get_params().items():
    print(f"{k:35.35s}: {str(v):35.35s}...")

memory                             : None                               ...
steps                              : [('scl', StandardScaler()), ('clf',...
verbose                            : False                              ...
scl                                : StandardScaler()                   ...
clf                                : SVC(random_state=1)                ...
scl__copy                          : True                               ...
scl__with_mean                     : True                               ...
scl__with_std                      : True                               ...
clf__C                             : 1.0                                ...
clf__break_ties                    : False                              ...
clf__cache_size                    : 200                                ...
clf__class_weight                  : None                               ...
clf__coef0                         : 0.0                                ...
clf__decisio

In [4]:
param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]

set1 = {'clf__C': param_range,
        'clf__kernel': ['linear']}

set2 = {'clf__C': param_range,
        'clf__gamma': param_range,
        'clf__kernel': ['rbf']}

param_grid = [set1, set2]

In [5]:
from sklearn.model_selection import RandomizedSearchCV

rs = RandomizedSearchCV(estimator=pipe_svc, 
                  param_distributions=param_grid,
                  n_iter = 10,
                  scoring='accuracy', 
                  cv=5,
                  n_jobs=-1)

In [6]:
rs.fit(X_train,y_train)

RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('scl', StandardScaler()),
                                             ('clf', SVC(random_state=1))]),
                   n_jobs=-1,
                   param_distributions=[{'clf__C': [0.0001, 0.001, 0.01, 0.1,
                                                    1.0, 10.0, 100.0, 1000.0],
                                         'clf__kernel': ['linear']},
                                        {'clf__C': [0.0001, 0.001, 0.01, 0.1,
                                                    1.0, 10.0, 100.0, 1000.0],
                                         'clf__gamma': [0.0001, 0.001, 0.01,
                                                        0.1, 1.0, 10.0, 100.0,
                                                        1000.0],
                                         'clf__kernel': ['rbf']}],
                   scoring='accuracy')

In [7]:
# Get parameter names
for k, v in rs.get_params().items():
    print(f"{k:35.35s}: {str(v):35.35}...")

cv                                 : 5                                  ...
error_score                        : nan                                ...
estimator__memory                  : None                               ...
estimator__steps                   : [('scl', StandardScaler()), ('clf',...
estimator__verbose                 : False                              ...
estimator__scl                     : StandardScaler()                   ...
estimator__clf                     : SVC(random_state=1)                ...
estimator__scl__copy               : True                               ...
estimator__scl__with_mean          : True                               ...
estimator__scl__with_std           : True                               ...
estimator__clf__C                  : 1.0                                ...
estimator__clf__break_ties         : False                              ...
estimator__clf__cache_size         : 200                                ...
estimator__c

In [8]:
df = pd.DataFrame(rs.cv_results_)
display(df)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__kernel,param_clf__gamma,param_clf__C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.005601,0.00049,0.002801,0.0003998757,rbf,0.001,1.0,"{'clf__kernel': 'rbf', 'clf__gamma': 0.001, 'c...",0.925,0.95,0.9375,0.974684,0.936709,0.944778,0.016917,3
1,0.015803,0.0004,0.006001,4.623108e-07,rbf,1000.0,10.0,"{'clf__kernel': 'rbf', 'clf__gamma': 1000.0, '...",0.625,0.625,0.625,0.632911,0.632911,0.628165,0.003876,6
2,0.004801,0.0004,0.0008,0.0003999234,linear,,10.0,"{'clf__kernel': 'linear', 'clf__C': 10.0}",0.9625,0.95,0.925,0.962025,0.962025,0.95231,0.014448,2
3,0.006001,0.000633,0.002401,0.0004900767,rbf,0.1,1.0,"{'clf__kernel': 'rbf', 'clf__gamma': 0.1, 'clf...",0.9625,0.9875,0.9625,0.949367,0.962025,0.964778,0.012424,1
4,0.015003,0.000633,0.005802,0.0003996849,rbf,100.0,0.01,"{'clf__kernel': 'rbf', 'clf__gamma': 100.0, 'c...",0.625,0.625,0.625,0.632911,0.632911,0.628165,0.003876,6
5,0.014003,0.000633,0.006002,3.015783e-07,rbf,100.0,0.001,"{'clf__kernel': 'rbf', 'clf__gamma': 100.0, 'c...",0.625,0.625,0.625,0.632911,0.632911,0.628165,0.003876,6
6,0.009402,0.00049,0.004601,0.0008000732,rbf,1.0,1000.0,"{'clf__kernel': 'rbf', 'clf__gamma': 1.0, 'clf...",0.625,0.625,0.625,0.64557,0.632911,0.630696,0.008043,5
7,0.010403,0.0008,0.004201,0.0003997327,rbf,1000.0,0.0001,"{'clf__kernel': 'rbf', 'clf__gamma': 1000.0, '...",0.625,0.625,0.625,0.632911,0.632911,0.628165,0.003876,6
8,0.010802,0.0004,0.003801,0.000400281,rbf,10.0,10.0,"{'clf__kernel': 'rbf', 'clf__gamma': 10.0, 'cl...",0.625,0.625,0.625,0.632911,0.632911,0.628165,0.003876,6
9,0.006002,0.000633,0.003201,0.0004001141,rbf,0.1,0.1,"{'clf__kernel': 'rbf', 'clf__gamma': 0.1, 'clf...",0.9125,0.95,0.8875,0.962025,0.911392,0.924684,0.027361,4


In [9]:
df = df.sort_values(by=['rank_test_score'])
display(df.head())

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__kernel,param_clf__gamma,param_clf__C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
3,0.006001,0.000633,0.002401,0.00049,rbf,0.1,1.0,"{'clf__kernel': 'rbf', 'clf__gamma': 0.1, 'clf...",0.9625,0.9875,0.9625,0.949367,0.962025,0.964778,0.012424,1
2,0.004801,0.0004,0.0008,0.0004,linear,,10.0,"{'clf__kernel': 'linear', 'clf__C': 10.0}",0.9625,0.95,0.925,0.962025,0.962025,0.95231,0.014448,2
0,0.005601,0.00049,0.002801,0.0004,rbf,0.001,1.0,"{'clf__kernel': 'rbf', 'clf__gamma': 0.001, 'c...",0.925,0.95,0.9375,0.974684,0.936709,0.944778,0.016917,3
9,0.006002,0.000633,0.003201,0.0004,rbf,0.1,0.1,"{'clf__kernel': 'rbf', 'clf__gamma': 0.1, 'clf...",0.9125,0.95,0.8875,0.962025,0.911392,0.924684,0.027361,4
6,0.009402,0.00049,0.004601,0.0008,rbf,1.0,1000.0,"{'clf__kernel': 'rbf', 'clf__gamma': 1.0, 'clf...",0.625,0.625,0.625,0.64557,0.632911,0.630696,0.008043,5


In [10]:
print(rs.best_score_)
print(rs.best_params_)

0.9647784810126583
{'clf__kernel': 'rbf', 'clf__gamma': 0.1, 'clf__C': 1.0}


- Note that grid search already refit the entire training data with the best parameters. You can check this from this setting.


In [11]:
rs.refit

True

In [12]:
y_pred = rs.predict(X_test)
testing_accuracy = rs.score(X_test,y_test)
print(f"Testing accuracy: {testing_accuracy:6.3f}")

Testing accuracy:  0.942


In [13]:
# To do this manually
clf = rs.best_estimator_
clf.fit(X_train, y_train)
testing_accuracy = clf.score(X_test,y_test)
print(f"Testing accuracy: {testing_accuracy:6.3f}")

Testing accuracy:  0.942
