In [1]:
#pipeline is an estimator, gridsearchcv that you generate is also an estimator

In [2]:
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline

In [3]:
X = fetch_california_housing()['data']
y = fetch_california_housing()['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0 )

In [7]:
type(X)

numpy.ndarray

In [3]:
pipe = Pipeline([('scaler', StandardScaler()), ('regresor', Ridge())])
pipe.fit(X_train, y_train)

print (pipe.score(X_test, y_test))

0.5943141338604156


In [4]:
pipe.get_params() #retrun all parameters for the pipeline

{'memory': None,
 'steps': [('scaler',
   StandardScaler(copy=True, with_mean=True, with_std=True)),
  ('regresor', Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
         normalize=False, random_state=None, solver='auto', tol=0.001))],
 'verbose': False,
 'scaler': StandardScaler(copy=True, with_mean=True, with_std=True),
 'regresor': Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
       normalize=False, random_state=None, solver='auto', tol=0.001),
 'scaler__copy': True,
 'scaler__with_mean': True,
 'scaler__with_std': True,
 'regresor__alpha': 1.0,
 'regresor__copy_X': True,
 'regresor__fit_intercept': True,
 'regresor__max_iter': None,
 'regresor__normalize': False,
 'regresor__random_state': None,
 'regresor__solver': 'auto',
 'regresor__tol': 0.001}

In [14]:
GridSearchCV?

In [29]:
import numpy as np
param_grid = {'regresor__alpha': np.logspace(-3, 3 ,20)}

# gridsearch refits all estimators that are upstream, which is a lttle slow

In [36]:
grid_search = GridSearchCV(pipe, param_grid, cv=5, n_jobs=2, verbose=1)


In [31]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    2.6s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('regresor', Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001))]),
       fit_params=None, iid=True, n_jobs=2,
       param_grid={'regresor__alpha': array([1.00000e-03, 2.06914e-03, 4.28133e-03, 8.85867e-03, 1.83298e-02,
       3.79269e-02, 7.84760e-02, 1.62378e-01, 3.35982e-01, 6.95193e-01,
       1.43845e+00, 2.97635e+00, 6.15848e+00, 1.27427e+01, 2.63665e+01,
       5.45559e+01, 1.12884e+02, 2.33572e+02, 4.83293e+02, 1.00000e+03])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [33]:
grid_search.best_score_

0.6053956962874546

In [34]:
np.logspace?

# to fix this issue, use this feature called memory

In [37]:
from tempfile import mkdtemp
from shutil import rmtree

cachedir = mkdtemp() #creates a temp diectory
pipe_cache = Pipeline([('scaler', StandardScaler()), ('regresor', Ridge())], memory=cachedir)

In [41]:
pipe_cache.fit (X_train, y_train)

Pipeline(memory='C:\\Users\\KSULAI~1\\AppData\\Local\\Temp\\tmpa9n5lpqb',
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('regresor', Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001))])

In [42]:
rmtree(cachedir)

# another approach is to apply grid on the last step of the pipeline, which in this case is Ridge()
we can only do this if we wanna tune hyperparams for the last step

In [43]:
param_grid = {'alpha': np.logspace(-3,3,20)}
grid_search = GridSearchCV(Ridge(), param_grid, cv=5, n_jobs=2, verbose=1)

In [48]:
pipe2 = Pipeline([('scaler', StandardScaler()), ('grid_search', grid_search)])

In [50]:
pipe2.fit(X_train, y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    2.1s finished


Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('grid_search', GridSearchCV(cv=5, error_score='raise',
       estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001),
       fit_params=N...   pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1))])

In [52]:
pipe2.named_steps['grid_search'].best_params_

{'alpha': 26.366508987303554}

# lets say we have 3 steps, scaler, dimension reduction, and Ridge
if we wanna tune the last two steps, we need to pass the pipeline to the gridsearch

In [75]:
from sklearn.decomposition import PCA #dimensional reducution

cachedir = mkdtemp()
pipe_3 = Pipeline([("scaler", StandardScaler()), ("dim-red", PCA()), ("regressor", Ridge())], memory=cachedir)

param_grid = {"dim-red__n_components": [2,3,4,5,6], "regressor__alpha": np.logspace(-3, 3, 20)}


grid_search = GridSearchCV(pipe_3, param_grid, cv=5, n_jobs=2, verbose=1)
grid_search.fit (X_train, y_train);


Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=2)]: Done 211 tasks      | elapsed:    4.9s
[Parallel(n_jobs=2)]: Done 500 out of 500 | elapsed:    9.3s finished


In [76]:
grid_search.best_params_

{'dim-red__n_components': 6, 'regressor__alpha': 26.366508987303554}

# with *RandomizedSearchCV* we can tune different hyperparameters at the same time/

In [78]:
from sklearn.model_selection import RandomizedSearchCV

In [79]:
RandomizedSearchCV?

In [93]:
cachedir = mkdtemp()
pipe_4 = Pipeline([("scaler", StandardScaler()), ("dim-red", PCA()), ("regressor", Ridge())], memory=cachedir)
param_grid = {"dim-red__n_components": range(1,9), "regressor__alpha": np.logspace(-3, 3, 200)}

random_search = RandomizedSearchCV(pipe_4, param_grid, cv=5, verbose=1, n_jobs=2, n_iter=100)

In [94]:
random_search.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=2)]: Done  69 tasks      | elapsed:    4.4s
[Parallel(n_jobs=2)]: Done 500 out of 500 | elapsed:   11.1s finished


RandomizedSearchCV(cv=5, error_score='raise',
          estimator=Pipeline(memory='C:\\Users\\KSULAI~1\\AppData\\Local\\Temp\\tmpch650k2m',
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('dim-red', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('regressor', Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001))]),
          fit_params=None, iid=True, n_iter=100, n_jobs=2,
          param_distributions={'dim-red__n_components': range(1, 9), 'regressor__alpha': array([1.00000e-03, 1.07189e-03, ..., 9.32930e+02, 1.00000e+03])},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=1)

In [99]:
print (random_search.best_score_)
random_search.score(X_test, y_test)

0.6053921829118217


0.5942308273119916

In [90]:
type(random_search)

sklearn.model_selection._search.RandomizedSearchCV

In [92]:
%%timeit

random_search.fit(X_train, y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    3.1s finished


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    3.1s finished


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    3.3s finished


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    3.1s finished


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    2.9s finished


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    3.1s finished


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    3.2s finished


Fitting 5 folds for each of 20 candidates, totalling 100 fits
3.47 s ± 97.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    3.0s finished
