# Find Best Preprocessing Steps During Model Selection
### Preliminaries

In [3]:
# Import libraries
import numpy as np
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest

# set random seed
np.random.seed(0)

### Load Iris Dataset

In [4]:
#Load data
iris = datasets.load_iris()
X = iris.data
y = iris.target

### Create Proprocessing Object

We are include two different preprocessing steps: principal component analysis and a k-best feature selection.

In [5]:
# Create a combined preprocesing object
preprocess = FeatureUnion([('pca',PCA()), 
                           ('kbest', SelectKBest(k=1))])

### Create Pipeline

In [7]:
# Create a Pipeline
pipe = Pipeline([('preprocess', preprocess), ('classifier', LogisticRegression())])

### Create Search Space of Hyperparameter Values

In [8]:
# Create space of candidate values
search_space = [{'preprocess__pca__n_components': [1, 2, 3],
                 'classifier__penalty': ['l1', 'l2'],
                 'classifier__C': np.logspace(0, 4, 10)}]

### Create Gird Search

In [9]:
# Create grid search
clf = GridSearchCV(pipe, search_space, cv=5, verbose=0, n_jobs=-1)

###  Conduct Grid Searcn

In [10]:
# Fit grid search
best_model = clf.fit(X,y)

### View Best Model's Hyperparamters

In [14]:
# View best hyperparameters
print('Best Number of Principal Components: ', best_model.best_estimator_.get_params()['preprocess__pca__n_components'])
print ('Best Penalty: ', best_model.best_estimator_.get_params()['classifier__penalty'])
print ('Best C: ', best_model.best_estimator_.get_params()['classifier__C'])

Best Number of Principal Components:  3
Best Penalty:  l1
Best C:  59.9484250319


https://chrisalbon.com/machine-learning/find_best_preprocessing_steps_during_model_selection.html