**Pipeline**
  

In [3]:
from sklearn.datasets import load_breast_cancer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.preprocessing import StandardScaler

In [6]:
X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

In [None]:
estimators = []
estimators.append(("standardize", StandardScaler()))
estimators.append(("model", LinearDiscriminantAnalysis()))

model = Pipeline(estimators)
model.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('standardize',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('model',
                 LinearDiscriminantAnalysis(n_components=None, priors=None,
                                            shrinkage=None, solver='svd',
                                            store_covariance=False,
                                            tol=0.0001))],
         verbose=False)

In [None]:
model.score(X_test, y_test)

0.9736842105263158

**Pipeline Variant with multiple preprocessing steps**

In [4]:
from sklearn.datasets import load_breast_cancer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, cross_val_score
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest

In [None]:
X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

In [None]:
features = []
features.append(("pca", PCA(n_components=3)))
features.append(("stc", SelectKBest(k=6)))

feature_union = FeatureUnion(features)

In [None]:
estimators = []
estimators.append(("feature", feature_union))
estimators.append(("standardize", StandardScaler()))
estimators.append(("model", LogisticRegression()))

model = Pipeline(estimators)

In [None]:
kfold = KFold(n_splits=10, random_state=7)
results = cross_val_score(model, X, y, cv=kfold, n_jobs=-1)



In [None]:
results.mean()

0.9578634085213033

**Pipeline and gridsearch cv**

In [8]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

estimators = []
estimators.append(("standardize", StandardScaler()))
estimators.append(("svm", SVC()))

model = Pipeline(estimators)
model.fit(X_train, y_train)

param_grid = {'svm__C': [0.001, 0.01, 0.1, 1, 10 ,100], 'svm__gamma': [0.001, 0.01, 0.1, 1, 10 ,100]}
grid = GridSearchCV(model, param_grid, cv=5)
grid.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('standardize',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('svm',
                                        SVC(C=1.0, break_ties=False,
                                            cache_size=200, class_weight=None,
                                            coef0=0.0,
                                            decision_function_shape='ovr',
                                            degree=3, gamma='scale',
                                            kernel='rbf', max_iter=-1,
                                            probability=False,
                                            random_state=None, shrinking=True,
                                            tol=

**make pipeline**

In [12]:
from sklearn.pipeline import make_pipeline
pipe_short = make_pipeline(StandardScaler(), SVC(C=100))
pipe_short

Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('svc',
                 SVC(C=100, break_ties=False, cache_size=200, class_weight=None,
                     coef0=0.0, decision_function_shape='ovr', degree=3,
                     gamma='scale', kernel='rbf', max_iter=-1,
                     probability=False, random_state=None, shrinking=True,
                     tol=0.001, verbose=False))],
         verbose=False)

In [13]:
pipe_short.steps

[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)),
 ('svc',
  SVC(C=100, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
      decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
      max_iter=-1, probability=False, random_state=None, shrinking=True,
      tol=0.001, verbose=False))]

In [14]:
pipe_short.named_steps

{'standardscaler': StandardScaler(copy=True, with_mean=True, with_std=True),
 'svc': SVC(C=100, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
     decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
     max_iter=-1, probability=False, random_state=None, shrinking=True,
     tol=0.001, verbose=False)}

**GridSearch, pipeline and model**

In [28]:
pipe = Pipeline([('preprocessing', StandardScaler()), ('classifier', SVC())])

param_grid = [
              {'classifier':  [SVC()], 'preprocessing': [StandardScaler()], 'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100], 'classifier__gamma': [0.001, 0.01, 0.1, 1, 10, 100]},
              {'classifier':  [LinearDiscriminantAnalysis()], 'preprocessing': [StandardScaler()]}
]

grid = GridSearchCV(pipe, param_grid, cv=5)
grid.fit(X_train, y_train)
grid.best_params_

{'classifier': SVC(C=10, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
     decision_function_shape='ovr', degree=3, gamma=0.01, kernel='rbf',
     max_iter=-1, probability=False, random_state=None, shrinking=True,
     tol=0.001, verbose=False),
 'classifier__C': 10,
 'classifier__gamma': 0.01,
 'preprocessing': StandardScaler(copy=True, with_mean=True, with_std=True)}