# Pipeline

In [1]:
import numpy as np
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [2]:
iris = datasets.load_iris()
X = iris.data
y = iris.target

In [None]:
scaler = MinMaxScaler((0,1))
scaler.fit(X_train)
X_train_transfoemd = scaler.transform(X_train)

lr = LogisticRegression()
lr.fit(X_train_transformed)

X_test_tr = scaler.transform(X_test)
lr.predict(X_test_tr, y_test)


## 1. `sklearn.pipeline.Pipeline` 이용

In [3]:
norm_pipe = Pipeline([
    ('Normalization', MinMaxScaler(feature_range=(0, 1))), 
    ('LR', LogisticRegression(random_state=1234))
])  

In [4]:
print(norm_pipe)

Pipeline(memory=None,
     steps=[('Normalization', MinMaxScaler(copy=True, feature_range=(0, 1))), ('LR', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=1234, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])


In [5]:
stand_pipe = Pipeline([
    ('Standardization', StandardScaler()),
    ('LR', LogisticRegression(random_state=1234))
])

In [6]:
print(stand_pipe)

Pipeline(memory=None,
     steps=[('Standardization', StandardScaler(copy=True, with_mean=True, with_std=True)), ('LR', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=1234, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])


In [7]:
param_grid = {
    'Normalization__feature_range': [(0, 1), (-0.5, 0.5), (-1, 1)],
    'LR__C': [0.1, 0.5, 1.0, 5.0],
    'LR__penalty': ['l1', 'l2']
}

In [8]:
norm_grid = GridSearchCV(norm_pipe, param_grid, cv=5, verbose=1)
norm_grid.fit(X, y)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed:    0.4s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('Normalization', MinMaxScaler(copy=True, feature_range=(0, 1))), ('LR', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=1234, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'Normalization__feature_range': [(0, 1), (-0.5, 0.5), (-1, 1)], 'LR__C': [0.1, 0.5, 1.0, 5.0], 'LR__penalty': ['l1', 'l2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [9]:
norm_grid.grid_scores_



[mean: 0.66000, std: 0.01333, params: {'LR__C': 0.1, 'LR__penalty': 'l1', 'Normalization__feature_range': (0, 1)},
 mean: 0.72000, std: 0.04000, params: {'LR__C': 0.1, 'LR__penalty': 'l1', 'Normalization__feature_range': (-0.5, 0.5)},
 mean: 0.73333, std: 0.04216, params: {'LR__C': 0.1, 'LR__penalty': 'l1', 'Normalization__feature_range': (-1, 1)},
 mean: 0.69333, std: 0.02494, params: {'LR__C': 0.1, 'LR__penalty': 'l2', 'Normalization__feature_range': (0, 1)},
 mean: 0.80667, std: 0.06110, params: {'LR__C': 0.1, 'LR__penalty': 'l2', 'Normalization__feature_range': (-0.5, 0.5)},
 mean: 0.84000, std: 0.07424, params: {'LR__C': 0.1, 'LR__penalty': 'l2', 'Normalization__feature_range': (-1, 1)},
 mean: 0.87333, std: 0.04899, params: {'LR__C': 0.5, 'LR__penalty': 'l1', 'Normalization__feature_range': (0, 1)},
 mean: 0.89333, std: 0.06464, params: {'LR__C': 0.5, 'LR__penalty': 'l1', 'Normalization__feature_range': (-0.5, 0.5)},
 mean: 0.92000, std: 0.06182, params: {'LR__C': 0.5, 'LR__penal

In [10]:
print(norm_grid.best_params_)
print(norm_grid.best_score_)

{'LR__C': 5.0, 'LR__penalty': 'l1', 'Normalization__feature_range': (-1, 1)}
0.9533333333333334


In [11]:
param_grid = {
    'LR__C': [0.1, 0.5, 1.0, 5.0],
    'LR__penalty': ['l1', 'l2']
}

In [12]:
stand_grid = GridSearchCV(stand_pipe, param_grid, cv=5, verbose=1)
stand_grid.fit(X, y)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:    0.1s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('Standardization', StandardScaler(copy=True, with_mean=True, with_std=True)), ('LR', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=1234, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'LR__C': [0.1, 0.5, 1.0, 5.0], 'LR__penalty': ['l1', 'l2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [13]:
stand_grid.grid_scores_



[mean: 0.86667, std: 0.05578, params: {'LR__C': 0.1, 'LR__penalty': 'l1'},
 mean: 0.83333, std: 0.06992, params: {'LR__C': 0.1, 'LR__penalty': 'l2'},
 mean: 0.91333, std: 0.06864, params: {'LR__C': 0.5, 'LR__penalty': 'l1'},
 mean: 0.88000, std: 0.06864, params: {'LR__C': 0.5, 'LR__penalty': 'l2'},
 mean: 0.93333, std: 0.04714, params: {'LR__C': 1.0, 'LR__penalty': 'l1'},
 mean: 0.90667, std: 0.06464, params: {'LR__C': 1.0, 'LR__penalty': 'l2'},
 mean: 0.95333, std: 0.03399, params: {'LR__C': 5.0, 'LR__penalty': 'l1'},
 mean: 0.93333, std: 0.04714, params: {'LR__C': 5.0, 'LR__penalty': 'l2'}]

In [14]:
print(stand_grid.best_params_)
print(stand_grid.best_score_)

{'LR__C': 5.0, 'LR__penalty': 'l1'}
0.9533333333333334


## 2. `sklearn.pipeline.make_pipeline`

In [15]:
norm_pipe = make_pipeline(
    MinMaxScaler(feature_range=(0,1)), 
    LogisticRegression(random_state=1234)
)  

In [16]:
print(norm_pipe)

Pipeline(memory=None,
     steps=[('minmaxscaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('logisticregression', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=1234, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])


In [17]:
stand_pipe = make_pipeline(
    StandardScaler(), 
    LogisticRegression(random_state=1234)
)

In [18]:
# 모듈에 이름을 붙이면 error 발생
stand_pipe = make_pipeline(
    ('aa', StandardScaler()), 
    ('bb', LogisticRegression(random_state=1234))
)

TypeError: All intermediate steps should be transformers and implement fit and transform. '('aa', StandardScaler(copy=True, with_mean=True, with_std=True))' (type <class 'tuple'>) doesn't

In [19]:
print(stand_pipe)

Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('logisticregression', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=1234, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])


In [20]:
param_grid = {
    'logisticregression__C': [0.1, 0.5, 1.0, 5.0],
    'logisticregression__penalty': ['l1', 'l2']
}

In [21]:
norm_grid = GridSearchCV(norm_pipe, param_grid, cv=5, verbose=1)
norm_grid.fit(X, y)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:    0.1s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('minmaxscaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('logisticregression', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=1234, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'logisticregression__C': [0.1, 0.5, 1.0, 5.0], 'logisticregression__penalty': ['l1', 'l2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [22]:
stand_grid = GridSearchCV(stand_pipe, param_grid, cv=4, verbose=1)
stand_grid.fit(X, y)

Fitting 4 folds for each of 8 candidates, totalling 32 fits


[Parallel(n_jobs=1)]: Done  32 out of  32 | elapsed:    0.1s finished


GridSearchCV(cv=4, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('logisticregression', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=1234, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'logisticregression__C': [0.1, 0.5, 1.0, 5.0], 'logisticregression__penalty': ['l1', 'l2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)