## Pipelines in Sklearn
**https://github.com/krishnaik06/Pipelines-Using-Sklearn/blob/master/SklearnPipeline.ipynb**

In [27]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import  PCA
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.tree import  DecisionTreeClassifier
from sklearn.ensemble import  RandomForestClassifier

import numpy as np

In [2]:
iris = load_iris()

In [3]:
iris.data[:5]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2]])

In [4]:
# shape of total data 
iris.data.shape

(150, 4)

In [5]:
x_train, x_test , y_train, y_test = train_test_split(iris.data, iris.target,
                                                     test_size = 0.3, random_state = 0)

In [6]:
# Pipelines Creation
## 1. Data Preprocessing using Standard Scaler

## 2. Reduce Dimension using PCA
## 3. Apply Classifier 

In [7]:
pipeline_lr=Pipeline([('scalar1',StandardScaler()),
                     ('pca1',PCA(n_components=2)),
                     ('lr_classifier',LogisticRegression(random_state=0))])

In [8]:
pipeline_dt = Pipeline([
                        ("scalar2", StandardScaler()),
                        ("pca2", PCA(n_components=2)),
                        ("dt_classifier", DecisionTreeClassifier())
])

In [9]:
pipeline_randomforest = Pipeline([
                                  ("scalar3", StandardScaler()),
                                  ("pca3", PCA(n_components=2)),
                                  ("rf_classifier", RandomForestClassifier())
])

In [10]:
## Lets make the list of pipelines
pipelines = [pipeline_lr, pipeline_dt, pipeline_randomforest]

In [17]:
best_accuracy = 0.0
best_classifier  = 0
best_pipeline = ""


In [18]:
# Dictionary of pipelines and classifier types for ease of reference
pipe_dict = {0: "Logistic Regression", 1 : "Decision Tree", 2: "RandomForest"}

# Fit the pipelines
for pipe in pipelines:
  pipe.fit(x_train, y_train)


In [19]:
for i , model in enumerate(pipelines):
  print("{} Test Accuracy: {}".format(pipe_dict[i],model.score(x_test,y_test)))

Logistic Regression Test Accuracy: 0.8666666666666667
Decision Tree Test Accuracy: 0.9111111111111111
RandomForest Test Accuracy: 0.9111111111111111


In [21]:
for i,model in enumerate(pipelines):
    if model.score(x_test,y_test)>best_accuracy:
        best_accuracy=model.score(x_test,y_test)
        best_pipeline=model
        best_classifier=i
print('Classifier with best accuracy:{}'.format(pipe_dict[best_classifier]))

Classifier with best accuracy:Decision Tree


### Pipelines Perform Hyperparameter Tuning Using Grid SearchCV  

In [24]:
from sklearn.model_selection import  GridSearchCV

In [29]:
## Create a pipeline
pipe = Pipeline([("classifier", RandomForestClassifier())])

# Create a dictionary with candidate learning algorithms and thier hyperparameters

grid_param = [
              {
                  "classifier" : [LogisticRegression()],
               "classifier__penalty":["l2", "l1"],
               "classifier__C" : np.logspace(0,4,10)
              },
              {
                  "classifier" : [LogisticRegression()],
               "classifier__penalty":["l2"],
               "classifier__C" : np.logspace(0,4,10),
               "classifier__solver" : ["newton-cg", "saga", "sag", "liblinear"] # This solvers dont allow L1 penalty
              },
              {"classifier" : [RandomForestClassifier()],
               "classifier__n_estimators" : [10,100,1000],
               "classifier__max_depth": [5,8, 15,25, 30 , None],
               "classifier__min_samples_leaf": [1,2,5,10,15,100],
               "classifier__max_leaf_nodes" : [2,5,10]}]


In [30]:
# Create a grid search of the pipeline, the fit the best model
gridsearch = GridSearchCV(pipe, grid_param, cv = 5, verbose = 0, n_jobs=-1)

# Fit grid search  # Time Consuming
best_model = gridsearch.fit(x_train, y_train)



In [31]:
print(best_model.best_estimator_)
print("The mean accuracy of the model is : " , best_model.score(x_test, y_test))

Pipeline(memory=None,
         steps=[('classifier',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='auto', n_jobs=None,
                                    penalty='l2', random_state=None,
                                    solver='saga', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)
The mean accuracy of the model is :  0.9555555555555556


## MakePipelines in SK Learn

In [40]:
from sklearn.pipeline import  make_pipeline

# Create a Pipe lIne
pipe = make_pipeline((RandomForestClassifier()))

#Create dictionary with candidate learning algorithms and their hyperparameters

grid_param = [
                {"randomforestclassifier": [RandomForestClassifier()],
                 "randomforestclassifier__n_estimators": [10, 100, 1000],
                 "randomforestclassifier__max_depth":[5,8,15,25,30,None],
                 "randomforestclassifier__min_samples_leaf":[1,2,5,10,15,100],
                 "randomforestclassifier__max_leaf_nodes": [2, 5,10]}]


In [42]:
## create a gridsearch of the pipeline, the fit the best model
gridsearch = GridSearchCV(pipe, grid_param, cv=5, verbose=0,n_jobs=-1) # Fit grid search

In [None]:
best_model = gridsearch.fit(x_train, y_train)

In [None]:
best_model.score(x_test, y_test)