# **Understanding the PipeLines **

In [1]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from sklearn.gaussian_process.kernels import RBF
from sklearn.naive_bayes import GaussianNB

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

In [2]:
iris_df = load_iris()

In [3]:
iris_df.data

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

In [4]:
x_train, x_test, y_train, y_test = train_test_split(iris_df.data, iris_df.target, test_size = 0.3, random_state = 0 )

In [5]:
len(x_train), len(x_test), len(y_train), len(y_test)

(105, 45, 105, 45)

Pipelines depends on 2 basis
1. Preprocessing
2. Fit

In [6]:
""" Pipeline on the Logistic Regression """
pipeline_lr = Pipeline([('firstScaler', StandardScaler()),
                        ('firstPCA', PCA(n_components = 2)),
                        ('lr_classifier', LogisticRegression(random_state = 0))])

In [7]:
""" Pipeline on the Decision Tree """
pipeline_dt = Pipeline([('secondScaler', StandardScaler()),
                        ('secondPCA', PCA(n_components = 2)),
                        ('dt_classifier', DecisionTreeClassifier())])

In [8]:
""" Pipeline on the Random Forest """
pipeline_rf = Pipeline([('thirdScaler', StandardScaler()),
                        ('thirdPCA', PCA(n_components = 2)),
                        ('rf_classifier', RandomForestClassifier())])

In [9]:
""" Pipeline on the MLP Classifier """
pipeline_mlp = Pipeline([('forthScaler', StandardScaler()),
                        ('forthPCA', PCA(n_components = 2)),
                        ('mlp_classifier', MLPClassifier(alpha=1, max_iter=1000))])

In [10]:
""" Pipeline on the K-Nearest Neighbours """
pipeline_knn = Pipeline([('fifthScaler', StandardScaler()),
                        ('fifthPCA', PCA(n_components = 2)),
                        ('knn_classifier', KNeighborsClassifier(3))])

In [11]:
""" Pipeline on the Linear SVM """
pipeline_lsvm = Pipeline([('sixthScaler', StandardScaler()),
                        ('sixthPCA', PCA(n_components = 2)),
                        ('lsvm_classifier', SVC(kernel="linear", C=0.025))])

In [12]:
""" Pipeline on the Gaussian Process Classifier """
pipeline_gaussianprocess = Pipeline([('seventhScaler', StandardScaler()),
                        ('seventhPCA', PCA(n_components = 2)),
                        ('gaussianprocess_classifier', GaussianProcessClassifier(1.0 * RBF(1.0)))])

In [13]:
""" Pipeline on the ADA """
pipeline_ada = Pipeline([('eighthScaler', StandardScaler()),
                        ('eighthPCA', PCA(n_components = 2)),
                        ('ada_classifier', AdaBoostClassifier())])

In [14]:
""" Pipeline on the Naive Bayes Gaussian"""
pipeline_gaussianNB = Pipeline([('ninthScaler', StandardScaler()),
                        ('ninthPCA', PCA(n_components = 2)),
                        ('gaussianNB_classifier', GaussianNB())])

In [15]:
""" Pipeline on the Decision Tree """
pipeline_QDA = Pipeline([('tenthScaler', StandardScaler()),
                        ('tenthPCA', PCA(n_components = 2)),
                        ('QDA_classifier', QuadraticDiscriminantAnalysis())])

In [16]:
#now making list of pipelines for classifiers
pipelines = [pipeline_lr, pipeline_dt, pipeline_rf, pipeline_mlp, pipeline_QDA, pipeline_gaussianprocess, pipeline_ada, pipeline_gaussianNB
             , pipeline_knn, pipeline_lsvm]

In [17]:
# lets try to find the best classifier , pipeline from their accuracy
best_pipeline = 0
best_classifier = 0
best_accuracy = 0.0

In [18]:
pipe_dict = { 0 : 'Logistic Regression', 1 : 'Decision Tree', 2 : 'Random Forest', 3 : 'MLP Classifier', 4 : 'KNeighbours Classifier', 5 : 'Linear SVM', 6 : 'Gaussian Process Classifier', 7 : 'ADA', 8 : 'Naive Bayes Gaussian Classifier', 9 : 'QDA'}

# Fitting the pipelines
for pipes in pipelines:
  pipes.fit(x_train, y_train)

In [19]:
for i, model in enumerate(pipelines):
  print("{} Test Accuracy : {}".format(pipe_dict[i], model.score(x_test, y_test)))

Logistic Regression Test Accuracy : 0.8666666666666667
Decision Tree Test Accuracy : 0.9111111111111111
Random Forest Test Accuracy : 0.9111111111111111
MLP Classifier Test Accuracy : 0.8888888888888888
KNeighbours Classifier Test Accuracy : 0.8888888888888888
Linear SVM Test Accuracy : 0.8888888888888888
Gaussian Process Classifier Test Accuracy : 0.7333333333333333
ADA Test Accuracy : 0.9111111111111111
Naive Bayes Gaussian Classifier Test Accuracy : 0.9111111111111111
QDA Test Accuracy : 0.8666666666666667


In [20]:
for i, model in enumerate(pipelines):
  if model.score(x_test, y_test) > best_accuracy:
    best_accuracy = model.score(x_test, y_test)
    best_classifier = i
    best_pipeline = model

print("Classifier with best accuracy among all  : {}". format(pipe_dict[best_classifier]))

Classifier with best accuracy among all  : Decision Tree
