In [1]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
import joblib
from numpy import ravel
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier # The k-nearest neighbor classifier
from sklearn.feature_selection import VarianceThreshold # Feature selector
# Various pre-processing steps
from sklearn.preprocessing import Normalizer, StandardScaler, MinMaxScaler, PowerTransformer, MaxAbsScaler, LabelEncoder
from sklearn.model_selection import GridSearchCV  # For optimization


In [2]:
iris_df = load_iris()

In [17]:
iris_df.data[:10]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1]])

In [4]:
X_train, X_test, y_train, y_test = train_test_split(iris_df.data, iris_df.target, test_size=0.3, random_state=2)

In [5]:
## Pipeline description
# 1. Data Preprocessing by using Standard Scaler
# 2. Reducing the dimesion of the PCA
# 3. Apply Classifier

In [25]:

pipeline_lr = Pipeline(steps=[("scalar1", StandardScaler()),
                              ("pca1", PCA(n_components=2)),
                              ("lr_classifier", LogisticRegression())])


In [26]:
pipeline_dt = Pipeline(steps=[("scaler2", StandardScaler()),
                              ("pca2", PCA(n_components=2)),
                              ("dt_classifier", DecisionTreeClassifier())])


In [27]:
pipeline_rf = Pipeline(steps=[("scaler3", StandardScaler()), 
                              ("pca3", PCA(n_components=2)), 
                              ("rf_classifier", RandomForestClassifier())])


In [28]:
pipelines = [pipeline_lr, pipeline_dt, pipeline_rf]

In [29]:
best_accuracy   = 0.0
best_classifier = 0
best_pipeline   = " "

In [31]:
# Dictionary of pipelines and classifier types for ease of reference
pipe_dict = {0: "Logistic Regression", 1: "Decision Tree", 2: "Random Forest"}

# Fit the Pipeline
for pipe in pipelines:
    pipe.fit(X_train, y_train)

In [34]:
for i, model in enumerate(pipelines):
    print("{} Test Accuracy: {}".format(pipe_dict[i], model.score(X_test, y_test)))

Logistic Regression Test Accuracy: 0.9111111111111111
Decision Tree Test Accuracy: 0.9333333333333333
Random Forest Test Accuracy: 0.9333333333333333


In [39]:
for i, model in enumerate(pipelines):
    if model.score(X_test, y_test) > best_accuracy:
        best_accuracy = model.score(X_test, y_test)
        best_pipeline = model
        best_classifier = i
        
print("Classifier with the best Accuracy: {}".format(pipe_dict.get(best_classifier)))


Classifier with the best Accuracy: Decision Tree
