# ML Data Pipeline Project

### Importing Packages and Data

In [1]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.naive_bayes import GaussianNB as GNB
from sklearn.svm import SVC
import joblib
import warnings
warnings.filterwarnings('ignore')

In [2]:
iris_df=load_iris()

In [3]:
iris_df.data

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

In [4]:
X_train, X_test, y_train, y_test=train_test_split(iris_df.data, iris_df.target, test_size=0.27, random_state=0)

### Creating Pipeline

1) Preprocessing 
2) Dimension Reduction
3) Applying Classifiers

In [5]:
#Standardizing via Standard Scaler
pipeline_logreg=Pipeline([('scaler1', StandardScaler()), 
                          ('pca1', PCA(n_components=2)),
                         ('logreg_classifier', LogisticRegression(random_state=1))])

In [6]:
#Standardizing via Standard Scaler
pipeline_dectr=Pipeline([('scaler2', StandardScaler()), 
                          ('pca2', PCA(n_components=2)),
                         ('dectr_classifier', DecisionTreeClassifier())])

In [7]:
#Standardizing via Standard Scaler
pipeline_randf=Pipeline([('scaler3', StandardScaler()), 
                          ('pca3', PCA(n_components=2)),
                         ('randf_classifier', RandomForestClassifier())])

In [8]:
#Standardizing via Standard Scaler
pipeline_lda=Pipeline([('scaler4', StandardScaler()), 
                          ('pca4', PCA(n_components=2)),
                         ('lda_classifier', LDA())])

In [9]:
#Standardizing via Standard Scaler
pipeline_qda=Pipeline([('scaler5', StandardScaler()), 
                          ('pca5', PCA(n_components=2)),
                         ('qda_classifier', QDA())])

In [10]:
#Standardizing via Standard Scaler
pipeline_knn=Pipeline([('scaler6', StandardScaler()), 
                          ('pca6', PCA(n_components=2)),
                         ('knn_classifier', KNN())])

In [11]:
#Standardizing via Standard Scaler
pipeline_gnb=Pipeline([('scaler7', StandardScaler()), 
                          ('pca7', PCA(n_components=2)),
                         ('gnb_classifier', GNB())])

In [12]:
#Standardizing via Standard Scaler
pipeline_svc=Pipeline([('scaler8', StandardScaler()), 
                          ('pca8', PCA(n_components=2)),
                         ('svc_classifier', SVC())])

#### Initializing Pipelines

In [13]:
pipelines = [pipeline_logreg, pipeline_dectr, pipeline_randf,
            pipeline_lda, pipeline_qda, pipeline_knn,
            pipeline_gnb, pipeline_svc]

In [14]:
most_accurate=0.0
best_classifier=0
best_pipeline=""

In [15]:
#Creating pipeline dictionary 
pipe_dict = {0: 'Logistic Regression',1: 'Decision Tree',2: 'Random Forest',
            3: 'LDA',4: 'QDA',5: 'KNN',6: 'GNB' , 7: 'SVC'}

#Fitting pipeline
for pipe in pipelines:
    pipe.fit(X_train, y_train)

#### Testing Models

In [16]:
for i, model in enumerate(pipelines):
    print("{} Test Accuracy: {}".format(pipe_dict[i],model.score(X_test, y_test)))

Logistic Regression Test Accuracy: 0.8536585365853658
Decision Tree Test Accuracy: 0.9024390243902439
Random Forest Test Accuracy: 0.9024390243902439
LDA Test Accuracy: 0.926829268292683
QDA Test Accuracy: 0.8780487804878049
KNN Test Accuracy: 0.9024390243902439
GNB Test Accuracy: 0.9024390243902439
SVC Test Accuracy: 0.926829268292683


#### Selecting Model

In [17]:
for i, model in enumerate(pipelines):
    if model.score(X_test, y_test)>most_accurate:
        most_accurate=model.score(X_test, y_test)
        best_pipeline=model
        best_classifier=i
print("Most accurate classifier: {}".format(pipe_dict[best_classifier]))

Most accurate classifier: LDA
