# Undestanding Pipelines with GridSearchCV

In [1]:
# Loading neccessary libraries

import pandas as pd
import numpy as np

# Using built-in Iris Flower dataset for understanding the concept of classification problem
from sklearn.datasets import load_iris

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
# Loading the Iris Flower dataset into the notebook
df_iris= load_iris()
df_iris.data[1]

array([4.9, 3. , 1.4, 0.2])

In [3]:
df_iris.target[1]

0

In [4]:
# Performing the train test split

X_train, X_test, y_train, y_test= train_test_split(df_iris.data, df_iris.target, test_size=0.3, random_state=0)


In [5]:
# Creting Pipelines considering 3 different Machine Learning Algorithm
# 1. Data Preprocssing using StandardScaler
# 2. Reducing Dimensions using PCA
# 3. Apply Classifier

pipeline_lr= Pipeline([('scaler_1',StandardScaler()),
                       ('pca_1', PCA(n_components=2)),
                       ('clf_lr', LogisticRegression(random_state=0))])

pipeline_dt= Pipeline([('scaler2', StandardScaler()),
                       ('pca_2', PCA(n_components=2)),
                       ('clf_dt',DecisionTreeClassifier())])

pipeline_rf= Pipeline([('scaler_3', StandardScaler()),
                       ('pca_3', PCA(n_components=2)),
                       ('clf_rf', RandomForestClassifier())])

pipelines= [pipeline_lr, pipeline_dt, pipeline_rf]


# Initializing the best accuracy, classifier and pipelines
best_accuracy= 0.0
best_classifier=0
best_pipeline=""

pipeline_dict= {0: 'Logistic regression',
                1: 'Decision Tree',
                2: 'Random Forest'}

for pipe in pipelines:
    pipe.fit(X_train, y_train)

In [6]:
for i, model in enumerate(pipelines):
    print("Test Accuracy by {}: {}".format(pipeline_dict[i], model.score(X_test, y_test)))

Test Accuracy by Logistic regression: 0.8666666666666667
Test Accuracy by Decision Tree: 0.9111111111111111
Test Accuracy by Random Forest: 0.9111111111111111


In [7]:
# Performing Hyperparameter Tuning using GridSearchCV

pipe= Pipeline([('clf', RandomForestClassifier())])

# Creating parameter grids

param_grids= [
    {"clf": [LogisticRegression()],
     "clf__penalty": ['l2', 'l1'],
     "clf__C": np.logspace(0,4,10)
    },
    
    {"clf": [LogisticRegression()],
     "clf__penalty": ['l2'],
     "clf__C": np.logspace(0,4,10),
     "clf__solver":['newton-cg', 'saga', 'sag', 'liblinear']
    },
    
    {"clf": [RandomForestClassifier()],
     "clf__n_estimators": [10,100,1000],
     "clf__max_depth": [5,8,15,25,30,None],
     "clf__min_samples_leaf": [1,2,5,10,15,100],
     "clf__max_leaf_nodes": [2,5,10]
    }
]

grid_search= GridSearchCV(pipe, param_grids, cv=5, verbose=0, n_jobs=-1)
grid_search.fit(X_train, y_train)



GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('clf',
                                        RandomForestClassifier(bootstrap=True,
                                                               ccp_alpha=0.0,
                                                               class_weight=None,
                                                               criterion='gini',
                                                               max_depth=None,
                                                               max_features='auto',
                                                               max_leaf_nodes=None,
                                                               max_samples=None,
                                                               min_impurity_decrease=0.0,
                                                               min_impurity_split=None,
                                              

In [8]:
print(grid_search.best_params_)

{'clf': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='saga', tol=0.0001, verbose=0,
                   warm_start=False), 'clf__C': 1.0, 'clf__penalty': 'l2', 'clf__solver': 'saga'}


In [9]:
# Performing Prediction on test set
y_pred= grid_search.predict(X_test)

# Checking the accuracy score of the model
accuracy_score(y_test, y_pred)

0.9555555555555556

In [10]:
# Performing the confusion matrix
confusion_matrix(y_test, y_pred)

array([[16,  0,  0],
       [ 0, 16,  2],
       [ 0,  0, 11]], dtype=int64)

In [11]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        16
           1       1.00      0.89      0.94        18
           2       0.85      1.00      0.92        11

    accuracy                           0.96        45
   macro avg       0.95      0.96      0.95        45
weighted avg       0.96      0.96      0.96        45



In [12]:
df_final= pd.DataFrame(data= y_pred, columns=['Pred_labels'])
df_final.head()

Unnamed: 0,Pred_labels
0,2
1,1
2,0
3,2
4,0


In [13]:
df_final.to_csv('output.csv')