# 12 Pipeline Practice

## 12.1 Libraries and Data Processing

In [35]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, StackingClassifier
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression 
from sklearn.tree import DecisionTreeClassifier

import warnings
warnings.filterwarnings('ignore')

In [7]:
df = pd.read_pickle("./df.pkl") 

In [9]:
# Split the predictor and target variables
y = df['status_group']
X = df[['amount_tsh', 'gps_height', 'longitude', 'latitude', 'num_private', 'region_code', 'district_code', 
             'population', 'construction_year']]

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

## 12.2 Create a KNN Pipeline
Use the numerical data to create a KNN pipline

In [12]:
scaled_pipeline_1 = Pipeline([('ss', StandardScaler()), 
                              ('knn', KNeighborsClassifier())])

In [13]:
# Fit the training data to pipeline
scaled_pipeline_1.fit(X_train, y_train)

# Print the accuracy on test set
scaled_pipeline_1.score(X_test, y_test)

0.6645791245791246

## 12.3 Create a Random Forest Pipeline 
Use all the columns scaled to build a random forest pipline

In [14]:
scaled_pipeline_2 = Pipeline([('ss', StandardScaler()), 
                              ('RF', RandomForestClassifier(random_state=123))])

In [15]:
# Fit the training data to pipeline
scaled_pipeline_2.fit(X_train, y_train)

# Print the accuracy on test set
scaled_pipeline_2.score(X_test, y_test)

0.7228956228956229

# 12.3 Create a Bagged Tree Pipline

In [24]:
scaled_pipeline_3 = Pipeline([('ss', StandardScaler()), ('BT', BaggingClassifier(n_estimators=20))])

In [25]:
# Fit the training data to pipeline
scaled_pipeline_2.fit(X_train, y_train)

# Print the accuracy on test set
scaled_pipeline_2.score(X_test, y_test)

0.7228956228956229

## 12.4 Stack Models in a Pipeline - Single Layer
Stack multiple models in a pipeline

In [26]:
base_learners = [
                 ('rf_1', RandomForestClassifier(n_estimators=10, random_state=42)),
                 ('knn_1', KNeighborsClassifier(n_neighbors=5))             
                ]

In [31]:
clf = StackingClassifier(estimators=base_learners, final_estimator=LogisticRegression())

In [32]:
clf.fit(X_train, y_train).score(X_test, y_test)

0.720942760942761

## 12.5 Stack Models in a Pipeline - Multiple Layers

In [36]:
layer_one_estimators = [
                        ('rf_1', RandomForestClassifier(n_estimators=10, random_state=42)),
                        ('knn_1', KNeighborsClassifier(n_neighbors=5))             
                       ]
layer_two_estimators = [
                        ('dt_2', DecisionTreeClassifier()),
                        ('rf_2', RandomForestClassifier(n_estimators=50, random_state=42)),
                       ]
layer_two = StackingClassifier(estimators=layer_two_estimators, final_estimator=LogisticRegression())

In [37]:
# Create Final model by 
clf = StackingClassifier(estimators=layer_one_estimators, final_estimator=layer_two)

clf.fit(X_train, y_train).score(X_test, y_test)

0.7118518518518518