# Pipelines #

In [14]:
from sklearn.model_selection import train_test_split
from sklearn. preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

In [5]:
import pandas as pd
import seaborn as sns

In [11]:
iris = sns.load_dataset('iris')
print(iris.shape)
iris.head()

(150, 5)


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


## Modeling without Pipelines ##

In [8]:
# features
X = iris.drop('species', axis=1)

# target
y = iris['species']

In [12]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=99)

### Logistic Regression ###

In [61]:
# instantiate StandardScaler
scaler = StandardScaler()

# fit and transform data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled =  scaler.transform(X_test)

# instantiate LogisticRegression
logreg = LogisticRegression()

# fit model
logreg.fit(X_train_scaled, y_train)

# predict on train and test set
y_train_logpred = logreg.predict(X_train_scaled)
y_test_logpred = logreg.predict(X_test_scaled)

# calculate score on test
print('Training Accuracy', metrics.accuracy_score(y_train, y_train_logpred))
print('Test Accuracy:', metrics.accuracy_score(y_test, y_test_logpred))

Training Accuracy 0.9732142857142857
Test Accuracy: 0.9736842105263158


### KNN ###

In [62]:
# instantiate MinMaxScaler
minmax = MinMaxScaler()

# fit and transform data
X_train_minmax = minmax.fit_transform(X_train)
X_test_minmax = minmax.transform(X_test)

# instantiate KNeighborsClassifier
knn = KNeighborsClassifier()

# fit model
knn.fit(X_train_minmax, y_train)

# predict on train and test set
y_train_knnpred = knn.predict(X_train_minmax)
y_test_knnpred = knn.predict(X_test_minmax)

# calculate accuracy score on test
print('Training Accuracy', metrics.accuracy_score(y_train, y_train_knnpred))
print('Test Accuracy:', metrics.accuracy_score(y_test, y_test_knnpred))

Training Accuracy 0.9732142857142857
Test Accuracy: 0.9473684210526315


### Decision Tree ###

In [64]:
# instantiate StandardScaler
scaler2 = StandardScaler()

# fit and transform data
X_train_scaled2 = scaler2.fit_transform(X_train)
X_test_scaled2 =  scaler2.transform(X_test)

# instantiate PCA that accounts for 95% of variance
pca = PCA(.95)

# fit and transform data
X_train_pca = pca.fit_transform(X_train_scaled2)
X_test_pca =  pca.transform(X_test_scaled2)

# instantiate DecisionTreeClassifier
dtc = DecisionTreeClassifier()

# fit model
dtc.fit(X_train_pca, y_train)

# predict on train and test set
y_train_pcatree = dtc.predict(X_train_pca)
y_test_pcatree = dtc.predict(X_test_pca)

# calculate score on test
print('Training Accuracy', metrics.accuracy_score(y_train, y_train_pcatree))
print('Test Accuracy:', metrics.accuracy_score(y_test, y_test_pcatree))

Training Accuracy 1.0
Test Accuracy: 0.9210526315789473


## Modeling with Pipelines ##

### Logistic Regression ###

In [52]:
# Define pipeline for LogisticRegression
logreg_pipe = Pipeline([('ss', StandardScaler()),
                        ('logreg', LogisticRegression())])

In [53]:
# fit pipeline on the training data
logreg_pipe.fit(X_train, y_train)

# calculate score on test
print('Train Accuracy:', metrics.accuracy_score(y_train, logreg_pipe.predict(X_train)))
print('Test Accuracy:', metrics.accuracy_score(y_test, logreg_pipe.predict(X_test)))

Train Accuracy: 0.9732142857142857
Test Accuracy: 0.9736842105263158


### KNN ###

In [54]:
# Define pipeline for KNeighborsClassifier
knn_pipe = Pipeline([('minmax', MinMaxScaler()),
                     ('knn', KNeighborsClassifier())])

In [55]:
# fit pipeline on the training data
knn_pipe.fit(X_train, y_train)

# calculate score on test
print('Train Accuracy:', metrics.accuracy_score(y_train, knn_pipe.predict(X_train)))
print('Test Accuracy:', metrics.accuracy_score(y_test, knn_pipe.predict(X_test)))

Train Accuracy: 0.9732142857142857
Test Accuracy: 0.9473684210526315


### Decision Tree Classifier ###

In [59]:
# Define pipeline for DecisionTreeClassifier
dtc_pipe = Pipeline([('ss2', StandardScaler()),
                     ('pca', PCA(.95)),
                     ('dtc', DecisionTreeClassifier())])

In [60]:
# fit pipeline to training data
dtc_pipe.fit(X_train, y_train)

# calculate score on test
print('Train Accuracy:', metrics.accuracy_score(y_train, dtc_pipe.predict(X_train)))
print('Test Accuracy:', metrics.accuracy_score(y_test, dtc_pipe.predict(X_test)))

Train Accuracy: 1.0
Test Accuracy: 0.9210526315789473


### Iterating Through List of Pipelines ###

In [71]:
# define list of pipelines and names defined above
names = ['LogReg', 'KNN', 'DTC']
pipelines = [logreg_pipe, knn_pipe, dtc_pipe]

# iterate through list to fit models and get scores on test set
for name, pipeline in zip(names, pipelines):
    pipeline.fit(X_train, y_train)
    print(name+' Test Accuracy:', metrics.accuracy_score(y_test, pipeline.predict(X_test)))

LogReg Test Accuracy: 0.9736842105263158
KNN Test Accuracy: 0.9473684210526315
DTC Test Accuracy: 0.9210526315789473


## Pipelines and GridSearchCV

In [73]:
from sklearn.model_selection import GridSearchCV

# use grid search with the dtc_pipe
# make sure you put the name of the model used in the pipeline followed by a dunderscore and the param you want to change
params = [{'dtc__max_depth': [4, 5, 6], 
           'dtc__min_samples_split': [2, 5, 10], 
           'dtc__min_samples_leaf': [1, 3, 5]}]

gridsearch = GridSearchCV(dtc_pipe, params, scoring='accuracy', cv=3, verbose=1, n_jobs=-1)

gridsearch.fit(X_train, y_train)

Fitting 3 folds for each of 27 candidates, totalling 81 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  68 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:    2.3s finished


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('ss2', StandardScaler()),
                                       ('pca', PCA(n_components=0.95)),
                                       ('dtc', DecisionTreeClassifier())]),
             n_jobs=-1,
             param_grid=[{'dtc__max_depth': [4, 5, 6],
                          'dtc__min_samples_leaf': [1, 3, 5],
                          'dtc__min_samples_split': [2, 5, 10]}],
             scoring='accuracy', verbose=1)

In [75]:
print('Test Accuracy:', 
      metrics.accuracy_score(y_test, gridsearch.best_estimator_.predict(X_test)))

Test Accuracy: 0.9210526315789473
