# Pipelines

To make your sklearn code neater and potentially easier to move into production, we can use the built-in `pipelines` functionality.

In [1]:
from sklearn.pipeline import Pipeline # For setting up pipeline
from sklearn.preprocessing import Normalizer, StandardScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.ensemble import VotingClassifier
import pandas as pd


First, we will still import our data and set up the train-test split manually.

In [2]:
try:
    data = pd.read_csv("data/processed_data.csv")

except FileNotFoundError:
    # Download processed data:
    address = 'https://raw.githubusercontent.com/MichaelAllen1966/' + \
                '1804_python_healthcare/master/titanic/data/processed_data.csv'

    data = pd.read_csv(address)

    # Create a data subfolder if one does not already exist
    import os
    data_directory ='./data/'
    if not os.path.exists(data_directory):
        os.makedirs(data_directory)

    # Save data
    data.to_csv(data_directory + 'processed_data.csv', index=False)

data = data.astype(float)

# Drop Passengerid (axis=1 indicates we are removing a column rather than a row)
# We drop passenger ID as it is not original data

data.drop('PassengerId', inplace=True, axis=1)

X = data.drop('Survived',axis=1) # X = all 'data' except the 'survived' column
y = data['Survived'] # y = 'survived' column from 'data'

feature_names = X.columns.tolist()

X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_validate, y_train, y_validate = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)

print(f"Training Dataset Samples: {len(X_train)}")
print(f"Validation Dataset Samples: {len(X_validate)}")
print(f"Testing Dataset Samples: {len(X_test)}")

Training Dataset Samples: 569
Validation Dataset Samples: 143
Testing Dataset Samples: 179


In [3]:
data.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,AgeImputed,EmbarkedImputed,CabinLetterImputed,CabinNumber,...,Embarked_missing,CabinLetter_A,CabinLetter_B,CabinLetter_C,CabinLetter_D,CabinLetter_E,CabinLetter_F,CabinLetter_G,CabinLetter_T,CabinLetter_missing
0,0.0,3.0,22.0,1.0,0.0,7.25,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1.0,1.0,38.0,1.0,0.0,71.2833,0.0,0.0,0.0,85.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,3.0,26.0,0.0,0.0,7.925,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1.0,1.0,35.0,1.0,0.0,53.1,0.0,0.0,0.0,123.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,3.0,35.0,0.0,0.0,8.05,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


Now let's build a simple pipeline!

In [4]:
pipe = Pipeline([
('scaler', StandardScaler()),
('classifier', KNeighborsClassifier())
])

In [5]:
pipe.fit(X_train, y_train)
print(f'Training set score: {pipe.score(X_train,y_train):.3f}')
print(f'Test set score: {pipe.score(X_test,y_test):.3f}')

Training set score: 0.858
Test set score: 0.754


## Increasing the complexity of our pipe

Let's add in another step!

In [6]:
knn_classifier = KNeighborsClassifier()
sfs = SequentialFeatureSelector(knn_classifier,
                                n_features_to_select='auto',
                                tol=.01,
                                n_jobs=-1)
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('feature_selection', sfs),
    ('classifier', knn_classifier)
])

In [7]:
pipe

In [8]:
pipe.fit(X_train, y_train)
print(f'Training set score: {pipe.score(X_train,y_train):.3f}')
print(f'Test set score: {pipe.score(X_test,y_test):.3f}')

Training set score: 0.787
Test set score: 0.788


## Ensembles with pipelines

Ensembles can easily be added in to the process as well - they are effectively just a classifier like a single model once we have set them up.

In [9]:
voting_classifier_1 = VotingClassifier(
    estimators=[('knn', KNeighborsClassifier()),
                ('logreg', LogisticRegression())],
    voting='soft')

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', voting_classifier_1)
])

pipe.fit(X_train, y_train)
print(f'Training set score: {pipe.score(X_train,y_train):.3f}')
print(f'Test set score: {pipe.score(X_test,y_test):.3f}')

Training set score: 0.851
Test set score: 0.810


## Grid search with pipelines

To use grid search with our pipeline, we just need to add in the step name we defined with a double underscore before each parameter. 

In [10]:
knn_classifier = KNeighborsClassifier()
sfs = SequentialFeatureSelector(knn_classifier,
                                direction="backward",
                                n_jobs=-1)
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('feature_selection', sfs),
    ('classifier', knn_classifier)
])

parameter_grid = {
    "feature_selection__n_features_to_select": [18, 20],
    "classifier__n_neighbors": [i for i in range(1,10, 2)],
    "classifier__metric": ["minowski", "manhattan", "euclidean"]
}

In [11]:
random_search = GridSearchCV(
    estimator=pipe, # notice that we're passing our pipeline in here
    param_grid=parameter_grid,
    n_jobs=1, # If n_jobs is not one, you won't get the progress report during the process
    verbose=2, # this controls the level of detail being output
)

random_search.fit(X_train, y_train)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV] END classifier__metric=minowski, classifier__n_neighbors=1, feature_selection__n_features_to_select=18; total time=  10.8s
[CV] END classifier__metric=minowski, classifier__n_neighbors=1, feature_selection__n_features_to_select=18; total time=   9.1s
[CV] END classifier__metric=minowski, classifier__n_neighbors=1, feature_selection__n_features_to_select=18; total time=  10.7s
[CV] END classifier__metric=minowski, classifier__n_neighbors=1, feature_selection__n_features_to_select=18; total time=   9.5s
[CV] END classifier__metric=minowski, classifier__n_neighbors=1, feature_selection__n_features_to_select=18; total time=  12.6s
[CV] END classifier__metric=minowski, classifier__n_neighbors=1, feature_selection__n_features_to_select=20; total time=   7.0s
[CV] END classifier__metric=minowski, classifier__n_neighbors=1, feature_selection__n_features_to_select=20; total time=   6.8s
[CV] END classifier__metric=minowski, clas

50 fits failed out of a total of 150.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
50 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Sammi\anaconda3\envs\ml\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Sammi\anaconda3\envs\ml\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Sammi\anaconda3\envs\ml\Lib\site-packages\sklearn\pipeline.py", line 473, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "c:\Users\Sammi\anaconda3\envs\ml\Lib\s

In [12]:
print("Best parameters combination found:")
best_parameters = random_search.best_estimator_.get_params()
for param_name in sorted(parameter_grid.keys()):
    print(f"{param_name}: {best_parameters[param_name]}")

Best parameters combination found:
classifier__metric: euclidean
classifier__n_neighbors: 3
feature_selection__n_features_to_select: 20


In [13]:
test_accuracy = random_search.score(X_test, y_test)
print(
    "Accuracy of the best parameters using the inner CV of "
    f"the random search: {random_search.best_score_:.3f}"
)
print(f"Accuracy on test set: {test_accuracy:.3f}")

Accuracy of the best parameters using the inner CV of the random search: 0.805
Accuracy on test set: 0.771
