# Pipelines in scikit-learn

Pipelines in scikit-learn are a powerful tool that can help streamline your machine learning workflows, reduce the risk of data leakage, and simplify the process of hyperparameter tuning. By chaining together data preprocessing and model training steps, you can ensure that your workflow is consistent, modular, and easy to understand.

# Helpful functions

- **dataframe's select_dtypes**: The select_dtypes function in pandas is used to select columns in a DataFrame based on their data types. This function is especially useful when you want to apply specific operations or transformations to columns of certain types, like numerical, categorical, or boolean data.

In [26]:
import pandas as pd

df = pd.DataFrame({
    'numeric_1': [1, 2, 3],
    'numeric_2': [4.5, 5.6, 6.7],
    'category': ['A', 'B', 'C'],
    'boolean_1': [True, False, True],
    'boolean_2': [False, False, True]
})

numeric_df = df.select_dtypes(include=[int, float])
print(numeric_df)

categorical_df = df.select_dtypes(include=object)
print(categorical_df)

binary_df = df.select_dtypes(include=bool)
print(binary_df)

   numeric_1  numeric_2
0          1        4.5
1          2        5.6
2          3        6.7
  category
0        A
1        B
2        C
   boolean_1  boolean_2
0       True      False
1      False      False
2       True       True


# 1  Basic Preprocessing Pipeline

## 1.1 Pipeline for Numerical Data Only
For datasets with only numerical values, we can use a pipeline to handle missing values and scaling

In [29]:
from sklearn.datasets import fetch_openml
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import pandas as pd

data = fetch_openml(name="house_prices", version=1, as_frame=True)
X = data.data.select_dtypes(include=[float, int])
y = data.target

print("Missing values per column:")
print(X.isnull().sum())  

numeric_pipeline = Pipeline([('imputer', SimpleImputer(strategy="mean")),
                             ('scaler', StandardScaler())])

X_transformed = numeric_pipeline.fit_transform(X)
pd.DataFrame(X_transformed, columns=X.columns).head()


Missing values per column:
Id                 0
MSSubClass         0
LotFrontage      259
LotArea            0
OverallQual        0
OverallCond        0
YearBuilt          0
YearRemodAdd       0
MasVnrArea         8
BsmtFinSF1         0
BsmtFinSF2         0
BsmtUnfSF          0
TotalBsmtSF        0
1stFlrSF           0
2ndFlrSF           0
LowQualFinSF       0
GrLivArea          0
BsmtFullBath       0
BsmtHalfBath       0
FullBath           0
HalfBath           0
BedroomAbvGr       0
KitchenAbvGr       0
TotRmsAbvGrd       0
Fireplaces         0
GarageYrBlt       81
GarageCars         0
GarageArea         0
WoodDeckSF         0
OpenPorchSF        0
EnclosedPorch      0
3SsnPorch          0
ScreenPorch        0
PoolArea           0
MiscVal            0
MoSold             0
YrSold             0
dtype: int64


  warn(


Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
0,-1.730865,0.073375,-0.229372,-0.207142,0.651479,-0.5172,1.050994,0.878668,0.511418,0.575425,...,0.351,-0.752176,0.216503,-0.359325,-0.116339,-0.270208,-0.068692,-0.087688,-1.599111,0.138777
1,-1.728492,-0.872563,0.451936,-0.091886,-0.071836,2.179628,0.156734,-0.429577,-0.57441,1.171992,...,-0.060731,1.626195,-0.704483,-0.359325,-0.116339,-0.270208,-0.068692,-0.087688,-0.48911,-0.614439
2,-1.72612,0.073375,-0.09311,0.07348,0.651479,-0.5172,0.984752,0.830215,0.32306,0.092907,...,0.631726,-0.752176,-0.070361,-0.359325,-0.116339,-0.270208,-0.068692,-0.087688,0.990891,0.138777
3,-1.723747,0.309859,-0.456474,-0.096897,0.651479,-0.5172,-1.863632,-0.720298,-0.57441,-0.499274,...,0.790804,-0.752176,-0.176048,4.092524,-0.116339,-0.270208,-0.068692,-0.087688,-1.599111,-1.367655
4,-1.721374,0.073375,0.633618,0.375148,1.374795,-0.5172,0.951632,0.733308,1.36457,0.463568,...,1.698485,0.780197,0.56376,-0.359325,-0.116339,-0.270208,-0.068692,-0.087688,2.100892,0.138777


## 1.2 Pipeline for Categorical Data Only
For a pipeline focused on categorical data only, we can use transformers like SimpleImputer to handle missing values and OneHotEncoder to convert categories into a numerical format.

In [50]:
from sklearn.datasets import fetch_openml
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

data = fetch_openml(name="titanic", version=1, as_frame=True)
X = data.data.select_dtypes(include="category")
y = data.target

print("Missing values per column:")
print(X.isnull().sum())

cat_pipeline = Pipeline([('impute', SimpleImputer(strategy="most_frequent")),
                         ('encoder', OneHotEncoder())])

X_transformed = cat_pipeline.fit_transform(X)
pd.DataFrame(data=X_transformed.toarray(), columns=cat_pipeline.named_steps["encoder"].get_feature_names_out())

Missing values per column:
sex         0
embarked    2
dtype: int64


  warn(


Unnamed: 0,x0_female,x0_male,x1_C,x1_Q,x1_S
0,1.0,0.0,0.0,0.0,1.0
1,0.0,1.0,0.0,0.0,1.0
2,1.0,0.0,0.0,0.0,1.0
3,0.0,1.0,0.0,0.0,1.0
4,1.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...
1304,1.0,0.0,1.0,0.0,0.0
1305,1.0,0.0,1.0,0.0,0.0
1306,0.0,1.0,1.0,0.0,0.0
1307,0.0,1.0,1.0,0.0,0.0


## 1.3 Pipeline for both numerical and categorical features
For a dataset with both numerical and categorical features, you can use ColumnTransformer to create separate pipelines for each type and then combine them. This approach is very useful for preprocessing datasets with mixed data types.

In [75]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.datasets import fetch_openml 
import numpy as np

data = pd.DataFrame({
    'age': [20, 25, 30, None, 40],
    'salary': [50000, 60000, None, 80000, 90000],
    'city': ['Los Angeles', 'Los Angeles', None, 'Chicago', 'Houston']
})

numerical_cols = ['age','salary']
categorical_cols = ['city']

numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="mean")),
    ('scaler', StandardScaler()),
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(missing_values = None, strategy="most_frequent")), 
    ('encoding', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([('num', numeric_pipeline, numerical_cols),
                   ('cat', categorical_pipeline, categorical_cols)
                   ])
processed_data = preprocessor.fit_transform(data)
pd.DataFrame(data = processed_data, columns=preprocessor.get_feature_names_out())

Unnamed: 0,num__age,num__salary,cat__city_Chicago,cat__city_Houston,cat__city_Los Angeles
0,-1.322876,-1.414214,0.0,0.0,1.0
1,-0.566947,-0.707107,0.0,0.0,1.0
2,0.188982,0.0,0.0,0.0,1.0
3,0.0,0.707107,1.0,0.0,0.0
4,1.70084,1.414214,0.0,1.0,0.0


# 2 Classification Pipeline with Preprocessing

A pipeline for a classification model with preprocessing steps.

In [103]:
from sklearn.datasets import load_iris
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

data = load_iris()
X,y = data.data, data.target

X_train, X_test, y_train , y_test = train_test_split(X, y, test_size=0.2, shuffle= True, random_state=20)
pipeline = Pipeline([('scaler', StandardScaler()),
                     ('classifier', DecisionTreeClassifier())])

pipeline.fit(X_train, y_train)
y_hat = pipeline.predict(X_test)
accuracy_score(y_test, y_hat)

0.9333333333333333

# 3 Grid Search with a Pipeline

In [24]:
from sklearn.datasets import load_iris
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier 
from sklearn.preprocessing import StandardScaler 
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score

data = load_iris()
X, y = data.data, data.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle = True, random_state=10)

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', DecisionTreeClassifier())
])

params = {
        'classifier__max_depth': [2,3,5,7],
        'classifier__min_samples_leaf':[1,2,3,5]
        }

grid_search = GridSearchCV(pipeline, params, cv=5)
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)

yhat = grid_search.best_estimator_.predict(X_test)
accuracy_score(y_pred=yhat, y_true=y_test)



{'classifier__max_depth': 3, 'classifier__min_samples_leaf': 3}


0.9333333333333333

# 4 Feature Engineering and Model Selection Pipeline

In [28]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

data = fetch_openml(name="titanic", version=1, as_frame=True)
X, y = data.data, data.target

X = X[['pclass', 'sex', 'age', 'fare', 'embarked']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10, shuffle=True)

numerical_cols = ['age', 'fare']
categorical_cols = ['pclass', 'sex', 'embarked']
pipeline_num = Pipeline([
                       ('imputer', SimpleImputer(strategy="mean")),
                       ('scaler', StandardScaler())
                     ])

pipeline_cat = Pipeline([
                       ('imputer', SimpleImputer(strategy="most_frequent")),
                       ('encoder', OneHotEncoder())
                     ])
preprocessing = ColumnTransformer([
                     ('num', pipeline_num, numerical_cols),
                     ('cat', pipeline_cat, categorical_cols)
                     ])

pipeline_knn = Pipeline([
                    ('preprocess', preprocessing),
                    ('classifier', KNeighborsClassifier())
                ])
pipeline_tree = Pipeline([
                    ('preprocess', preprocessing),
                    ('classifier', DecisionTreeClassifier())
                ])

params_knn = {'classifier__n_neighbors': [3,5,7,11,15,17,31,23,25]}
params_tree = {'classifier__max_depth': [2,3,4,5,6,7,8,9,10],
               'classifier__min_samples_leaf': [1,2,3,4,5,6,7,8,10]}

grid_search_knn = GridSearchCV(pipeline_knn, params_knn, cv=5)
grid_search_knn.fit(X_train, y_train)
yhat_knn = grid_search_knn.best_estimator_.predict(X_test)

print(f"best knn parameters: {grid_search_knn.best_params_}")
print(f"accuracy of knn: {accuracy_score(y_test, yhat_knn):.2f}")

grid_search_tree = GridSearchCV(pipeline_tree, params_tree, cv=5)
grid_search_tree.fit(X_train, y_train)
yhat_tree = grid_search_tree.best_estimator_.predict(X_test)

print(f"best knn parameters: {grid_search_tree.best_params_}")
print(f"accuracy of knn: {accuracy_score(y_test, yhat_tree):.2f}")


  warn(


best knn parameters: {'classifier__n_neighbors': 15}
accuracy of knn: 0.78
best knn parameters: {'classifier__max_depth': 7, 'classifier__min_samples_leaf': 7}
accuracy of knn: 0.79
