# How-To Pipeline in scikit-learn

https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html

## Load data

In [135]:
import sys
import os
from dotenv import load_dotenv

#1. load environment variables and data

# load environment variables
load_dotenv()

#add working directory to sys path to execute utils/dataset.py
working_dir = os.environ.get("WORKING_DIRECTORY")
sys.path.insert(0, working_dir)

In [136]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from utils.dataset import get_data 
df = get_data()
df.head(20)

Loading data from wines: 8000it [00:00, 23853.82it/s]


Unnamed: 0,wine type,fixed acidity,volatile acidity,citric acid,residual sugar,magnesium,flavanoids,minerals,calcium,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,Pinot noir,5.8,0.15,0.49,1.1,76.729301,894.94,186.639301,109.91,0.048,21.0,98.0,0.9929,3.19,0.48,9.2,5
1,Merlot,6.6,0.25,0.32,5.6,4.795712,1160.95,251.875712,247.08,0.039,15.0,68.0,0.99163,2.96,0.52,11.1,6
2,Chardonnay,6.7,0.21,0.34,1.5,85.19371,789.82,304.70371,219.51,0.035,45.0,123.0,0.98949,3.24,0.36,12.6,7
3,Merlot,8.3,0.28,0.27,17.5,11.976525,777.86,237.586525,225.61,0.045,48.0,253.0,1.00014,3.02,0.56,9.1,6
4,Merlot,7.5,0.42,0.19,6.9,5.599673,785.72,95.399673,89.8,0.041,62.0,150.0,0.99508,3.23,0.37,10.0,6
5,Merlot,7.3,0.34,0.3,1.3,22.403749,1044.95,289.523749,267.12,0.057,25.0,173.0,0.9948,3.26,0.51,9.1,6
6,Merlot,7.6,0.21,0.49,2.5,23.875866,888.61,133.545866,109.67,0.047,20.0,130.0,0.99178,3.15,0.48,11.1,5
7,Chardonnay,6.0,0.25,0.4,5.7,23.309699,1381.79,266.529699,243.22,0.052,56.0,152.0,0.99398,3.16,0.88,10.5,6
8,Cabernet Sauvignon,6.7,0.18,0.19,4.7,49.165745,1456.41,269.915745,220.75,0.046,57.0,161.0,0.9946,3.32,0.66,10.5,6
9,Gamay,7.7,0.28,0.39,8.9,54.450579,929.44,377.690579,323.24,0.036,8.0,117.0,0.9935,3.06,0.38,12.0,2


In [137]:
#import pipeline from scikit
from sklearn.pipeline import Pipeline
pipeline = Pipeline(steps=[])

categorical_features = df.select_dtypes(include=['object']).columns
numerical_features = df.select_dtypes(include=[np.number]).columns

In [138]:
from sklearn.preprocessing import FunctionTransformer

def outlier_detection(df):
    return df

outlier_detection = FunctionTransformer(outlier_detection)

pipeline.steps.append(('outlier_detection', outlier_detection))

In [139]:
# simpleimputer for numerical features
from sklearn.impute import SimpleImputer

categorical_imputer = SimpleImputer(strategy="most_frequent")

numerical_imputer = SimpleImputer(strategy="mean")
#create a copy of df containing only missing values
df_missing = df.copy()
#keep only values with missing values
df_missing = df_missing[df_missing.isnull().any(axis=1)]
df_missing



Unnamed: 0,wine type,fixed acidity,volatile acidity,citric acid,residual sugar,magnesium,flavanoids,minerals,calcium,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
1334,Chardonnay,6.4,0.13,,1.6,96.924983,647.75,303.424983,206.5,0.092,40.0,158.0,0.9928,3.21,0.36,9.8,6
1715,Chardonnay,7.2,0.35,0.25,,,,,,,,,0.99334,2.93,0.66,10.3,7
1777,Pinot noir,6.4,0.32,,10.7,47.973891,725.24,286.333891,238.36,0.047,57.0,206.0,0.9968,3.08,0.6,9.4,5
2024,Chardonnay,6.5,0.23,0.25,,,,,,,,,0.99828,3.15,0.42,9.2,6
2157,Pinot noir,6.7,0.24,,8.7,46.794182,694.52,249.224182,202.43,0.036,29.0,148.0,0.9952,3.22,0.62,9.9,6
2649,Cabernet Sauvignon,7.7,0.39,0.28,,,,,,,,,0.9918,3.19,0.58,12.2,7
3781,Chardonnay,5.8,0.28,0.3,,,,,,,,,0.98952,3.32,0.6,12.5,7
5334,Chardonnay,6.4,0.13,,1.6,96.924983,647.75,303.424983,206.5,0.092,40.0,158.0,0.9928,3.21,0.36,9.8,6
5715,Chardonnay,7.2,0.35,0.25,,,,,,,,,0.99334,2.93,0.66,10.3,7
5777,Pinot noir,6.4,0.32,,10.7,47.973891,725.24,286.333891,238.36,0.047,57.0,206.0,0.9968,3.08,0.6,9.4,5


In [140]:
#impiute missing values
df_missing[numerical_features] = numerical_imputer.fit_transform(df_missing[numerical_features])
df_missing

Unnamed: 0,wine type,fixed acidity,volatile acidity,citric acid,residual sugar,magnesium,flavanoids,minerals,calcium,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
1334,Chardonnay,6.4,0.13,0.27,1.6,96.924983,647.75,303.424983,206.5,0.092,40.0,158.0,0.9928,3.21,0.36,9.8,6.0
1715,Chardonnay,7.2,0.35,0.25,7.0,63.897685,689.17,279.661019,215.763333,0.058333,42.0,170.666667,0.99334,2.93,0.66,10.3,7.0
1777,Pinot noir,6.4,0.32,0.27,10.7,47.973891,725.24,286.333891,238.36,0.047,57.0,206.0,0.9968,3.08,0.6,9.4,5.0
2024,Chardonnay,6.5,0.23,0.25,7.0,63.897685,689.17,279.661019,215.763333,0.058333,42.0,170.666667,0.99828,3.15,0.42,9.2,6.0
2157,Pinot noir,6.7,0.24,0.27,8.7,46.794182,694.52,249.224182,202.43,0.036,29.0,148.0,0.9952,3.22,0.62,9.9,6.0
2649,Cabernet Sauvignon,7.7,0.39,0.28,7.0,63.897685,689.17,279.661019,215.763333,0.058333,42.0,170.666667,0.9918,3.19,0.58,12.2,7.0
3781,Chardonnay,5.8,0.28,0.3,7.0,63.897685,689.17,279.661019,215.763333,0.058333,42.0,170.666667,0.98952,3.32,0.6,12.5,7.0
5334,Chardonnay,6.4,0.13,0.27,1.6,96.924983,647.75,303.424983,206.5,0.092,40.0,158.0,0.9928,3.21,0.36,9.8,6.0
5715,Chardonnay,7.2,0.35,0.25,7.0,63.897685,689.17,279.661019,215.763333,0.058333,42.0,170.666667,0.99334,2.93,0.66,10.3,7.0
5777,Pinot noir,6.4,0.32,0.27,10.7,47.973891,725.24,286.333891,238.36,0.047,57.0,206.0,0.9968,3.08,0.6,9.4,5.0


In [141]:
from sklearn.preprocessing import OneHotEncoder

#pipeline for categorical features
categorical_pipeline = Pipeline(steps=[])

categorical_pipeline.steps.append(('imputer', categorical_imputer))
categorical_pipeline.steps.append(('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False)))

#pipeline for numerical features
numeric_pipeline = Pipeline(steps=[])
numeric_pipeline.steps.append(('imputer', numerical_imputer))

# let pipeline know which pipeline to use for which features
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)
    ]).set_output(transform="pandas")

pipeline.steps.append(('preprocessor', preprocessor))

In [142]:
pipeline

In [143]:
# custom function in pipeline
from sklearn.base import BaseEstimator, TransformerMixin

def test_feature_selection(df):
    #get column which contains name "flavanoids"
    #df = df.filter(regex='flavanoids')
    return df

custom_function = FunctionTransformer(test_feature_selection)
#add custom function to pipeline
pipeline.steps.append(('test_feature_selection', custom_function))
pipeline


In [144]:
end_data = pipeline.fit_transform(df)
end_data

Unnamed: 0,num__fixed acidity,num__volatile acidity,num__citric acid,num__residual sugar,num__magnesium,num__flavanoids,num__minerals,num__calcium,num__chlorides,num__free sulfur dioxide,...,num__density,num__pH,num__sulphates,num__alcohol,num__quality,cat__wine type_Cabernet Sauvignon,cat__wine type_Chardonnay,cat__wine type_Gamay,cat__wine type_Merlot,cat__wine type_Pinot noir
0,5.8,0.15,0.49,1.1,76.729301,894.94,186.639301,109.91,0.048,21.0,...,0.99290,3.19,0.48,9.2,5.0,0.0,0.0,0.0,0.0,1.0
1,6.6,0.25,0.32,5.6,4.795712,1160.95,251.875712,247.08,0.039,15.0,...,0.99163,2.96,0.52,11.1,6.0,0.0,0.0,0.0,1.0,0.0
2,6.7,0.21,0.34,1.5,85.193710,789.82,304.703710,219.51,0.035,45.0,...,0.98949,3.24,0.36,12.6,7.0,0.0,1.0,0.0,0.0,0.0
3,8.3,0.28,0.27,17.5,11.976525,777.86,237.586525,225.61,0.045,48.0,...,1.00014,3.02,0.56,9.1,6.0,0.0,0.0,0.0,1.0,0.0
4,7.5,0.42,0.19,6.9,5.599673,785.72,95.399673,89.80,0.041,62.0,...,0.99508,3.23,0.37,10.0,6.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7995,6.4,0.23,0.37,7.9,92.701914,1143.32,318.791914,226.09,0.050,60.0,...,0.99488,2.86,0.49,9.3,6.0,1.0,0.0,0.0,0.0,0.0
7996,7.0,0.22,0.26,9.2,94.807955,863.32,322.107955,227.30,0.027,37.0,...,0.99228,3.06,0.34,12.5,8.0,1.0,0.0,0.0,0.0,0.0
7997,7.5,0.26,0.30,4.6,50.112474,831.67,360.872474,310.76,0.027,29.0,...,0.99085,3.15,0.38,12.0,7.0,0.0,0.0,0.0,1.0,0.0
7998,6.3,0.43,0.32,8.8,84.805688,1300.32,328.915688,244.11,0.042,18.0,...,0.99172,3.28,0.33,12.9,7.0,0.0,1.0,0.0,0.0,0.0
