# Data Pipeline Template

In [288]:
import sys
import os
from dotenv import load_dotenv

#1. load environment variables and data

# load environment variables
load_dotenv()

#add working directory to sys path to execute utils/dataset.py
working_dir = os.environ.get("WORKING_DIRECTORY")
sys.path.insert(0, working_dir)

In [289]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from utils.dataset import get_data 
df = get_data()
df.head(20)

Loading data from wines: 8000it [00:00, 22665.70it/s]


Unnamed: 0,wine type,fixed acidity,volatile acidity,citric acid,residual sugar,magnesium,flavanoids,minerals,calcium,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,Pinot noir,5.8,0.15,0.49,1.1,76.729301,894.94,186.639301,109.91,0.048,21.0,98.0,0.9929,3.19,0.48,9.2,5
1,Merlot,6.6,0.25,0.32,5.6,4.795712,1160.95,251.875712,247.08,0.039,15.0,68.0,0.99163,2.96,0.52,11.1,6
2,Chardonnay,6.7,0.21,0.34,1.5,85.19371,789.82,304.70371,219.51,0.035,45.0,123.0,0.98949,3.24,0.36,12.6,7
3,Merlot,8.3,0.28,0.27,17.5,11.976525,777.86,237.586525,225.61,0.045,48.0,253.0,1.00014,3.02,0.56,9.1,6
4,Merlot,7.5,0.42,0.19,6.9,5.599673,785.72,95.399673,89.8,0.041,62.0,150.0,0.99508,3.23,0.37,10.0,6
5,Merlot,7.3,0.34,0.3,1.3,22.403749,1044.95,289.523749,267.12,0.057,25.0,173.0,0.9948,3.26,0.51,9.1,6
6,Merlot,7.6,0.21,0.49,2.5,23.875866,888.61,133.545866,109.67,0.047,20.0,130.0,0.99178,3.15,0.48,11.1,5
7,Chardonnay,6.0,0.25,0.4,5.7,23.309699,1381.79,266.529699,243.22,0.052,56.0,152.0,0.99398,3.16,0.88,10.5,6
8,Cabernet Sauvignon,6.7,0.18,0.19,4.7,49.165745,1456.41,269.915745,220.75,0.046,57.0,161.0,0.9946,3.32,0.66,10.5,6
9,Gamay,7.7,0.28,0.39,8.9,54.450579,929.44,377.690579,323.24,0.036,8.0,117.0,0.9935,3.06,0.38,12.0,2


In [290]:
df.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,magnesium,flavanoids,minerals,calcium,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,8000.0,8000.0,7994.0,7992.0,7992.0,7992.0,7992.0,7992.0,7992.0,7992.0,7992.0,8000.0,8000.0,8000.0,8000.0,8000.0
mean,6.856625,0.278486,0.334929,6.406219,49.324446,934.43747,285.652011,236.327565,0.045875,35.191567,138.605856,0.994046,3.18727,0.49052,10.507502,6.56175
std,0.841727,0.100296,0.120249,5.104317,29.086962,270.488368,76.881459,71.298176,0.02234,17.079654,42.704772,0.003024,0.150369,0.114318,1.227373,8.757213
min,3.8,0.08,0.0,0.6,0.035118,93.0,16.045445,3.0,0.009,2.0,9.0,0.98713,2.72,0.22,8.0,1.0
25%,6.3,0.21,0.27,1.7,24.208416,747.295,233.481329,187.495,0.036,23.0,108.0,0.991727,3.09,0.41,9.5,5.0
50%,6.8,0.26,0.32,5.2,49.078666,931.025,284.649716,235.215,0.043,34.0,135.0,0.99373,3.18,0.48,10.4,6.0
75%,7.3,0.32,0.39,9.9,74.582793,1118.3725,337.955361,284.5425,0.05,46.0,168.0,0.996128,3.28,0.55,11.4,6.0
max,11.8,1.1,1.23,65.8,99.998181,1974.85,576.761262,514.52,0.346,289.0,440.0,1.03898,3.82,1.08,14.2,99.0


In [291]:
df.dtypes

wine type                object
fixed acidity           float64
volatile acidity        float64
citric acid             float64
residual sugar          float64
magnesium               float64
flavanoids              float64
minerals                float64
calcium                 float64
chlorides               float64
free sulfur dioxide     float64
total sulfur dioxide    float64
density                 float64
pH                      float64
sulphates               float64
alcohol                 float64
quality                   int64
dtype: object

In [292]:
#df = df.dropna()

## Pipeline Preparation

In [293]:
numeric_features = df.select_dtypes(include=[np.number])
numeric_features.columns

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'magnesium', 'flavanoids', 'minerals', 'calcium', 'chlorides',
       'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH',
       'sulphates', 'alcohol', 'quality'],
      dtype='object')

In [294]:
categorical_features = df.select_dtypes(include="object")
categorical_features.columns

Index(['wine type'], dtype='object')

## Data Preprocessing
### Missing Value Handling

In [403]:
from typing import Literal
from sklearn.base import BaseEstimator, TransformerMixin
class CustomSimpleImputer(BaseEstimator, TransformerMixin):
    def __init__(self, strategy="mean"):
        self.strategy = strategy
    def fit(self, X, y=None):
        if self.strategy == "mean":
            self.fill = X.mean()
        elif self.strategy == "median":
            self.fill = X.median()
        elif self.strategy == "mode":
            self.fill = X.mode()
        return self
    def transform(self, X, y=None):
        return X.fillna(self.fill)
    
    def set_output(self, *, transform: Literal['default', 'pandas'] | None = None) -> BaseEstimator:
        pass

In [404]:
from sklearn.impute import SimpleImputer

simple_imputer = SimpleImputer(strategy='median')
simple_imputer.fit_transform(df.select_dtypes(include=np.number))
imputed_df = pd.DataFrame(simple_imputer.fit_transform(df.select_dtypes(include=np.number)), columns=df.select_dtypes(include=np.number).columns)
imputed_df.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,magnesium,flavanoids,minerals,calcium,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0
mean,6.856625,0.278486,0.334918,6.405012,49.3242,934.434057,285.651009,236.326453,0.045872,35.190375,138.60225,0.994046,3.18727,0.49052,10.507502,6.56175
std,0.841727,0.100296,0.120204,5.101906,29.072414,270.353094,76.84301,71.262522,0.022329,17.071153,42.683564,0.003024,0.150369,0.114318,1.227373,8.757213
min,3.8,0.08,0.0,0.6,0.035118,93.0,16.045445,3.0,0.009,2.0,9.0,0.98713,2.72,0.22,8.0,1.0
25%,6.3,0.21,0.27,1.7,24.233913,747.6425,233.535461,187.59,0.036,23.0,108.0,0.991727,3.09,0.41,9.5,5.0
50%,6.8,0.26,0.32,5.2,49.078666,931.025,284.649716,235.215,0.043,34.0,135.0,0.99373,3.18,0.48,10.4,6.0
75%,7.3,0.32,0.39,9.8625,74.5572,1118.1775,337.930146,284.48,0.05,46.0,168.0,0.996128,3.28,0.55,11.4,6.0
max,11.8,1.1,1.23,65.8,99.998181,1974.85,576.761262,514.52,0.346,289.0,440.0,1.03898,3.82,1.08,14.2,99.0


In [405]:
df.isnull().sum()

wine type               0
fixed acidity           0
volatile acidity        0
citric acid             6
residual sugar          8
magnesium               8
flavanoids              8
minerals                8
calcium                 8
chlorides               8
free sulfur dioxide     8
total sulfur dioxide    8
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

### Outlier Detection - Z-Score

Funktioniert nur, wenn keines der numerischen Features fehlt. 
Frage: Muss man also erst imputen oder .dropna() machen?
- Idee: Man könnte ja erst mit dem Median imputen und dann die Z-Scores berechnen -> Danach die Outlier entfernen

In [406]:
import scipy.stats as stats
class ZScoreOutlierDetector(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=3):
        self.threshold = threshold
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X = pd.DataFrame(X)
        z = np.abs(stats.zscore(X[X.select_dtypes(include=[np.number]).columns]))
        new_X = X[(z < self.threshold).all(axis=1)]
        new_X.columns = X.columns
        return new_X

### (Scaling)

### (Feature and Instance Selection)

In [407]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, StandardScaler

## Pipeline Building

### Feature Pipelines

Create a data pipeline for categorical features and a data pipeline for numerical features.

In [408]:
from sklearn.pipeline import Pipeline

numeric_pipeline = Pipeline(steps=[])
categorical_pipeline = Pipeline(steps=[])

In [409]:
numeric_pipeline.steps.append(('imputer', CustomSimpleImputer(strategy='median')))

In [410]:
def one_hot_encoder(df):
    new_df = df.copy()
    new_df = pd.get_dummies(columns=df.select_dtypes(include="object").columns, data=df, prefix=df.columns)
    return new_df
test = one_hot_encoder(df.select_dtypes(include="object"))

categorical_pipeline.steps.append(("pd_one_hot_encoder", FunctionTransformer(one_hot_encoder)))

In [411]:
column_transformer = ColumnTransformer([
    ("numeric_pipeline", numeric_pipeline, numeric_features.columns),
    ("categorical_pipeline", categorical_pipeline, categorical_features.columns)
]).set_output(transform="pandas")

preprocessing_pipeline_with_outlier_detection = Pipeline([
    ("column_transformer", column_transformer),
    ("outlier_detector", ZScoreOutlierDetector()),
])

preprocessing_pipeline_without_outlier_detection = Pipeline([
    ("column_transformer", column_transformer),
])

preprocessing_pipeline_with_outlier_detection = preprocessing_pipeline_with_outlier_detection.fit(df)
preprocessing_pipeline_without_outlier_detection = preprocessing_pipeline_without_outlier_detection.fit(df)



In [414]:
preprocessing_pipeline_with_outlier_detection

In [415]:
preprocessing_pipeline_without_outlier_detection

In [416]:
data_without_outliers = preprocessing_pipeline_with_outlier_detection.transform(df)
data_without_outliers.head()

Unnamed: 0,numeric_pipeline__fixed acidity,numeric_pipeline__volatile acidity,numeric_pipeline__citric acid,numeric_pipeline__residual sugar,numeric_pipeline__magnesium,numeric_pipeline__flavanoids,numeric_pipeline__minerals,numeric_pipeline__calcium,numeric_pipeline__chlorides,numeric_pipeline__free sulfur dioxide,...,numeric_pipeline__density,numeric_pipeline__pH,numeric_pipeline__sulphates,numeric_pipeline__alcohol,numeric_pipeline__quality,categorical_pipeline__wine type_Cabernet Sauvignon,categorical_pipeline__wine type_Chardonnay,categorical_pipeline__wine type_Gamay,categorical_pipeline__wine type_Merlot,categorical_pipeline__wine type_Pinot noir
0,5.8,0.15,0.49,1.1,76.729301,894.94,186.639301,109.91,0.048,21.0,...,0.9929,3.19,0.48,9.2,5,0,0,0,0,1
1,6.6,0.25,0.32,5.6,4.795712,1160.95,251.875712,247.08,0.039,15.0,...,0.99163,2.96,0.52,11.1,6,0,0,0,1,0
2,6.7,0.21,0.34,1.5,85.19371,789.82,304.70371,219.51,0.035,45.0,...,0.98949,3.24,0.36,12.6,7,0,1,0,0,0
3,8.3,0.28,0.27,17.5,11.976525,777.86,237.586525,225.61,0.045,48.0,...,1.00014,3.02,0.56,9.1,6,0,0,0,1,0
4,7.5,0.42,0.19,6.9,5.599673,785.72,95.399673,89.8,0.041,62.0,...,0.99508,3.23,0.37,10.0,6,0,0,0,1,0


In [417]:
preprocessed_data_with_outliers = preprocessing_pipeline_without_outlier_detection.transform(df)
data_with_outliers = pd.DataFrame(preprocessed_data_with_outliers)
data_with_outliers.head()

Unnamed: 0,numeric_pipeline__fixed acidity,numeric_pipeline__volatile acidity,numeric_pipeline__citric acid,numeric_pipeline__residual sugar,numeric_pipeline__magnesium,numeric_pipeline__flavanoids,numeric_pipeline__minerals,numeric_pipeline__calcium,numeric_pipeline__chlorides,numeric_pipeline__free sulfur dioxide,...,numeric_pipeline__density,numeric_pipeline__pH,numeric_pipeline__sulphates,numeric_pipeline__alcohol,numeric_pipeline__quality,categorical_pipeline__wine type_Cabernet Sauvignon,categorical_pipeline__wine type_Chardonnay,categorical_pipeline__wine type_Gamay,categorical_pipeline__wine type_Merlot,categorical_pipeline__wine type_Pinot noir
0,5.8,0.15,0.49,1.1,76.729301,894.94,186.639301,109.91,0.048,21.0,...,0.9929,3.19,0.48,9.2,5,0,0,0,0,1
1,6.6,0.25,0.32,5.6,4.795712,1160.95,251.875712,247.08,0.039,15.0,...,0.99163,2.96,0.52,11.1,6,0,0,0,1,0
2,6.7,0.21,0.34,1.5,85.19371,789.82,304.70371,219.51,0.035,45.0,...,0.98949,3.24,0.36,12.6,7,0,1,0,0,0
3,8.3,0.28,0.27,17.5,11.976525,777.86,237.586525,225.61,0.045,48.0,...,1.00014,3.02,0.56,9.1,6,0,0,0,1,0
4,7.5,0.42,0.19,6.9,5.599673,785.72,95.399673,89.8,0.041,62.0,...,0.99508,3.23,0.37,10.0,6,0,0,0,1,0


## Pipeline Scoring

In [423]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

X_without_outliers = data_without_outliers.drop(columns=["numeric_pipeline__quality"])
y_without_outliers = data_without_outliers["numeric_pipeline__quality"]

X_with_outliers = data_with_outliers.drop(columns=["numeric_pipeline__quality"])
y_with_outliers = data_with_outliers["numeric_pipeline__quality"]

X_train_without_outliers, X_test_without_outliers, y_train_without_outliers, y_test_without_outliers = train_test_split(X_without_outliers, y_without_outliers, test_size=0.2, random_state=42)
X_train_with_outliers, X_test_with_outliers, y_train_with_outliers, y_test_with_outliers = train_test_split(X_with_outliers, y_with_outliers, test_size=0.2, random_state=42)


pipeline = Pipeline([
    ("svc", SVC())
])
pipeline_1 = pipeline.fit(X_train_without_outliers, y_train_without_outliers)
pipeline_2 = pipeline.fit(X_train_with_outliers, y_train_with_outliers)

score_without_outliers = pipeline_1.score(X_test_without_outliers, y_test_without_outliers)
score_with_outliers = pipeline_2.score(X_test_with_outliers, y_test_with_outliers)

print(f"score without outliers: {score_without_outliers}")
print(f"score with outliers: {score_with_outliers}")
if score_with_outliers > score_without_outliers:
    print("model works better with outliers")
else:
    print("model works better without outliers")


score without outliers: 0.46785714285714286
score with outliers: 0.431875
model works better without outliers


# Fragen

1. Ist es, in Betracht zu ihrem Test, besser, die Outlier-Detection in eine Pipeline zu integrieren? Macht das grundsätzlich Sinn?
2. Gibt es ein Best-Practice, wie man den Erfolg einer Pipeline misst?
    - Es gibt ja pipeline.score(), sofern ein Modell am Ende der Pipeline ist. Sollte man das dafür nutzen?
3. Macht eine Pipeline, die alle Schritte verarbeitet mehr Sinn als mehrere Pipelines, die jeweils einen Schritt verarbeiten (z.B. eine Pipeline für Imputation, eine für Outlier-Detection, eine für Scaling, eine für Feature-Selection, eine für Instance-Selection)?
4. Wie macht man eine Pipeline, deren schlussendlicher DataFrame auch die Spaltennamen beibehält?