In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import FunctionTransformer 

In [2]:
df = sns.load_dataset("iris")
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [3]:
sns.load_dataset("iris")

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [4]:
ft = FunctionTransformer(func=np.log, inverse_func=np.exp)
ft.transform(df.select_dtypes("number"))

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,1.629241,1.252763,0.336472,-1.609438
1,1.589235,1.098612,0.336472,-1.609438
2,1.547563,1.163151,0.262364,-1.609438
3,1.526056,1.131402,0.405465,-1.609438
4,1.609438,1.280934,0.336472,-1.609438
...,...,...,...,...
145,1.902108,1.098612,1.648659,0.832909
146,1.840550,0.916291,1.609438,0.641854
147,1.871802,1.098612,1.648659,0.693147
148,1.824549,1.223775,1.686399,0.832909


# Custom Transformers

In [5]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_array, check_is_fitted

from sklearn import set_config
set_config(display="diagram")

In [6]:
class StandardScalerClone(BaseEstimator, TransformerMixin):
    def __init__(self, with_mean=True):
        self.with_mean = True
    
    def fit(self, X, y=None):
        if isinstance(X, pd.DataFrame):
            self.features_names_in_ = X.columns.to_list()
        X = check_array(X)
        # all learned attributes must ending with a trailing underscore
        self.mean_ = X.mean(axis=0)
        self.scale_ = X.std(axis=0)
        self.n_features_in_ = X.shape[1]
        return self   # always return self
    
    def transform(self, X):
        check_is_fitted(self)
        X = check_array(X)
        if self.with_mean:
            X = X - self.mean_
        return X/self.scale_        

In [63]:
sc = StandardScalerClone()
sc.fit(df.select_dtypes("number"))

StandardScalerClone()

In [65]:
sc.features_names_in_, sc.n_features_in_

(['sepal_length', 'sepal_width', 'petal_length', 'petal_width'], 4)

In [None]:
# Transformation Pipelines

In [81]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [69]:
num_pipeline = Pipeline([
    ("impute", SimpleImputer(strategy="median")),
    ("standardize", StandardScaler())
])

In [71]:
num_pipeline

In [75]:
imputed_sc = num_pipeline.fit_transform(df.select_dtypes("number"))
imputed_sc[:2,:]

array([[-0.90068117,  1.01900435, -1.34022653, -1.3154443 ],
       [-1.14301691, -0.13197948, -1.34022653, -1.3154443 ]])

In [76]:
num_pipeline.get_feature_names_out()

array(['sepal_length', 'sepal_width', 'petal_length', 'petal_width'],
      dtype=object)

In [77]:
df_sc = pd.DataFrame(imputed_sc, columns=num_pipeline.get_feature_names_out())
df_sc.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,-0.900681,1.019004,-1.340227,-1.315444
1,-1.143017,-0.131979,-1.340227,-1.315444
2,-1.385353,0.328414,-1.397064,-1.315444
3,-1.506521,0.098217,-1.283389,-1.315444
4,-1.021849,1.249201,-1.340227,-1.315444


In [78]:
num_pipeline[0]

In [79]:
num_pipeline[1]

In [80]:
from sklearn.compose import ColumnTransformer, make_column_transformer

In [85]:
num_feature = df.select_dtypes("number").columns.to_list()
cat_feature = df.select_dtypes("object").columns.to_list()

cat_pipeline = Pipeline([
    ("impute", SimpleImputer(strategy="most_frequent")),
    ("encode", OneHotEncoder())
])

In [86]:
preprocessing = ColumnTransformer([
    ("num", num_pipeline, num_feature),
    ("cat", cat_pipeline, cat_feature)
])

In [87]:
preprocessing.fit_transform(df)

array([[-0.90068117,  1.01900435, -1.34022653, ...,  1.        ,
         0.        ,  0.        ],
       [-1.14301691, -0.13197948, -1.34022653, ...,  1.        ,
         0.        ,  0.        ],
       [-1.38535265,  0.32841405, -1.39706395, ...,  1.        ,
         0.        ,  0.        ],
       ...,
       [ 0.79566902, -0.13197948,  0.8195957 , ...,  0.        ,
         0.        ,  1.        ],
       [ 0.4321654 ,  0.78880759,  0.93327055, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.06866179, -0.13197948,  0.76275827, ...,  0.        ,
         0.        ,  1.        ]])

In [88]:
preprocessing.get_feature_names_out()

array(['num__sepal_length', 'num__sepal_width', 'num__petal_length',
       'num__petal_width', 'cat__species_setosa',
       'cat__species_versicolor', 'cat__species_virginica'], dtype=object)

In [89]:
preprocessing.feature_names_in_

array(['sepal_length', 'sepal_width', 'petal_length', 'petal_width',
       'species'], dtype=object)

In [91]:
pd.DataFrame(preprocessing.fit_transform(df), columns=preprocessing.get_feature_names_out())

Unnamed: 0,num__sepal_length,num__sepal_width,num__petal_length,num__petal_width,cat__species_setosa,cat__species_versicolor,cat__species_virginica
0,-0.900681,1.019004,-1.340227,-1.315444,1.0,0.0,0.0
1,-1.143017,-0.131979,-1.340227,-1.315444,1.0,0.0,0.0
2,-1.385353,0.328414,-1.397064,-1.315444,1.0,0.0,0.0
3,-1.506521,0.098217,-1.283389,-1.315444,1.0,0.0,0.0
4,-1.021849,1.249201,-1.340227,-1.315444,1.0,0.0,0.0
...,...,...,...,...,...,...,...
145,1.038005,-0.131979,0.819596,1.448832,0.0,0.0,1.0
146,0.553333,-1.282963,0.705921,0.922303,0.0,0.0,1.0
147,0.795669,-0.131979,0.819596,1.053935,0.0,0.0,1.0
148,0.432165,0.788808,0.933271,1.448832,0.0,0.0,1.0
