In [1]:
from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
# ## class templete for building pipeline component ##

# class class_name(BaseEstimator, TransformerMixin):
#     def __init__(self):
        
#     def fit(self, X, y=None):
#         return self
    
#     def predict(self, X):
#         result = ...
#         return result

#     def transform(self, X):
#         result = ...
#         return result

In [5]:
class numeric_filtering(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.constant_col = [i for i in range(X.shape[1]) if X[:,i].std()==0]
        self.id_col = [i for i in range(X.shape[1]) if len(np.unique(np.diff(X[:,i])))==1]
        self.rm_cols = self.constant_col + self.id_col
        self.final_cols = [i for i in range(X.shape[1]) if i not in self.rm_cols]
        return self
    
    def transform(self, X):
        result = X[:,self.final_cols]
        return result

In [6]:
class categorical_filtering(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.constant_col = [i for i in range(X.shape[1]) if len(np.unique(X[:,i]))==1]
        self.id_col = [i for i in range(X.shape[1]) if len(np.unique(X[:,i]))==X.shape[0]]
        self.cardinality = [i for i in range(X.shape[1]) if len(np.unique(X[:,i])) > 50]
        self.rm_cols = self.constant_col + self.id_col + self.cardinality
        self.final_cols = [i for i in range(X.shape[1]) if i not in self.rm_cols]
        return self
    
    def transform(self, X):
        result = X[:,self.final_cols]
        return result

In [7]:
import numpy as np
import pandas as pd
import pickle as pkl

from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

from sklearn.ensemble import RandomForestClassifier
from catboost.datasets import titanic


In [8]:
## numerical column imputation ##

pipe1 = Pipeline([
    ('num_step1', SimpleImputer(strategy="mean") ),
    ('num_step2', numeric_filtering() ),
    ('num_step3', StandardScaler()  ),
])

In [9]:
## categorical column imputation ##

pipe2 = Pipeline([
    ('cat_step1', SimpleImputer(strategy="most_frequent") ),
    ('cat_step2', categorical_filtering() ),
    ('cat_step3', OneHotEncoder()  ),
])

In [10]:
## Column transformer ##

transform = ColumnTransformer([
    ('num',  pipe1,  make_column_selector(dtype_include=np.number)),
    ('cat',  pipe2,  make_column_selector(dtype_exclude=np.number)),
])

In [11]:
## model pipeline ##

pipe0 = Pipeline([
    ('transform',  transform     ),
    ('model',      RandomForestClassifier()    )
])

In [15]:
train = pd.read_csv("../train.csv")

In [16]:
ycol = "Survived"
xcol = [col for col in train.columns if col not in [ycol]]

In [17]:
pipe0.fit(train[xcol],train[ycol])

In [18]:
pipe0.predict(train[xcol])

array([0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0,
       1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0,
       1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1,

In [19]:
with open("ml_pipeline.pkl","wb") as f:
    pkl.dump(pipe0,f)