## Preprocessing pipelines

In [1]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, OrdinalEncoder
from sklearn.compose import make_column_transformer, make_column_selector, ColumnTransformer
import pandas as pd
import os
import sys
import importlib
import joblib
sys.path.insert(0, os.path.abspath(os.path.join("..", "transformers")))
from column_drop import *

## Preprocessing pipelines

In this part we will perform preprocessing steps as:
* dropping irrelevant columns
* imputing missing values
* categorical features encoding
* scaling 


In [2]:
# dropping columns
drop_cols_pipeline = Pipeline([('missing_drop',DropMissing()),
                               ('drop_low_variance', DropLowVarianceCategorical()),
                               ('drop_cardinality', DropHighCardinality())])

# pipeline for imputing and binning numerical cols:
numeric_pipeline = Pipeline([('numeric_imputer', SimpleImputer(strategy='median'))])

categorical_pipeline = Pipeline([('encoding', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)), # label encoder because we have a lot of unique categorical values which will result in great dimention increase
                                 ('categorical_imputer', SimpleImputer(strategy='most_frequent'))]) 

transformer = ColumnTransformer([('cat_pipe', categorical_pipeline, make_column_selector(dtype_include='object')),
                                 ('num_pipe', numeric_pipeline, make_column_selector(dtype_include='number'))],
                                 remainder='drop',
                                 n_jobs=-1)

# final pipeline for X
preprocessing_pipeline = Pipeline([('drop', drop_cols_pipeline),
                                   ('column_transform', transformer),
                                   ('scale', MinMaxScaler())])

## Saving pipeline

In [3]:
pipeline_folder = '../pipelines'
if not os.path.exists(pipeline_folder):
        os.makedirs(pipeline_folder)
        print(f"Folder '{pipeline_folder}' został utworzony.")

joblib.dump(preprocessing_pipeline, '../pipelines/preprocessing_pipeline.joblib')

Folder '../pipelines' został utworzony.


['../pipelines/preprocessing_pipeline.joblib']