## Preprocessing pipelines

In [1]:
import os
import sys
import joblib
import pandas as pd

from sklearn import set_config
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, OrdinalEncoder
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.compose import ColumnTransformer

set_config(transform_output="pandas")

sys.path.insert(0, os.path.abspath(os.path.join("..", "transformers")))
from column_drop import *

In [2]:
data_dir = os.path.join(".", "..", "data")
train_dir = os.path.join(data_dir, "train")
test_dir = os.path.join(data_dir, "test")

pipeline_dir = os.path.join(".", "..", "pipelines")
os.makedirs(pipeline_dir, exist_ok=True)

## Preprocessing pipelines

In this part we will perform preprocessing steps as:
* dropping irrelevant columns
* imputing missing values
* categorical features encoding
* scaling 


In [3]:
# dropping columns
drop_cols_pipeline = Pipeline(
    [
        ("missing_drop", DropMissing()),
        ("drop_low_variance", DropLowVarianceCategorical()),
        ("drop_cardinality", DropHighCardinality()),
    ]
)

# pipeline for imputing and binning numerical cols:
numeric_pipeline = Pipeline([("numeric_imputer", SimpleImputer(strategy="median"))])

categorical_pipeline = Pipeline(
    [
        (
            "encoding",
            OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1),
        ),  # label encoder because we have a lot of unique categorical values which will result in great dimention increase
        ("categorical_imputer", SimpleImputer(strategy="most_frequent")),
    ]
)

transformer = ColumnTransformer(
    [
        (
            "cat_pipe",
            categorical_pipeline,
            make_column_selector(dtype_include="object"),
        ),
        ("num_pipe", numeric_pipeline, make_column_selector(dtype_include="number")),
    ],
    remainder="drop",
    n_jobs=-1,
)

# final pipeline for X
preprocessing_pipeline = Pipeline(
    [
        ("drop", drop_cols_pipeline),
        ("column_transform", transformer),
        ("scale", MinMaxScaler()),
    ]
)

## Saving pipeline

In [4]:
pipeline_path = os.path.join(pipeline_dir, "preprocessing_pipeline_raw.joblib")
joblib.dump(
    preprocessing_pipeline,
    pipeline_path,
)

['./../pipelines/preprocessing_pipeline_raw.joblib']

# Fitting pipelines

In [5]:
for i in range(5):
    print(f"Running for dataset {i}")
    preprocessing_pipeline = joblib.load(pipeline_path)

    X_train = pd.read_csv(os.path.join(train_dir, f"X{i}_train.csv"))
    X_train_processed = preprocessing_pipeline.fit_transform(X_train)
    print(f"\tFitted pipeline.")
    X_train_processed.to_csv(
        os.path.join(train_dir, f"X{i}_train_processed.csv"), index=False
    )
    print(f"\tSaved X_train.")

    X_test = pd.read_csv(os.path.join(test_dir, f"X{i}_test.csv"))
    X_test_processed = preprocessing_pipeline.transform(X_test)
    X_test_processed.to_csv(
        os.path.join(test_dir, f"X{i}_test_processed.csv"), index=False
    )
    print(f"\tSaved X_test.")

    joblib.dump(
        preprocessing_pipeline,
        os.path.join(pipeline_dir, f"preprocessing_pipeline_{i}.joblib"),
    )
    print(f"\tSaved pipeline.")

Running for dataset 0
	Fitted pipeline.
	Saved X_train.
	Saved X_test.
	Saved pipeline.
Running for dataset 1
	Fitted pipeline.
	Saved X_train.
	Saved X_test.
	Saved pipeline.
Running for dataset 2
	Fitted pipeline.
	Saved X_train.
	Saved X_test.
	Saved pipeline.
Running for dataset 3
	Fitted pipeline.
	Saved X_train.
	Saved X_test.
	Saved pipeline.
Running for dataset 4
	Fitted pipeline.
	Saved X_train.
	Saved X_test.
	Saved pipeline.
