# Data-centric ML pipeline demo
For source code and notebooks, check the [repository](https://github.com/hongsupshin/data_centric_preprocessing).

In [1]:
import pandas as pd
import numpy as np
import os
import sys
import joblib
import time

sys.path.append('../')

from data_centric_preprocessing.create_schema import *
from data_centric_preprocessing.data_schema import *
from data_centric_preprocessing.util import *
from data_centric_preprocessing.config import *

## 1. No mismatch

In [2]:
X_train = pd.DataFrame({
    "col_bool": [np.nan, True, False],
    "col_num": [np.nan, 0.5, 10],
    "col_str": [np.nan, "A", "B"],
    "col_array": [np.nan, ["A"], ["B"]],
    "col_null": [np.nan, np.nan, np.nan]}, dtype=object)

X_serve = X_train.copy()

Training

In [3]:
def preprocess_train(X_train):

    X_train = drop_non_informative_features(X_train, seed_patterns=["SEED$"])
    schema_train = build_schema(
        X_train,
        numpy_dtype_map=numpy_dtype_map,
        custom_dtype_map=custom_dtype_map,
        nominal_num_patterns=nominal_num_patterns,
        catch_invariant_with_missing=True
    )
    X_train = cast_numpy_dtype(X_train, schema_train)
    preprocessor = build_preprocessor(
        schema_train,
        transformer_map=transformer_map,
        preprocessor_steps=preprocessor_steps
    )
    X_train_transformed = preprocessor.fit_transform(X_train)    
    
    return X_train_transformed, schema_train, preprocessor

def preprocess_serve(X_serve, schema_train):
    
    X_serve = drop_non_informative_features(X_serve, seed_patterns=["SEED$"])
    schema_serve = build_schema(
        X_serve,
        numpy_dtype_map=numpy_dtype_map,
        custom_dtype_map=custom_dtype_map,
        nominal_num_patterns=nominal_num_patterns,
        catch_invariant_with_missing=True
    )
    X_serve = cast_numpy_dtype(X_serve, schema_serve)
    X_serve = match_cols(X_serve, schema_train.index)
    X_serve = replace_numpy_dtype_mismatch(schema_train, schema_serve, X_serve)

    X_serve_transformed = preprocessor.transform(X_serve)
    
    return X_serve_transformed, schema_serve

Training

In [4]:
X_train_transformed, schema_train, preprocessor = preprocess_train(X_train)
print(X_train_transformed)

[[ 2.    2.    0.5   5.25]
 [ 0.    0.    1.    0.5 ]
 [ 1.    1.    0.   10.  ]]


In [5]:
schema_train

Unnamed: 0,pandas_dtype,numpy_dtype,custom_dtype
col_bool,boolean,<class 'float'>,numeric
col_num,mixed-integer-float,<class 'float'>,numeric
col_str,string,<class 'str'>,nominal_str
col_array,mixed,<class 'str'>,arrays


Serving

In [6]:
X_serve_transformed, schema_serve = preprocess_serve(X_serve, schema_train)
print(X_serve_transformed)

[[ 2.    2.    0.5   5.25]
 [ 0.    0.    1.    0.5 ]
 [ 1.    1.    0.   10.  ]]


## 2. Shape mismatch

In [7]:
X_serve = X_train.copy()
X_serve["col_extra"] = [np.nan, "C", "D"]

In [8]:
X_serve_transformed, schema_serve = preprocess_serve(X_serve, schema_train)
print(X_serve_transformed)

Columns ['col_extra'] were dropped because they are not in X.


[[ 2.    2.    0.5   5.25]
 [ 0.    0.    1.    0.5 ]
 [ 1.    1.    0.   10.  ]]


In [9]:
schema_serve

Unnamed: 0,pandas_dtype,numpy_dtype,custom_dtype
col_bool,boolean,<class 'float'>,numeric
col_num,mixed-integer-float,<class 'float'>,numeric
col_str,string,<class 'str'>,nominal_str
col_array,mixed,<class 'str'>,arrays
col_extra,string,<class 'str'>,nominal_str


## 3. Type mismatch

In [10]:
X_serve = X_train.copy()
X_serve["col_bool"] = [np.nan, "A", "B"]

In [11]:
X_serve_transformed, schema_serve = preprocess_serve(X_serve, schema_train)
print(X_serve_transformed)

[[ 2.    2.    0.5   5.25]
 [ 0.    0.    0.5   0.5 ]
 [ 1.    1.    0.5  10.  ]]




In [12]:
schema_serve

Unnamed: 0,pandas_dtype,numpy_dtype,custom_dtype
col_bool,string,<class 'str'>,nominal_str
col_num,mixed-integer-float,<class 'float'>,numeric
col_str,string,<class 'str'>,nominal_str
col_array,mixed,<class 'str'>,arrays


## 4. Shape and type mismatches

In [13]:
X_serve = X_train.copy()
X_serve["col_extra"] = [np.nan, "C", "D"]
X_serve["col_bool"] = [np.nan, "A", "B"]

In [14]:
X_serve_transformed, schema_serve = preprocess_serve(X_serve, schema_train)
print(X_serve_transformed)

Columns ['col_extra'] were dropped because they are not in X.


[[ 2.    2.    0.5   5.25]
 [ 0.    0.    0.5   0.5 ]
 [ 1.    1.    0.5  10.  ]]




In [15]:
schema_serve

Unnamed: 0,pandas_dtype,numpy_dtype,custom_dtype
col_bool,string,<class 'str'>,nominal_str
col_num,mixed-integer-float,<class 'float'>,numeric
col_str,string,<class 'str'>,nominal_str
col_array,mixed,<class 'str'>,arrays
col_extra,string,<class 'str'>,nominal_str
