In [None]:
from sklearn.preprocessing import  OneHotEncoder
import pandas as pd
import numpy as np
import joblib

CAT_NA_VALUE = "__MISSING__"
pd.set_option('display.max_rows', None)


df = pd.read_csv("data/bank_data_train.csv", nrows=1000)

categorical_cols = df.select_dtypes(exclude="number").columns.to_list()
numerical_cols   = df.select_dtypes(include='number').columns.to_list()
numerical_cols   = [c for c in numerical_cols if c not in ("ID", "TARGET")]
dtypes_dict      = df.dtypes.to_dict()

nan_counts       = pd.Series(0.0, index=df.columns.to_list())

del df

samples = 0
chunks = pd.read_csv(
        "data/bank_data_train.csv",
        dtype=dtypes_dict,
        chunksize=100_000
)

for chunk in chunks:
    nan_counts += chunk.isna().sum()
    samples    += len(chunk)

nan_counts    = (nan_counts / samples) * 100
kept_cols       = nan_counts[nan_counts <= 40]
print("NaN percentage of each feature")
print(nan_counts)

NaN percentage of each feature
ID                              0.000000
CR_PROD_CNT_IL                  0.000000
AMOUNT_RUB_CLO_PRC             10.789437
PRC_ACCEPTS_A_EMAIL_LINK       56.315493
APP_REGISTR_RGN_CODE           82.952786
PRC_ACCEPTS_A_POS              56.315493
PRC_ACCEPTS_A_TK               56.315493
TURNOVER_DYNAMIC_IL_1M          0.000000
CNT_TRAN_AUT_TENDENCY1M        78.289929
SUM_TRAN_AUT_TENDENCY1M        78.289929
AMOUNT_RUB_SUP_PRC             10.789437
PRC_ACCEPTS_A_AMOBILE          56.315493
SUM_TRAN_AUT_TENDENCY3M        68.734480
CLNT_TRUST_RELATION            80.455249
PRC_ACCEPTS_TK                 56.315493
PRC_ACCEPTS_A_MTP              56.315493
REST_DYNAMIC_FDEP_1M            0.000000
CNT_TRAN_AUT_TENDENCY3M        68.734480
CNT_ACCEPTS_TK                 56.315493
APP_MARITAL_STATUS             80.789437
REST_DYNAMIC_SAVE_3M            0.000000
CR_PROD_CNT_VCU                 0.000000
REST_AVG_CUR                    0.000000
CNT_TRAN_MED_TENDENCY1M   

In [None]:
kept_num_cols = [n for n in numerical_cols if n in kept_cols.index.to_list()]
kept_cat_cols = [n for n in categorical_cols if n in kept_cols.index.to_list()]
print("Kept feature")
print(kept_cols)

del kept_cols

chunks = pd.read_csv(
        "data/bank_data_train.csv",
        dtype=dtypes_dict,
        chunksize=100_000
)


unique_categories = {}
for cat in kept_cat_cols:
    unique_categories[cat] = set()

sum_x   = pd.Series(0.0, index=kept_num_cols)
sum_x2  = pd.Series(0.0, index=kept_num_cols)
for chunk in chunks:
        # ---- categorical stats ----

    for col in kept_cat_cols:
        unique_categories[col].add(CAT_NA_VALUE)
        unique_categories[col].update(
            chunk[col].fillna(CAT_NA_VALUE).astype(str).unique()
        )
        # ---- numerical stats ----
    chunk[kept_num_cols] = chunk[kept_num_cols].fillna(0.0)

    sum_x   += chunk[kept_num_cols].sum(axis=0)
    sum_x2  += chunk[kept_num_cols].pow(2).sum(axis=0)


mean     = sum_x / samples
variance = (sum_x2 / samples) - (mean ** 2)
std_dev  = np.sqrt(variance)

encoders = []

    # ---- encoders ----
    # onehot_encoder = None
for cat in kept_cat_cols:
    encoder = OneHotEncoder(categories=unique_categories[cat])
    encoders.append(encoder)


preprocessor = {
    "mean":mean.to_numpy(),
    "std" : std_dev,
    "encoders":encoders,
    "kept_num_cols":kept_num_cols,
    "kept_cat_cols":kept_cat_cols,
    "dtypes":dtypes_dict
}

joblib.dump(preprocessor, "preprocessor.joblib")
print("preprocessing saved!")

Kept feature
ID                           0.000000
CR_PROD_CNT_IL               0.000000
AMOUNT_RUB_CLO_PRC          10.789437
TURNOVER_DYNAMIC_IL_1M       0.000000
AMOUNT_RUB_SUP_PRC          10.789437
REST_DYNAMIC_FDEP_1M         0.000000
REST_DYNAMIC_SAVE_3M         0.000000
CR_PROD_CNT_VCU              0.000000
REST_AVG_CUR                 0.000000
AMOUNT_RUB_NAS_PRC          10.789437
TRANS_COUNT_SUP_PRC         10.789437
TRANS_COUNT_NAS_PRC         10.789437
CR_PROD_CNT_TOVR             0.000000
CR_PROD_CNT_PIL              0.000000
TURNOVER_CC                  0.000000
TRANS_COUNT_ATM_PRC         10.789437
AMOUNT_RUB_ATM_PRC          10.789437
TURNOVER_PAYM                0.000000
AGE                          0.000000
CR_PROD_CNT_CC               0.000000
REST_DYNAMIC_FDEP_3M         0.000000
REST_DYNAMIC_IL_1M           0.000000
CR_PROD_CNT_CCFP             0.000000
REST_DYNAMIC_CUR_1M          0.000000
REST_AVG_PAYM                0.000000
LDEAL_GRACE_DAYS_PCT_MED     0.000000