# Data Preparation

## Import Data

In [2]:
import numpy as np
import pandas as pd
pd.set_option('max_columns', None)
pd.set_option("max_colwidth", None)

In [7]:
def import_dataset(filename):
    bank_mkt = pd.read_csv(filename,
                           na_values=["unknown", "nonexistent"],
                           true_values=["yes", "success"],
                           false_values=["no", "failure"])
    # Treat pdays = 999 as missing values
    bank_mkt["pdays"] = bank_mkt["pdays"].replace(999, pd.NA)
    # Convert types, "Int64" is nullable integer data type in pandas
    bank_mkt = bank_mkt.astype(dtype={"age": "Int64",
                                      "job": "category",
                                      "marital": "category",
                                      "education": "category",
                                      "default": "boolean",
                                      "housing": "boolean",
                                      "loan": "boolean",
                                      "contact": "category",
                                      "month": "category",
                                      "day_of_week": "category",
                                      "duration": "Int64",
                                      "campaign": "Int64",
                                      "pdays": "Int64",
                                      "previous": "Int64",
                                      "poutcome": "boolean",
                                      "y": "boolean"})
    # Drop duplicates
    bank_mkt = bank_mkt.drop_duplicates().reset_index(drop=True)
    # reorder categorical data
    bank_mkt["education"] = bank_mkt["education"].cat.reorder_categories(["illiterate", "basic.4y", "basic.6y", "basic.9y", "high.school", "professional.course", "university.degree"], ordered=True)
    bank_mkt["month"] = bank_mkt["month"].cat.reorder_categories(["mar", "apr", "jun", "jul", "may", "aug", "sep", "oct", "nov", "dec"], ordered=True)
    bank_mkt["day_of_week"] = bank_mkt["day_of_week"].cat.reorder_categories(["mon", "tue", "wed", "thu", "fri"], ordered=True)
    return bank_mkt

In [8]:
bank_mkt = import_dataset("../data/BankMarketing.csv")

## Partition

We need to split the dataset into trainning set and test set, then we train models on the trainning set and only use test set for final validation purposes. However, simply sampling the dataset may lead to unrepresenatative partition given that our dataset is imbalanced and clients have different features. Luckily, `scikit-learn` provides a useful function to select representative data as test data.

In [9]:
from sklearn.model_selection import StratifiedShuffleSplit

In [10]:
train_test_split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_index, test_index in train_test_split.split(bank_mkt.drop("y", axis=1), bank_mkt["y"]):
    bank_train_set = bank_mkt.loc[train_index].reset_index(drop=True)
    bank_test_set = bank_mkt.loc[test_index].reset_index(drop=True)

## Encoding and Scaling

Even though `sklearn` can process `pd.DataFrame`, more complicated data transformation can only be well processed when the dataframe is encoded in numbers. Missing values will be encoded as `-1`. All boolean and categorical values need to be converted to integers first. For ordinal data like `education`, `month` and `day_of_week` , we can say `basic.6y` is more "advanced" than `basic.4y` for example. Therefore, we should encode `education` into ordinal values or transform them into years of `education`. The same logic also goes for `month` and `day_of_week`.

As discussed above, some clients do not have `pdays` but have `poutcome`, which implies that they may have been contacted before but the `pdays` is not inluded. We use `pdays=999` for this kind of clients. `pdays` also needs to be cut into different categories which is known as the discretization process.

We create a function `cat_encode()` to allow us to do such transformations and then apply one-hot encoding and standardization on the dataset using pipelines. We first create a `basic_preprecessor` pipeline which doesn't do any feature engineering or ordinal encoding.

In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn import set_config
set_config(display='diagram')

In [7]:
def cat_encode(X, education=None, pdays=None):
    """Encode categorical data into numerical values.

    education: if education is "year", education column will be encoded into years of eductaion.
    pdays: if pdays is "cut", pdays column will be feature engineered and discretized.
    """
    X = X.copy()
    # If pdays is "cut", pdays column will be feature engineered and discretized.
    if pdays == "cut":
        X.loc[X["pdays"].isna() & X["poutcome"].notna(), "pdays"] = 999
        X["pdays"] = pd.cut(X["pdays"], [0, 5, 10, 15, 30, 1000], labels=[1, 2, 3, 4, 5], include_lowest=True).astype("Int64")
    # Encode nominal and ordinal features
    # `month` will be encoded to the corresponding number, e.g. "mar" -> 3.
    month_map = {"mar": 3,
                 "apr": 4,
                 "jun": 5,
                 "jul": 6,
                 "may": 7,
                 "aug": 8,
                 "sep": 9,
                 "oct": 10,
                 "nov": 11,
                 "dec": 12}
    X["month"] = X["month"].replace(month_map).astype("int")
    # Other categorical features will be coded as its order in pandas categorical index
    cat_features = ["job", "education", "marital", "contact", "day_of_week"]
    bool_features = ["default", "housing", "loan", "poutcome"]
    X[cat_features] = X[cat_features].apply(lambda x: x.cat.codes).astype("Int64")
    X[bool_features] = X[bool_features].astype("Int64")
    # Fill missing values as -1
    X = X.fillna(-1)
    # If education is "year", education column will be encoded into years of eductaion.
    if education == "year":
        education_map = { 0: 0, # illiterate
                          1: 4, # basic.4y
                          2: 6, # basic.6y
                          3: 9, # basic.9y
                          4: 12, # high.school
                          5: 15, # professional course
                          6: 16} # university
        X["education"] = X["education"].replace(education_map)
    return X

basic_encoder = FunctionTransformer(cat_encode)

cat_features = ["job", "marital", "education", "default", "housing", "loan", "month", "day_of_week", "pdays", "poutcome"]

basic_transformer = ColumnTransformer([
    ("one_hot_encoder", OneHotEncoder(drop="first", sparse=False), cat_features)],
    remainder="passthrough")

basic_preprocessor = Pipeline([
    ("basic_encoder", basic_encoder),
    ("basic_transformer", basic_transformer),
    ("scaler", StandardScaler())
])

basic_preprocessor

## Handling Missing Data


We have several strategies to handle the missing values. For categorical data, we can treat missing value as a different category (-1), which is done by `cat_encode`.

`SimpleImputer` allows us to use simple strategies, such as mean value, to fill the missing values. In our case, because most of our missing values are categorical, it makes more sense to use the most frequent values. This strategy and the feature engineering on `pdays` will be applied on the `freq_preprocessor`.

In [8]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import SimpleImputer

In [9]:
cat_features = ["job", "marital", "education", "default", "housing", "loan", "pdays", "poutcome"]

freq_nom_features = ["job", "marital", "default", "housing", "loan"]

freq_ord_features = ["education"]

one_hot_features = [i for i in cat_features if i not in freq_nom_features and i not in freq_ord_features]

cut_encoder = FunctionTransformer(cat_encode, kw_args={"pdays":"cut"})

freq_nom_imputer = Pipeline([
    ("freq_imputer", SimpleImputer(missing_values=-1, strategy="most_frequent")),
    ("one_hot_encoder", OneHotEncoder(drop="first", sparse=False))
])

freq_ord_imputer = Pipeline([
    ("freq_imputer", SimpleImputer(missing_values=-1, strategy="most_frequent"))
])

freq_transformer = ColumnTransformer([
    ("freq_nom_imputer", freq_nom_imputer, freq_nom_features),
    ("freq_ord_imputer", freq_ord_imputer, freq_ord_features),
    ("one_hot_encoder", OneHotEncoder(drop="first", sparse=False), one_hot_features)],
    remainder="passthrough")

freq_preprocessor = Pipeline([
    ("cut_encoder", cut_encoder),
    ("freq_transformer", freq_transformer),
    ("scaler", StandardScaler())
])

freq_preprocessor

We can also try using machine learning algorithms to impute the missing values. In `sklearn`, this could be achived by an experimental `IterativeImputer`. Here we assume that client data is independent from contact and previous compaingn data. We will only use client basic data to impute missing values. This method along with the feature engineering on `pdays` will be demostrated by `ite_preprocessor`.

In [10]:
cut_encoder = FunctionTransformer(cat_encode, kw_args={"pdays":"cut"})

ite_features = ["age", "job", "marital", "education", "default", "housing", "loan"]

one_hot_features = [1, 2, 11, 13] # ["job", "marital", "pdays", "poutcome"]

ite_imputer = Pipeline([
    ("ite_imputer", IterativeImputer(max_iter=100, missing_values=-1, initial_strategy="most_frequent", random_state=42)),
    ("ite_round", FunctionTransformer(np.round))
])

ite_transformer = ColumnTransformer([("ite_imputer", ite_imputer, ite_features)], remainder="passthrough")

one_hot_encoder = ColumnTransformer([("one_hot_encoder", OneHotEncoder(drop="first", sparse=False), one_hot_features)], remainder="passthrough")

ite_preprocessor = Pipeline([
    ("cut_encoder", cut_encoder),
    ("ite_transformer", ite_transformer),
    ("one_hot_encoder", one_hot_encoder),
    ("scaler", StandardScaler())
])

ite_preprocessor

In [11]:
X_train = bank_train_set.drop(["duration", "y"], axis=1)
X_test = bank_test_set.drop(["duration", "y"], axis=1)
X_train.to_pickle("../data/X_train.pkl")
X_test.to_pickle("../data/X_test.pkl")

y_train = bank_train_set["y"].astype("int").to_numpy()
y_test = bank_test_set["y"].astype("int").to_numpy()
np.save(f"../data/y_train.npy", y_train)
np.save(f"../data/y_test.npy", y_test)

names = ["basic", "freq", "ite"]
preprocessors = [basic_preprocessor,freq_preprocessor, ite_preprocessor]

for name, preprocessor in zip(names, preprocessors):
    X_train_pre = preprocessor.fit_transform(X_train)
    X_test_pre = preprocessor.transform(X_test)
    np.save(f"../data/X_train_{name}.npy", X_train_pre)
    np.save(f"../data/X_test_{name}.npy", X_test_pre)