# Data Preparation

## Import Data

In [1]:
import numpy as np
import pandas as pd
pd.set_option('max_columns', None)
pd.set_option("max_colwidth", None)

In [2]:
def import_dataset(filename):
    bank_mkt = pd.read_csv(filename,
                           na_values=["unknown", "nonexistent"],
                           true_values=["yes", "success"],
                           false_values=["no", "failure"])
    # Treat pdays = 999 as missing values
    bank_mkt["pdays"] = bank_mkt["pdays"].replace(999, pd.NA)
    # Convert types, "Int64" is nullable integer data type in pandas
    bank_mkt = bank_mkt.astype(dtype={"age": "Int64",
                                      "job": "category",
                                      "marital": "category",
                                      "education": "category",
                                      "default": "boolean",
                                      "housing": "boolean",
                                      "loan": "boolean",
                                      "contact": "category",
                                      "month": "category",
                                      "day_of_week": "category",
                                      "duration": "Int64",
                                      "campaign": "Int64",
                                      "pdays": "Int64",
                                      "previous": "Int64",
                                      "poutcome": "boolean",
                                      "y": "boolean"})
    # reorder categorical data
    bank_mkt["education"] = bank_mkt["education"].cat.reorder_categories(["illiterate", "basic.4y", "basic.6y", "basic.9y", "high.school", "professional.course", "university.degree"], ordered=True)
    bank_mkt["month"] = bank_mkt["month"].cat.reorder_categories(["mar", "apr", "jun", "jul", "may", "aug", "sep", "oct", "nov", "dec"], ordered=True)
    bank_mkt["day_of_week"] = bank_mkt["day_of_week"].cat.reorder_categories(["mon", "tue", "wed", "thu", "fri"], ordered=True)
    return bank_mkt

In [3]:
bank_mkt = import_dataset("../data/BankMarketing.csv")

## Partition

We need to split the dataset into trainning set and test set, then we train models on the trainning set and only use test set for final validation purposes. However, simply sampling the dataset may lead to unrepresenatative partition given that our dataset is imbalanced and clients have different features. Luckily, `scikit-learn` provides a useful function to select representative data as test data.

In [4]:
from sklearn.model_selection import StratifiedShuffleSplit

In [5]:
train_test_split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_index, test_index in train_test_split.split(bank_mkt.drop("y", axis=1), bank_mkt["y"]):
    bank_train_set = bank_mkt.loc[train_index].reset_index(drop=True)
    bank_test_set = bank_mkt.loc[test_index].reset_index(drop=True)

## Handling Missing Data


We have several strategies to handle the missing values. For categorical data, we can either treat missing value as a different category or impute them as the most frequent value.

In [6]:
from sklearn.impute import SimpleImputer

In [7]:
cat_features = ["job", "marital", "education"]
X = bank_train_set.drop(["duration", "y"], axis=1)
X_cat = X[cat_features]
freq_imp = SimpleImputer(strategy="most_frequent")
freq_imp.fit_transform(X_cat)

array([['blue-collar', 'married', 'basic.9y'],
       ['entrepreneur', 'married', 'university.degree'],
       ['retired', 'married', 'basic.4y'],
       ...,
       ['admin.', 'married', 'basic.9y'],
       ['admin.', 'married', 'university.degree'],
       ['admin.', 'married', 'university.degree']], dtype=object)

In [8]:
X_cat = X[cat_features]
fill_imp = SimpleImputer(strategy="constant", fill_value="unknown")
fill_imp.fit_transform(X_cat)

array([['blue-collar', 'married', 'basic.9y'],
       ['entrepreneur', 'married', 'university.degree'],
       ['retired', 'married', 'basic.4y'],
       ...,
       ['admin.', 'married', 'basic.9y'],
       ['admin.', 'married', 'university.degree'],
       ['admin.', 'married', 'university.degree']], dtype=object)

Missing values in boolean data is more tricky and requires `pandas` to transform the data first because `SimpleImputer` can not fill nullable boolean data.

In [9]:
bool_features=["default", "housing", "loan"]
X_bool = X[bool_features].astype("category")
freq_imp.fit_transform(X_bool)

array([[False, False, False],
       [False, False, False],
       [False, False, False],
       ...,
       [False, False, True],
       [False, False, False],
       [False, False, True]], dtype=object)

In [10]:
X_bool = X[bool_features].astype("category")
fill_imp.fit_transform(X_bool)

array([['unknown', False, False],
       [False, False, False],
       [False, False, False],
       ...,
       [False, False, True],
       [False, False, False],
       [False, False, True]], dtype=object)

As discussed above, some clients do not have `pdays` but have `poutcome`, which implies that they may have been contacted before but the `pdays` is more than 30 days therefore not inluded. `pdays` can also be cut into different categories which is known as the discretization process.

In [11]:
X_pdays = X["pdays"]
X_pdays[X["pdays"].isna() & X["poutcome"].notna()] = 999
pd.cut(X_pdays, [0, 5, 10, 15, 30, 1000], labels=["pdays<=5", "pdays<=10", "pdays<=15", "pdays<=30", "pdays>30"], include_lowest=True)

0             NaN
1        pdays>30
2             NaN
3             NaN
4             NaN
           ...   
32945         NaN
32946    pdays>30
32947         NaN
32948         NaN
32949         NaN
Name: pdays, Length: 32950, dtype: category
Categories (5, object): ['pdays<=5' < 'pdays<=10' < 'pdays<=15' < 'pdays<=30' < 'pdays>30']

## Encoding

In [12]:
from sklearn.preprocessing import OneHotEncoder

`education`, `month` and `day_of_week` are ordinal data. We can say `basic.6y` is more "advanced" than `basic.4y` for example. Therefore, we should encode `education` into ordinal values or transform them into years of `education`. The same logic also goes for `month` and `day_of_week`. Even though `sklearn` has its own `OrdinalEncoder`, it is using alphabatical order therefore we use pandas instead.

In [13]:
ord_features = ["education", "month", "day_of_week"]
X_ord = X[ord_features]
X_ord.apply(lambda x: x.cat.codes)

Unnamed: 0,education,month,day_of_week
0,3,8,2
1,6,8,2
2,1,3,0
3,6,4,0
4,6,2,1
...,...,...,...
32945,4,3,1
32946,5,8,4
32947,3,3,0
32948,6,4,4


We will also need `OneHotEncoder` to transform categorical data into multiple binary data.

In [14]:
one_hot_features = ["job", "marital", "default", "housing", "loan"]
one_hot_encoder = OneHotEncoder(drop="first")
X_one_hot = X[one_hot_features].astype("category")
X_one_hot = freq_imp.fit_transform(X_one_hot)
one_hot_encoder.fit_transform(X_one_hot)
one_hot_encoder.get_feature_names(one_hot_features)

array(['job_blue-collar', 'job_entrepreneur', 'job_housemaid',
       'job_management', 'job_retired', 'job_self-employed',
       'job_services', 'job_student', 'job_technician', 'job_unemployed',
       'marital_married', 'marital_single', 'default_True',
       'housing_True', 'loan_True'], dtype=object)

This can also be done in `pandas`. The advantage of doing one hot encoding in `pandas` is that `pd.get_dummies()` can keep missing values as a row of `0`.

## Transformation Pipeline

We can then wrap all our transformations above into pipeline.

In [15]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

In [16]:
def pdays_transformation(X):
    """Feature Engineering `pdays`."""
    X = X.copy()
    X.loc[X["pdays"].isna() & X["poutcome"].notna(), "pdays"] = 999
    X["pdays"] = pd.cut(X["pdays"], [0, 5, 10, 15, 30, 1000], labels=["<=5", "<=10", "<=15", "<=30", ">30"], include_lowest=True)
    return X

def ordinal_transformation(X, education=None):
    """Encode ordinal labels.

    education: if education is "year", education column will be encoded into years of eductaion.
    """
    X = X.copy()
    ordinal_features = ["education", "month", "day_of_week"]
    X[ordinal_features] = X[ordinal_features].apply(lambda x: x.cat.codes)
    if education=="year":
        education_map = { 0: 0, # illiterate
                          1: 4, # basic.4y
                          2: 6, # basic.6y
                          3: 9, # basic.9y
                          4: 12, # high.school
                          5: 15, # professional course
                          6: 16} # university
        X["education"] = X["education"].replace(education_map)
    return X

def bool_transformation(X):
    """Transform boolean data into categorical data."""
    X = X.copy()
    bool_features = ["default", "housing", "loan", "poutcome"]
    X[bool_features] = X[bool_features].astype("category")
    X[bool_features] = X[bool_features].replace({True: "true", False: "false"})
    return X

cut_transformer = FunctionTransformer(pdays_transformation)

ordinal_transformer = FunctionTransformer(ordinal_transformation)

bool_transformer = FunctionTransformer(bool_transformation)

In [17]:
freq_features = ["job", "marital", "education"]

fill_features = ["housing", "loan", "default", "pdays", "poutcome"]

one_hot_features = ["contact"]

freq_transformer = Pipeline([
    ("freq_imputer", SimpleImputer(strategy="most_frequent")),
    ("one_hot_encoder", OneHotEncoder(drop="first", handle_unknown="error"))
])

fill_transformer = Pipeline([
    ("freq_imputer", SimpleImputer(strategy="constant", fill_value="missing")),
    ("one_hot_encoder", OneHotEncoder(drop="first", handle_unknown="error"))
])

cat_transformer = ColumnTransformer([
    ("freq_imputer", freq_transformer, freq_features),
    ("fill_imputer", fill_transformer, fill_features),
    ("one_hot_encoder", OneHotEncoder(drop="first", handle_unknown="error"), one_hot_features)
], remainder="passthrough")

preprocessor = Pipeline([
    ("bool_transformer", bool_transformer),
    ("cut_transformer", cut_transformer),
    ("ordinal_transformer", ordinal_transformer),
    ("cat_transformer", cat_transformer),
    ("scaler", StandardScaler())
])

In [18]:
X_train = preprocessor.fit_transform(bank_train_set.drop(["duration", "y"], axis=1))
y_train = bank_train_set["y"].astype("int").to_numpy()

## Baseline Benchmark

In [19]:
from sklearn.model_selection import cross_validate
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

In [20]:
scoring = ["f1", "precision", "recall", "roc_auc"]
# Initialize Model
nb_model = GaussianNB()
logit_model = LogisticRegression(class_weight="balanced")
knn_model = KNeighborsClassifier(n_neighbors=5)
# Train model and get CV results 
nb_cv = cross_validate(nb_model, X_train, y_train, scoring=scoring, cv = 5)
logit_cv = cross_validate(logit_model, X_train, y_train, scoring=scoring, cv = 5)
knn_cv = cross_validate(knn_model, X_train, y_train, scoring=scoring, cv = 5)
# Calculate CV result mean
nb_result = pd.DataFrame(nb_cv).mean().rename("Naive Bayes")
logit_result = pd.DataFrame(logit_cv).mean().rename("Logistic Regression")
knn_result = pd.DataFrame(knn_cv).mean().rename("KNN")
# Store and output result
result = pd.concat([nb_result, logit_result, knn_result], axis=1)
result

Unnamed: 0,Naive Bayes,Logistic Regression,KNN
fit_time,0.013109,0.086483,0.60494
score_time,0.011415,0.007786,10.609435
test_f1,0.311337,0.419526,0.335483
test_precision,0.199409,0.312006,0.507897
test_recall,0.771852,0.640625,0.25054
test_roc_auc,0.750778,0.781366,0.706039


In [21]:
from sklearn.dummy import DummyClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score

In [22]:
X_test = preprocessor.transform(bank_test_set.drop(["duration", "y"], axis=1))
y_test = bank_test_set["y"].astype("int").to_numpy()
# Initialize and fit Model
dummy_model = DummyClassifier(strategy="prior").fit(X_train, y_train)
nb_model = GaussianNB().fit(X_train, y_train)
logit_model = LogisticRegression(class_weight="balanced").fit(X_train, y_train)
knn_model = KNeighborsClassifier(n_neighbors=5).fit(X_train, y_train)
# Predict and calculate score
dummy_predict = dummy_model.predict(X_test)
dummy_f1 = f1_score(y_test, dummy_predict)
dummy_precision = precision_score(y_test, dummy_predict)
dummy_recall = recall_score(y_test, dummy_predict)
dummy_roc_auc = roc_auc_score(y_test, dummy_predict)
nb_predict = nb_model.predict(X_test)
nb_f1 = f1_score(y_test, nb_predict)
nb_precision = precision_score(y_test, nb_predict)
nb_recall = recall_score(y_test, nb_predict)
nb_roc_auc = roc_auc_score(y_test, nb_predict)
logit_predict = logit_model.predict(X_test)
logit_f1 = f1_score(y_test, logit_predict)
logit_precision = precision_score(y_test, logit_predict)
logit_recall = recall_score(y_test, logit_predict)
logit_roc_auc = roc_auc_score(y_test, logit_predict)
knn_predict = knn_model.predict(X_test)
knn_f1 = f1_score(y_test, knn_predict)
knn_precision = precision_score(y_test, knn_predict)
knn_recall = recall_score(y_test, knn_predict)
knn_roc_auc = roc_auc_score(y_test, knn_predict)
# Store and output result
result = pd.DataFrame(data={"Dummy Classifier": [dummy_f1, dummy_precision, dummy_recall, dummy_roc_auc],
                            "Naive Bayes": [nb_f1, nb_precision, nb_recall, nb_roc_auc],
                            "Logistic Regression": [logit_f1, logit_precision, logit_recall, logit_roc_auc],
                            "KNN": [knn_f1, knn_precision, knn_recall, knn_roc_auc]},
                       index=["F1 Score", "Precision Score", "Recall Score", "ROC AUC Score"])
result

Unnamed: 0,Dummy Classifier,Naive Bayes,Logistic Regression,KNN
F1 Score,0.0,0.345568,0.44007,0.372325
Precision Score,0.0,0.224894,0.326552,0.550633
Recall Score,0.0,0.74569,0.674569,0.28125
ROC AUC Score,0.5,0.709712,0.748981,0.626056
