In [2]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import cross_validate
from sklearn import tree
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

In [3]:
def import_dataset(filename):
    bank_mkt = pd.read_csv(filename,
                           na_values=["unknown", "nonexistent"],
                           true_values=["yes", "success"],
                           false_values=["no", "failure"])
    # Treat pdays = 999 as missing values
    bank_mkt["pdays"] = bank_mkt["pdays"].replace(999, np.nan)
    # Convert types, "Int64" is nullable integer data type in pandas
    bank_mkt = bank_mkt.astype(dtype={"age": "Int64",
                                      "job": "category",
                                      "marital": "category",
                                      "education": "category",
                                      "default": "boolean",
                                      "housing": "boolean",
                                      "loan": "boolean",
                                      "contact": "category",
                                      "month": "category",
                                      "day_of_week": "category",
                                      "duration": "Int64",
                                      "campaign": "Int64",
                                      "pdays": "Int64",
                                      "previous": "Int64",
                                      "poutcome": "boolean",
                                      "y": "boolean"})
    return bank_mkt

In [4]:
def min_max_scale(df):
    min_max_scaler = preprocessing.MinMaxScaler()
    min_max_scaler.fit_transform(df.select_dtypes("number"))

In [5]:
def baseline_benchmark(df):
    """
    Feed transfromed dataframe with no missing values.
    """
    shuffled_df = df.sample(frac=1).reset_index(drop=True)
    bank_x = shuffled_df.drop(["duration", "y"], axis=1)
    min_max_scale(bank_x)
    bank_x = pd.get_dummies(bank_x, drop_first=True).values
    bank_y = shuffled_df["y"].astype(int).values
    nb_model = GaussianNB()
    tree_clf = tree.DecisionTreeClassifier(class_weight="balanced")
    rf_clf = RandomForestClassifier(class_weight="balanced")
    scoring = ["f1", "accuracy"]
    scores = cross_validate(nb_model, bank_x, bank_y, scoring=scoring, cv=5, return_train_score=True)
    result = pd.DataFrame(scores)
    return result.mean()

## Transform education

In [6]:
bank_mkt = import_dataset("../data/BankMarketing.csv")
bank_mkt["education"].value_counts()

university.degree      12168
high.school             9515
basic.9y                6045
professional.course     5243
basic.4y                4176
basic.6y                2292
illiterate                18
Name: education, dtype: int64

In [7]:
# Treat education as grades
def trans_edu(df):
    education_grade = {"university.degree": 16,
                   "high.school": 12,
                   "professional.course": 12,
                   "basic.9y": 9,
                   "basic.4y": 4,
                   "basic.6y": 6,
                   "illiterate": 0}
    df["education"] = df["education"].replace(education_grade)

## Fill Missing Values as Unknown

In [8]:
def fill_unknown(df, feature_list):
    df = df.copy()
    for feature in feature_list:
        if pd.api.types.is_categorical_dtype(df[feature]):
            df[feature] = df[feature].cat.add_categories("unknown")
            df[feature] = df[feature].fillna("unknown")
        elif pd.api.types.is_bool_dtype(bank_mkt[feature]):
            df[feature] = df[feature].astype("category")
            df[feature] = df[feature].cat.add_categories("unknown")
            df[feature] = df[feature].fillna("unknown")
    return df

In [9]:
bank_mkt = import_dataset("../data/BankMarketing.csv")
category_list = ["marital", "education", "default", "job", "housing", "loan"]
bank_mkt = fill_unknown(bank_mkt, category_list)
bank_mkt["pdays"] = bank_mkt["pdays"].fillna(999)
bank_mkt["poutcome"] = bank_mkt["poutcome"].fillna(False)
baseline_benchmark(bank_mkt)

fit_time          0.059572
score_time        0.016776
test_f1           0.429805
train_f1          0.430581
test_accuracy     0.870788
train_accuracy    0.870897
dtype: float64

In [10]:
bank_mkt = import_dataset("../data/BankMarketing.csv")
category_list = ["marital", "education", "default", "job", "housing", "loan"]
bank_mkt = fill_unknown(bank_mkt, category_list)
bank_mkt["pdays"] = bank_mkt["pdays"].fillna(999)
bank_mkt["poutcome"] = bank_mkt["poutcome"].fillna(False)
education_grade = {"university.degree": 16,
                   "professional.course": 16,
                   "high.school": 12,
                   "unknown": 10,
                   "basic.9y": 9,
                   "basic.6y": 6,
                   "basic.4y": 4,
                   "illiterate": 0}
bank_mkt["education"] = bank_mkt["education"].replace(education_grade).astype("Int64")
baseline_benchmark(bank_mkt)

fit_time          0.050785
score_time        0.013010
test_f1           0.430620
train_f1          0.430084
test_accuracy     0.870618
train_accuracy    0.870521
dtype: float64

## Fill Missing Values by Imputation

### Most Frequent

In [11]:
from sklearn.impute import SimpleImputer
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score, f1_score
from sklearn import tree
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

In [12]:
bank_mkt[["default", "housing", "loan", "poutcome"]].describe()

Unnamed: 0,default,housing,loan,poutcome
count,41188,41188,41188,41188
unique,3,3,3,2
top,False,True,False,False
freq,32588,21576,33950,39815


In [13]:
bank_mkt["pdays"].describe()

count    41188.000000
mean       962.475454
std        186.910907
min          0.000000
25%        999.000000
50%        999.000000
75%        999.000000
max        999.000000
Name: pdays, dtype: float64

In [14]:
def fill_freq(bank_mkt):
    # Add new type
#     bank_mkt["ptype"] = pd.NA
#     bank_mkt.loc[bank_mkt["pdays"].isna() & bank_mkt["poutcome"].isna(), "ptype"] = "new"
#     bank_mkt.loc[bank_mkt["pdays"].isna() & bank_mkt["poutcome"].notna(), "ptype"] = "lost"
#     bank_mkt.loc[bank_mkt["pdays"].notna() & bank_mkt["poutcome"].notna(), "ptype"] = "previous"
    # Fill with most frequent data
    bank_mkt["pdays"] = bank_mkt["pdays"].fillna(999)
    bank_mkt[["default", "loan", "poutcome"]] = bank_mkt[["default", "loan", "poutcome"]].fillna(False)
    bank_mkt["housing"] = bank_mkt["housing"].fillna(True)
    df = bank_mkt[["job", "marital", "education"]]
    imp = SimpleImputer(strategy="most_frequent")
    bank_mkt[["job", "marital", "education"]] = imp.fit_transform(df)

In [19]:
bank_mkt = import_dataset("../data/BankMarketing.csv")
bank_mkt[bank_mkt["job"].isna()].age

29       55
35       55
73       57
91       57
144      38
         ..
40370    59
40428    64
40656    67
41005    63
41108    64
Name: age, Length: 330, dtype: Int64

In [26]:
bank_mkt[bank_mkt["job"] == "retired"].age.describe()

count    1720.000000
mean       62.027326
std        10.493293
min        23.000000
25%        56.000000
50%        59.000000
75%        69.000000
max        98.000000
Name: age, dtype: float64

In [16]:
bank_mkt = import_dataset("../data/BankMarketing.csv")
fill_freq(bank_mkt)
baseline_benchmark(bank_mkt)

fit_time          0.051946
score_time        0.014076
test_f1           0.430063
train_f1          0.430596
test_accuracy     0.872002
train_accuracy    0.872269
dtype: float64

In [90]:
bank_mkt = import_dataset("../data/BankMarketing.csv")
fill_freq(bank_mkt)
education_grade = {"university.degree": 6,
                   "professional.course": 6,
                   "high.school": 5,
                   "basic.9y": 4,
                   "basic.6y": 3,
                   "basic.4y": 2,
                   "illiterate": 0}
bank_mkt["education"] = bank_mkt["education"].replace(education_grade).astype("Int64")
baseline_benchmark(bank_mkt)

fit_time          2.180267
score_time        0.124848
test_f1           0.373589
train_f1          0.971932
test_accuracy     0.891400
train_accuracy    0.993511
dtype: float64

## Mix Unknown and Freq

In [91]:
def fill_mix(bank_mkt):
    # Fill with most frequent data
    bank_mkt["pdays"] = bank_mkt["pdays"].fillna(999)
    bank_mkt[["default", "loan", "poutcome"]] = bank_mkt[["default", "loan", "poutcome"]].fillna(False)
    bank_mkt["housing"] = bank_mkt["housing"].fillna(True)
    df = bank_mkt[["job", "education"]]
    imp = SimpleImputer(strategy="most_frequent")
    bank_mkt[["job", "education"]] = imp.fit_transform(df)
    # Fill marital as unknown
    bank_mkt["marital"] = bank_mkt["marital"].cat.add_categories("unknown")
    bank_mkt["marital"] = bank_mkt["marital"].fillna("unknown")

In [100]:
bank_mkt = import_dataset("../data/BankMarketing.csv")
fill_mix(bank_mkt)
baseline_benchmark(bank_mkt)

fit_time          2.203759
score_time        0.125677
test_f1           0.369754
train_f1          0.972594
test_accuracy     0.891546
train_accuracy    0.993669
dtype: float64

In [99]:
bank_mkt = import_dataset("../data/BankMarketing.csv")
fill_mix(bank_mkt)
education_grade = {"university.degree": 6,
                   "professional.course": 6,
                   "high.school": 5,
                   "basic.9y": 4,
                   "basic.6y": 3,
                   "basic.4y": 2,
                   "illiterate": 0}
bank_mkt["education"] = bank_mkt["education"].replace(education_grade).astype("Int64")
baseline_benchmark(bank_mkt)

fit_time          2.205433
score_time        0.124501
test_f1           0.371182
train_f1          0.972479
test_accuracy     0.890866
train_accuracy    0.993645
dtype: float64

## Iterative Imputer

In [148]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import BayesianRidge

bank_mkt = import_dataset("../data/BankMarketing.csv")
bank_mkt["pdays"] = bank_mkt["pdays"].fillna(999)
bank_mkt["poutcome"] = bank_mkt["poutcome"].fillna(False)
bank_mkt["job"] = bank_mkt["job"].cat.codes
bank_mkt["marital"] = bank_mkt["marital"].cat.codes
bank_mkt["default"] = bank_mkt["default"].astype("float")
bank_mkt["housing"] = bank_mkt["housing"].astype("float")
bank_mkt["loan"] = bank_mkt["loan"].astype("float")
education_grade = {"university.degree": 6,
                   "professional.course": 6,
                   "high.school": 5,
                   "basic.9y": 4,
                   "basic.6y": 3,
                   "basic.4y": 2,
                   "illiterate": 0}
bank_mkt["education"] = bank_mkt["education"].replace(education_grade).astype("float")
X = bank_mkt[["age", "job", "education", "default", "housing", "loan"]].fillna(np.nan)
imputer = IterativeImputer(max_iter=100, estimator=DecisionTreeRegressor())
bank_mkt[["age", "job", "education", "default", "housing", "loan"]] = imputer.fit_transform(X)
bank_mkt = bank_mkt.astype(dtype={"job": "category",
                                  "marital": "category",
                                  "education": "category"})
baseline_benchmark(bank_mkt)

fit_time          0.483089
score_time        0.115229
test_f1           0.202894
train_f1          0.206718
test_accuracy     0.130475
train_accuracy    0.136308
dtype: float64

In [129]:
bank_mkt = import_dataset("../data/BankMarketing.csv")
bank_mkt["pdays"] = bank_mkt["pdays"].fillna(999)
bank_mkt["poutcome"] = bank_mkt["poutcome"].fillna(False)
bank_mkt["job"] = bank_mkt["job"].cat.codes
bank_mkt["marital"] = bank_mkt["marital"].cat.codes
bank_mkt["default"] = bank_mkt["default"].astype("float")
bank_mkt["housing"] = bank_mkt["housing"].astype("float")
bank_mkt["loan"] = bank_mkt["loan"].astype("float")
education_grade = {"university.degree": 6,
                   "professional.course": 6,
                   "high.school": 5,
                   "basic.9y": 4,
                   "basic.6y": 3,
                   "basic.4y": 2,
                   "illiterate": 0}
bank_mkt["education"] = bank_mkt["education"].replace(education_grade).astype("float")

In [134]:
from sklearn.impute import KNNImputer
X = bank_mkt[["age", "job", "education", "default", "housing", "loan"]].fillna(np.nan)
min_max_scale(X)
imputer = KNNImputer(missing_values=np.nan, n_neighbors=5, weights="uniform")
bank_mkt[["age", "job", "education", "default", "housing", "loan"]] = imputer.fit_transform(X).astype("int")
bank_mkt.astype(dtype={"job": "category",
                       "marital": "category",
                       "education": "category"})

In [141]:
baseline_benchmark(bank_mkt)

fit_time          0.036999
score_time        0.009882
test_f1           0.420050
train_f1          0.419619
test_accuracy     0.881834
train_accuracy    0.881762
dtype: float64