In [3]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import cross_validate
from sklearn import tree
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

def import_dataset(filename):
    bank_mkt = pd.read_csv(filename,
                           na_values=["unknown", "nonexistent"],
                           true_values=["yes", "success"],
                           false_values=["no", "failure"])
    # Treat pdays = 999 as missing values
    bank_mkt["pdays"] = bank_mkt["pdays"].replace(999, np.nan)
    # Convert types, "Int64" is nullable integer data type in pandas
    bank_mkt = bank_mkt.astype(dtype={"age": "Int64",
                                      "job": "category",
                                      "marital": "category",
                                      "education": "category",
                                      "default": "boolean",
                                      "housing": "boolean",
                                      "loan": "boolean",
                                      "contact": "category",
                                      "month": "category",
                                      "day_of_week": "category",
                                      "duration": "Int64",
                                      "campaign": "Int64",
                                      "pdays": "Int64",
                                      "previous": "Int64",
                                      "poutcome": "boolean",
                                      "y": "boolean"})
    return bank_mkt

def fill_unknown(df, feature_list):
    df = df.copy()
    for feature in feature_list:
        if pd.api.types.is_categorical_dtype(df[feature]):
            df[feature] = df[feature].cat.add_categories("unknown")
            df[feature] = df[feature].fillna("unknown")
        elif pd.api.types.is_bool_dtype(bank_mkt[feature]):
            df[feature] = df[feature].astype("category")
            df[feature] = df[feature].cat.add_categories("unknown")
            df[feature] = df[feature].fillna("unknown")
    return df

def min_max_scale(df):
    min_max_scaler = preprocessing.MinMaxScaler()
    min_max_scaler.fit_transform(df.select_dtypes("number"))
    
def baseline_benchmark(df):
    """
    Feed transfromed dataframe with no missing values.
    """
    shuffled_df = df.sample(frac=1).reset_index(drop=True)
    bank_x = shuffled_df.drop(["duration", "y"], axis=1)
    min_max_scale(bank_x)
    bank_x = pd.get_dummies(bank_x, drop_first=True).values
    bank_y = shuffled_df["y"].astype(int).values
    nb_model = GaussianNB()
    tree_clf = tree.DecisionTreeClassifier(class_weight="balanced")
    rf_clf = RandomForestClassifier(class_weight="balanced")
    scoring = ["f1", "accuracy"]
    scores = cross_validate(nb_model, bank_x, bank_y, scoring=scoring, cv=5, return_train_score=True)
    result = pd.DataFrame(scores)
    return result.mean()

bank_mkt = import_dataset("../data/BankMarketing.csv")
category_list = ["marital", "education", "default", "job", "housing", "loan"]
bank_mkt = fill_unknown(bank_mkt, category_list)
bank_mkt["pdays"] = bank_mkt["pdays"].fillna(999)
bank_mkt["poutcome"] = bank_mkt["poutcome"].fillna(False)
result = baseline_benchmark(bank_mkt)
result

fit_time          0.059709
score_time        0.015532
test_f1           0.428933
train_f1          0.430663
test_accuracy     0.870788
train_accuracy    0.870891
dtype: float64