# Data Preparation and Feature Engineering

In [14]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
import numpy as np
import pandas as pd
from category_encoders.target_encoder import TargetEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV, RandomizedSearchCV, cross_validate
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.kernel_approximation import RBFSampler
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import accuracy_score, average_precision_score, f1_score, precision_score, recall_score, balanced_accuracy_score, roc_auc_score, confusion_matrix
from sklearn.utils.fixes import loguniform
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

## Import

In [15]:
def import_dataset(filename):
    """
    Import the dataset from the path.

    Parameters
    ----------
        filename : str
            filename with path

    Returns
    -------
        data : DataFrame

    Examples
    --------
        bank_mkt = import_dataset("../data/BankMarketing.csv")
    """
    bank_mkt = pd.read_csv(filename,
                           na_values=["unknown", "nonexistent"],
                           true_values=["yes", "success"],
                           false_values=["no", "failure"])
    # Treat pdays = 999 as missing values
    bank_mkt["pdays"] = bank_mkt["pdays"].replace(999, pd.NA)
    # Convert types, "Int64" is nullable integer data type in pandas
    bank_mkt = bank_mkt.astype(dtype={"age": "Int64",
                                      "job": "category",
                                      "marital": "category",
                                      "education": "category",
                                      "default": "boolean",
                                      "housing": "boolean",
                                      "loan": "boolean",
                                      "contact": "category",
                                      "month": "category",
                                      "day_of_week": "category",
                                      "duration": "Int64",
                                      "campaign": "Int64",
                                      "pdays": "Int64",
                                      "previous": "Int64",
                                      "poutcome": "boolean",
                                      "y": "boolean"})
    # Drop duplicates
    bank_mkt = bank_mkt.drop_duplicates().reset_index(drop=True)
    # reorder categorical data
    bank_mkt["education"] = bank_mkt["education"].cat.reorder_categories(["illiterate", "basic.4y", "basic.6y", "basic.9y", "high.school", "professional.course", "university.degree"], ordered=True)
    bank_mkt["month"] = bank_mkt["month"].cat.reorder_categories(["mar", "apr", "jun", "jul", "may", "aug", "sep", "oct", "nov", "dec"], ordered=True)
    bank_mkt["day_of_week"] = bank_mkt["day_of_week"].cat.reorder_categories(["mon", "tue", "wed", "thu", "fri"], ordered=True)
    return bank_mkt

In [16]:
bank_mkt = import_dataset("../data/BankMarketing.csv")

## Partition

In [17]:
def split_dataset(data, preprocessor, random_state=82):
    """
    Split dataset into train, test and validation sets using preprocessor.
    Because the random state of validation set is not specified, the validation set will be different each time when the function is called.

    Parameters
    ----------
        data : DataFrame

        preprocessor : Pipeline

    Returns
    -------
        datasets : tuple

    Examples
    --------
        from sklearn.preprocessing import OrdinalEncoder
        data = import_dataset("../data/BankMarketing.csv").interpolate(method="pad").loc[:, ["job", "education", "y"]]
        # To unpack all train, test, and validation sets 
        X_train, y_train, X_test, y_test, X_ttrain, y_ttrain, X_validate, y_validate = split_dataset(data, OrdinalEncoder())
        # To unpack train and test sets.
        X_train, y_train, X_test, y_test, *other_sets = split_dataset(data, OrdinalEncoder())
        # To unpack test and validation set
        *other_sets, X_test, y_test, X_ttrain, y_ttrain, X_validate, y_validate = split_dataset(data, OrdinalEncoder())
        # To unpack only train set.
        X_train, y_train, *other_sets = split_dataset(data, OneHotEncoder())
    """
    train_test_split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=random_state)
    for train_index, test_index in train_test_split.split(data.drop("y", axis=1), data["y"]):
        train_set = data.loc[train_index].reset_index(drop=True)
        test_set = data.loc[test_index].reset_index(drop=True)

    y_train = train_set["y"].astype("int").to_numpy()
    y_test = test_set["y"].astype("int").to_numpy()
    X_train = preprocessor.fit_transform(train_set, y_train)
    X_test = preprocessor.transform(test_set)
        
    train_validate_split = StratifiedShuffleSplit(n_splits=1, test_size=0.2)
    for ttrain_index, validate_index in train_validate_split.split(X_train, y_train):
        ttrain_set = train_set.loc[ttrain_index].reset_index(drop=True)
        validate_set = train_set.loc[validate_index].reset_index(drop=True)
    
    y_ttrain = ttrain_set["y"].astype("int").to_numpy()
    y_validate = validate_set["y"].astype("int").to_numpy()
    X_ttrain = preprocessor.fit_transform(ttrain_set, y_ttrain)
    X_validate = preprocessor.transform(validate_set)
    
    if isinstance(X_train, pd.DataFrame):
        X_train = X_train.to_numpy()
        X_test = X_test.to_numpy()
        X_ttrain = X_ttrain.to_numpy()
        X_validate = X_validate.to_numpy()

    return (X_train, y_train, X_test, y_test, X_ttrain, y_ttrain, X_validate, y_validate)

## Baseline

In [18]:
def benchmark(data, preprocessor, clf):
    """
    Benchmark preprocessor and clf's performance on train, validation and test sets. 
    All the data transformation should be handled by preprocessor and estimation should be handled by clf.
    
    Parameters
    ----------
        data : DataFrame
        
        preprocessor : Pipeline
        
        clf : estimator
        
        name : str, default = None
        
        compare_to: DataFrame, default = None
        
    """
    X_train, y_train, X_test, y_test, X_ttrain, y_ttrain, X_validate, y_validate = split_dataset(data, preprocessor)
    X_sets = [X_ttrain, X_validate, X_test]
    y_sets = [y_ttrain, y_validate, y_test]
    
    metric_names = ["TNR", "TPR", "bACC", "ROC", "REC", "PRE", "AP"]
    set_names = ["Train", "Validate", "Test"]
    metric_df = pd.DataFrame(index=metric_names, columns=set_names)
    
    try:
        clf.fit(X_ttrain, y_ttrain, eval_set=(X_validate, y_validate), verbose=False)
    except (ValueError, TypeError):
        clf.fit(X_ttrain, y_ttrain)
        
    for name, X, y in zip(set_names, X_sets, y_sets):
        # Re-fit model on train set before test set evaluation except CatBoost
        if name == "Test":
            try:
                clf.fit(X_ttrain, y_ttrain, eval_set=(X_validate, y_validate), verbose=False)
            except (ValueError, TypeError):
                clf.fit(X_train, y_train)
                
        y_pred = clf.predict(X)
        
        try:
            y_score = clf.decision_function(X)
        except AttributeError:
            y_score = clf.predict_proba(X)[:, 1]
            
        metrics = [recall_score(y, y_pred, pos_label=0),
                   recall_score(y, y_pred),
                   balanced_accuracy_score(y, y_pred),
                   roc_auc_score(y, y_score),
                   recall_score(y, y_pred),
                   precision_score(y, y_pred),
                   average_precision_score(y, y_score)]
        metric_df[name] = metrics
        
    return metric_df

In [19]:
def cat_encode(X,
               drop=["duration", "y"],
               cut=None,
               cyclic=None,
               target=None):
    """
    Encode and transform categorical data into numerical values.
    
    Parameters
    ----------
        X : DataFrame
        
        drop : list, default = ["duration", "y"]
        
        cut : list
        
        cyclic : list
        
        target : list
    
    Returns
    -------
        X : DataFrame

    Examples
    --------
    bank_mkt = import_dataset("../data/BankMarketing.csv")
    X = cat_encode(bank_mkt)
    """
    X = X.copy()
    
    # `month` will be encoded to the corresponding number, e.g. "mar" -> 3.
    month_map = {"mar": 3,
                 "apr": 4,
                 "may": 5,
                 "jun": 6,
                 "jul": 7,
                 "aug": 8,
                 "sep": 9,
                 "oct": 10,
                 "nov": 11,
                 "dec": 12}
    X["month"] = X["month"].replace(month_map).astype("Int64")
    
    if cut != None:
        if "pdays" in cut:
            X["pdays"] = X["pdays"].fillna(-1)
            # Clients who have been contacted but do not have pdays record
            X.loc[X["pdays"].isna() & X["poutcome"].notna(), "pdays"] = 999
            # Cut pdays into categories
            X["pdays"] = pd.cut(X["pdays"], [0, 3, 5, 10, 15, 30, 1000], labels=[3, 5, 10, 15, 30, 1000], include_lowest=True).astype("Int64")
    else:
        # Fill missing values in pdays as 999
        X["pdays"] = X["pdays"].fillna(999)
    
    if cyclic != None:
        if "month" in cyclic:
            X['month_sin'] = np.sin(2 * np.pi * X["month"]/12)
            X['month_cos'] = np.cos(2 * np.pi * X["month"]/12)
            X = X.drop("month", axis=1)
        if "day_of_week" in cyclic:
            X["day_of_week"] = X["day_of_week"].cat.codes
            X['day_sin'] = np.sin(2 * np.pi * X["day_of_week"]/5)
            X['day_cos'] = np.cos(2 * np.pi * X["day_of_week"]/5)
            X = X.drop("day_of_week", axis=1)
    
    # Transform target encoded feature as str
    if target != None:
        X[target] = X[target].astype("str")
        
    # Drop features
    X = X.drop(drop, axis=1)
    
    # Other categorical features will be coded as its order in pandas categorical index
    X = X.apply(lambda x: x.cat.codes if pd.api.types.is_categorical_dtype(x) else (x.astype("Int64") if pd.api.types.is_bool_dtype(x) else x))
    
    # Fill missing values as -1
    X = X.fillna(-1)
    
    return X

In [20]:
cat_encoder = FunctionTransformer(cat_encode)
cat_clf = CatBoostClassifier(eval_metric="AUC", class_weights=[1, 8])
cat_baseline = benchmark(bank_mkt, cat_encoder, cat_clf)
cat_baseline

Unnamed: 0,Train,Validate,Test
TNR,0.864431,0.862128,0.861795
TPR,0.669923,0.630728,0.635776
bACC,0.767177,0.746428,0.748786
ROC,0.857805,0.79851,0.79842
REC,0.669923,0.630728,0.635776
PRE,0.38554,0.367347,0.36875
AP,0.543347,0.476788,0.447053


In [21]:
cat_features = ["job",
                "marital",
                "education",
                "default",
                "housing",
                "loan",
                "poutcome"]

num_features =  ["age",
                 "campaign",
                 "pdays",
                 "previous",
                 "emp.var.rate",
                 "cons.price.idx",
                 "cons.conf.idx",
                 "euribor3m",
                 "nr.employed"]

hot_scaler = ColumnTransformer([
    ("one_hot_encoder", OneHotEncoder(drop="first"), cat_features),
    ("scaler", StandardScaler(), num_features)
], remainder="passthrough")

cat_transformer = make_pipeline(cat_encoder, hot_scaler)

In [22]:
logit_clf = LogisticRegression(penalty="none", class_weight="balanced", max_iter=1000)
logit_baseline = benchmark(bank_mkt, cat_transformer, logit_clf)
logit_baseline

Unnamed: 0,Train,Validate,Test
TNR,0.807253,0.805166,0.807608
TPR,0.658134,0.672507,0.665948
bACC,0.732694,0.738836,0.736778
ROC,0.786998,0.784342,0.781964
REC,0.658134,0.672507,0.665948
PRE,0.30243,0.30464,0.305336
AP,0.442225,0.449944,0.425815


In [23]:
rbf_clf = Pipeline([
    ("rbf", RBFSampler(gamma=0.001, random_state=42)),
    ("svm", LinearSVC(C=1, loss="squared_hinge", dual=False, class_weight="balanced", max_iter=1000))
])
rbf_baseline = benchmark(bank_mkt, cat_transformer, rbf_clf)
rbf_baseline

Unnamed: 0,Train,Validate,Test
TNR,0.796348,0.793192,0.797209
TPR,0.672617,0.661725,0.678879
bACC,0.734482,0.727458,0.738044
ROC,0.791338,0.775648,0.78522
REC,0.672617,0.661725,0.678879
PRE,0.295458,0.288824,0.298295
AP,0.45309,0.427039,0.42464


In [24]:
sum_baseline = pd.concat([cat_baseline, logit_baseline, rbf_baseline], keys=["CatBoost", "Logistic", "RBF SVM"], axis=1)
sum_baseline

Unnamed: 0_level_0,CatBoost,CatBoost,CatBoost,Logistic,Logistic,Logistic,RBF SVM,RBF SVM,RBF SVM
Unnamed: 0_level_1,Train,Validate,Test,Train,Validate,Test,Train,Validate,Test
TNR,0.864431,0.862128,0.861795,0.807253,0.805166,0.807608,0.796348,0.793192,0.797209
TPR,0.669923,0.630728,0.635776,0.658134,0.672507,0.665948,0.672617,0.661725,0.678879
bACC,0.767177,0.746428,0.748786,0.732694,0.738836,0.736778,0.734482,0.727458,0.738044
ROC,0.857805,0.79851,0.79842,0.786998,0.784342,0.781964,0.791338,0.775648,0.78522
REC,0.669923,0.630728,0.635776,0.658134,0.672507,0.665948,0.672617,0.661725,0.678879
PRE,0.38554,0.367347,0.36875,0.30243,0.30464,0.305336,0.295458,0.288824,0.298295
AP,0.543347,0.476788,0.447053,0.442225,0.449944,0.425815,0.45309,0.427039,0.42464


## Drop Client Data

In [25]:
drop_features = ["age",
                 "job",
                 "marital",
                 "education",
                 "housing",
                 "loan",
                 "default",
                 "duration",
                 "y"]

drop_encoder = FunctionTransformer(cat_encode, kw_args={"drop": drop_features})

In [26]:
cat_drop = benchmark(bank_mkt, drop_encoder, cat_clf)
pd.concat([cat_baseline, cat_drop], keys=["Catboost Baseline", "Catboost Drop"], axis=1)

Unnamed: 0_level_0,Catboost Baseline,Catboost Baseline,Catboost Baseline,Catboost Drop,Catboost Drop,Catboost Drop
Unnamed: 0_level_1,Train,Validate,Test,Train,Validate,Test
TNR,0.864431,0.862128,0.861795,0.864346,0.862812,0.860016
TPR,0.669923,0.630728,0.635776,0.655776,0.668464,0.640086
bACC,0.767177,0.746428,0.748786,0.760061,0.765638,0.750051
ROC,0.857805,0.79851,0.79842,0.833984,0.817493,0.807777
REC,0.669923,0.630728,0.635776,0.655776,0.668464,0.640086
PRE,0.38554,0.367347,0.36875,0.380348,0.382126,0.367347
AP,0.543347,0.476788,0.447053,0.523444,0.490449,0.45801


In [27]:
cat_features = ["poutcome"]

num_features =  ["campaign",
                 "pdays",
                 "previous",
                 "emp.var.rate",
                 "cons.price.idx",
                 "cons.conf.idx",
                 "euribor3m",
                 "nr.employed"]

hot_scaler = ColumnTransformer([
    ("one_hot_encoder", OneHotEncoder(drop="first"), cat_features),
    ("scaler", StandardScaler(), num_features)
], remainder="passthrough")

drop_transformer = make_pipeline(drop_encoder, hot_scaler)

In [28]:
logit_drop = benchmark(bank_mkt, drop_transformer, logit_clf)
pd.concat([logit_baseline, logit_drop], keys=["Logistic Baseline", "Logistic Drop"], axis=1)

Unnamed: 0_level_0,Logistic Baseline,Logistic Baseline,Logistic Baseline,Logistic Drop,Logistic Drop,Logistic Drop
Unnamed: 0_level_1,Train,Validate,Test,Train,Validate,Test
TNR,0.807253,0.805166,0.807608,0.792841,0.787718,0.795977
TPR,0.658134,0.672507,0.665948,0.668238,0.638814,0.664871
bACC,0.732694,0.738836,0.736778,0.73054,0.713266,0.730424
ROC,0.786998,0.784342,0.781964,0.780918,0.778248,0.783053
REC,0.658134,0.672507,0.665948,0.668238,0.638814,0.664871
PRE,0.30243,0.30464,0.305336,0.290568,0.276385,0.292694
AP,0.442225,0.449944,0.425815,0.44237,0.427636,0.417135


In [29]:
rbf_drop = benchmark(bank_mkt, drop_transformer, rbf_clf)
pd.concat([rbf_baseline, rbf_drop], keys=["RBF Baseline", "RBF Drop"], axis=1)

Unnamed: 0_level_0,RBF Baseline,RBF Baseline,RBF Baseline,RBF Drop,RBF Drop,RBF Drop
Unnamed: 0_level_1,Train,Validate,Test,Train,Validate,Test
TNR,0.796348,0.793192,0.797209,0.781422,0.774718,0.782978
TPR,0.672617,0.661725,0.678879,0.685079,0.642857,0.677802
bACC,0.734482,0.727458,0.738044,0.733251,0.708787,0.73039
ROC,0.791338,0.775648,0.78522,0.793232,0.772558,0.78497
REC,0.672617,0.661725,0.678879,0.685079,0.642857,0.677802
PRE,0.295458,0.288824,0.298295,0.284675,0.265886,0.283973
AP,0.45309,0.427039,0.42464,0.441357,0.445666,0.412465


## Drop Everything Except Client Data

In [30]:
drop_features = ["contact",
                 "month",
                 "day_of_week",
                 "duration",
                 "campaign",
                 "pdays",
                 "previous",
                 "poutcome",
                 "emp.var.rate",
                 "cons.price.idx",
                 "cons.conf.idx",
                 "euribor3m",
                 "duration",
                 "y"]

client_encoder = FunctionTransformer(cat_encode, kw_args={"drop": drop_features})

In [31]:
cat_client = benchmark(bank_mkt, client_encoder, cat_clf)
pd.concat([cat_baseline, cat_client], keys=["Catboost Baseline", "Catboost Client"], axis=1)

Unnamed: 0_level_0,Catboost Baseline,Catboost Baseline,Catboost Baseline,Catboost Client,Catboost Client,Catboost Client
Unnamed: 0_level_1,Train,Validate,Test,Train,Validate,Test
TNR,0.864431,0.862128,0.861795,0.754736,0.764796,0.75821
TPR,0.669923,0.630728,0.635776,0.68811,0.707547,0.700431
bACC,0.767177,0.746428,0.748786,0.721423,0.736172,0.729321
ROC,0.857805,0.79851,0.79842,0.783119,0.800174,0.777165
REC,0.669923,0.630728,0.635776,0.68811,0.707547,0.700431
PRE,0.38554,0.367347,0.36875,0.262664,0.276316,0.268928
AP,0.543347,0.476788,0.447053,0.38452,0.388863,0.372864


In [32]:
cat_features = ["job",
                "marital",
                "education",
                "default",
                "housing",
                "loan"]

num_features =  ["age"]

hot_scaler = ColumnTransformer([
    ("one_hot_encoder", OneHotEncoder(drop="first"), cat_features),
    ("scaler", StandardScaler(), num_features)
], remainder="passthrough")

client_transformer = make_pipeline(client_encoder, hot_scaler)


In [33]:
logit_client = benchmark(bank_mkt, client_transformer, logit_clf)
pd.concat([logit_baseline, logit_client], keys=["Logistic Baseline", "Logistic Client"], axis=1)

Unnamed: 0_level_0,Logistic Baseline,Logistic Baseline,Logistic Baseline,Logistic Client,Logistic Client,Logistic Client
Unnamed: 0_level_1,Train,Validate,Test,Train,Validate,Test
TNR,0.807253,0.805166,0.807608,0.53201,0.523435,0.55104
TPR,0.658134,0.672507,0.665948,0.693499,0.691375,0.685345
bACC,0.732694,0.738836,0.736778,0.612755,0.607405,0.618192
ROC,0.786998,0.784342,0.781964,0.665946,0.645527,0.664488
REC,0.658134,0.672507,0.665948,0.693499,0.691375,0.685345
PRE,0.30243,0.30464,0.305336,0.15836,0.155502,0.162369
AP,0.442225,0.449944,0.425815,0.211972,0.195418,0.22809


In [34]:
rbf_client = benchmark(bank_mkt, client_transformer, rbf_clf)
pd.concat([rbf_baseline, rbf_client], keys=["RBF Baseline", "RBF Client"], axis=1)

Unnamed: 0_level_0,RBF Baseline,RBF Baseline,RBF Baseline,RBF Client,RBF Client,RBF Client
Unnamed: 0_level_1,Train,Validate,Test,Train,Validate,Test
TNR,0.796348,0.793192,0.797209,0.807809,0.803969,0.808019
TPR,0.672617,0.661725,0.678879,0.650051,0.668464,0.654095
bACC,0.734482,0.727458,0.738044,0.72893,0.736216,0.731057
ROC,0.791338,0.775648,0.78522,0.782253,0.780218,0.775276
REC,0.672617,0.661725,0.678879,0.650051,0.668464,0.654095
PRE,0.295458,0.288824,0.298295,0.300436,0.302071,0.30199
AP,0.45309,0.427039,0.42464,0.371627,0.387286,0.356361


## Impute Missing Values

### Use Most Frequent Values

In [35]:
freq_features = ["job", "marital", "education", "default", "housing", "loan"]

freq_imputer = ColumnTransformer([
    ("freq_imputer", SimpleImputer(missing_values=-1, strategy="most_frequent"), freq_features)
], remainder="passthrough")

freq_encoder = make_pipeline(cat_encoder, freq_imputer)

In [36]:
cat_freq = benchmark(bank_mkt, freq_encoder, cat_clf)
pd.concat([cat_baseline, cat_freq], keys=["Catboost Baseline", "Catboost Frequent"], axis=1)

Unnamed: 0_level_0,Catboost Baseline,Catboost Baseline,Catboost Baseline,Catboost Frequent,Catboost Frequent,Catboost Frequent
Unnamed: 0_level_1,Train,Validate,Test,Train,Validate,Test
TNR,0.864431,0.862128,0.861795,0.858744,0.85597,0.857827
TPR,0.669923,0.630728,0.635776,0.647356,0.660377,0.643319
bACC,0.767177,0.746428,0.748786,0.75305,0.758174,0.750573
ROC,0.857805,0.79851,0.79842,0.821809,0.815234,0.797938
REC,0.669923,0.630728,0.635776,0.647356,0.660377,0.643319
PRE,0.38554,0.367347,0.36875,0.367847,0.367868,0.364914
AP,0.543347,0.476788,0.447053,0.504159,0.483151,0.444807


In [37]:
# Select "job", "marital", "education"
cat_features = [0,1,2]

# Select "age", "campaign", "pdays", "previous", "emp.var.rate", "cons.price.idx", "cons.conf.idx", "euribor3m", "nr.employed"
num_features = [5,10,11,12,14,15,16,17,18]

hot_scaler = ColumnTransformer([
    ("one_hot_encoder", OneHotEncoder(drop="first"), cat_features),
    ("scaler", StandardScaler(), num_features)
], remainder="passthrough")

freq_transformer = make_pipeline(cat_encoder, freq_imputer, hot_scaler)


In [38]:
logit_freq = benchmark(bank_mkt, freq_transformer, logit_clf)
pd.concat([logit_baseline, logit_freq], keys=["Logistic Baseline", "Logistic Frequent"], axis=1)

Unnamed: 0_level_0,Logistic Baseline,Logistic Baseline,Logistic Baseline,Logistic Frequent,Logistic Frequent,Logistic Frequent
Unnamed: 0_level_1,Train,Validate,Test,Train,Validate,Test
TNR,0.807253,0.805166,0.807608,0.810076,0.811837,0.805008
TPR,0.658134,0.672507,0.665948,0.659818,0.644205,0.659483
bACC,0.732694,0.738836,0.736778,0.734947,0.728021,0.732245
ROC,0.786998,0.784342,0.781964,0.786966,0.777684,0.78252
REC,0.658134,0.672507,0.665948,0.659818,0.644205,0.659483
PRE,0.30243,0.30464,0.305336,0.306094,0.302915,0.300442
AP,0.442225,0.449944,0.425815,0.437241,0.435037,0.417261


In [39]:
rbf_freq = benchmark(bank_mkt, freq_transformer, rbf_clf)
pd.concat([rbf_baseline, rbf_freq], keys=["RBF Baseline", "RBF Frequent"], axis=1)

Unnamed: 0_level_0,RBF Baseline,RBF Baseline,RBF Baseline,RBF Frequent,RBF Frequent,RBF Frequent
Unnamed: 0_level_1,Train,Validate,Test,Train,Validate,Test
TNR,0.796348,0.793192,0.797209,0.810118,0.816969,0.810208
TPR,0.672617,0.661725,0.678879,0.660155,0.656334,0.670259
bACC,0.734482,0.727458,0.738044,0.735137,0.736652,0.740233
ROC,0.791338,0.775648,0.78522,0.786766,0.785308,0.782977
REC,0.672617,0.661725,0.678879,0.660155,0.656334,0.670259
PRE,0.295458,0.288824,0.298295,0.30625,0.312781,0.309607
AP,0.45309,0.427039,0.42464,0.438847,0.449438,0.416148


### Estimate Missing Values 

In [40]:
ite_features = ["age", "job", "marital", "education", "default", "housing", "loan", "contact", "campaign", "month", "day_of_week", "pdays", "previous"]

ite_imputer = ColumnTransformer([
    ("ite_imputer",
     make_pipeline(
         IterativeImputer(max_iter=100, missing_values=-1, initial_strategy="most_frequent", random_state=42),
         FunctionTransformer(np.round)
     ),
     ite_features),
], remainder="passthrough")

ite_encoder = make_pipeline(cat_encoder, ite_imputer)

In [41]:
cat_ite = benchmark(bank_mkt, ite_encoder, cat_clf)
pd.concat([cat_baseline, cat_ite], keys=["Catboost Baseline", "Catboost Iterative"], axis=1)

Unnamed: 0_level_0,Catboost Baseline,Catboost Baseline,Catboost Baseline,Catboost Iterative,Catboost Iterative,Catboost Iterative
Unnamed: 0_level_1,Train,Validate,Test,Train,Validate,Test
TNR,0.864431,0.862128,0.861795,0.867211,0.861273,0.862753
TPR,0.669923,0.630728,0.635776,0.666218,0.633423,0.639009
bACC,0.767177,0.746428,0.748786,0.766714,0.747348,0.750881
ROC,0.857805,0.79851,0.79842,0.857959,0.805253,0.797889
REC,0.669923,0.630728,0.635776,0.666218,0.633423,0.639009
PRE,0.38554,0.367347,0.36875,0.38914,0.366901,0.371554
AP,0.543347,0.476788,0.447053,0.551589,0.456234,0.448618


In [42]:
# Select "job", "marital", "education", "default", "housing", "loan", "poutcome"
cat_features = [1,2,3,4,5,13]

# Select "age", "campaign", "pdays", "previous", "emp.var.rate", "cons.price.idx", "cons.conf.idx", "euribor3m", "nr.employed"
num_features = [0,8,9,10,12,14,15,16,17,18]

hot_scaler = ColumnTransformer([
    ("one_hot_encoder", OneHotEncoder(drop="first"), cat_features),
    ("scaler", StandardScaler(), num_features)
], remainder="passthrough")

ite_transformer = make_pipeline(cat_encoder, ite_imputer, hot_scaler)

In [43]:
logit_ite = benchmark(bank_mkt, ite_transformer, logit_clf)
pd.concat([logit_baseline, logit_ite], keys=["Logistic Baseline", "Logistic Iterative"], axis=1)

Unnamed: 0_level_0,Logistic Baseline,Logistic Baseline,Logistic Baseline,Logistic Iterative,Logistic Iterative,Logistic Iterative
Unnamed: 0_level_1,Train,Validate,Test,Train,Validate,Test
TNR,0.807253,0.805166,0.807608,0.799427,0.790284,0.801177
TPR,0.658134,0.672507,0.665948,0.662176,0.668464,0.661638
bACC,0.732694,0.738836,0.736778,0.730801,0.729374,0.731407
ROC,0.786998,0.784342,0.781964,0.783451,0.787418,0.782912
REC,0.658134,0.672507,0.665948,0.662176,0.668464,0.661638
PRE,0.30243,0.30464,0.305336,0.295373,0.288037,0.297049
AP,0.442225,0.449944,0.425815,0.434081,0.473632,0.427194


In [44]:
rbf_ite = benchmark(bank_mkt, ite_transformer, rbf_clf)
pd.concat([rbf_baseline, rbf_ite], keys=["RBF Baseline", "RBF Iterative"], axis=1)

Unnamed: 0_level_0,RBF Baseline,RBF Baseline,RBF Baseline,RBF Iterative,RBF Iterative,RBF Iterative
Unnamed: 0_level_1,Train,Validate,Test,Train,Validate,Test
TNR,0.796348,0.793192,0.797209,0.787239,0.788231,0.791325
TPR,0.672617,0.661725,0.678879,0.675312,0.661725,0.681034
bACC,0.734482,0.727458,0.738044,0.731275,0.724978,0.73618
ROC,0.791338,0.775648,0.78522,0.783933,0.787756,0.783194
REC,0.672617,0.661725,0.678879,0.675312,0.661725,0.681034
PRE,0.295458,0.288824,0.298295,0.287249,0.283979,0.293
AP,0.45309,0.427039,0.42464,0.43868,0.433345,0.429148


## Feature Engineering `pdays`

In [45]:
cut_encoder = FunctionTransformer(cat_encode, kw_args={"cut": ["pdays"]})

In [46]:
cat_cut = benchmark(bank_mkt, cut_encoder, cat_clf)
pd.concat([cat_baseline, cat_cut], keys=["Catboost Baseline", "Catboost Cut"], axis=1)

Unnamed: 0_level_0,Catboost Baseline,Catboost Baseline,Catboost Baseline,Catboost Cut,Catboost Cut,Catboost Cut
Unnamed: 0_level_1,Train,Validate,Test,Train,Validate,Test
TNR,0.864431,0.862128,0.861795,0.860796,0.864865,0.85988
TPR,0.669923,0.630728,0.635776,0.668575,0.621294,0.641164
bACC,0.767177,0.746428,0.748786,0.764686,0.743079,0.750522
ROC,0.857805,0.79851,0.79842,0.847252,0.796195,0.799227
REC,0.669923,0.630728,0.635776,0.668575,0.621294,0.641164
PRE,0.38554,0.367347,0.36875,0.378817,0.368505,0.367511
AP,0.543347,0.476788,0.447053,0.536839,0.459985,0.446577


In [47]:
cat_features = ["job",
                "marital",
                "education",
                "default",
                "housing",
                "loan",
                "poutcome",
                "pdays"]

num_features = ["age", "campaign", "previous", "emp.var.rate", "cons.price.idx", "cons.conf.idx", "euribor3m", "nr.employed"]

hot_scaler = ColumnTransformer([
    ("one_hot_encoder", OneHotEncoder(drop="first"), cat_features),
    ("scaler", StandardScaler(), num_features)
], remainder="passthrough")

cut_transformer = make_pipeline(cut_encoder, hot_scaler)

In [48]:
logit_cut = benchmark(bank_mkt, cut_transformer, logit_clf)
pd.concat([logit_baseline, logit_cut], keys=["Logistic Baseline", "Logistic Cut"], axis=1)

Unnamed: 0_level_0,Logistic Baseline,Logistic Baseline,Logistic Baseline,Logistic Cut,Logistic Cut,Logistic Cut
Unnamed: 0_level_1,Train,Validate,Test,Train,Validate,Test
TNR,0.807253,0.805166,0.807608,0.802934,0.801916,0.807608
TPR,0.658134,0.672507,0.665948,0.667902,0.640162,0.665948
bACC,0.732694,0.738836,0.736778,0.735418,0.721039,0.736778
ROC,0.786998,0.784342,0.781964,0.789413,0.773151,0.782022
REC,0.658134,0.672507,0.665948,0.667902,0.640162,0.665948
PRE,0.30243,0.30464,0.305336,0.300865,0.290876,0.305336
AP,0.442225,0.449944,0.425815,0.444065,0.446452,0.42797


In [49]:
rbf_ite = benchmark(bank_mkt, ite_transformer, rbf_clf)
pd.concat([rbf_baseline, rbf_ite], keys=["RBF Baseline", "RBF Iterative"], axis=1)

Unnamed: 0_level_0,RBF Baseline,RBF Baseline,RBF Baseline,RBF Iterative,RBF Iterative,RBF Iterative
Unnamed: 0_level_1,Train,Validate,Test,Train,Validate,Test
TNR,0.796348,0.793192,0.797209,0.784202,0.790797,0.791325
TPR,0.672617,0.661725,0.678879,0.679353,0.644205,0.681034
bACC,0.734482,0.727458,0.738044,0.731778,0.717501,0.73618
ROC,0.791338,0.775648,0.78522,0.788353,0.767367,0.783194
REC,0.672617,0.661725,0.678879,0.679353,0.644205,0.681034
PRE,0.295458,0.288824,0.298295,0.285573,0.281011,0.293
AP,0.45309,0.427039,0.42464,0.448493,0.395437,0.429148


## Cyclic Encoding

In [50]:
cyclic_encoder = FunctionTransformer(cat_encode, kw_args={"cyclic":["month", "day_of_week"]})

In [51]:
cat_cyclic = benchmark(bank_mkt, cyclic_encoder, cat_clf)
pd.concat([cat_baseline, cat_cyclic], keys=["Catboost Baseline", "Catboost Cyclic"], axis=1)

Unnamed: 0_level_0,Catboost Baseline,Catboost Baseline,Catboost Baseline,Catboost Cyclic,Catboost Cyclic,Catboost Cyclic
Unnamed: 0_level_1,Train,Validate,Test,Train,Validate,Test
TNR,0.864431,0.862128,0.861795,0.864645,0.868286,0.862069
TPR,0.669923,0.630728,0.635776,0.663523,0.622642,0.631466
bACC,0.767177,0.746428,0.748786,0.764084,0.745464,0.746767
ROC,0.857805,0.79851,0.79842,0.847148,0.794554,0.797962
REC,0.669923,0.630728,0.635776,0.663523,0.622642,0.631466
PRE,0.38554,0.367347,0.36875,0.383642,0.375,0.367629
AP,0.543347,0.476788,0.447053,0.539321,0.471629,0.450189


In [52]:
cat_features = ["job",
                "marital",
                "education",
                "default",
                "housing",
                "loan",
                "poutcome"]

num_features = ["age", "campaign", "pdays", "previous",
                "emp.var.rate", "cons.price.idx", "cons.conf.idx", "euribor3m", "nr.employed",
                "month_sin", "month_cos", "day_sin", "day_cos"]

hot_scaler = ColumnTransformer([
    ("one_hot_encoder", OneHotEncoder(drop="first"), cat_features),
    ("scaler", StandardScaler(), num_features)
], remainder="passthrough")

cyclic_transformer = make_pipeline(cyclic_encoder, hot_scaler)

In [53]:
logit_ite = benchmark(bank_mkt, cyclic_transformer, logit_clf)
pd.concat([logit_baseline, logit_ite], keys=["Logistic Baseline", "Logistic Cyclic"], axis=1)

Unnamed: 0_level_0,Logistic Baseline,Logistic Baseline,Logistic Baseline,Logistic Cyclic,Logistic Cyclic,Logistic Cyclic
Unnamed: 0_level_1,Train,Validate,Test,Train,Validate,Test
TNR,0.807253,0.805166,0.807608,0.813112,0.811666,0.80884
TPR,0.658134,0.672507,0.665948,0.65645,0.650943,0.663793
bACC,0.732694,0.738836,0.736778,0.734781,0.731305,0.736316
ROC,0.786998,0.784342,0.781964,0.788496,0.782984,0.782896
REC,0.658134,0.672507,0.665948,0.65645,0.650943,0.663793
PRE,0.30243,0.30464,0.305336,0.308435,0.304924,0.306011
AP,0.442225,0.449944,0.425815,0.446636,0.42974,0.431768


In [54]:
rbf_ite = benchmark(bank_mkt, cyclic_transformer, rbf_clf)
pd.concat([rbf_baseline, rbf_ite], keys=["RBF Baseline", "RBF Cyclic"], axis=1)

Unnamed: 0_level_0,RBF Baseline,RBF Baseline,RBF Baseline,RBF Cyclic,RBF Cyclic,RBF Cyclic
Unnamed: 0_level_1,Train,Validate,Test,Train,Validate,Test
TNR,0.796348,0.793192,0.797209,0.802977,0.804824,0.805829
TPR,0.672617,0.661725,0.678879,0.67127,0.642857,0.676724
bACC,0.734482,0.727458,0.738044,0.737123,0.72384,0.741277
ROC,0.791338,0.775648,0.78522,0.789986,0.774783,0.785208
REC,0.672617,0.661725,0.678879,0.67127,0.642857,0.676724
PRE,0.295458,0.288824,0.298295,0.30197,0.294808,0.30679
AP,0.45309,0.427039,0.42464,0.444024,0.426224,0.424327


## Mean Encoding

In [55]:
target_features = ["job", "marital", "education", "month", "day_of_week"] 

target_encoder = make_pipeline(FunctionTransformer(cat_encode, kw_args={"target": target_features}), TargetEncoder(cols=target_features))

In [56]:
cat_target = benchmark(bank_mkt, target_encoder, cat_clf)
pd.concat([cat_baseline, cat_target], keys=["Catboost Baseline", "Catboost Target"], axis=1)

Unnamed: 0_level_0,Catboost Baseline,Catboost Baseline,Catboost Baseline,Catboost Target,Catboost Target,Catboost Target
Unnamed: 0_level_1,Train,Validate,Test,Train,Validate,Test
TNR,0.864431,0.862128,0.861795,0.86101,0.861615,0.860701
TPR,0.669923,0.630728,0.635776,0.646009,0.65903,0.641164
bACC,0.767177,0.746428,0.748786,0.753509,0.760322,0.750932
ROC,0.857805,0.79851,0.79842,0.830373,0.82082,0.798341
REC,0.669923,0.630728,0.635776,0.646009,0.65903,0.641164
PRE,0.38554,0.367347,0.36875,0.37113,0.376733,0.368878
AP,0.543347,0.476788,0.447053,0.512449,0.499875,0.44934


In [57]:
num_features = [0,1,2,3,8,9,11,14,15,16,17,18]

target_scaler = ColumnTransformer([
    ("scaler", StandardScaler(), num_features)
], remainder="passthrough")

target_transformer = make_pipeline(target_encoder, target_scaler)

In [58]:
logit_target = benchmark(bank_mkt, target_transformer, logit_clf)
pd.concat([logit_baseline, logit_target], keys=["Logistic Baseline", "Logistic Target"], axis=1)

Unnamed: 0_level_0,Logistic Baseline,Logistic Baseline,Logistic Baseline,Logistic Target,Logistic Target,Logistic Target
Unnamed: 0_level_1,Train,Validate,Test,Train,Validate,Test
TNR,0.807253,0.805166,0.807608,0.838772,0.837154,0.83046
TPR,0.658134,0.672507,0.665948,0.642977,0.633423,0.649784
bACC,0.732694,0.738836,0.736778,0.740875,0.735288,0.740122
ROC,0.786998,0.784342,0.781964,0.791805,0.790774,0.785802
REC,0.658134,0.672507,0.665948,0.642977,0.633423,0.649784
PRE,0.30243,0.30464,0.305336,0.336151,0.33052,0.327362
AP,0.442225,0.449944,0.425815,0.444018,0.425354,0.418058


In [59]:
rbf_target = benchmark(bank_mkt, target_transformer, rbf_clf)
pd.concat([rbf_baseline, rbf_target], keys=["RBF Baseline", "RBF Target"], axis=1)

Unnamed: 0_level_0,RBF Baseline,RBF Baseline,RBF Baseline,RBF Target,RBF Target,RBF Target
Unnamed: 0_level_1,Train,Validate,Test,Train,Validate,Test
TNR,0.796348,0.793192,0.797209,0.801736,0.806192,0.810345
TPR,0.672617,0.661725,0.678879,0.668575,0.695418,0.674569
bACC,0.734482,0.727458,0.738044,0.735156,0.750805,0.742457
ROC,0.791338,0.775648,0.78522,0.788897,0.800629,0.78636
REC,0.672617,0.661725,0.678879,0.668575,0.695418,0.674569
PRE,0.295458,0.288824,0.298295,0.299804,0.312917,0.311133
AP,0.45309,0.427039,0.42464,0.42881,0.459276,0.422143


## Entity Embeddings

## Outside Infomation

## Summary