# Feature Engineering

## Data Preparation

In [1]:
import numpy as np
import pandas as pd
from category_encoders.target_encoder import TargetEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV, RandomizedSearchCV, cross_validate
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.kernel_approximation import RBFSampler
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import accuracy_score, average_precision_score, f1_score, precision_score, recall_score, balanced_accuracy_score, roc_auc_score, confusion_matrix
from sklearn.utils.fixes import loguniform
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

In [2]:
def import_dataset(filename):
    bank_mkt = pd.read_csv(filename,
                           na_values=["unknown", "nonexistent"],
                           true_values=["yes", "success"],
                           false_values=["no", "failure"])
    # Treat pdays = 999 as missing values
    bank_mkt["pdays"] = bank_mkt["pdays"].replace(999, pd.NA)
    # Convert types, "Int64" is nullable integer data type in pandas
    bank_mkt = bank_mkt.astype(dtype={"age": "Int64",
                                      "job": "category",
                                      "marital": "category",
                                      "education": "category",
                                      "default": "boolean",
                                      "housing": "boolean",
                                      "loan": "boolean",
                                      "contact": "category",
                                      "month": "category",
                                      "day_of_week": "category",
                                      "duration": "Int64",
                                      "campaign": "Int64",
                                      "pdays": "Int64",
                                      "previous": "Int64",
                                      "poutcome": "boolean",
                                      "y": "boolean"})
    # Drop duplicates
    bank_mkt = bank_mkt.drop_duplicates().reset_index(drop=True)
    # reorder categorical data
    bank_mkt["education"] = bank_mkt["education"].cat.reorder_categories(["illiterate", "basic.4y", "basic.6y", "basic.9y", "high.school", "professional.course", "university.degree"], ordered=True)
    bank_mkt["month"] = bank_mkt["month"].cat.reorder_categories(["mar", "apr", "jun", "jul", "may", "aug", "sep", "oct", "nov", "dec"], ordered=True)
    bank_mkt["day_of_week"] = bank_mkt["day_of_week"].cat.reorder_categories(["mon", "tue", "wed", "thu", "fri"], ordered=True)
    return bank_mkt

In [3]:
bank_mkt = import_dataset("../data/BankMarketing.csv")

In [4]:
train_test_split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=62)
train_validate_split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=62)

In [5]:
for train_index, test_index in train_test_split.split(bank_mkt.drop("y", axis=1), bank_mkt["y"]):
    bank_train_set = bank_mkt.loc[train_index].reset_index(drop=True)
    bank_test_set = bank_mkt.loc[test_index].reset_index(drop=True)

## Baseline

In [81]:
def cat_encode(X,
               drop=["duration", "y"],
               cut=None,
               cyclic=None,
               target=None):
    """
    Encode categorical data into numerical values.
    """
    X = X.copy()
    
    if not cut:
        # Fill missing values in pdays as 999
        X["pdays"] = X["pdays"].fillna(999)
    else:
        if "pdays" in cut:
            # Cut pdays into categories
            bank_mkt["pdays"] = bank_mkt["pdays"].replace(999, pd.NA)
            X["pdays"] = pd.cut(X["pdays"], [0, 5, 10, 15, 30, 1000], labels=[1, 2, 3, 4, 5], include_lowest=True).astype("Int64")
    
    # `month` will be encoded to the corresponding number, e.g. "mar" -> 3.
    month_map = {"mar": 3,
                 "apr": 4,
                 "may": 5,
                 "jun": 6,
                 "jul": 7,
                 "aug": 8,
                 "sep": 9,
                 "oct": 10,
                 "nov": 11,
                 "dec": 12}
    X["month"] = X["month"].replace(month_map).astype("Int64")
    
    if not cyclic:
        pass
    else:
        if "month" in cyclic:
            X['month_sin'] = np.sin(2 * np.pi * X["month"]/12)
            X['month_cos'] = np.cos(2 * np.pi * X["month"]/12)
            X = X.drop("month", axis=1)
        if "day_of_week" in cyclic:
            X["day_of_week"] = X["day_of_week"].cat.codes
            X['day_sin'] = np.sin(2 * np.pi * X["day_of_week"]/5)
            X['day_cos'] = np.cos(2 * np.pi * X["day_of_week"]/5)
            X = X.drop("day_of_week", axis=1)
    
    # Transfor target encoded feature as str
    X[target] = X[target].astype("str")
        
    # Drop features
    X = X.drop(drop, axis=1)
    
    # Other categorical features will be coded as its order in pandas categorical index
    X = X.apply(lambda x: x.cat.codes if pd.api.types.is_categorical_dtype(x) else (x.astype("Int64") if pd.api.types.is_bool_dtype(x) else x))
    
    # Fill missing values as -1
    X = X.fillna(-1)
    return X

In [8]:
def transform_dataset(train_set, test_set, preprocessor, to_numpy=False, is_numpy=False):
    y_train = train_set["y"].astype("int")
    X_train = preprocessor.fit_transform(train_set, y_train)
    y_test = test_set["y"].astype("int")
    X_test = preprocessor.transform(test_set)
    for train_index, validate_index in train_validate_split.split(X_train, y_train):
        if is_numpy:
                X_ttrain = X_train[train_index]
                y_ttrain = y_train[train_index]
                X_validate = X_train[validate_index]
                y_validate = y_train[validate_index]
        else:
                X_ttrain = X_train.iloc[train_index]
                y_ttrain = y_train.iloc[train_index]
                X_validate = X_train.iloc[validate_index]
                y_validate = y_train.iloc[validate_index]

    if to_numpy:
            y_train = y_train.to_numpy()
            X_train = X_train.to_numpy()
            y_test = y_test.to_numpy()
            X_test = X_test.to_numpy()
            X_ttrain = X_ttrain.to_numpy()
            y_ttrain = y_ttrain.to_numpy()
            X_validate = X_validate.to_numpy()
            y_validate = y_validate.to_numpy()
        
    return (X_train, y_train, X_ttrain, y_ttrain, X_validate, y_validate, X_test, y_test)

In [9]:
def benchmark_dataset(clf, X_ttrain, y_ttrain, X_validate, y_validate, X_test, y_test):
    metric_names = ["TNR", "TPR", "bACC", "ROC", "REC", "PRE", "F1", "AP"]
    set_names = ["Train", "Validate", "Test"]

    X_sets = [X_ttrain, X_validate, X_test]
    y_sets = [y_ttrain, y_validate, y_test]

    metric_df = pd.DataFrame(index=metric_names, columns=set_names)

    for name, X, y in zip(set_names, X_sets, y_sets):
        y_pred = clf.predict(X)
        if hasattr(clf, "decision_function"):
            y_score = clf.decision_function(X)
        else:
            y_score = clf.predict_proba(X)[:, 1]
        metrics = [recall_score(y, y_pred, pos_label=0),
                   recall_score(y, y_pred),
                   balanced_accuracy_score(y, y_pred),
                   roc_auc_score(y, y_score),
                   recall_score(y, y_pred),
                   precision_score(y, y_pred),
                   f1_score(y, y_pred),
                   average_precision_score(y, y_score)]
        metric_df[name] = metrics
    return metric_df

In [72]:
basic_encoder = FunctionTransformer(cat_encode)

In [7]:
cat_features = ["job",
                "marital",
                "education",
                "default",
                "housing",
                "loan",
                "poutcome"]

num_features =  ["age",
                 "campaign",
                 "pdays",
                 "previous",
                 "emp.var.rate",
                 "cons.price.idx",
                 "cons.conf.idx",
                 "euribor3m",
                 "nr.employed"]

cat_transformer = ColumnTransformer([
    ("one_hot_encoder", OneHotEncoder(drop="first"), cat_features),
    ("scaler", StandardScaler(), num_features)
], remainder="passthrough")

In [10]:
X_train, y_train, X_ttrain, y_ttrain, X_validate, y_validate, X_test, y_test = transform_dataset(bank_train_set, bank_test_set, preprocessor=basic_encoder, to_numpy=True)
cat_clf = CatBoostClassifier(eval_metric="AUC", class_weights=[1, 8])
cat_clf.fit(X_ttrain, y_ttrain, eval_set=(X_validate, y_validate), verbose=False)
benchmark_dataset(cat_clf, X_ttrain, y_ttrain, X_validate, y_validate, X_test, y_test)

Unnamed: 0,Train,Validate,Test
TNR,0.862721,0.85768,0.86549
TPR,0.665544,0.62938,0.645474
bACC,0.764132,0.74353,0.755482
ROC,0.85362,0.795319,0.811488
REC,0.665544,0.62938,0.645474
PRE,0.381026,0.359507,0.378635
F1,0.484611,0.457619,0.477291
AP,0.541778,0.444935,0.473949


In [11]:
X_train, y_train, X_ttrain, y_ttrain, X_validate, y_validate, X_test, y_test = transform_dataset(bank_train_set, bank_test_set, preprocessor=basic_encoder)
logit_clf = make_pipeline(cat_transformer, LogisticRegression(penalty="none", class_weight="balanced", max_iter=1000))
logit_clf.fit(X_ttrain, y_ttrain)
benchmark_dataset(logit_clf, X_ttrain, y_ttrain, X_validate, y_validate, X_test, y_test)

Unnamed: 0,Train,Validate,Test
TNR,0.799598,0.797982,0.803093
TPR,0.665881,0.673854,0.65625
bACC,0.732739,0.735918,0.729671
ROC,0.785015,0.783269,0.782275
REC,0.665881,0.673854,0.65625
PRE,0.296713,0.297442,0.297363
F1,0.410507,0.412712,0.409274
AP,0.442165,0.434741,0.441087


In [12]:
rbf_clf = Pipeline([
    ("cat_transformer", cat_transformer),
    ("rbf", RBFSampler(gamma=0.001, random_state=42)),
    ("svm", LinearSVC(C=1, loss="squared_hinge", dual=False, class_weight="balanced", max_iter=1000))
])
rbf_clf.fit(X_ttrain, y_ttrain)
benchmark_dataset(rbf_clf, X_ttrain, y_ttrain, X_validate, y_validate, X_test, y_test)

Unnamed: 0,Train,Validate,Test
TNR,0.784929,0.785836,0.788998
TPR,0.67868,0.681941,0.670259
bACC,0.731804,0.733889,0.729628
ROC,0.787357,0.787405,0.78525
REC,0.67868,0.681941,0.670259
PRE,0.286059,0.287827,0.287431
F1,0.402477,0.4048,0.402329
AP,0.443953,0.443074,0.442188


## Drop Client Data

In [13]:
drop_features = ["age",
                 "job",
                 "marital",
                 "education",
                 "housing",
                 "loan",
                 "default",
                 "duration",
                 "y"]

drop_encoder = FunctionTransformer(cat_encode, kw_args={"drop": drop_features})

In [14]:
X_train, y_train, X_ttrain, y_ttrain, X_validate, y_validate, X_test, y_test = transform_dataset(bank_train_set, bank_test_set, preprocessor=drop_encoder, to_numpy=True)

In [15]:
cat_clf = CatBoostClassifier(eval_metric="AUC", class_weights=[1, 8])
cat_clf.fit(X_ttrain, y_ttrain, eval_set=(X_validate, y_validate), verbose=False)
benchmark_dataset(cat_clf, X_ttrain, y_ttrain, X_validate, y_validate, X_test, y_test)

Unnamed: 0,Train,Validate,Test
TNR,0.854168,0.848443,0.857964
TPR,0.660492,0.6469,0.636853
bACC,0.75733,0.747672,0.747409
ROC,0.825793,0.803788,0.814312
REC,0.660492,0.6469,0.636853
PRE,0.365109,0.351391,0.362799
F1,0.470264,0.455408,0.46226
AP,0.513784,0.447797,0.480871


In [16]:
X_train, y_train, X_ttrain, y_ttrain, X_validate, y_validate, X_test, y_test = transform_dataset(bank_train_set, bank_test_set, preprocessor=drop_encoder)

In [17]:
cat_features = ["poutcome"]

num_features =  ["campaign",
                 "pdays",
                 "previous",
                 "emp.var.rate",
                 "cons.price.idx",
                 "cons.conf.idx",
                 "euribor3m",
                 "nr.employed"]

drop_transformer = ColumnTransformer([
    ("one_hot_encoder", OneHotEncoder(drop="first"), cat_features),
    ("scaler", StandardScaler(), num_features)
], remainder="passthrough")

In [18]:
logit_clf = make_pipeline(drop_transformer, LogisticRegression(penalty="none", class_weight="balanced"))
logit_clf.fit(X_ttrain, y_ttrain)
benchmark_dataset(logit_clf, X_ttrain, y_ttrain, X_validate, y_validate, X_test, y_test)

Unnamed: 0,Train,Validate,Test
TNR,0.785229,0.785323,0.787493
TPR,0.669249,0.679245,0.659483
bACC,0.727239,0.732284,0.723488
ROC,0.780602,0.776179,0.781611
REC,0.669249,0.679245,0.659483
PRE,0.283493,0.286526,0.282679
F1,0.398276,0.403039,0.395732
AP,0.433102,0.437963,0.434768


In [19]:
rbf_clf = Pipeline([
    ("drop_transformer", drop_transformer),
    ("rbf", RBFSampler(gamma=0.001, random_state=42)),
    ("svm", LinearSVC(C=1, loss="squared_hinge", dual=False, class_weight="balanced", max_iter=1000))
])
rbf_clf.fit(X_ttrain, y_ttrain)
benchmark_dataset(rbf_clf, X_ttrain, y_ttrain, X_validate, y_validate, X_test, y_test)

Unnamed: 0,Train,Validate,Test
TNR,0.767694,0.771126,0.772989
TPR,0.682048,0.690027,0.674569
bACC,0.724871,0.730576,0.723779
ROC,0.788101,0.789536,0.78835
REC,0.682048,0.690027,0.674569
PRE,0.271557,0.276757,0.273961
F1,0.388452,0.395062,0.389667
AP,0.435232,0.43748,0.445355


## Drop Everything Except Client Data

In [20]:
drop_features = ["contact",
                 "month",
                 "day_of_week",
                 "duration",
                 "campaign",
                 "pdays",
                 "previous",
                 "poutcome",
                 "emp.var.rate",
                 "cons.price.idx",
                 "cons.conf.idx",
                 "euribor3m",
                 "duration",
                 "y"]

drop_encoder = FunctionTransformer(cat_encode, kw_args={"drop": drop_features})

In [21]:
X_train, y_train, X_ttrain, y_ttrain, X_validate, y_validate, X_test, y_test = transform_dataset(bank_train_set,
                                                                                                 bank_test_set,
                                                                                                 preprocessor=drop_encoder,
                                                                                                 to_numpy=True)

In [22]:
cat_clf = CatBoostClassifier(eval_metric="AUC", class_weights=[1, 8])
cat_clf.fit(X_ttrain, y_ttrain, eval_set=(X_validate, y_validate), verbose=False)
benchmark_dataset(cat_clf, X_ttrain, y_ttrain, X_validate, y_validate, X_test, y_test)

Unnamed: 0,Train,Validate,Test
TNR,0.75495,0.757441,0.753421
TPR,0.699225,0.699461,0.677802
bACC,0.727088,0.728451,0.715611
ROC,0.78616,0.783439,0.779924
REC,0.699225,0.699461,0.677802
PRE,0.265949,0.26794,0.258741
F1,0.385336,0.387458,0.374516
AP,0.387902,0.364424,0.379107


In [23]:
X_train, y_train, X_ttrain, y_ttrain, X_validate, y_validate, X_test, y_test = transform_dataset(bank_train_set, bank_test_set, preprocessor=drop_encoder)

In [24]:
cat_features = ["job",
                "marital",
                "education",
                "default",
                "housing",
                "loan"]

num_features =  ["age"]

drop_encoder = ColumnTransformer([
    ("one_hot_encoder", OneHotEncoder(drop="first"), cat_features),
    ("scaler", StandardScaler(), num_features)
], remainder="passthrough")

In [25]:
logit_clf = make_pipeline(drop_encoder, LogisticRegression(penalty="none", class_weight="balanced"))
logit_clf.fit(X_ttrain, y_ttrain)
benchmark_dataset(logit_clf, X_ttrain, y_ttrain, X_validate, y_validate, X_test, y_test)

Unnamed: 0,Train,Validate,Test
TNR,0.551726,0.548238,0.545156
TPR,0.676659,0.669811,0.689655
bACC,0.614192,0.609025,0.617406
ROC,0.664866,0.662443,0.664856
REC,0.676659,0.669811,0.689655
PRE,0.160836,0.158381,0.161453
F1,0.259897,0.256186,0.261652
AP,0.217875,0.202102,0.219732


In [26]:
rbf_clf = Pipeline([
    ("drop_encoder", drop_encoder),
    ("rbf", RBFSampler(gamma=0.001, random_state=42)),
    ("svm", LinearSVC(C=1, loss="squared_hinge", dual=False, class_weight="balanced", max_iter=1000))
])
rbf_clf.fit(X_ttrain, y_ttrain)
benchmark_dataset(rbf_clf, X_ttrain, y_ttrain, X_validate, y_validate, X_test, y_test)

Unnamed: 0,Train,Validate,Test
TNR,0.795364,0.797639,0.799398
TPR,0.667228,0.660377,0.649784
bACC,0.731296,0.729008,0.724591
ROC,0.781569,0.780268,0.778715
REC,0.667228,0.660377,0.649784
PRE,0.292787,0.292887,0.291445
F1,0.406985,0.405797,0.402402
AP,0.370085,0.357364,0.37646


## Impute Missing Values

### Use Most Frequent Values

In [27]:
freq_features = ["job", "marital", "education", "default", "housing", "loan"]

freq_imputer = ColumnTransformer([
    ("freq_imputer", SimpleImputer(missing_values=-1, strategy="most_frequent"), freq_features)
], remainder="passthrough")

freq_transformer = make_pipeline(basic_encoder, freq_imputer)

In [28]:
X_train, y_train, X_ttrain, y_ttrain, X_validate, y_validate, X_test, y_test = transform_dataset(bank_train_set,
                                                                                                 bank_test_set,
                                                                                                 preprocessor=freq_transformer,
                                                                                                 is_numpy=True)

In [29]:
cat_clf = CatBoostClassifier(eval_metric="AUC", class_weights=[1, 8])
cat_clf.fit(X_ttrain, y_ttrain, eval_set=(X_validate, y_validate), verbose=False)
benchmark_dataset(cat_clf, X_ttrain, y_ttrain, X_validate, y_validate, X_test, y_test)

Unnamed: 0,Train,Validate,Test
TNR,0.863063,0.860075,0.868637
TPR,0.661502,0.632075,0.634698
bACC,0.762283,0.746075,0.751668
ROC,0.843613,0.80007,0.810083
REC,0.661502,0.632075,0.634698
PRE,0.380178,0.364413,0.380245
F1,0.482852,0.462297,0.475575
AP,0.534078,0.449086,0.477569


In [30]:
# Select "job", "marital", "education"
cat_features = [0,1,2]

# Select "age", "campaign", "pdays", "previous", "emp.var.rate", "cons.price.idx", "cons.conf.idx", "euribor3m", "nr.employed"
num_features = [5,10,11,12,14,15,16,17,18]

freq_encoder = ColumnTransformer([
    ("one_hot_encoder", OneHotEncoder(drop="first"), cat_features),
    ("scaler", StandardScaler(), num_features)
], remainder="passthrough")

In [31]:
logit_clf = make_pipeline(freq_encoder, LogisticRegression(penalty="none", class_weight="balanced", max_iter=1000))
logit_clf.fit(X_ttrain, y_ttrain)
benchmark_dataset(logit_clf, X_ttrain, y_ttrain, X_validate, y_validate, X_test, y_test)

Unnamed: 0,Train,Validate,Test
TNR,0.796262,0.794218,0.802135
TPR,0.666554,0.669811,0.659483
bACC,0.731408,0.732015,0.730809
ROC,0.783825,0.781039,0.781439
REC,0.666554,0.669811,0.659483
PRE,0.29349,0.292353,0.297376
F1,0.407537,0.407043,0.409913
AP,0.432163,0.435141,0.434259


In [32]:
rbf_clf = Pipeline([
    ("freq_encoder", freq_encoder),
    ("rbf", RBFSampler(gamma=0.001, random_state=42)),
    ("svm", LinearSVC(C=1, loss="squared_hinge", dual=False, class_weight="balanced", max_iter=1000))
])
rbf_clf.fit(X_ttrain, y_ttrain)
benchmark_dataset(rbf_clf, X_ttrain, y_ttrain, X_validate, y_validate, X_test, y_test)

Unnamed: 0,Train,Validate,Test
TNR,0.795279,0.796271,0.800219
TPR,0.67228,0.672507,0.668103
bACC,0.733779,0.734389,0.734161
ROC,0.784915,0.785655,0.78391
REC,0.67228,0.672507,0.668103
PRE,0.294265,0.295266,0.298077
F1,0.409352,0.410362,0.412234
AP,0.431096,0.432811,0.435974


### Estimate Missing Values 

In [33]:
ite_features = ["age", "job", "marital", "education", "default", "housing", "loan", "contact", "campaign", "month", "day_of_week", "pdays", "previous", "poutcome"]

ite_imputer = ColumnTransformer([
    ("ite_imputer",
     make_pipeline(
         IterativeImputer(max_iter=100, missing_values=-1, initial_strategy="most_frequent", random_state=42),
         FunctionTransformer(np.round)
     ),
     ite_features),
], remainder="passthrough")

ite_transformer = make_pipeline(basic_encoder, ite_imputer)

In [34]:
X_train, y_train, X_ttrain, y_ttrain, X_validate, y_validate, X_test, y_test = transform_dataset(bank_train_set,
                                                                                                 bank_test_set,
                                                                                                 preprocessor=ite_transformer,
                                                                                                 is_numpy=True)

In [35]:
cat_clf = CatBoostClassifier(eval_metric="AUC", class_weights=[1, 8])
cat_clf.fit(X_ttrain, y_ttrain, eval_set=(X_validate, y_validate), verbose=False)
benchmark_dataset(cat_clf, X_ttrain, y_ttrain, X_validate, y_validate, X_test, y_test)

Unnamed: 0,Train,Validate,Test
TNR,0.862421,0.859904,0.868227
TPR,0.659144,0.636119,0.635776
bACC,0.760783,0.748011,0.752001
ROC,0.844864,0.798758,0.811029
REC,0.659144,0.636119,0.635776
PRE,0.378237,0.365608,0.37991
F1,0.480658,0.464338,0.475615
AP,0.533133,0.442818,0.474651


In [36]:
# Select "job", "marital", "education"
cat_features = [1,2,3]

# Select "age", "campaign", "pdays", "previous", "emp.var.rate", "cons.price.idx", "cons.conf.idx", "euribor3m", "nr.employed"
num_features = [0,8,9,10,12,14,15,16,17,18]

ite_encoder = ColumnTransformer([
    ("one_hot_encoder", OneHotEncoder(drop="first"), cat_features),
    ("scaler", StandardScaler(), num_features)
], remainder="passthrough")

In [37]:
logit_clf = make_pipeline(ite_encoder, LogisticRegression(penalty="none", class_weight="balanced", max_iter=1000))
logit_clf.fit(X_ttrain, y_ttrain)
benchmark_dataset(logit_clf, X_ttrain, y_ttrain, X_validate, y_validate, X_test, y_test)

Unnamed: 0,Train,Validate,Test
TNR,0.790403,0.791139,0.797893
TPR,0.668575,0.673854,0.657328
bACC,0.729489,0.732497,0.72761
ROC,0.784198,0.781738,0.78071
REC,0.668575,0.673854,0.657328
PRE,0.288266,0.290529,0.292286
F1,0.402841,0.406009,0.404643
AP,0.435411,0.428329,0.438375


In [38]:
rbf_clf = Pipeline([
    ("ite_encoder", ite_encoder),
    ("rbf", RBFSampler(gamma=0.001, random_state=42)),
    ("svm", LinearSVC(C=1, loss="squared_hinge", dual=False, class_weight="balanced", max_iter=1000))
])
rbf_clf.fit(X_ttrain, y_ttrain)
benchmark_dataset(rbf_clf, X_ttrain, y_ttrain, X_validate, y_validate, X_test, y_test)

Unnamed: 0,Train,Validate,Test
TNR,0.781593,0.783613,0.786809
TPR,0.676996,0.685984,0.668103
bACC,0.729295,0.734798,0.727456
ROC,0.783282,0.785732,0.781442
REC,0.676996,0.685984,0.668103
PRE,0.282422,0.286922,0.284665
F1,0.398572,0.40461,0.399227
AP,0.432351,0.429585,0.440479


## Feature Engineering `pdays`

In [39]:
cut_encoder = FunctionTransformer(cat_encode, kw_args={"cut": ["pdays"]})
X_train, y_train, X_ttrain, y_ttrain, X_validate, y_validate, X_test, y_test = transform_dataset(bank_train_set,
                                                                                                 bank_test_set,
                                                                                                 preprocessor=cut_encoder,
                                                                                                 to_numpy=True)

In [40]:
cat_clf = CatBoostClassifier(eval_metric="AUC", class_weights=[1, 8])
cat_clf.fit(X_ttrain, y_ttrain, eval_set=(X_validate, y_validate), verbose=False)
benchmark_dataset(cat_clf, X_ttrain, y_ttrain, X_validate, y_validate, X_test, y_test)

Unnamed: 0,Train,Validate,Test
TNR,0.842877,0.841088,0.846333
TPR,0.662176,0.637466,0.647629
bACC,0.752527,0.739277,0.746981
ROC,0.812115,0.797791,0.800994
REC,0.662176,0.637466,0.647629
PRE,0.348582,0.337375,0.348608
F1,0.456731,0.441231,0.453243
AP,0.490078,0.441374,0.46777


In [41]:
X_train, y_train, X_ttrain, y_ttrain, X_validate, y_validate, X_test, y_test = transform_dataset(bank_train_set,
                                                                                                 bank_test_set,
                                                                                                 preprocessor=cut_encoder)

In [42]:
cat_features = ["job", "marital", "education", "pdays"]

num_features = ["age", "campaign", "previous", "emp.var.rate", "cons.price.idx", "cons.conf.idx", "euribor3m", "nr.employed"]

pdays_encoder = ColumnTransformer([
    ("one_hot_encoder", OneHotEncoder(drop="first"), cat_features),
    ("scaler", StandardScaler(), num_features)
], remainder="passthrough")

In [43]:
logit_clf = make_pipeline(pdays_encoder, LogisticRegression(penalty="none", class_weight="balanced", max_iter=1000))
logit_clf.fit(X_ttrain, y_ttrain)
benchmark_dataset(logit_clf, X_ttrain, y_ttrain, X_validate, y_validate, X_test, y_test)

Unnamed: 0,Train,Validate,Test
TNR,0.799427,0.797297,0.802819
TPR,0.668238,0.671159,0.658405
bACC,0.733833,0.734228,0.730612
ROC,0.784564,0.783235,0.781876
REC,0.668238,0.671159,0.658405
PRE,0.297273,0.2959,0.297758
F1,0.41149,0.410722,0.410067
AP,0.43504,0.43849,0.436233


In [44]:
rbf_clf = Pipeline([
    ("pdays_encoder", pdays_encoder),
    ("rbf", RBFSampler(gamma=0.001, random_state=42)),
    ("svm", LinearSVC(C=1, loss="squared_hinge", dual=False, class_weight="balanced", max_iter=1000))
])
rbf_clf.fit(X_ttrain, y_ttrain)
benchmark_dataset(rbf_clf, X_ttrain, y_ttrain, X_validate, y_validate, X_test, y_test)

Unnamed: 0,Train,Validate,Test
TNR,0.75726,0.761204,0.7578
TPR,0.697204,0.699461,0.686422
bACC,0.727232,0.730333,0.722111
ROC,0.781716,0.785262,0.781191
REC,0.697204,0.699461,0.686422
PRE,0.267235,0.271018,0.264645
F1,0.386374,0.390666,0.382009
AP,0.424904,0.409433,0.41944


## Cyclic Encoding

In [62]:
cyclic_encoder = FunctionTransformer(cat_encode, kw_args={"cyclic":["month", "day_of_week"]})
X_train, y_train, X_ttrain, y_ttrain, X_validate, y_validate, X_test, y_test = transform_dataset(bank_train_set,
                                                                                                 bank_test_set,
                                                                                                 preprocessor=cyclic_encoder,
                                                                                                 to_numpy=True)

In [63]:
cat_clf = CatBoostClassifier(eval_metric="AUC", class_weights=[1, 8])
cat_clf.fit(X_ttrain, y_ttrain, eval_set=(X_validate, y_validate), verbose=False)
benchmark_dataset(cat_clf, X_ttrain, y_ttrain, X_validate, y_validate, X_test, y_test)

Unnamed: 0,Train,Validate,Test
TNR,0.861823,0.858536,0.865353
TPR,0.66386,0.637466,0.636853
bACC,0.762841,0.748001,0.751103
ROC,0.845273,0.799287,0.80876
REC,0.66386,0.637466,0.636853
PRE,0.378893,0.363846,0.375238
F1,0.482438,0.463271,0.472233
AP,0.535995,0.451774,0.477055


In [65]:
X_train, y_train, X_ttrain, y_ttrain, X_validate, y_validate, X_test, y_test = transform_dataset(bank_train_set,
                                                                                                 bank_test_set,
                                                                                                 preprocessor=cyclic_encoder)

In [67]:
cat_features = ["job", "marital", "education"]

num_features = ["age", "campaign", "pdays", "previous",
                "emp.var.rate", "cons.price.idx", "cons.conf.idx", "euribor3m", "nr.employed",
                "month_sin", "month_cos", "day_sin", "day_cos"]

pdays_encoder = ColumnTransformer([
    ("one_hot_encoder", OneHotEncoder(drop="first"), cat_features),
    ("scaler", StandardScaler(), num_features)
], remainder="passthrough")

In [68]:
logit_clf = make_pipeline(pdays_encoder, LogisticRegression(penalty="none", class_weight="balanced", max_iter=1000))
logit_clf.fit(X_ttrain, y_ttrain)
benchmark_dataset(logit_clf, X_ttrain, y_ttrain, X_validate, y_validate, X_test, y_test)

Unnamed: 0,Train,Validate,Test
TNR,0.801394,0.798153,0.805829
TPR,0.665881,0.665768,0.653017
bACC,0.733637,0.73196,0.729423
ROC,0.786115,0.786068,0.784021
REC,0.665881,0.665768,0.653017
PRE,0.298595,0.295102,0.299259
F1,0.412304,0.40894,0.41043
AP,0.434329,0.433529,0.435278


In [69]:
rbf_clf = Pipeline([
    ("pdays_encoder", pdays_encoder),
    ("rbf", RBFSampler(gamma=0.001, random_state=42)),
    ("svm", LinearSVC(C=1, loss="squared_hinge", dual=False, class_weight="balanced", max_iter=1000))
])
rbf_clf.fit(X_ttrain, y_ttrain)
benchmark_dataset(rbf_clf, X_ttrain, y_ttrain, X_validate, y_validate, X_test, y_test)

Unnamed: 0,Train,Validate,Test
TNR,0.7919,0.788231,0.792967
TPR,0.677332,0.681941,0.665948
bACC,0.734616,0.735086,0.729457
ROC,0.787342,0.790733,0.783596
REC,0.677332,0.681941,0.665948
PRE,0.292424,0.290138,0.290005
F1,0.408491,0.40708,0.404054
AP,0.434689,0.42678,0.432883


## Mean Encoding

In [86]:
target_features = ["job", "marital", "education", "month", "day_of_week"]

target_encoder = FunctionTransformer(cat_encode, kw_args={"target": target_features})

target_transformer = make_pipeline(target_encoder, TargetEncoder(cols=target_features, return_df=False))

X_train, y_train, X_ttrain, y_ttrain, X_validate, y_validate, X_test, y_test = transform_dataset(bank_train_set,
                                                                                                 bank_test_set,
                                                                                                 preprocessor=target_transformer,
                                                                                                 is_numpy=True)

  elif pd.api.types.is_categorical(cols):


In [87]:
cat_clf = CatBoostClassifier(eval_metric="AUC", class_weights=[1, 8])
cat_clf.fit(X_ttrain, y_ttrain, eval_set=(X_validate, y_validate), verbose=False)
benchmark_dataset(cat_clf, X_ttrain, y_ttrain, X_validate, y_validate, X_test, y_test)

Unnamed: 0,Train,Validate,Test
TNR,0.854339,0.85272,0.859743
TPR,0.65544,0.633423,0.642241
bACC,0.754889,0.743071,0.750992
ROC,0.819805,0.798304,0.802987
REC,0.65544,0.633423,0.642241
PRE,0.363602,0.353118,0.367674
F1,0.467732,0.453449,0.467634
AP,0.504355,0.447364,0.471247


In [89]:
X_train[0]

array([0.0780065005417118, 0.06855983772819473, 0.1014456379881538,
       0.10635213494992093, 0, 1, 1, 1, 0.1055359246171967,
       0.10932475884244373, 1, 999, 0, -1, 1.4, 94.465, -41.8, 4.959,
       5228.1], dtype=object)

In [90]:
num_features = [0,1,2,3,8,9,11,14,15,16,17,18]

target_encoder = ColumnTransformer([
    ("scaler", StandardScaler(), num_features)
], remainder="passthrough")

In [91]:
logit_clf = make_pipeline(target_encoder, LogisticRegression(penalty="none", class_weight="balanced", max_iter=1000))
logit_clf.fit(X_ttrain, y_ttrain)
benchmark_dataset(logit_clf, X_ttrain, y_ttrain, X_validate, y_validate, X_test, y_test)

Unnamed: 0,Train,Validate,Test
TNR,0.826241,0.82843,0.83306
TPR,0.658808,0.654987,0.648707
bACC,0.742524,0.741708,0.740883
ROC,0.790166,0.792526,0.791826
REC,0.658808,0.654987,0.648707
PRE,0.324971,0.326394,0.330406
F1,0.435247,0.435679,0.437818
AP,0.436779,0.437489,0.436174


In [93]:
rbf_clf = Pipeline([
    ("target_encoder", target_encoder),
    ("rbf", RBFSampler(gamma=0.001, random_state=42)),
    ("svm", LinearSVC(C=1, loss="squared_hinge", dual=False, class_weight="balanced", max_iter=1000))
])
rbf_clf.fit(X_ttrain, y_ttrain)
benchmark_dataset(rbf_clf, X_ttrain, y_ttrain, X_validate, y_validate, X_test, y_test)

Unnamed: 0,Train,Validate,Test
TNR,0.802805,0.801403,0.806924
TPR,0.678343,0.67655,0.667026
bACC,0.740574,0.738976,0.736975
ROC,0.789954,0.794211,0.789741
REC,0.678343,0.67655,0.667026
PRE,0.304,0.301864,0.304926
F1,0.419846,0.417464,0.418526
AP,0.43477,0.424359,0.434415
