# Feature Engineering

## Data Preparation

In [132]:
import numpy as np
import pandas as pd
from category_encoders.target_encoder import TargetEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV, RandomizedSearchCV, cross_validate
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.kernel_approximation import RBFSampler
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import accuracy_score, average_precision_score, f1_score, precision_score, recall_score, balanced_accuracy_score, roc_auc_score, confusion_matrix
from sklearn.utils.fixes import loguniform
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

In [133]:
def import_dataset(filename):
    bank_mkt = pd.read_csv(filename,
                           na_values=["unknown", "nonexistent"],
                           true_values=["yes", "success"],
                           false_values=["no", "failure"])
    # Treat pdays = 999 as missing values
    bank_mkt["pdays"] = bank_mkt["pdays"].replace(999, pd.NA)
    # Convert types, "Int64" is nullable integer data type in pandas
    bank_mkt = bank_mkt.astype(dtype={"age": "Int64",
                                      "job": "category",
                                      "marital": "category",
                                      "education": "category",
                                      "default": "boolean",
                                      "housing": "boolean",
                                      "loan": "boolean",
                                      "contact": "category",
                                      "month": "category",
                                      "day_of_week": "category",
                                      "duration": "Int64",
                                      "campaign": "Int64",
                                      "pdays": "Int64",
                                      "previous": "Int64",
                                      "poutcome": "boolean",
                                      "y": "boolean"})
    # Drop duplicates
    bank_mkt = bank_mkt.drop_duplicates().reset_index(drop=True)
    # reorder categorical data
    bank_mkt["education"] = bank_mkt["education"].cat.reorder_categories(["illiterate", "basic.4y", "basic.6y", "basic.9y", "high.school", "professional.course", "university.degree"], ordered=True)
    bank_mkt["month"] = bank_mkt["month"].cat.reorder_categories(["mar", "apr", "jun", "jul", "may", "aug", "sep", "oct", "nov", "dec"], ordered=True)
    bank_mkt["day_of_week"] = bank_mkt["day_of_week"].cat.reorder_categories(["mon", "tue", "wed", "thu", "fri"], ordered=True)
    return bank_mkt

In [134]:
bank_mkt = import_dataset("../data/BankMarketing.csv")

In [135]:
train_test_split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=62)
train_validate_split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=62)

In [136]:
for train_index, test_index in train_test_split.split(bank_mkt.drop("y", axis=1), bank_mkt["y"]):
    bank_train_set = bank_mkt.loc[train_index].reset_index(drop=True)
    bank_test_set = bank_mkt.loc[test_index].reset_index(drop=True)

## Baseline

In [137]:
def cat_encode(X, drop=["duration", "y"]):
    """
    Encode categorical data into numerical values.
    """
    X = X.copy()
    # `month` will be encoded to the corresponding number, e.g. "mar" -> 3.
    month_map = {"mar": 3,
                 "apr": 4,
                 "may": 5,
                 "jun": 6,
                 "jul": 7,
                 "aug": 8,
                 "sep": 9,
                 "oct": 10,
                 "nov": 11,
                 "dec": 12}
    X["month"] = X["month"].replace(month_map).astype("int")
    # Drop features
    X = X.drop(drop, axis=1)
    # Other categorical features will be coded as its order in pandas categorical index
    X = X.apply(lambda x: x.cat.codes if pd.api.types.is_categorical_dtype(x) else (x.astype("Int64") if pd.api.types.is_bool_dtype(x) else x))
    # Fill missing values as -1
    X = X.fillna(-1)
    return X

basic_encoder = FunctionTransformer(cat_encode)

In [138]:
cat_features = ["job",
                "marital",
                "education",
                "default",
                "housing",
                "loan",
                "poutcome"]

num_features =  ["age",
                 "campaign",
                 "pdays",
                 "previous",
                 "emp.var.rate",
                 "cons.price.idx",
                 "cons.conf.idx",
                 "euribor3m",
                 "nr.employed"]

cat_transformer = ColumnTransformer([
    ("one_hot_encoder", OneHotEncoder(drop="first"), cat_features),
    ("scaler", StandardScaler(), num_features)
], remainder="passthrough")

In [139]:
def transform_dataset(train_set, test_set, preprocessor, to_numpy=False, is_numpy=False):
    y_train = train_set["y"].astype("int")
    X_train = preprocessor.fit_transform(train_set, y_train)
    y_test = test_set["y"].astype("int")
    X_test = preprocessor.transform(test_set)
    for train_index, validate_index in train_validate_split.split(X_train, y_train):
        if is_numpy:
                X_ttrain = X_train[train_index]
                y_ttrain = y_train[train_index]
                X_validate = X_train[validate_index]
                y_validate = y_train[validate_index]
        else:
                X_ttrain = X_train.iloc[train_index]
                y_ttrain = y_train.iloc[train_index]
                X_validate = X_train.iloc[validate_index]
                y_validate = y_train.iloc[validate_index]

    if to_numpy:
            y_train = y_train.to_numpy()
            X_train = X_train.to_numpy()
            y_test = y_test.to_numpy()
            X_test = X_test.to_numpy()
            X_ttrain = X_ttrain.to_numpy()
            y_ttrain = y_ttrain.to_numpy()
            X_validate = X_validate.to_numpy()
            y_validate = y_validate.to_numpy()
        
    return (X_train, y_train, X_ttrain, y_ttrain, X_validate, y_validate, X_test, y_test)

In [140]:
def benchmark_dataset(clf, X_ttrain, y_ttrain, X_validate, y_validate, X_test, y_test):
    metric_names = ["TNR", "TPR", "bACC", "ROC", "REC", "PRE", "F1", "AP"]
    set_names = ["Train", "Validate", "Test"]

    X_sets = [X_ttrain, X_validate, X_test]
    y_sets = [y_ttrain, y_validate, y_test]

    metric_df = pd.DataFrame(index=metric_names, columns=set_names)

    for name, X, y in zip(set_names, X_sets, y_sets):
        y_pred = clf.predict(X)
        if hasattr(clf, "decision_function"):
            y_score = clf.decision_function(X)
        else:
            y_score = clf.predict_proba(X)[:, 1]
        metrics = [recall_score(y, y_pred, pos_label=0),
                   recall_score(y, y_pred),
                   balanced_accuracy_score(y, y_pred),
                   roc_auc_score(y, y_score),
                   recall_score(y, y_pred),
                   precision_score(y, y_pred),
                   f1_score(y, y_pred),
                   average_precision_score(y, y_score)]
        metric_df[name] = metrics
    return metric_df

In [141]:
X_train, y_train, X_ttrain, y_ttrain, X_validate, y_validate, X_test, y_test = transform_dataset(bank_train_set, bank_test_set, preprocessor=basic_encoder, to_numpy=True)
cat_clf = CatBoostClassifier(eval_metric="AUC", class_weights=[1, 8])
cat_clf.fit(X_ttrain, y_ttrain, eval_set=(X_validate, y_validate), verbose=False)
benchmark_dataset(cat_clf, X_ttrain, y_ttrain, X_validate, y_validate, X_test, y_test)

Unnamed: 0,Train,Validate,Test
TNR,0.860967,0.856654,0.862069
TPR,0.666891,0.634771,0.634698
bACC,0.763929,0.745713,0.748384
ROC,0.853671,0.79867,0.809506
REC,0.666891,0.634771,0.634698
PRE,0.378513,0.359817,0.368817
F1,0.482927,0.459288,0.466535
AP,0.53965,0.441568,0.473421


In [144]:
X_train, y_train, X_ttrain, y_ttrain, X_validate, y_validate, X_test, y_test = transform_dataset(bank_train_set, bank_test_set, preprocessor=basic_encoder)
logit_clf = make_pipeline(cat_transformer, LogisticRegression(penalty="none", class_weight="balanced", max_iter=1000))
logit_clf.fit(X_ttrain, y_ttrain)
benchmark_dataset(logit_clf, X_ttrain, y_ttrain, X_validate, y_validate, X_test, y_test)

Unnamed: 0,Train,Validate,Test
TNR,0.799512,0.797639,0.802545
TPR,0.666891,0.672507,0.657328
bACC,0.733202,0.735073,0.729936
ROC,0.784841,0.782818,0.782162
REC,0.666891,0.672507,0.657328
PRE,0.296941,0.296671,0.297126
F1,0.410916,0.411716,0.409259
AP,0.441001,0.431291,0.439847


In [145]:
rbf_clf = Pipeline([
    ("cat_transformer", cat_transformer),
    ("rbf", RBFSampler(gamma=0.001, random_state=42)),
    ("svm", LinearSVC(C=1, loss="squared_hinge", dual=False, class_weight="balanced", max_iter=1000))
])
rbf_clf.fit(X_ttrain, y_ttrain)
benchmark_dataset(rbf_clf, X_ttrain, y_ttrain, X_validate, y_validate, X_test, y_test)

Unnamed: 0,Train,Validate,Test
TNR,0.782791,0.783442,0.787083
TPR,0.682721,0.683288,0.673491
bACC,0.732756,0.733365,0.730287
ROC,0.786836,0.786245,0.785123
REC,0.682721,0.683288,0.673491
PRE,0.285252,0.285956,0.286566
F1,0.402382,0.403181,0.402059
AP,0.439252,0.432701,0.445124


## Drop Client Data

In [146]:
drop_features = ["age",
                 "job",
                 "marital",
                 "education",
                 "housing",
                 "loan",
                 "default",
                 "duration",
                 "y"]

drop_encoder = FunctionTransformer(cat_encode, kw_args={"drop": drop_features})

In [147]:
X_train, y_train, X_ttrain, y_ttrain, X_validate, y_validate, X_test, y_test = transform_dataset(bank_train_set, bank_test_set, preprocessor=drop_encoder, to_numpy=True)

In [148]:
cat_clf = CatBoostClassifier(eval_metric="AUC", class_weights=[1, 8])
cat_clf.fit(X_ttrain, y_ttrain, eval_set=(X_validate, y_validate), verbose=False)
benchmark_dataset(cat_clf, X_ttrain, y_ttrain, X_validate, y_validate, X_test, y_test)

Unnamed: 0,Train,Validate,Test
TNR,0.857461,0.856141,0.861658
TPR,0.654092,0.641509,0.643319
bACC,0.755776,0.748825,0.752489
ROC,0.821104,0.803869,0.813389
REC,0.654092,0.641509,0.643319
PRE,0.368152,0.361427,0.371269
F1,0.471131,0.46236,0.47082
AP,0.510775,0.451903,0.4889


In [149]:
X_train, y_train, X_ttrain, y_ttrain, X_validate, y_validate, X_test, y_test = transform_dataset(bank_train_set, bank_test_set, preprocessor=drop_encoder)

In [150]:
cat_features = ["poutcome"]

num_features =  ["campaign",
                 "pdays",
                 "previous",
                 "emp.var.rate",
                 "cons.price.idx",
                 "cons.conf.idx",
                 "euribor3m",
                 "nr.employed"]

drop_transformer = ColumnTransformer([
    ("one_hot_encoder", OneHotEncoder(drop="first"), cat_features),
    ("scaler", StandardScaler(), num_features)
], remainder="passthrough")

In [151]:
logit_clf = make_pipeline(drop_transformer, LogisticRegression(penalty="none", class_weight="balanced"))
logit_clf.fit(X_ttrain, y_ttrain)
benchmark_dataset(logit_clf, X_ttrain, y_ttrain, X_validate, y_validate, X_test, y_test)

Unnamed: 0,Train,Validate,Test
TNR,0.784502,0.785152,0.787219
TPR,0.669923,0.679245,0.659483
bACC,0.727212,0.732199,0.723351
ROC,0.780472,0.775708,0.781589
REC,0.669923,0.679245,0.659483
PRE,0.283011,0.286364,0.282418
F1,0.397919,0.402878,0.395477
AP,0.432919,0.432338,0.43287


In [152]:
rbf_clf = Pipeline([
    ("drop_transformer", drop_transformer),
    ("rbf", RBFSampler(gamma=0.001, random_state=42)),
    ("svm", LinearSVC(C=1, loss="squared_hinge", dual=False, class_weight="balanced", max_iter=1000))
])
rbf_clf.fit(X_ttrain, y_ttrain)
benchmark_dataset(rbf_clf, X_ttrain, y_ttrain, X_validate, y_validate, X_test, y_test)

Unnamed: 0,Train,Validate,Test
TNR,0.763332,0.767191,0.767926
TPR,0.684069,0.69407,0.676724
bACC,0.723701,0.730631,0.722325
ROC,0.787776,0.788972,0.787742
REC,0.684069,0.69407,0.676724
PRE,0.268473,0.27452,0.270224
F1,0.385609,0.39343,0.386224
AP,0.434689,0.436209,0.442937


## Drop Everything Except Client Data

In [153]:
drop_features = ["contact",
                 "month",
                 "day_of_week",
                 "duration",
                 "campaign",
                 "pdays",
                 "previous",
                 "poutcome",
                 "emp.var.rate",
                 "cons.price.idx",
                 "cons.conf.idx",
                 "euribor3m",
                 "duration",
                 "y"]

drop_encoder = FunctionTransformer(cat_encode, kw_args={"drop": drop_features})

In [156]:
X_train, y_train, X_ttrain, y_ttrain, X_validate, y_validate, X_test, y_test = transform_dataset(bank_train_set,
                                                                                                 bank_test_set,
                                                                                                 preprocessor=drop_encoder,
                                                                                                 to_numpy=True)

In [157]:
cat_clf = CatBoostClassifier(eval_metric="AUC", class_weights=[1, 8])
cat_clf.fit(X_ttrain, y_ttrain, eval_set=(X_validate, y_validate), verbose=False)
benchmark_dataset(cat_clf, X_ttrain, y_ttrain, X_validate, y_validate, X_test, y_test)

Unnamed: 0,Train,Validate,Test
TNR,0.75495,0.757441,0.753421
TPR,0.699225,0.699461,0.677802
bACC,0.727088,0.728451,0.715611
ROC,0.78616,0.783439,0.779924
REC,0.699225,0.699461,0.677802
PRE,0.265949,0.26794,0.258741
F1,0.385336,0.387458,0.374516
AP,0.387902,0.364424,0.379107


In [158]:
X_train, y_train, X_ttrain, y_ttrain, X_validate, y_validate, X_test, y_test = transform_dataset(bank_train_set, bank_test_set, preprocessor=drop_encoder)

In [161]:
cat_features = ["job",
                "marital",
                "education",
                "default",
                "housing",
                "loan"]

num_features =  ["age"]

drop_encoder = ColumnTransformer([
    ("one_hot_encoder", OneHotEncoder(drop="first"), cat_features),
    ("scaler", StandardScaler(), num_features)
], remainder="passthrough")

In [162]:
logit_clf = make_pipeline(drop_encoder, LogisticRegression(penalty="none", class_weight="balanced"))
logit_clf.fit(X_ttrain, y_ttrain)
benchmark_dataset(logit_clf, X_ttrain, y_ttrain, X_validate, y_validate, X_test, y_test)

Unnamed: 0,Train,Validate,Test
TNR,0.551726,0.548238,0.545156
TPR,0.676659,0.669811,0.689655
bACC,0.614192,0.609025,0.617406
ROC,0.664866,0.662443,0.664856
REC,0.676659,0.669811,0.689655
PRE,0.160836,0.158381,0.161453
F1,0.259897,0.256186,0.261652
AP,0.217875,0.202102,0.219732


In [168]:
rbf_clf = Pipeline([
    ("drop_encoder", drop_encoder),
    ("rbf", RBFSampler(gamma=0.001, random_state=42)),
    ("svm", LinearSVC(C=1, loss="squared_hinge", dual=False, class_weight="balanced", max_iter=1000))
])
rbf_clf.fit(X_ttrain, y_ttrain)
benchmark_dataset(rbf_clf, X_ttrain, y_ttrain, X_validate, y_validate, X_test, y_test)

Unnamed: 0,Train,Validate,Test
TNR,0.795364,0.797639,0.799398
TPR,0.667228,0.660377,0.649784
bACC,0.731296,0.729008,0.724591
ROC,0.781569,0.780268,0.778715
REC,0.667228,0.660377,0.649784
PRE,0.292787,0.292887,0.291445
F1,0.406985,0.405797,0.402402
AP,0.370085,0.357364,0.37646


## Impute Missing Values

### Use Most Frequent Values

In [183]:
freq_features = ["job", "marital", "education", "default", "housing", "loan"]

freq_imputer = ColumnTransformer([
    ("freq_imputer", SimpleImputer(missing_values=-1, strategy="most_frequent"), freq_features)
], remainder="passthrough")

freq_transformer = make_pipeline(basic_encoder, freq_imputer)

In [184]:
X_train, y_train, X_ttrain, y_ttrain, X_validate, y_validate, X_test, y_test = transform_dataset(bank_train_set,
                                                                                                 bank_test_set,
                                                                                                 preprocessor=freq_transformer,
                                                                                                 is_numpy=True)

In [185]:
cat_clf = CatBoostClassifier(eval_metric="AUC", class_weights=[1, 8])
cat_clf.fit(X_ttrain, y_ttrain, eval_set=(X_validate, y_validate), verbose=False)
benchmark_dataset(cat_clf, X_ttrain, y_ttrain, X_validate, y_validate, X_test, y_test)

Unnamed: 0,Train,Validate,Test
TNR,0.853868,0.853062,0.858922
TPR,0.648366,0.634771,0.637931
bACC,0.751117,0.743916,0.748426
ROC,0.81378,0.800544,0.803428
REC,0.648366,0.634771,0.637931
PRE,0.360352,0.354135,0.364757
F1,0.463241,0.454633,0.464132
AP,0.492063,0.447945,0.470189


In [186]:
# Select "job", "marital", "education"
cat_features = [0,1,2]

# Select "age", "campaign", "pdays", "previous", "emp.var.rate", "cons.price.idx", "cons.conf.idx", "euribor3m", "nr.employed"
num_features = [5,10,11,12,14,15,16,17,18]

freq_encoder = ColumnTransformer([
    ("one_hot_encoder", OneHotEncoder(drop="first"), cat_features),
    ("scaler", StandardScaler(), num_features)
], remainder="passthrough")

In [187]:
logit_clf = make_pipeline(freq_encoder, LogisticRegression(penalty="none", class_weight="balanced", max_iter=1000))
logit_clf.fit(X_ttrain, y_ttrain)
benchmark_dataset(logit_clf, X_ttrain, y_ttrain, X_validate, y_validate, X_test, y_test)

Unnamed: 0,Train,Validate,Test
TNR,0.785058,0.784297,0.791325
TPR,0.673964,0.673854,0.662716
bACC,0.729511,0.729076,0.72702
ROC,0.781464,0.779181,0.779768
REC,0.673964,0.673854,0.662716
PRE,0.284759,0.28393,0.287383
F1,0.40036,0.399521,0.400913
AP,0.418721,0.412101,0.427253


In [188]:
rbf_clf = Pipeline([
    ("freq_encoder", freq_encoder),
    ("rbf", RBFSampler(gamma=0.001, random_state=42)),
    ("svm", LinearSVC(C=1, loss="squared_hinge", dual=False, class_weight="balanced", max_iter=1000))
])
rbf_clf.fit(X_ttrain, y_ttrain)
benchmark_dataset(rbf_clf, X_ttrain, y_ttrain, X_validate, y_validate, X_test, y_test)

Unnamed: 0,Train,Validate,Test
TNR,0.786383,0.788915,0.790914
TPR,0.680701,0.67655,0.667026
bACC,0.733542,0.732733,0.72897
ROC,0.781372,0.78313,0.782486
REC,0.680701,0.67655,0.667026
PRE,0.288056,0.289171,0.288309
F1,0.404807,0.405165,0.402602
AP,0.426665,0.415455,0.43093


### Estimate Missing Values 

In [191]:
ite_features = ["age", "job", "marital", "education", "default", "housing", "loan", "contact", "campaign", "pdays", "previous", "poutcome"]

ite_imputer = ColumnTransformer([
    ("ite_imputer",
     make_pipeline(
         IterativeImputer(max_iter=100, missing_values=-1, initial_strategy="most_frequent", random_state=42),
         FunctionTransformer(np.round)
     ),
     ite_features),
], remainder="passthrough")

ite_transformer = make_pipeline(basic_encoder, ite_imputer)

In [192]:
X_train, y_train, X_ttrain, y_ttrain, X_validate, y_validate, X_test, y_test = transform_dataset(bank_train_set,
                                                                                                 bank_test_set,
                                                                                                 preprocessor=ite_transformer,
                                                                                                 is_numpy=True)

In [193]:
cat_clf = CatBoostClassifier(eval_metric="AUC", class_weights=[1, 8])
cat_clf.fit(X_ttrain, y_ttrain, eval_set=(X_validate, y_validate), verbose=False)
benchmark_dataset(cat_clf, X_ttrain, y_ttrain, X_validate, y_validate, X_test, y_test)

Unnamed: 0,Train,Validate,Test
TNR,0.863148,0.859904,0.867406
TPR,0.660155,0.626685,0.632543
bACC,0.761652,0.743294,0.749974
ROC,0.847201,0.800226,0.810601
REC,0.660155,0.626685,0.632543
PRE,0.379845,0.36215,0.377249
F1,0.482224,0.459033,0.472625
AP,0.53641,0.44335,0.475045


In [198]:
bank_train_set.columns

Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed', 'y'],
      dtype='object')

In [197]:
X_ttrain[0]

array([ 5.7000e+01,  0.0000e+00,  1.0000e+00,  6.0000e+00,  0.0000e+00,
        1.0000e+00,  0.0000e+00,  0.0000e+00,  3.0000e+00,  1.0000e+01,
        0.0000e+00,  0.0000e+00,  8.0000e+00,  2.0000e+00,  1.4000e+00,
        9.3444e+01, -3.6100e+01,  4.9640e+00,  5.2281e+03])

In [206]:
# Select "job", "marital", "education"
cat_features = [1,2,3]

# Select "age", "campaign", "pdays", "previous", "emp.var.rate", "cons.price.idx", "cons.conf.idx", "euribor3m", "nr.employed"
num_features = [0,10,11,12,14,15,16,17,18]

ite_encoder = ColumnTransformer([
    ("one_hot_encoder", OneHotEncoder(drop="first"), cat_features),
    ("scaler", StandardScaler(), num_features)
], remainder="passthrough")

In [207]:
ite_encoder.fit_transform(X_ttrain)[0]

array([ 1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  1.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  1.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  1.62021194, -0.35212997, -0.1876231 ,
        0.68445666,  0.84058086, -0.22565417,  0.95114048,  0.77595774,
        0.84715918,  0.        ,  1.        ,  0.        ,  0.        ,
        3.        , 10.        ,  2.        ])

In [208]:
logit_clf = make_pipeline(ite_encoder, LogisticRegression(penalty="none", class_weight="balanced", max_iter=1000))
logit_clf.fit(X_ttrain, y_ttrain)
benchmark_dataset(logit_clf, X_ttrain, y_ttrain, X_validate, y_validate, X_test, y_test)

ValueError: Found unknown categories [11.0] in column 0 during transform

In [123]:
rbf_clf = Pipeline([
    ("freq_encoder", freq_encoder),
    ("rbf", RBFSampler(gamma=0.001, random_state=42)),
    ("svm", LinearSVC(C=1, loss="squared_hinge", dual=False, class_weight="balanced", max_iter=1000))
])
rbf_clf.fit(X_ttrain, y_ttrain)
benchmark_dataset(rbf_clf, X_ttrain, y_ttrain, X_validate, y_validate, X_test, y_test)

Unnamed: 0,Train,Validate,Test
TNR,0.770688,0.772152,0.773125
TPR,0.687774,0.690027,0.673491
bACC,0.729231,0.731089,0.723308
ROC,0.783815,0.785399,0.7836
REC,0.687774,0.690027,0.673491
PRE,0.275797,0.277657,0.273763
F1,0.393714,0.395978,0.389287
AP,0.431321,0.424879,0.429063


## Feature Engineering `pdays`

## Mean Encoding

## Cyclic Encoding