# Data Preparation and Feature Engineering

In [3]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
import numpy as np
import pandas as pd
from category_encoders.target_encoder import TargetEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV, RandomizedSearchCV, cross_validate
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.kernel_approximation import RBFSampler
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import accuracy_score, average_precision_score, f1_score, precision_score, recall_score, balanced_accuracy_score, roc_auc_score, confusion_matrix
from sklearn.utils.fixes import loguniform
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

## Import

In [4]:
def import_dataset(filename):
    """
    Import the dataset from the path.

    Parameters
    ----------
        filename : str
            filename with path

    Returns
    -------
        data : DataFrame

    Examples
    --------
        bank_mkt = import_dataset("../data/BankMarketing.csv")
    """
    bank_mkt = pd.read_csv(filename,
                           na_values=["unknown", "nonexistent"],
                           true_values=["yes", "success"],
                           false_values=["no", "failure"])
    # Treat pdays = 999 as missing values
    bank_mkt["pdays"] = bank_mkt["pdays"].replace(999, pd.NA)
    # `month` will be encoded to the corresponding number, e.g. "mar" -> 3
    month_map = {"mar": 3,
                 "apr": 4,
                 "may": 5,
                 "jun": 6,
                 "jul": 7,
                 "aug": 8,
                 "sep": 9,
                 "oct": 10,
                 "nov": 11,
                 "dec": 12}
    bank_mkt["month"] = bank_mkt["month"].replace(month_map)
    # `day_of_week` will be encoded to the corresponding number, e.g. "wed" -> 3
    dow_map = {"mon":1,
               "tue":2,
               "wed":3,
               "thu":4,
               "fri":5}
    bank_mkt["day_of_week"] = bank_mkt["day_of_week"].replace(dow_map)
    # Convert types, "Int64" is nullable integer data type in pandas
    bank_mkt = bank_mkt.astype(dtype={"age": "Int64",
                                      "job": "category",
                                      "marital": "category",
                                      "education": "category",
                                      "default": "boolean",
                                      "housing": "boolean",
                                      "loan": "boolean",
                                      "contact": "category",
                                      "month": "Int64",
                                      "day_of_week": "Int64",
                                      "duration": "Int64",
                                      "campaign": "Int64",
                                      "pdays": "Int64",
                                      "previous": "Int64",
                                      "poutcome": "boolean",
                                      "y": "boolean"})
    # Drop 12 duplicated rows
    bank_mkt = bank_mkt.drop_duplicates().reset_index(drop=True)
    # reorder ordinal categorical data
    bank_mkt["education"] = bank_mkt["education"].cat.reorder_categories(["illiterate", "basic.4y", "basic.6y", "basic.9y", "high.school", "professional.course", "university.degree"], ordered=True)
    return bank_mkt

In [5]:
bank_mkt = import_dataset("../data/BankMarketing.csv")

## Partition

In [7]:
def split_dataset(data, preprocessor, random_state=82):
    """
    Split dataset into train, test and validation sets using preprocessor.
    Because the random state of validation set is not specified, the validation set will be different each time when the function is called.

    Parameters
    ----------
        data : DataFrame

        preprocessor : Pipeline

    Returns
    -------
        datasets : tuple

    Examples
    --------
        from sklearn.preprocessing import OrdinalEncoder
        data = import_dataset("../data/BankMarketing.csv").interpolate(method="pad").loc[:, ["job", "education", "y"]]
        # To unpack all train, test, and validation sets 
        X_train, y_train, X_test, y_test, X_ttrain, y_ttrain, X_validate, y_validate = split_dataset(data, OrdinalEncoder())
        # To unpack train and test sets.
        X_train, y_train, X_test, y_test, *other_sets = split_dataset(data, OrdinalEncoder())
        # To unpack test and validation set
        *other_sets, X_test, y_test, X_ttrain, y_ttrain, X_validate, y_validate = split_dataset(data, OrdinalEncoder())
        # To unpack only train set.
        X_train, y_train, *other_sets = split_dataset(data, OneHotEncoder())
    """
    train_test_split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=random_state)
    for train_index, test_index in train_test_split.split(data.drop("y", axis=1), data["y"]):
        train_set = data.loc[train_index]
        test_set = data.loc[test_index]

    y_train = train_set["y"].astype("int").to_numpy()
    y_test = test_set["y"].astype("int").to_numpy()
    X_train = preprocessor.fit_transform(train_set, y_train)
    X_test = preprocessor.transform(test_set)
        
    train_validate_split = StratifiedShuffleSplit(n_splits=1, test_size=0.2)
    for ttrain_index, validate_index in train_validate_split.split(X_train, y_train):
        ttrain_set = train_set.loc[ttrain_index]
        validate_set = train_set.loc[validate_index]
    
    y_ttrain = ttrain_set["y"].astype("int").to_numpy()
    y_validate = validate_set["y"].astype("int").to_numpy()
    X_ttrain = preprocessor.fit_transform(ttrain_set, y_ttrain)
    X_validate = preprocessor.transform(validate_set)
    
    if isinstance(X_train, pd.DataFrame):
        X_train = X_train.to_numpy()
        X_test = X_test.to_numpy()
        X_ttrain = X_ttrain.to_numpy()
        X_validate = X_validate.to_numpy()

    return (X_train, y_train, X_test, y_test, X_ttrain, y_ttrain, X_validate, y_validate)

## Baseline

In [8]:
def benchmark(data, preprocessor, clf):
    """
    Benchmark preprocessor and clf's performance on train, validation and test sets. 
    All the data transformation should be handled by preprocessor and estimation should be handled by clf.
    
    Parameters
    ----------
        data : DataFrame
        
        preprocessor : Pipeline
        
        clf : estimator
        
        name : str, default = None
        
        compare_to: DataFrame, default = None
        
    """
    X_train, y_train, X_test, y_test, X_ttrain, y_ttrain, X_validate, y_validate = split_dataset(data, preprocessor)
    X_sets = [X_ttrain, X_validate, X_test]
    y_sets = [y_ttrain, y_validate, y_test]
    
    metric_names = ["TNR", "TPR", "bACC", "ROC", "REC", "PRE", "AP"]
    set_names = ["Train", "Validate", "Test"]
    metric_df = pd.DataFrame(index=metric_names, columns=set_names)
    
    try:
        clf.fit(X_ttrain, y_ttrain, eval_set=(X_validate, y_validate), verbose=False)
    except (ValueError, TypeError):
        clf.fit(X_ttrain, y_ttrain)
        
    for name, X, y in zip(set_names, X_sets, y_sets):
        # Re-fit model on train set before test set evaluation except CatBoost
        if name == "Test":
            try:
                clf.fit(X_ttrain, y_ttrain, eval_set=(X_validate, y_validate), verbose=False)
            except (ValueError, TypeError):
                clf.fit(X_train, y_train)
                
        y_pred = clf.predict(X)
        
        try:
            y_score = clf.decision_function(X)
        except AttributeError:
            y_score = clf.predict_proba(X)[:, 1]
            
        metrics = [recall_score(y, y_pred, pos_label=0),
                   recall_score(y, y_pred),
                   balanced_accuracy_score(y, y_pred),
                   roc_auc_score(y, y_score),
                   recall_score(y, y_pred),
                   precision_score(y, y_pred),
                   average_precision_score(y, y_score)]
        metric_df[name] = metrics
        
    return metric_df

In [47]:
def cat_encode(X,
               drop=["duration", "y"],
               cut=None,
               cyclic=None,
               target=None,
               external=None):
    """
    Encode and transform categorical data into numerical values.
    
    Parameters
    ----------
        X : DataFrame
        
        drop : list, default = ["duration", "y"]
        
        cut : list
        
        cyclic : list
        
        target : list
        
        external : list
    
    Returns
    -------
        X : DataFrame

    Examples
    --------
    bank_mkt = import_dataset("../data/BankMarketing.csv")
    X = cat_encode(bank_mkt)
    """
    X = X.copy()
    
    # `month` will be encoded to the corresponding number, e.g. "mar" -> 3.
    month_map = {"mar": 3,
                 "apr": 4,
                 "may": 5,
                 "jun": 6,
                 "jul": 7,
                 "aug": 8,
                 "sep": 9,
                 "oct": 10,
                 "nov": 11,
                 "dec": 12}
    X["month"] = X["month"].replace(month_map).astype("Int64")
    
    if cut != None:
        if "pdays" in cut:
            X["pdays"] = X["pdays"].fillna(-1)
            # Clients who have been contacted but do not have pdays record
            X.loc[X["pdays"].isna() & X["poutcome"].notna(), "pdays"] = 999
            # Cut pdays into categories
            X["pdays"] = pd.cut(X["pdays"], [0, 3, 5, 10, 15, 30, 1000], labels=[3, 5, 10, 15, 30, 1000], include_lowest=True).astype("Int64")
    else:
        # Fill missing values in pdays as 999
        X["pdays"] = X["pdays"].fillna(999)
    
    if cyclic != None:
        if "month" in cyclic:
            X['month_sin'] = np.sin(2 * np.pi * X["month"]/12)
            X['month_cos'] = np.cos(2 * np.pi * X["month"]/12)
            X = X.drop("month", axis=1)
        if "day_of_week" in cyclic:
            X["day_of_week"] = X["day_of_week"].cat.codes
            X['day_sin'] = np.sin(2 * np.pi * X["day_of_week"]/5)
            X['day_cos'] = np.cos(2 * np.pi * X["day_of_week"]/5)
            X = X.drop("day_of_week", axis=1)
    
    if external != None:
        if "year" in external:
            X["year"]=2008
            X.loc[27682:39118, "year"] = 2009
            X.loc[39118:, "year"] = 2010
        if "date" in external:
            X["year"]=2008
            X.loc[27682:39118, "year"] = 2009
            X.loc[39118:, "year"] = 2010
            X["date"] = pd.to_datetime(X[["month", "year"]].assign(day=1)).dt.strftime("%m/%Y").astype("str")
            
    # Transform target encoded feature as str
    if target != None:
        X[target] = X[target].astype("str")
        
    # Drop features
    X = X.drop(drop, axis=1)
    
    # Other categorical features will be coded as its order in pandas categorical index
    X = X.apply(lambda x: x.cat.codes if pd.api.types.is_categorical_dtype(x) else (x.astype("Int64") if pd.api.types.is_bool_dtype(x) else x))
    
    # Fill missing values as -1
    X = X.fillna(-1)
    
    return X

In [20]:
cat_encoder = FunctionTransformer(cat_encode)
cat_clf = CatBoostClassifier(eval_metric="AUC", class_weights=[1, 8])
cat_baseline = benchmark(bank_mkt, cat_encoder, cat_clf)
cat_baseline

Unnamed: 0,Train,Validate,Test
TNR,0.865201,0.862641,0.861932
TPR,0.657797,0.630728,0.639009
bACC,0.761499,0.746684,0.75047
ROC,0.845568,0.797931,0.801313
REC,0.657797,0.630728,0.639009
PRE,0.382566,0.368214,0.370162
AP,0.532276,0.467858,0.44578


In [11]:
cat_features = ["job",
                "marital",
                "education",
                "default",
                "housing",
                "loan",
                "poutcome"]

num_features =  ["age",
                 "campaign",
                 "pdays",
                 "previous",
                 "emp.var.rate",
                 "cons.price.idx",
                 "cons.conf.idx",
                 "euribor3m",
                 "nr.employed"]

hot_scaler = ColumnTransformer([
    ("one_hot_encoder", OneHotEncoder(drop="first"), cat_features),
    ("scaler", StandardScaler(), num_features)
], remainder="passthrough")

cat_transformer = make_pipeline(cat_encoder, hot_scaler)

In [12]:
logit_clf = LogisticRegression(penalty="none", class_weight="balanced", max_iter=1000)
logit_baseline = benchmark(bank_mkt, cat_transformer, logit_clf)
logit_baseline

Unnamed: 0,Train,Validate,Test
TNR,0.802036,0.795758,0.807608
TPR,0.66386,0.663073,0.665948
bACC,0.732948,0.729415,0.736778
ROC,0.7855,0.784058,0.78197
REC,0.66386,0.663073,0.665948
PRE,0.298636,0.291815,0.305336
AP,0.438799,0.458944,0.425859


In [13]:
rbf_clf = Pipeline([
    ("rbf", RBFSampler(gamma=0.001, random_state=42)),
    ("svm", LinearSVC(C=1, loss="squared_hinge", dual=False, class_weight="balanced", max_iter=1000))
])
rbf_baseline = benchmark(bank_mkt, cat_transformer, rbf_clf)
rbf_baseline

Unnamed: 0,Train,Validate,Test
TNR,0.792627,0.795074,0.797345
TPR,0.67127,0.671159,0.678879
bACC,0.731948,0.733116,0.738112
ROC,0.787844,0.792044,0.785194
REC,0.67127,0.671159,0.678879
PRE,0.291289,0.293632,0.298437
AP,0.446463,0.45179,0.425436


In [14]:
sum_baseline = pd.concat([cat_baseline, logit_baseline, rbf_baseline], keys=["CatBoost", "Logistic", "RBF SVM"], axis=1)
sum_baseline

Unnamed: 0_level_0,CatBoost,CatBoost,CatBoost,Logistic,Logistic,Logistic,RBF SVM,RBF SVM,RBF SVM
Unnamed: 0_level_1,Train,Validate,Test,Train,Validate,Test,Train,Validate,Test
TNR,0.854381,0.857338,0.852901,0.802036,0.795758,0.807608,0.792627,0.795074,0.797345
TPR,0.661165,0.65903,0.649784,0.66386,0.663073,0.665948,0.67127,0.671159,0.678879
bACC,0.757773,0.758184,0.751343,0.732948,0.729415,0.736778,0.731948,0.733116,0.738112
ROC,0.836404,0.812978,0.802966,0.7855,0.784058,0.78197,0.787844,0.792044,0.785194
REC,0.661165,0.65903,0.649784,0.66386,0.663073,0.665948,0.67127,0.671159,0.678879
PRE,0.365686,0.369615,0.359356,0.298636,0.291815,0.305336,0.291289,0.293632,0.298437
AP,0.52885,0.483317,0.445645,0.438799,0.458944,0.425859,0.446463,0.45179,0.425436


## Drop Client Data

In [100]:
drop_features = ["age",
                 "job",
                 "marital",
                 "education",
                 "housing",
                 "loan",
                 "default",
                 "duration",
                 "y"]

drop_encoder = FunctionTransformer(cat_encode, kw_args={"drop": drop_features})

In [101]:
cat_drop = benchmark(bank_mkt, drop_encoder, cat_clf)
pd.concat([cat_baseline, cat_drop], keys=["Catboost Baseline", "Catboost Drop"], axis=1)

Unnamed: 0_level_0,Catboost Baseline,Catboost Baseline,Catboost Baseline,Catboost Drop,Catboost Drop,Catboost Drop
Unnamed: 0_level_1,Train,Validate,Test,Train,Validate,Test
TNR,0.946782,0.943256,0.903633,0.945525,0.94893,0.956293
TPR,0.327976,0.302381,0.333596,0.3125,0.314286,0.192202
bACC,0.637379,0.622818,0.618615,0.629013,0.631608,0.574247
ROC,0.74952,0.680977,0.680787,0.723606,0.70185,0.657389
REC,0.327976,0.302381,0.333596,0.3125,0.314286,0.192202
PRE,0.295601,0.266247,0.606734,0.280899,0.295302,0.662144
AP,0.267692,0.216556,0.499409,0.245246,0.244555,0.48096


In [102]:
cat_features = ["poutcome"]

num_features =  ["campaign",
                 "pdays",
                 "previous",
                 "emp.var.rate",
                 "cons.price.idx",
                 "cons.conf.idx",
                 "euribor3m",
                 "nr.employed"]

hot_scaler = ColumnTransformer([
    ("one_hot_encoder", OneHotEncoder(drop="first"), cat_features),
    ("scaler", StandardScaler(), num_features)
], remainder="passthrough")

drop_transformer = make_pipeline(drop_encoder, hot_scaler)

In [103]:
logit_drop = benchmark(bank_mkt, drop_transformer, logit_clf)
pd.concat([logit_baseline, logit_drop], keys=["Logistic Baseline", "Logistic Drop"], axis=1)

Unnamed: 0_level_0,Logistic Baseline,Logistic Baseline,Logistic Baseline,Logistic Drop,Logistic Drop,Logistic Drop
Unnamed: 0_level_1,Train,Validate,Test,Train,Validate,Test
TNR,0.749838,0.750649,0.555731,0.774238,0.780318,0.502019
TPR,0.507143,0.466667,0.315479,0.463095,0.480952,0.32887
bACC,0.62849,0.608658,0.435605,0.618667,0.630635,0.415444
ROC,0.68428,0.67155,0.388754,0.671071,0.692944,0.38884
REC,0.507143,0.466667,0.315479,0.463095,0.480952,0.32887
PRE,0.121298,0.113033,0.240396,0.122558,0.129737,0.227397
AP,0.195585,0.184663,0.305101,0.17914,0.185115,0.303314


In [104]:
rbf_drop = benchmark(bank_mkt, drop_transformer, rbf_clf)
pd.concat([rbf_baseline, rbf_drop], keys=["RBF Baseline", "RBF Drop"], axis=1)

Unnamed: 0_level_0,RBF Baseline,RBF Baseline,RBF Baseline,RBF Drop,RBF Drop,RBF Drop
Unnamed: 0_level_1,Train,Validate,Test,Train,Validate,Test
TNR,0.731355,0.727626,0.48271,0.699376,0.696984,0.446551
TPR,0.519048,0.495238,0.433635,0.573214,0.495238,0.459236
bACC,0.625202,0.611432,0.458173,0.636295,0.596111,0.452893
ROC,0.682524,0.657794,0.457365,0.680859,0.650337,0.480152
REC,0.519048,0.495238,0.433635,0.573214,0.495238,0.459236
PRE,0.116267,0.110169,0.271986,0.114916,0.100144,0.26997
AP,0.190419,0.185811,0.297581,0.182002,0.162034,0.3493


## Drop Everything Except Client Data

In [105]:
drop_features = ["contact",
                 "month",
                 "day_of_week",
                 "duration",
                 "campaign",
                 "pdays",
                 "previous",
                 "poutcome",
                 "emp.var.rate",
                 "cons.price.idx",
                 "cons.conf.idx",
                 "euribor3m",
                 "duration",
                 "y"]

client_encoder = FunctionTransformer(cat_encode, kw_args={"drop": drop_features})

In [106]:
cat_client = benchmark(bank_mkt, client_encoder, cat_clf)
pd.concat([cat_baseline, cat_client], keys=["Catboost Baseline", "Catboost Client"], axis=1)

Unnamed: 0_level_0,Catboost Baseline,Catboost Baseline,Catboost Baseline,Catboost Client,Catboost Client,Catboost Client
Unnamed: 0_level_1,Train,Validate,Test,Train,Validate,Test
TNR,0.946782,0.943256,0.903633,0.924206,0.9238,0.262243
TPR,0.327976,0.302381,0.333596,0.284524,0.32619,0.865301
bACC,0.637379,0.622818,0.618615,0.604365,0.624995,0.563772
ROC,0.74952,0.680977,0.680787,0.69606,0.68097,0.594448
REC,0.327976,0.302381,0.333596,0.284524,0.32619,0.865301
PRE,0.295601,0.266247,0.606734,0.203578,0.2257,0.343281
AP,0.267692,0.216556,0.499409,0.177429,0.184081,0.378059


In [107]:
cat_features = ["job",
                "marital",
                "education",
                "default",
                "housing",
                "loan"]

num_features =  ["age"]

hot_scaler = ColumnTransformer([
    ("one_hot_encoder", OneHotEncoder(drop="first"), cat_features),
    ("scaler", StandardScaler(), num_features)
], remainder="passthrough")

client_transformer = make_pipeline(client_encoder, hot_scaler)


In [108]:
logit_client = benchmark(bank_mkt, client_transformer, logit_clf)
pd.concat([logit_baseline, logit_client], keys=["Logistic Baseline", "Logistic Client"], axis=1)

Unnamed: 0_level_0,Logistic Baseline,Logistic Baseline,Logistic Baseline,Logistic Client,Logistic Client,Logistic Client
Unnamed: 0_level_1,Train,Validate,Test,Train,Validate,Test
TNR,0.749838,0.750649,0.555731,0.495379,0.481031,0.339652
TPR,0.507143,0.466667,0.315479,0.6375,0.628571,0.780228
bACC,0.62849,0.608658,0.435605,0.56644,0.554801,0.55994
ROC,0.68428,0.67155,0.388754,0.591552,0.58295,0.574904
REC,0.507143,0.466667,0.315479,0.6375,0.628571,0.780228
PRE,0.121298,0.113033,0.240396,0.07921,0.07619,0.344942
AP,0.195585,0.184663,0.305101,0.089746,0.088462,0.345087


In [109]:
rbf_client = benchmark(bank_mkt, client_transformer, rbf_clf)
pd.concat([rbf_baseline, rbf_client], keys=["RBF Baseline", "RBF Client"], axis=1)

Unnamed: 0_level_0,RBF Baseline,RBF Baseline,RBF Baseline,RBF Client,RBF Client,RBF Client
Unnamed: 0_level_1,Train,Validate,Test,Train,Validate,Test
TNR,0.731355,0.727626,0.48271,0.783358,0.781615,0.32052
TPR,0.519048,0.495238,0.433635,0.454762,0.447619,0.386373
bACC,0.625202,0.611432,0.458173,0.61906,0.614617,0.353446
ROC,0.682524,0.657794,0.457365,0.666659,0.653124,0.323596
REC,0.519048,0.495238,0.433635,0.454762,0.447619,0.386373
PRE,0.116267,0.110169,0.271986,0.125061,0.122476,0.202185
AP,0.190419,0.185811,0.297581,0.144859,0.139881,0.225061


## Impute Missing Values

### Use Most Frequent Values

In [110]:
freq_features = ["job", "marital", "education", "default", "housing", "loan"]

freq_imputer = ColumnTransformer([
    ("freq_imputer", SimpleImputer(missing_values=-1, strategy="most_frequent"), freq_features)
], remainder="passthrough")

freq_encoder = make_pipeline(cat_encoder, freq_imputer)

In [111]:
cat_freq = benchmark(bank_mkt, freq_encoder, cat_clf)
pd.concat([cat_baseline, cat_freq], keys=["Catboost Baseline", "Catboost Frequent"], axis=1)

Unnamed: 0_level_0,Catboost Baseline,Catboost Baseline,Catboost Baseline,Catboost Frequent,Catboost Frequent,Catboost Frequent
Unnamed: 0_level_1,Train,Validate,Test,Train,Validate,Test
TNR,0.946782,0.943256,0.903633,0.953753,0.952497,0.873793
TPR,0.327976,0.302381,0.333596,0.32619,0.290476,0.348169
bACC,0.637379,0.622818,0.618615,0.639972,0.621486,0.610981
ROC,0.74952,0.680977,0.680787,0.764044,0.697451,0.693952
REC,0.327976,0.302381,0.333596,0.32619,0.290476,0.348169
PRE,0.295601,0.266247,0.606734,0.324452,0.293976,0.551466
AP,0.267692,0.216556,0.499409,0.287825,0.204977,0.497987


In [112]:
# Select "job", "marital", "education"
cat_features = [0,1,2]

# Select "age", "campaign", "pdays", "previous", "emp.var.rate", "cons.price.idx", "cons.conf.idx", "euribor3m", "nr.employed"
num_features = [5,10,11,12,14,15,16,17,18]

hot_scaler = ColumnTransformer([
    ("one_hot_encoder", OneHotEncoder(drop="first"), cat_features),
    ("scaler", StandardScaler(), num_features)
], remainder="passthrough")

freq_transformer = make_pipeline(cat_encoder, freq_imputer, hot_scaler)


In [113]:
logit_freq = benchmark(bank_mkt, freq_transformer, logit_clf)
pd.concat([logit_baseline, logit_freq], keys=["Logistic Baseline", "Logistic Frequent"], axis=1)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Unnamed: 0_level_0,Logistic Baseline,Logistic Baseline,Logistic Baseline,Logistic Frequent,Logistic Frequent,Logistic Frequent
Unnamed: 0_level_1,Train,Validate,Test,Train,Validate,Test
TNR,0.749838,0.750649,0.555731,0.769374,0.776265,0.541689
TPR,0.507143,0.466667,0.315479,0.484524,0.478571,0.317448
bACC,0.62849,0.608658,0.435605,0.626949,0.627418,0.429568
ROC,0.68428,0.67155,0.388754,0.6811,0.685639,0.388679
REC,0.507143,0.466667,0.315479,0.484524,0.478571,0.317448
PRE,0.121298,0.113033,0.240396,0.125154,0.127135,0.235879
AP,0.195585,0.184663,0.305101,0.190328,0.190995,0.304329


In [114]:
rbf_freq = benchmark(bank_mkt, freq_transformer, rbf_clf)
pd.concat([rbf_baseline, rbf_freq], keys=["RBF Baseline", "RBF Frequent"], axis=1)

Unnamed: 0_level_0,RBF Baseline,RBF Baseline,RBF Baseline,RBF Frequent,RBF Frequent,RBF Frequent
Unnamed: 0_level_1,Train,Validate,Test,Train,Validate,Test
TNR,0.731355,0.727626,0.48271,0.74088,0.736057,0.39161
TPR,0.519048,0.495238,0.433635,0.499405,0.538095,0.581331
bACC,0.625202,0.611432,0.458173,0.620143,0.637076,0.48647
ROC,0.682524,0.657794,0.457365,0.67282,0.696074,0.527953
REC,0.519048,0.495238,0.433635,0.499405,0.538095,0.581331
PRE,0.116267,0.110169,0.271986,0.116012,0.121899,0.298665
AP,0.190419,0.185811,0.297581,0.185224,0.19978,0.399867


### Estimate Missing Values 

In [115]:
ite_features = ["age", "job", "marital", "education", "default", "housing", "loan", "contact", "campaign", "month", "day_of_week", "pdays", "previous"]

ite_imputer = ColumnTransformer([
    ("ite_imputer",
     make_pipeline(
         IterativeImputer(max_iter=100, missing_values=-1, initial_strategy="most_frequent", random_state=42),
         FunctionTransformer(np.round)
     ),
     ite_features),
], remainder="passthrough")

ite_encoder = make_pipeline(cat_encoder, ite_imputer)

In [116]:
cat_ite = benchmark(bank_mkt, ite_encoder, cat_clf)
pd.concat([cat_baseline, cat_ite], keys=["Catboost Baseline", "Catboost Iterative"], axis=1)

Unnamed: 0_level_0,Catboost Baseline,Catboost Baseline,Catboost Baseline,Catboost Iterative,Catboost Iterative,Catboost Iterative
Unnamed: 0_level_1,Train,Validate,Test,Train,Validate,Test
TNR,0.946782,0.943256,0.903633,0.950186,0.945687,0.725996
TPR,0.327976,0.302381,0.333596,0.295238,0.359524,0.554943
bACC,0.637379,0.622818,0.618615,0.622712,0.652606,0.64047
ROC,0.74952,0.680977,0.680787,0.721813,0.721429,0.707384
REC,0.327976,0.302381,0.333596,0.295238,0.359524,0.554943
PRE,0.295601,0.266247,0.606734,0.287536,0.3107,0.474411
AP,0.267692,0.216556,0.499409,0.245008,0.263722,0.490075


In [117]:
# Select "job", "marital", "education", "default", "housing", "loan", "poutcome"
cat_features = [1,2,3,4,5,13]

# Select "age", "campaign", "pdays", "previous", "emp.var.rate", "cons.price.idx", "cons.conf.idx", "euribor3m", "nr.employed"
num_features = [0,8,9,10,12,14,15,16,17,18]

hot_scaler = ColumnTransformer([
    ("one_hot_encoder", OneHotEncoder(drop="first"), cat_features),
    ("scaler", StandardScaler(), num_features)
], remainder="passthrough")

ite_transformer = make_pipeline(cat_encoder, ite_imputer, hot_scaler)

In [118]:
logit_ite = benchmark(bank_mkt, ite_transformer, logit_clf)
pd.concat([logit_baseline, logit_ite], keys=["Logistic Baseline", "Logistic Iterative"], axis=1)

Unnamed: 0_level_0,Logistic Baseline,Logistic Baseline,Logistic Baseline,Logistic Iterative,Logistic Iterative,Logistic Iterative
Unnamed: 0_level_1,Train,Validate,Test,Train,Validate,Test
TNR,0.749838,0.750649,0.555731,0.712751,0.700713,0.525364
TPR,0.507143,0.466667,0.315479,0.542857,0.545238,0.315085
bACC,0.62849,0.608658,0.435605,0.627804,0.622976,0.420224
ROC,0.68428,0.67155,0.388754,0.681547,0.670371,0.387802
REC,0.507143,0.466667,0.315479,0.542857,0.545238,0.315085
PRE,0.121298,0.113033,0.240396,0.114014,0.110361,0.228311
AP,0.195585,0.184663,0.305101,0.187467,0.189898,0.301043


In [119]:
rbf_ite = benchmark(bank_mkt, ite_transformer, rbf_clf)
pd.concat([rbf_baseline, rbf_ite], keys=["RBF Baseline", "RBF Iterative"], axis=1)

Unnamed: 0_level_0,RBF Baseline,RBF Baseline,RBF Baseline,RBF Iterative,RBF Iterative,RBF Iterative
Unnamed: 0_level_1,Train,Validate,Test,Train,Validate,Test
TNR,0.731355,0.727626,0.48271,0.712468,0.712873,0.464279
TPR,0.519048,0.495238,0.433635,0.519048,0.564286,0.517133
bACC,0.625202,0.611432,0.458173,0.615758,0.638579,0.490706
ROC,0.682524,0.657794,0.457365,0.668423,0.690162,0.528076
REC,0.519048,0.495238,0.433635,0.519048,0.564286,0.517133
PRE,0.116267,0.110169,0.271986,0.109465,0.118028,0.300802
AP,0.190419,0.185811,0.297581,0.177775,0.233388,0.396487


## Feature Engineering `pdays`

In [32]:
cut_encoder = FunctionTransformer(cat_encode, kw_args={"cut": ["pdays"]})

In [33]:
cat_cut = benchmark(bank_mkt, cut_encoder, cat_clf)
pd.concat([cat_baseline, cat_cut], keys=["Catboost Baseline", "Catboost Cut"], axis=1)

Unnamed: 0_level_0,Catboost Baseline,Catboost Baseline,Catboost Baseline,Catboost Cut,Catboost Cut,Catboost Cut
Unnamed: 0_level_1,Train,Validate,Test,Train,Validate,Test
TNR,0.888612,0.891576,0.020305,0.883267,0.887498,0.113706
TPR,0.554154,0.520337,0.992544,0.535226,0.535764,0.966449
bACC,0.721383,0.705957,0.506424,0.709247,0.711631,0.540077
ROC,0.822153,0.767514,0.704615,0.777547,0.771101,0.709818
REC,0.554154,0.520337,0.992544,0.535226,0.535764,0.966449
PRE,0.332912,0.324869,0.524631,0.31504,0.323181,0.542932
AP,0.417707,0.361904,0.730569,0.368836,0.347587,0.717697


In [34]:
cat_features = ["job",
                "marital",
                "education",
                "default",
                "housing",
                "loan",
                "poutcome",
                "pdays"]

num_features = ["age", "campaign", "previous", "emp.var.rate", "cons.price.idx", "cons.conf.idx", "euribor3m", "nr.employed"]

hot_scaler = ColumnTransformer([
    ("one_hot_encoder", OneHotEncoder(drop="first"), cat_features),
    ("scaler", StandardScaler(), num_features)
], remainder="passthrough")

cut_transformer = make_pipeline(cut_encoder, hot_scaler)

In [35]:
logit_cut = benchmark(bank_mkt, cut_transformer, logit_clf)
pd.concat([logit_baseline, logit_cut], keys=["Logistic Baseline", "Logistic Cut"], axis=1)

Unnamed: 0_level_0,Logistic Baseline,Logistic Baseline,Logistic Baseline,Logistic Cut,Logistic Cut,Logistic Cut
Unnamed: 0_level_1,Train,Validate,Test,Train,Validate,Test
TNR,0.761401,0.764309,0.463959,0.763405,0.770356,0.498477
TPR,0.621101,0.615708,0.811743,0.617596,0.614306,0.795899
bACC,0.691251,0.690009,0.637851,0.6905,0.692331,0.647188
ROC,0.746054,0.733595,0.718543,0.745403,0.74158,0.71848
REC,0.621101,0.615708,0.811743,0.617596,0.614306,0.795899
PRE,0.207058,0.207565,0.622588,0.207514,0.211492,0.633531
AP,0.321821,0.312369,0.716084,0.321846,0.317145,0.715623


In [36]:
rbf_ite = benchmark(bank_mkt, ite_transformer, rbf_clf)
pd.concat([rbf_baseline, rbf_ite], keys=["RBF Baseline", "RBF Iterative"], axis=1)

Unnamed: 0_level_0,RBF Baseline,RBF Baseline,RBF Baseline,RBF Iterative,RBF Iterative,RBF Iterative
Unnamed: 0_level_1,Train,Validate,Test,Train,Validate,Test
TNR,0.779157,0.777247,0.160406,0.770824,0.756856,0.165482
TPR,0.608833,0.586255,0.954334,0.609884,0.615708,0.962721
bACC,0.693995,0.681751,0.55737,0.690354,0.686282,0.564102
ROC,0.750144,0.735048,0.728434,0.746573,0.741733,0.720813
REC,0.608833,0.586255,0.954334,0.609884,0.615708,0.962721
PRE,0.216638,0.208791,0.553214,0.210705,0.202491,0.556873
AP,0.321448,0.299921,0.731069,0.312602,0.31922,0.697359


## Cyclic Encoding

In [37]:
cyclic_encoder = FunctionTransformer(cat_encode, kw_args={"cyclic":["month", "day_of_week"]})

In [38]:
cat_cyclic = benchmark(bank_mkt, cyclic_encoder, cat_clf)
pd.concat([cat_baseline, cat_cyclic], keys=["Catboost Baseline", "Catboost Cyclic"], axis=1)

Unnamed: 0_level_0,Catboost Baseline,Catboost Baseline,Catboost Baseline,Catboost Cyclic,Catboost Cyclic,Catboost Cyclic
Unnamed: 0_level_1,Train,Validate,Test,Train,Validate,Test
TNR,0.888612,0.891576,0.020305,0.884216,0.887639,0.017259
TPR,0.554154,0.520337,0.992544,0.547844,0.521739,0.996272
bACC,0.721383,0.705957,0.506424,0.71603,0.704689,0.506766
ROC,0.822153,0.767514,0.704615,0.791883,0.74853,0.732023
REC,0.554154,0.520337,0.992544,0.547844,0.521739,0.996272
PRE,0.332912,0.324869,0.524631,0.32187,0.317677,0.524791
AP,0.417707,0.361904,0.730569,0.387561,0.338399,0.724835


In [39]:
cat_features = ["job",
                "marital",
                "education",
                "default",
                "housing",
                "loan",
                "poutcome"]

num_features = ["age", "campaign", "pdays", "previous",
                "emp.var.rate", "cons.price.idx", "cons.conf.idx", "euribor3m", "nr.employed",
                "month_sin", "month_cos", "day_sin", "day_cos"]

hot_scaler = ColumnTransformer([
    ("one_hot_encoder", OneHotEncoder(drop="first"), cat_features),
    ("scaler", StandardScaler(), num_features)
], remainder="passthrough")

cyclic_transformer = make_pipeline(cyclic_encoder, hot_scaler)

In [40]:
logit_ite = benchmark(bank_mkt, cyclic_transformer, logit_clf)
pd.concat([logit_baseline, logit_ite], keys=["Logistic Baseline", "Logistic Cyclic"], axis=1)

Unnamed: 0_level_0,Logistic Baseline,Logistic Baseline,Logistic Baseline,Logistic Cyclic,Logistic Cyclic,Logistic Cyclic
Unnamed: 0_level_1,Train,Validate,Test,Train,Validate,Test
TNR,0.761401,0.764309,0.463959,0.775078,0.777809,0.007107
TPR,0.621101,0.615708,0.811743,0.616544,0.612903,0.996272
bACC,0.691251,0.690009,0.637851,0.695811,0.695356,0.501689
ROC,0.746054,0.733595,0.718543,0.749987,0.739776,0.632209
REC,0.621101,0.615708,0.811743,0.616544,0.612903,0.996272
PRE,0.207058,0.207565,0.622588,0.215669,0.216658,0.522228
AP,0.321821,0.312369,0.716084,0.322118,0.303164,0.652721


In [41]:
rbf_ite = benchmark(bank_mkt, cyclic_transformer, rbf_clf)
pd.concat([rbf_baseline, rbf_ite], keys=["RBF Baseline", "RBF Cyclic"], axis=1)

Unnamed: 0_level_0,RBF Baseline,RBF Baseline,RBF Baseline,RBF Cyclic,RBF Cyclic,RBF Cyclic
Unnamed: 0_level_1,Train,Validate,Test,Train,Validate,Test
TNR,0.779157,0.777247,0.160406,0.777293,0.769793,0.0
TPR,0.608833,0.586255,0.954334,0.607431,0.638149,0.998136
bACC,0.693995,0.681751,0.55737,0.692362,0.703971,0.499068
ROC,0.750144,0.735048,0.728434,0.748003,0.747419,0.652762
REC,0.608833,0.586255,0.954334,0.607431,0.638149,0.998136
PRE,0.216638,0.208791,0.553214,0.214826,0.217495,0.520914
AP,0.321448,0.299921,0.731069,0.311819,0.344888,0.660708


## Mean Encoding

In [42]:
target_features = ["job", "marital", "education", "month", "day_of_week"] 

target_encoder = make_pipeline(FunctionTransformer(cat_encode, kw_args={"target": target_features}), TargetEncoder(cols=target_features))

In [43]:
cat_target = benchmark(bank_mkt, target_encoder, cat_clf)
pd.concat([cat_baseline, cat_target], keys=["Catboost Baseline", "Catboost Target"], axis=1)

Unnamed: 0_level_0,Catboost Baseline,Catboost Baseline,Catboost Baseline,Catboost Target,Catboost Target,Catboost Target
Unnamed: 0_level_1,Train,Validate,Test,Train,Validate,Test
TNR,0.888612,0.891576,0.020305,0.889385,0.881733,0.207107
TPR,0.554154,0.520337,0.992544,0.558009,0.568022,0.941286
bACC,0.721383,0.705957,0.506424,0.723697,0.724877,0.574196
ROC,0.822153,0.767514,0.704615,0.823417,0.772176,0.704155
REC,0.554154,0.520337,0.992544,0.558009,0.568022,0.941286
PRE,0.332912,0.324869,0.524631,0.336007,0.32504,0.563931
AP,0.417707,0.361904,0.730569,0.424982,0.350502,0.712325


In [44]:
num_features = [0,1,2,3,8,9,11,14,15,16,17,18]

target_scaler = ColumnTransformer([
    ("scaler", StandardScaler(), num_features)
], remainder="passthrough")

target_transformer = make_pipeline(target_encoder, target_scaler)

In [45]:
logit_target = benchmark(bank_mkt, target_transformer, logit_clf)
pd.concat([logit_baseline, logit_target], keys=["Logistic Baseline", "Logistic Target"], axis=1)

Unnamed: 0_level_0,Logistic Baseline,Logistic Baseline,Logistic Baseline,Logistic Target,Logistic Target,Logistic Target
Unnamed: 0_level_1,Train,Validate,Test,Train,Validate,Test
TNR,0.761401,0.764309,0.463959,0.785451,0.779918,0.205076
TPR,0.621101,0.615708,0.811743,0.62075,0.58906,0.940354
bACC,0.691251,0.690009,0.637851,0.7031,0.684489,0.572715
ROC,0.746054,0.733595,0.718543,0.752667,0.728992,0.701907
REC,0.621101,0.615708,0.811743,0.62075,0.58906,0.940354
PRE,0.207058,0.207565,0.622588,0.224946,0.211587,0.563058
AP,0.321821,0.312369,0.716084,0.326746,0.314612,0.69027


In [46]:
rbf_target = benchmark(bank_mkt, target_transformer, rbf_clf)
pd.concat([rbf_baseline, rbf_target], keys=["RBF Baseline", "RBF Target"], axis=1)

Unnamed: 0_level_0,RBF Baseline,RBF Baseline,RBF Baseline,RBF Target,RBF Target,RBF Target
Unnamed: 0_level_1,Train,Validate,Test,Train,Validate,Test
TNR,0.779157,0.777247,0.160406,0.795436,0.790184,0.118782
TPR,0.608833,0.586255,0.954334,0.609884,0.587658,0.972973
bACC,0.693995,0.681751,0.55737,0.70266,0.688921,0.545877
ROC,0.750144,0.735048,0.728434,0.755447,0.733983,0.642283
REC,0.608833,0.586255,0.954334,0.609884,0.587658,0.972973
PRE,0.216638,0.208791,0.553214,0.23022,0.219257,0.546025
AP,0.321448,0.299921,0.731069,0.328199,0.294682,0.622557


## Entity Embeddings

## External Infomation

CPI: http://www.ine.pt

In [128]:
def cat_encode(X,
               drop=["duration", "y"],
               cut=None,
               cyclic=None,
               target=None,
               external=None,
               fillna=True):
    """
    Encode and transform categorical data into numerical values.
    
    Parameters
    ----------
        X : DataFrame
        
        drop : list, default = ["duration", "y"]
        
        cut : list
        
        cyclic : list
        
        target : list
        
        external : list
        
        fillna : bool, default = True
    
    Returns
    -------
        X : DataFrame

    Examples
    --------
    bank_mkt = import_dataset("../data/BankMarketing.csv")
    X = cat_encode(bank_mkt)
    """
    X = X.copy()
    
    # `month` will be encoded to the corresponding number, e.g. "mar" -> 3.
    month_map = {"mar": 3,
                 "apr": 4,
                 "may": 5,
                 "jun": 6,
                 "jul": 7,
                 "aug": 8,
                 "sep": 9,
                 "oct": 10,
                 "nov": 11,
                 "dec": 12}
    X["month"] = X["month"].replace(month_map).astype("Int64")
    
    if cut != None:
        if "pdays" in cut:
            X["pdays"] = X["pdays"].fillna(-1)
            # Clients who have been contacted but do not have pdays record
            X.loc[X["pdays"].isna() & X["poutcome"].notna(), "pdays"] = 999
            # Cut pdays into categories
            X["pdays"] = pd.cut(X["pdays"], [0, 3, 5, 10, 15, 30, 1000], labels=[3, 5, 10, 15, 30, 1000], include_lowest=True).astype("Int64")
    else:
        # Fill missing values in pdays as 999
        X["pdays"] = X["pdays"].fillna(999)
    
    if cyclic != None:
        if "month" in cyclic:
            X['month_sin'] = np.sin(2 * np.pi * X["month"]/12)
            X['month_cos'] = np.cos(2 * np.pi * X["month"]/12)
            X = X.drop("month", axis=1)
        if "day_of_week" in cyclic:
            X["day_of_week"] = X["day_of_week"].cat.codes
            X['day_sin'] = np.sin(2 * np.pi * X["day_of_week"]/5)
            X['day_cos'] = np.cos(2 * np.pi * X["day_of_week"]/5)
            X = X.drop("day_of_week", axis=1)
    
    if external != None:
        if "year" in external:
            X["year"]=2008
            X.loc[27682:39118, "year"] = 2009
            X.loc[39118:, "year"] = 2010
        elif "date" in external:
            X["year"]=2008
            X.loc[27682:39118, "year"] = 2009
            X.loc[39118:, "year"] = 2010
            X["date"] = pd.to_datetime(X[["month", "year"]].assign(day=1)).dt.strftime("%m/%Y").astype("category")
            
    # Transform target encoded feature as str
    if target != None:
        X[target] = X[target].astype("str")
        
    # Drop features
    X = X.drop(drop, axis=1)
    
    # Other categorical features will be coded as its order in pandas categorical index
    X = X.apply(lambda x: x.cat.codes if pd.api.types.is_categorical_dtype(x) else (x.astype("Int64") if pd.api.types.is_bool_dtype(x) else x))
    
    # Fill missing values as -1
    if fillna:
        X = X.fillna(-1)
    else:
        X = X.replace(pd.NA, np.nan)
    
    return X

In [163]:
drop_features = [
                 "marital",
                 "education",
                 "default",
                 "housing",
                 "loan",
                 "duration",
                 "y"]

target_features = ["age", "job", "pdays", "poutcome", "year", "date"] 

target_scaler = ColumnTransformer([
    ("scaler", StandardScaler(), num_features)
], remainder="passthrough")

year_encoder = FunctionTransformer(cat_encode, kw_args= {"drop": drop_features,
                                                         "target": target_features,
                                                         "external":["date"],
                                                         "fillna": True})

target_encoder = make_pipeline(year_encoder, TargetEncoder(cols=target_features))

In [164]:
target_encoder.fit_transform(bank_mkt, bank_mkt["y"])

Unnamed: 0,age,job,contact,month,day_of_week,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,year,date
0,0.113636,0.100000,1,5,1,1,0.092585,0,0.088324,1.1,93.994,-36.4,4.857,5191.0,0.048371,0.030920
1,0.095975,0.081422,1,5,1,1,0.092585,0,0.088324,1.1,93.994,-36.4,4.857,5191.0,0.048371,0.030920
2,0.092881,0.081422,1,5,1,1,0.092585,0,0.088324,1.1,93.994,-36.4,4.857,5191.0,0.048371,0.030920
3,0.072351,0.129667,1,5,1,1,0.092585,0,0.088324,1.1,93.994,-36.4,4.857,5191.0,0.048371,0.030920
4,0.113636,0.081422,1,5,1,1,0.092585,0,0.088324,1.1,93.994,-36.4,4.857,5191.0,0.048371,0.030920
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41171,0.382353,0.252619,0,11,5,1,0.092585,0,0.088324,-1.1,94.767,-50.8,1.028,4963.6,0.521380,0.453125
41172,0.076699,0.068951,0,11,5,1,0.092585,0,0.088324,-1.1,94.767,-50.8,1.028,4963.6,0.521380,0.453125
41173,0.113636,0.252619,0,11,5,2,0.092585,0,0.088324,-1.1,94.767,-50.8,1.028,4963.6,0.521380,0.453125
41174,0.076162,0.108325,0,11,5,1,0.092585,0,0.088324,-1.1,94.767,-50.8,1.028,4963.6,0.521380,0.453125


In [200]:
cat_clf = CatBoostClassifier(eval_metric="AUC", class_weights=[1,8])
cat_year = benchmark(bank_mkt, target_encoder, cat_clf)
pd.concat([cat_baseline, cat_year], keys=["Catboost Baseline", "Catboost Date"], axis=1)

Unnamed: 0_level_0,Catboost Baseline,Catboost Baseline,Catboost Baseline,Catboost Date,Catboost Date,Catboost Date
Unnamed: 0_level_1,Train,Validate,Test,Train,Validate,Test
TNR,0.865201,0.862641,0.861932,0.867126,0.872562,0.862069
TPR,0.657797,0.630728,0.639009,0.646009,0.630728,0.634698
bACC,0.761499,0.746684,0.75047,0.756567,0.751645,0.748384
ROC,0.845568,0.797931,0.801313,0.824377,0.801862,0.799998
REC,0.657797,0.630728,0.639009,0.646009,0.630728,0.634698
PRE,0.382566,0.368214,0.370162,0.381692,0.38582,0.368817
AP,0.532276,0.467858,0.44578,0.510141,0.486803,0.448976


In [96]:
cat_features = ["job",
                "marital",
                "education",
                "default",
                "housing",
                "loan",
                "poutcome"]

num_features = ["age",
                "campaign", 
                "pdays",
                "previous",
                "emp.var.rate",
                "cons.price.idx",
                "cons.conf.idx",
                "euribor3m",
                "nr.employed"]
target_scaler = ColumnTransformer([
    ("scaler", StandardScaler(), num_features)
], remainder="passthrough")
                
hot_scaler = ColumnTransformer([
    ("one_hot_encoder", OneHotEncoder(drop="first"), cat_features),
    ("scaler", StandardScaler(), num_features)
], remainder="passthrough")

target_transformer = make_pipeline(year_encoder, TargetEncoder(cols=target_features), hot_scaler)

In [97]:
logit_target = benchmark(bank_mkt, target_transformer, logit_clf)
pd.concat([logit_baseline, logit_target], keys=["Logistic Baseline", "Logistic Target"], axis=1)

Unnamed: 0_level_0,Logistic Baseline,Logistic Baseline,Logistic Baseline,Logistic Target,Logistic Target,Logistic Target
Unnamed: 0_level_1,Train,Validate,Test,Train,Validate,Test
TNR,0.802036,0.795758,0.807608,0.802506,0.795587,0.807745
TPR,0.66386,0.663073,0.665948,0.666554,0.650943,0.665948
bACC,0.732948,0.729415,0.736778,0.73453,0.723265,0.736847
ROC,0.7855,0.784058,0.78197,0.788252,0.775852,0.781969
REC,0.66386,0.663073,0.665948,0.666554,0.650943,0.665948
PRE,0.298636,0.291815,0.305336,0.299985,0.287843,0.305487
AP,0.438799,0.458944,0.425859,0.445736,0.430392,0.425832


In [99]:
rbf_target = benchmark(bank_mkt, target_transformer, rbf_clf)
pd.concat([rbf_baseline, rbf_target], keys=["RBF Baseline", "RBF Target"], axis=1)

Unnamed: 0_level_0,RBF Baseline,RBF Baseline,RBF Baseline,RBF Target,RBF Target,RBF Target
Unnamed: 0_level_1,Train,Validate,Test,Train,Validate,Test
TNR,0.792627,0.795074,0.797345,0.790788,0.792508,0.799398
TPR,0.67127,0.671159,0.678879,0.67228,0.671159,0.676724
bACC,0.731948,0.733116,0.738112,0.731534,0.731833,0.738061
ROC,0.787844,0.792044,0.785194,0.791481,0.779615,0.781676
REC,0.67127,0.671159,0.678879,0.67228,0.671159,0.676724
PRE,0.291289,0.293632,0.298437,0.289779,0.291058,0.299904
AP,0.446463,0.45179,0.425436,0.452065,0.418888,0.425548


## Summary