# Ensemble

In [58]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use("seaborn")
plt.rcParams["figure.figsize"] = (6.4, 4.8)
plt.rcParams["figure.dpi"] = 300
plt.rcParams["figure.titleweight"] = "bold"
plt.rcParams["axes.titleweight"] = "bold"
plt.rcParams["axes.titlepad"] = 10.0
plt.rcParams["axes.titlelocation"] = "left"
from IPython.display import set_matplotlib_formats
set_matplotlib_formats("svg")
import seaborn as sns
from category_encoders.target_encoder import TargetEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.model_selection import cross_validate, StratifiedShuffleSplit, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, average_precision_score, f1_score, precision_score, recall_score, balanced_accuracy_score, roc_auc_score, confusion_matrix
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

## Data Preparation

In [15]:
def import_dataset(filename):
    bank_mkt = pd.read_csv(filename,
                           na_values=["unknown", "nonexistent"],
                           true_values=["yes", "success"],
                           false_values=["no", "failure"])
    # Treat pdays = 999 as missing values
    bank_mkt["pdays"] = bank_mkt["pdays"].replace(999, pd.NA)
    # Convert types, "Int64" is nullable integer data type in pandas
    bank_mkt = bank_mkt.astype(dtype={"age": "Int64",
                                      "job": "category",
                                      "marital": "category",
                                      "education": "category",
                                      "default": "boolean",
                                      "housing": "boolean",
                                      "loan": "boolean",
                                      "contact": "category",
                                      "month": "category",
                                      "day_of_week": "category",
                                      "duration": "Int64",
                                      "campaign": "Int64",
                                      "pdays": "Int64",
                                      "previous": "Int64",
                                      "poutcome": "boolean",
                                      "y": "boolean"})
    # Drop duplicates
    bank_mkt = bank_mkt.drop_duplicates().reset_index(drop=True)
    # reorder categorical data
    bank_mkt["education"] = bank_mkt["education"].cat.reorder_categories(["illiterate", "basic.4y", "basic.6y", "basic.9y", "high.school", "professional.course", "university.degree"], ordered=True)
    bank_mkt["month"] = bank_mkt["month"].cat.reorder_categories(["mar", "apr", "jun", "jul", "may", "aug", "sep", "oct", "nov", "dec"], ordered=True)
    bank_mkt["day_of_week"] = bank_mkt["day_of_week"].cat.reorder_categories(["mon", "tue", "wed", "thu", "fri"], ordered=True)
    return bank_mkt

In [16]:
bank_mkt = import_dataset("../data/BankMarketing.csv")

In [110]:
train_test_split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_index, test_index in train_test_split.split(bank_mkt.drop("y", axis=1), bank_mkt["y"]):
    bank_train_set = bank_mkt.loc[train_index].reset_index(drop=True)
    bank_test_set = bank_mkt.loc[test_index].reset_index(drop=True)

In [180]:
# Features that should be encoded as their target mean value
combine_features = ["job", "education"]

combine_name = ".".join(combine_features)

# Features with missing values that should be imputed with most freq value
freq_features = ["default", "housing", "loan"]

# Features with missing values that should be imputed by IterativeImputer
ite_features = ["age", "job", "marital", "education", "default", "housing", "loan"]

# Features that should be mean encoded
# target_features = ["month", "day_of_week"]
target_features = [1,2,3,8,9,10]


def tree_encode(X):
    """
    Encode categorical data into numerical values.
    pdays column will be feature engineered and discretized.
    """
    X = X.copy()
    # pdays column will be feature engineered and discretized.
    X.loc[X["pdays"].isna() & X["poutcome"].notna(), "pdays"] = 999
    #X["pdays"] = pd.cut(X["pdays"], [0, 5, 10, 15, 30, 1000], labels=[1, 2, 3, 4, 5], include_lowest=True).astype("Int64")
    # Cut age into age groups
    # X["age"] = pd.cut(X["age"], [0, 18, 24, 30, 35, 40, 45, 50, 55, 60, 100], labels=[18, 24, 30, 35, 40, 45, 50, 55, 60, 100], include_lowest=True).astype("Int64")
    # Feature interactions
    # combine_features = ["education", "job"]
    # combine_name = "_".join(combine_features)
    # X[combine_name] = X[combine_features].astype("str").apply(lambda x: ".".join(x), axis=1)
    # Encode nominal and ordinal features
    # `month` will be encoded to the corresponding number, e.g. "mar" -> 3.
    month_map = {"mar": 3,
                 "apr": 4,
                 "may": 5,
                 "jun": 6,
                 "jul": 7,
                 "aug": 8,
                 "sep": 9,
                 "oct": 10,
                 "nov": 11,
                 "dec": 12}
    X["month"] = X["month"].replace(month_map).astype("Int64")
    # Other categorical features will be coded as its order in pandas categorical index
    cat_features = ["job", "education", "marital", "day_of_week", "contact"]
    bool_features = ["default", "housing", "loan", "poutcome"]
    X[cat_features] = X[cat_features].apply(lambda x: x.cat.codes).astype("Int64")
    X[bool_features] = X[bool_features].astype("Int64")
    # X[target_features] = X[target_features].astype("str")
    # Fill missing values as -1
    X = X.fillna(-1)
    return X

tree_encoder = FunctionTransformer(tree_encode)

# freq_features = [1, 2, 3, 4, 5, 6]
# freq_imputer will impute missing values in columns specified by freq_features
freq_imputer = ColumnTransformer([
    ("freq_imputer",
     SimpleImputer(missing_values=-1, strategy="most_frequent"),
     freq_features)],
    remainder="passthrough")

ite_imputer = Pipeline([
    ("ite_imputer", IterativeImputer(max_iter=100, missing_values=-1, initial_strategy="most_frequent", random_state=42)),
    ("ite_round", FunctionTransformer(np.round))
])

ite_transformer = ColumnTransformer([("ite_imputer", ite_imputer, ite_features)], remainder="passthrough")

target_encoder = TargetEncoder(cols=target_features, return_df=False)

tree_preprocessor = Pipeline([
    ("tree_encoder", tree_encoder),
    ("ite_transformer", ite_transformer),
    ("target_encoder", target_encoder)
])

y_train = bank_train_set["y"].astype("int").to_numpy()
X_duration = bank_train_set["duration"].astype("int").to_numpy()
X_train = bank_train_set.drop(["duration", "y"], axis=1)
X_train = tree_preprocessor.fit_transform(X_train, X_duration)
y_test = bank_test_set["y"].astype("int").to_numpy()
X_test = tree_preprocessor.transform(bank_test_set.drop(["duration", "y"], axis=1))

  elif pd.api.types.is_categorical(cols):


In [167]:
rbf_sgd_tuned = rbf_sgd_clf.set_params(rbf__gamma=0.0008, svm__alpha=1e-6)
clf_metric(rbf_sgd_tuned)

Unnamed: 0,Train,Validation,Test
ACC,0.696395,0.698057,0.884896
bACC,0.709088,0.710713,0.698071
ROC,0.787826,0.783831,0.793473
REC,0.725474,0.727043,0.456897
PRE,0.235691,0.238153,0.488479
F1,0.353708,0.356394,0.47216
AP,0.43835,0.431853,0.445343


## XGBoost

In [181]:
matric_names = ["ACC", "bACC", "ROC", "REC", "PRE", "F1", "AP"]
cv_metric = ["accuracy", "balanced_accuracy", "roc_auc", "recall", "precision", "f1", "average_precision"]
set_names = ["Train", "Validation", "Test"]
matric_df = pd.DataFrame(index=matric_names, columns=set_names)

xgb_clf = XGBClassifier(max_depth=2, gamma=1, min_child_weight=1, scale_pos_weight=8)
fit = xgb_clf.fit(X_train, y_train)
cv_score = cross_validate(xgb_clf, X_train, y_train, scoring=cv_metric, cv=5, n_jobs=-1, return_train_score=True)
cv_mean = pd.DataFrame(cv_score).mean()
train_mean = cv_mean[["train_accuracy", "train_balanced_accuracy", "train_roc_auc", "train_recall", "train_precision", "train_f1", "train_average_precision"]].to_numpy()
validation_mean = cv_mean[["test_accuracy", "test_balanced_accuracy", "test_roc_auc", "test_recall", "test_precision", "test_f1", "test_average_precision"]].to_numpy()

y_pred = xgb_clf.predict(X_test)
y_score = xgb_clf.predict_proba(X_test)[:,1]
test_score = [accuracy_score(y, y_pred),
              balanced_accuracy_score(y_test, y_pred),
              roc_auc_score(y_test, y_score),
              recall_score(y_test, y_pred),
              precision_score(y_test, y_pred),
              f1_score(y_test, y_pred),
              average_precision_score(y_test, y_score)]

matric_df.loc[:, "Train"] = train_mean
matric_df.loc[:, "Validation"] = validation_mean
matric_df.loc[:, "Test"] = test_score

matric_df

Unnamed: 0,Train,Validation,Test
ACC,0.836164,0.83391,0.833536
bACC,0.75453,0.749264,0.749564
ROC,0.820916,0.802531,0.803782
REC,0.649151,0.639997,0.641164
PRE,0.370499,0.364803,0.36436
F1,0.471709,0.464696,0.464662
AP,0.492627,0.468931,0.463528


## From the Scratch

In [442]:
train_test_split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=123)

for train_index, test_index in train_test_split.split(bank_mkt.drop("y", axis=1), bank_mkt["y"]):
    bank_train_set = bank_mkt.loc[train_index].reset_index(drop=True)
    bank_test_set = bank_mkt.loc[test_index].reset_index(drop=True)

In [460]:
# Features that should be encoded as their target mean value
combine_features = ["job", "education"]

combine_name = ".".join(combine_features)

# Features with missing values that should be imputed with most freq value
# freq_features = ["loan"]

# Features that should be mean encoded
# target_features = ["month", "day_of_week"]
# target_features = [7]

drop_features = ["age", "job", "marital", "education", "housing", "loan", "default", "duration", "y"]
#drop_features = ["default", "duration", "y"]


def tree_encode(X):
    """
    Encode categorical data into numerical values.
    pdays column will be feature engineered and discretized.
    """
    X = X.copy()
    # Combine feature
    # X[combine_name] = X[combine_features].astype("str").apply(lambda x: ".".join(x), axis=1)
    # X[combine_name] = X[combine_name].astype("category")
    # Transform month
#     month_map = {"mar": 3,
#                  "apr": 4,
#                  "may": 5,
#                  "jun": 6,
#                  "jul": 7,
#                  "aug": 8,
#                  "sep": 9,
#                  "oct": 10,
#                  "nov": 11,
#                  "dec": 12}
#    X["month"] = X["month"].astype("category")
    # Categorical and boolean features will be coded as number
    X = X.apply(lambda x: x.cat.codes if pd.api.types.is_categorical_dtype(x) else (x.astype("Int64") if pd.api.types.is_bool_dtype(x) else x))
    # Drop features
    X = X.drop(drop_features, axis=1)
    # Fill missing values as -1
    # X = X.fillna(-1).to_numpy()
    X = X.fillna(-1)
    return X

tree_encoder = FunctionTransformer(tree_encode)

# freq_imputer will impute missing values in columns specified by freq_features
freq_imputer = ColumnTransformer([
    ("freq_imputer",
     SimpleImputer(missing_values=-1, strategy="most_frequent"),
     freq_features)],
    remainder="passthrough")

# target_encoder = TargetEncoder(cols=target_features, return_df=False)

tree_preprocessor = Pipeline([
    ("tree_encoder", tree_encoder),
    ("target_encoder", TargetEncoder(cols=["month", "day_of_week"], return_df=False))
])

y_train = bank_train_set["y"].astype("int").to_numpy()
X_train = tree_preprocessor.fit_transform(bank_train_set, y_train)
y_test = bank_test_set["y"].astype("int").to_numpy()
X_test = tree_preprocessor.transform(bank_test_set)

  elif pd.api.types.is_categorical(cols):


In [461]:
metric_names = ["ACC", "bACC", "ROC", "REC", "PRE", "F1", "AP"]
cv_metric = ["accuracy", "balanced_accuracy", "roc_auc", "recall", "precision", "f1", "average_precision"]
set_names = ["Train", "Validation", "Test"]
metric_df = pd.DataFrame(index=metric_names, columns=set_names)

xgb_clf = XGBClassifier(max_depth=2, gamma=1, min_child_weight=1, scale_pos_weight=8)
fit = xgb_clf.fit(X_train, y_train)
cv_score = cross_validate(xgb_clf, X_train, y_train, scoring=cv_metric, cv=5, n_jobs=-1, return_train_score=True)
cv_mean = pd.DataFrame(cv_score).mean()
train_mean = cv_mean[["train_accuracy", "train_balanced_accuracy", "train_roc_auc", "train_recall", "train_precision", "train_f1", "train_average_precision"]].to_numpy()
validation_mean = cv_mean[["test_accuracy", "test_balanced_accuracy", "test_roc_auc", "test_recall", "test_precision", "test_f1", "test_average_precision"]].to_numpy()

y_pred = xgb_clf.predict(X_test)
y_score = xgb_clf.predict_proba(X_test)[:,1]
test_score = [accuracy_score(y, y_pred),
              balanced_accuracy_score(y_test, y_pred),
              roc_auc_score(y_test, y_score),
              recall_score(y_test, y_pred),
              precision_score(y_test, y_pred),
              f1_score(y_test, y_pred),
              average_precision_score(y_test, y_score)]

metric_df.loc[:, "Train"] = train_mean
metric_df.loc[:, "Validation"] = validation_mean
metric_df.loc[:, "Test"] = test_score

metric_df

Unnamed: 0,Train,Validation,Test
ACC,0.832665,0.83133,0.830257
bACC,0.752058,0.747689,0.747246
ROC,0.817946,0.806021,0.797986
REC,0.648006,0.63972,0.640086
PRE,0.363912,0.360215,0.358263
F1,0.466019,0.460804,0.459397
AP,0.493324,0.471017,0.466147


In [462]:
cat_clf = CatBoostClassifier(eval_metric="AUC", class_weights=[1, 8])
cat_fit = cat_clf.fit(X_train, y_train, eval_set=(X_test, y_test), verbose=False)

X_sets = [X_train, X_test]
y_sets = [y_train, y_test]

matric_names = ["TNR", "TPR", "bACC", "ROC", "REC", "PRE", "F1", "AP"]
set_names = ["Train", "Test"]
matric_df = pd.DataFrame(index=matric_names, columns=set_names)

for name, X, y in zip(set_names, X_sets, y_sets):
    y_pred = cat_clf.predict(X)
    y_score = cat_clf.predict_proba(X)[:,1]
    matrics = [recall_score(y, y_pred, pos_label=0),
               recall_score(y, y_pred),
               balanced_accuracy_score(y, y_pred),
               roc_auc_score(y, y_score),
               recall_score(y, y_pred),
               precision_score(y, y_pred),
               f1_score(y, y_pred),
               average_precision_score(y, y_score)]
    matric_df[name] = matrics

matric_df

Unnamed: 0,Train,Test
TNR,0.864415,0.865764
TPR,0.665858,0.631466
bACC,0.765137,0.748615
ROC,0.838656,0.801122
REC,0.665858,0.631466
PRE,0.384053,0.373963
F1,0.487137,0.469739
AP,0.533638,0.467263


In [348]:
# drop_features = ["age", "job", "marital", "education", "default", "duration", "y"]
# CV AP 0.468420

In [450]:
xgb_clf = XGBClassifier(max_depth=2, gamma=1, min_child_weight=1, scale_pos_weight=8)
fit = xgb_clf.fit(X_train, y_train)
cv_score = cross_validate(xgb_clf, X_train, y_train, scoring=cv_metric, cv=5, n_jobs=-1, return_train_score=True)
cv_mean = pd.DataFrame(cv_score).mean()
train_mean = cv_mean[["train_accuracy", "train_balanced_accuracy", "train_roc_auc", "train_recall", "train_precision", "train_f1", "train_average_precision"]].to_numpy()
validation_mean = cv_mean[["test_accuracy", "test_balanced_accuracy", "test_roc_auc", "test_recall", "test_precision", "test_f1", "test_average_precision"]].to_numpy()

y_pred = xgb_clf.predict(X_test)
y_score = xgb_clf.predict_proba(X_test)[:,1]
test_score = [accuracy_score(y, y_pred),
              balanced_accuracy_score(y_test, y_pred),
              roc_auc_score(y_test, y_score),
              recall_score(y_test, y_pred),
              precision_score(y_test, y_pred),
              f1_score(y_test, y_pred),
              average_precision_score(y_test, y_score)]

metric_df.loc[:, "Train"] = train_mean
metric_df.loc[:, "Validation"] = validation_mean
metric_df.loc[:, "Test"] = test_score

metric_df

Unnamed: 0,Train,Validation,Test
ACC,0.827209,0.825258,0.830743
bACC,0.753512,0.74909,0.746579
ROC,0.817498,0.806086,0.799046
REC,0.658381,0.650767,0.637931
PRE,0.355887,0.351342,0.358788
F1,0.461982,0.456289,0.459271
AP,0.488878,0.470275,0.462833


In [451]:
cat_clf = CatBoostClassifier(eval_metric="AUC", class_weights=[1, 8])
cat_fit = cat_clf.fit(X_train, y_train, eval_set=(X_test, y_test), verbose=False)

X_sets = [X_train, X_test]
y_sets = [y_train, y_test]

matric_names = ["TNR", "TPR", "bACC", "ROC", "REC", "PRE", "F1", "AP"]
set_names = ["Train", "Test"]
matric_df = pd.DataFrame(index=matric_names, columns=set_names)

for name, X, y in zip(set_names, X_sets, y_sets):
    y_pred = cat_clf.predict(X)
    y_score = cat_clf.predict_proba(X)[:,1]
    matrics = [recall_score(y, y_pred, pos_label=0),
               recall_score(y, y_pred),
               balanced_accuracy_score(y, y_pred),
               roc_auc_score(y, y_score),
               recall_score(y, y_pred),
               precision_score(y, y_pred),
               f1_score(y, y_pred),
               average_precision_score(y, y_score)]
    matric_df[name] = matrics

matric_df

Unnamed: 0,Train,Test
TNR,0.853296,0.854543
TPR,0.660469,0.640086
bACC,0.756883,0.747315
ROC,0.82706,0.802356
REC,0.660469,0.640086
PRE,0.363704,0.358479
F1,0.469091,0.459574
AP,0.508959,0.463079
