# Ensemble

In [720]:
import numpy as np
np.seterr(divide='ignore', invalid='ignore')
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use("seaborn")
plt.rcParams["figure.figsize"] = (6.4, 4.8)
plt.rcParams["figure.dpi"] = 300
plt.rcParams["figure.titleweight"] = "bold"
plt.rcParams["axes.titleweight"] = "bold"
plt.rcParams["axes.titlepad"] = 10.0
plt.rcParams["axes.titlelocation"] = "left"
from IPython.display import set_matplotlib_formats
set_matplotlib_formats("svg")
import seaborn as sns
from category_encoders.target_encoder import TargetEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import average_precision_score, f1_score, precision_score, recall_score, balanced_accuracy_score, roc_auc_score, confusion_matrix
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

## Data Preparation

In [176]:
def import_dataset(filename):
    bank_mkt = pd.read_csv(filename,
                           na_values=["unknown", "nonexistent"],
                           true_values=["yes", "success"],
                           false_values=["no", "failure"])
    # Treat pdays = 999 as missing values
    bank_mkt["pdays"] = bank_mkt["pdays"].replace(999, pd.NA)
    # Convert types, "Int64" is nullable integer data type in pandas
    bank_mkt = bank_mkt.astype(dtype={"age": "Int64",
                                      "job": "category",
                                      "marital": "category",
                                      "education": "category",
                                      "default": "boolean",
                                      "housing": "boolean",
                                      "loan": "boolean",
                                      "contact": "category",
                                      "month": "category",
                                      "day_of_week": "category",
                                      "duration": "Int64",
                                      "campaign": "Int64",
                                      "pdays": "Int64",
                                      "previous": "Int64",
                                      "poutcome": "boolean",
                                      "y": "boolean"})
    # Drop duplicates
    bank_mkt = bank_mkt.drop_duplicates().reset_index(drop=True)
    # reorder categorical data
    bank_mkt["education"] = bank_mkt["education"].cat.reorder_categories(["illiterate", "basic.4y", "basic.6y", "basic.9y", "high.school", "professional.course", "university.degree"], ordered=True)
    bank_mkt["month"] = bank_mkt["month"].cat.reorder_categories(["mar", "apr", "jun", "jul", "may", "aug", "sep", "oct", "nov", "dec"], ordered=True)
    bank_mkt["day_of_week"] = bank_mkt["day_of_week"].cat.reorder_categories(["mon", "tue", "wed", "thu", "fri"], ordered=True)
    return bank_mkt

In [204]:
bank_mkt = import_dataset("../data/BankMarketing.csv")

In [814]:
train_test_split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_index, test_index in train_test_split.split(bank_mkt.drop("y", axis=1), bank_mkt["y"]):
    bank_train_set = bank_mkt.loc[train_index].reset_index(drop=True)
    bank_test_set = bank_mkt.loc[test_index].reset_index(drop=True)

In [941]:
# Features that should be encoded as their target mean value
combine_features = ["job", "education"]

combine_name = ".".join(combine_features)

# Features with missing values that should be imputed by IterativeImputer
ite_features = ["age", "job", "marital", "education", "default", "housing", "loan"]

# Features that should be mean encoded
# target_features = ["age", "job", "marital", "education", "month", "day_of_week", "pdays", combine_name]
target_features = ["age", "job", "marital", "education", "month", "day_of_week", "pdays", combine_name]

def tree_encode(X):
    """
    Encode categorical data into numerical values.
    pdays column will be feature engineered and discretized.
    """
    X = X.copy()
    # pdays column will be feature engineered and discretized.
    X.loc[X["pdays"].isna() & X["poutcome"].notna(), "pdays"] = 999
    # X["pdays"] = pd.qcut(X["pdays"], [0, 0.005, 0.01, 0.02, 0.03, 0.04, 0.05, 1], duplicates="drop")
    X["pdays"] = pd.cut(X["pdays"], [0, 5, 10, 15, 20, 25, 30, 1000], labels=[1, 2, 3, 4, 5, 6, 7], include_lowest=True).astype("Int64")
    # Cut age into age groups
    X["age"] = pd.cut(X["age"], [0, 18, 24, 30, 40, 50,60, 70, 100], labels=[18, 24, 30, 40, 50, 60, 70, 100], include_lowest=True).astype("Int64")
    # Feature interactions
    # combine_features = ["education", "job"]
    # combine_name = "_".join(combine_features)
    X[combine_name] = X[combine_features].astype("str").apply(lambda x: ".".join(x), axis=1)
    # Encode nominal and ordinal features
    # `month` will be encoded to the corresponding number, e.g. "mar" -> 3.
    month_map = {"mar": 3,
                 "apr": 4,
                 "may": 5,
                 "jun": 6,
                 "jul": 7,
                 "aug": 8,
                 "sep": 9,
                 "oct": 10,
                 "nov": 11,
                 "dec": 12}
    X["month"] = X["month"].replace(month_map).astype("Int64")
    # Other categorical features will be coded as its order in pandas categorical index
    cat_features = ["job", "education", "marital", "day_of_week", "contact"]
    bool_features = ["default", "housing", "loan", "poutcome"]
    X[cat_features] = X[cat_features].apply(lambda x: x.cat.codes).astype("Int64")
    X[bool_features] = X[bool_features].astype("Int64")
    X[target_features] = X[target_features].astype("str")
    # Fill missing values as -1
    X = X.fillna(-1)
    return X

tree_encoder = FunctionTransformer(tree_encode)

# Features with missing values that should be imputed with most freq value
freq_features = ["job", "education", "marital", "default", "housing", "loan"]
# freq_features = [1, 2, 3, 4, 5, 6]
# freq_imputer will impute missing values in columns specified by freq_features
freq_imputer = ColumnTransformer([
    ("freq_imputer",
     SimpleImputer(missing_values=-1, strategy="most_frequent"),
     freq_features)],
    remainder="passthrough")

#ite_features = [0, 1, 2, 3, 4, 5, 6]

ite_imputer = Pipeline([
    ("ite_imputer", IterativeImputer(max_iter=1000, missing_values=-1, initial_strategy="most_frequent", random_state=42)),
    ("ite_round", FunctionTransformer(np.round))
])

ite_transformer = ColumnTransformer([("ite_imputer", ite_imputer, ite_features)], remainder="passthrough")

target_encoder = TargetEncoder(cols=target_features, return_df=True)

tree_preprocessor = Pipeline([
    ("tree_encoder", tree_encoder),
    ("target_encoder", target_encoder),
    ("ite_transformer", ite_transformer)
])

In [946]:
y_train = bank_train_set["y"].astype("int").to_numpy()
X_duration = bank_train_set["duration"].astype("int").to_numpy()
X_train = tree_preprocessor.fit_transform(bank_train_set.drop(["duration", "y"], axis=1), X_duration)
y_test = bank_test_set["y"].astype("int").to_numpy()
X_test = tree_preprocessor.transform(bank_test_set.drop(["duration", "y"], axis=1))

  elif pd.api.types.is_categorical(cols):


In [947]:
train_validate_split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_index, test_index in train_test_split.split(X_train, y_train):
    X_ttrain = X_train[train_index]
    y_ttrain = y_train[train_index]
    X_validate = X_train[test_index]
    y_validate = y_train[test_index]

## Methods

In [948]:
xgb_clf = XGBClassifier(max_depth=3, scale_pos_weight=8)
xgb_fit = xgb_clf.fit(X_ttrain, y_ttrain)

X_sets = [X_ttrain, X_validate, X_test]
y_sets = [y_ttrain, y_validate, y_test]

matric_names = ["TNR", "TPR", "bACC", "ROC", "REC", "PRE", "F1", "AP"]
set_names = ["Train", "Validation", "Test"]
matric_df = pd.DataFrame(index=matric_names, columns=set_names)

for name, X, y in zip(set_names, X_sets, y_sets):
    y_pred = xgb_clf.predict(X)
    y_score = xgb_clf.predict_proba(X)[:,1]
    matrics = [recall_score(y, y_pred, pos_label=0),
               recall_score(y, y_pred),
               balanced_accuracy_score(y, y_pred),
               roc_auc_score(y, y_score),
               recall_score(y, y_pred),
               precision_score(y, y_pred),
               f1_score(y, y_pred),
               average_precision_score(y, y_score)]
    matric_df[name] = matrics

matric_df

Unnamed: 0,Train,Validation,Test
TNR,0.864602,0.866746,0.870142
TPR,0.653419,0.640162,0.658405
bACC,0.759011,0.753454,0.764274
ROC,0.835416,0.813876,0.809554
REC,0.653419,0.640162,0.658405
PRE,0.379945,0.378788,0.391667
F1,0.480495,0.475952,0.491158
AP,0.526257,0.475124,0.480411


In [949]:
cat_clf = CatBoostClassifier(eval_metric="AUC", verbose=False)
cat_fit = cat_clf.fit(X_ttrain, y_ttrain)

X_sets = [X_ttrain, X_validate, X_test]
y_sets = [y_ttrain, y_validate, y_test]

matric_names = ["TNR", "TPR", "bACC", "ROC", "REC", "PRE", "F1", "AP"]
set_names = ["Train", "Validation", "Test"]
matric_df = pd.DataFrame(index=matric_names, columns=set_names)

for name, X, y in zip(set_names, X_sets, y_sets):
    y_pred = xgb_clf.predict(X)
    y_score = xgb_clf.predict_proba(X)[:,1]
    matrics = [recall_score(y, y_pred, pos_label=0),
               recall_score(y, y_pred),
               balanced_accuracy_score(y, y_pred),
               roc_auc_score(y, y_score),
               recall_score(y, y_pred),
               precision_score(y, y_pred),
               f1_score(y, y_pred),
               average_precision_score(y, y_score)]
    matric_df[name] = matrics

matric_df

Unnamed: 0,Train,Validation,Test
TNR,0.864602,0.866746,0.870142
TPR,0.653419,0.640162,0.658405
bACC,0.759011,0.753454,0.764274
ROC,0.835416,0.813876,0.809554
REC,0.653419,0.640162,0.658405
PRE,0.379945,0.378788,0.391667
F1,0.480495,0.475952,0.491158
AP,0.526257,0.475124,0.480411


In [940]:
xgb_clf = XGBClassifier()
param_grid = [
    {"max_depth": [6, 5, 4, 3, 2],
     "scale_pos_weight": [8]}
    ]
grid_search = GridSearchCV(xgb_clf,
                           param_grid,
                           scoring="average_precision",
                           # scoring="roc_auc",
                           return_train_score=True,
                           cv=5,
                           n_jobs=-1)

grid_fit = grid_search.fit(X_train, y_train)
grid_results = grid_search.cv_results_
grid_best_params = grid_search.best_params_
grid_best_score = grid_search.best_score_

print(f"best mean test score: {grid_best_score}, for {grid_best_params}.")

for test_score, train_score, params in zip(grid_results["mean_test_score"],
                                           grid_results["mean_test_score"],
                                           grid_results["params"]):
    print(f"mean test score: {test_score}, mean train score: {train_score}, for {params}.")

best mean test score: 0.46760831263553626, for {'max_depth': 3, 'scale_pos_weight': 8}.
mean test score: 0.4273262687987603, mean train score: 0.4273262687987603, for {'max_depth': 6, 'scale_pos_weight': 8}.
mean test score: 0.4460181827046707, mean train score: 0.4460181827046707, for {'max_depth': 5, 'scale_pos_weight': 8}.
mean test score: 0.46325870536142977, mean train score: 0.46325870536142977, for {'max_depth': 4, 'scale_pos_weight': 8}.
mean test score: 0.46760831263553626, mean train score: 0.46760831263553626, for {'max_depth': 3, 'scale_pos_weight': 8}.
mean test score: 0.4645482149350781, mean train score: 0.4645482149350781, for {'max_depth': 2, 'scale_pos_weight': 8}.
