# Decision Trees

In [69]:
import numpy as np
np.seterr(divide='ignore', invalid='ignore')
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use("seaborn")
plt.rcParams["figure.figsize"] = (6.4, 4.8)
plt.rcParams["figure.dpi"] = 300
plt.rcParams["figure.titleweight"] = "bold"
plt.rcParams["axes.titleweight"] = "bold"
plt.rcParams["axes.titlepad"] = 10.0
plt.rcParams["axes.titlelocation"] = "left"
from IPython.display import set_matplotlib_formats
set_matplotlib_formats("svg")
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import average_precision_score, f1_score, precision_score, recall_score, balanced_accuracy_score, roc_auc_score, confusion_matrix

## Data Preparation

In [19]:
def import_dataset(filename):
    bank_mkt = pd.read_csv(filename,
                           na_values=["unknown", "nonexistent"],
                           true_values=["yes", "success"],
                           false_values=["no", "failure"])
    # Treat pdays = 999 as missing values
    bank_mkt["pdays"] = bank_mkt["pdays"].replace(999, pd.NA)
    # Convert types, "Int64" is nullable integer data type in pandas
    bank_mkt = bank_mkt.astype(dtype={"age": "Int64",
                                      "job": "category",
                                      "marital": "category",
                                      "education": "category",
                                      "default": "boolean",
                                      "housing": "boolean",
                                      "loan": "boolean",
                                      "contact": "category",
                                      "month": "category",
                                      "day_of_week": "category",
                                      "duration": "Int64",
                                      "campaign": "Int64",
                                      "pdays": "Int64",
                                      "previous": "Int64",
                                      "poutcome": "boolean",
                                      "y": "boolean"})
    # Drop duplicates
    bank_mkt = bank_mkt.drop_duplicates().reset_index(drop=True)
    # reorder categorical data
    bank_mkt["education"] = bank_mkt["education"].cat.reorder_categories(["illiterate", "basic.4y", "basic.6y", "basic.9y", "high.school", "professional.course", "university.degree"], ordered=True)
    bank_mkt["month"] = bank_mkt["month"].cat.reorder_categories(["mar", "apr", "jun", "jul", "may", "aug", "sep", "oct", "nov", "dec"], ordered=True)
    bank_mkt["day_of_week"] = bank_mkt["day_of_week"].cat.reorder_categories(["mon", "tue", "wed", "thu", "fri"], ordered=True)
    return bank_mkt

In [64]:
def tree_encode(X):
    """
    Encode categorical data into numerical values.
    pdays column will be feature engineered and discretized.
    """
    X = X.copy()
    # pdays column will be feature engineered and discretized.
    X.loc[X["pdays"].isna() & X["poutcome"].notna(), "pdays"] = 999
    X["pdays"] = pd.cut(X["pdays"], [0, 5, 10, 15, 30, 1000], labels=[1, 2, 3, 4, 5], include_lowest=True).astype("Int64")
    # Encode nominal and ordinal features
    # `month` will be encoded to the corresponding number, e.g. "mar" -> 3.
    month_map = {"mar": 3,
                 "apr": 4,
                 "jun": 5,
                 "jul": 6,
                 "may": 7,
                 "aug": 8,
                 "sep": 9,
                 "oct": 10,
                 "nov": 11,
                 "dec": 12}
    X["month"] = X["month"].replace(month_map).astype("int")
    # Other categorical features will be coded as its order in pandas categorical index
    cat_features = ["job", "education", "marital", "contact", "day_of_week"]
    bool_features = ["default", "housing", "loan", "poutcome"]
    X[cat_features] = X[cat_features].apply(lambda x: x.cat.codes).astype("Int64")
    X[bool_features] = X[bool_features].astype("Int64")
    # Fill missing values as -1
    X = X.fillna(-1)
    return X

tree_encoder = FunctionTransformer(tree_encode)

# Features with missing values that should be imputed with most freq value
freq_features = ["job", "education", "marital", "default", "housing", "loan"]

# tree_imputer will impute missing values in columns specified by freq_features
tree_imputer = ColumnTransformer([
    ("freq_imputer",
     SimpleImputer(missing_values=-1,strategy="most_frequent"),
     freq_features)],
    remainder="passthrough")

# Wrap tree_encoder and tree_imputer in one pipeline
tree_preprocessor = Pipeline([
    ("basic_encoder", tree_encoder),
    ("tree_imputer", tree_imputer)
])

In [65]:
bank_mkt = import_dataset("../data/BankMarketing.csv")

In [66]:
train_test_split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_index, test_index in train_test_split.split(bank_mkt.drop("y", axis=1), bank_mkt["y"]):
    bank_train_set = bank_mkt.loc[train_index].reset_index(drop=True)
    bank_test_set = bank_mkt.loc[test_index].reset_index(drop=True)

In [67]:
X_train = tree_preprocessor.fit_transform(bank_train_set.drop(["duration", "y"], axis=1))
y_train = bank_train_set["y"].astype("int").to_numpy()

## Methods

In [68]:
decision_tree = DecisionTreeClassifier(class_weight="balanced")
param_grid = [
    {"criterion": ["gini", "entropy"],
     "max_depth": [100000, 50000, 30000, 20000, 10000, 5000],
     "max_leaf_nodes": [200, 100, 10, 1]}
    ]
grid_search = GridSearchCV(decision_tree,
                           param_grid,
                           scoring="average_precision",
                           return_train_score=True,
                           cv=5,
                           n_jobs=-1)
grid_fit = grid_search.fit(X_train, y_train)
grid_results = grid_search.cv_results_
grid_best_params = grid_search.best_params_

grid_fit = grid_search.fit(X_train, y_train)
grid_results = grid_search.cv_results_
grid_best_params = grid_search.best_params_
grid_best_score = grid_search.best_score_

print(f"best mean test score: {grid_best_score}, for {grid_best_params}.")

for test_score, train_score, params in zip(grid_results["mean_test_score"],
                                           grid_results["mean_test_score"],
                                           grid_results["params"]):
    print(f"mean test score: {test_score}, mean train score: {train_score}, for {params}.")

best mean test score: 0.4245309871344455, for {'criterion': 'gini', 'max_depth': 100000, 'max_leaf_nodes': 100}.
mean test score: 0.4183440906792913, mean train score: 0.4183440906792913, for {'criterion': 'gini', 'max_depth': 100000, 'max_leaf_nodes': 200}.
mean test score: 0.4245309871344455, mean train score: 0.4245309871344455, for {'criterion': 'gini', 'max_depth': 100000, 'max_leaf_nodes': 100}.
mean test score: 0.38292052914651087, mean train score: 0.38292052914651087, for {'criterion': 'gini', 'max_depth': 100000, 'max_leaf_nodes': 10}.
mean test score: nan, mean train score: nan, for {'criterion': 'gini', 'max_depth': 100000, 'max_leaf_nodes': 1}.
mean test score: 0.4185390899908906, mean train score: 0.4185390899908906, for {'criterion': 'gini', 'max_depth': 50000, 'max_leaf_nodes': 200}.
mean test score: 0.4245309871344455, mean train score: 0.4245309871344455, for {'criterion': 'gini', 'max_depth': 50000, 'max_leaf_nodes': 100}.
mean test score: 0.38292052914651087, mean t

In [74]:
random_forest = RandomForestClassifier(class_weight="balanced")
param_grid = [
    {"n_estimators": [200, 100, 50, 10],
     "criterion": ["gini", "entropy"],
     "max_depth": [10000000, 1000000, 100000, 10000, 1000, 100]}
    ]
grid_search = GridSearchCV(random_forest,
                           param_grid,
                           scoring="average_precision",
                           return_train_score=True,
                           cv=5,
                           n_jobs=-1)
grid_fit = grid_search.fit(X_train, y_train)
grid_results = grid_search.cv_results_
grid_best_params = grid_search.best_params_

grid_fit = grid_search.fit(X_train, y_train)
grid_results = grid_search.cv_results_
grid_best_params = grid_search.best_params_
grid_best_score = grid_search.best_score_

print(f"best mean test score: {grid_best_score}, for {grid_best_params}.")

for test_score, train_score, params in zip(grid_results["mean_test_score"],
                                           grid_results["mean_test_score"],
                                           grid_results["params"]):
    print(f"mean test score: {test_score}, mean train score: {train_score}, for {params}.")

best mean test score: 0.41694694623482675, for {'criterion': 'entropy', 'max_depth': 100, 'n_estimators': 200}.
mean test score: 0.41570447538107097, mean train score: 0.41570447538107097, for {'criterion': 'gini', 'max_depth': 10000000, 'n_estimators': 200}.
mean test score: 0.4099963169867573, mean train score: 0.4099963169867573, for {'criterion': 'gini', 'max_depth': 10000000, 'n_estimators': 100}.
mean test score: 0.4031650927662446, mean train score: 0.4031650927662446, for {'criterion': 'gini', 'max_depth': 10000000, 'n_estimators': 50}.
mean test score: 0.3498652554255627, mean train score: 0.3498652554255627, for {'criterion': 'gini', 'max_depth': 10000000, 'n_estimators': 10}.
mean test score: 0.4131428037638877, mean train score: 0.4131428037638877, for {'criterion': 'gini', 'max_depth': 1000000, 'n_estimators': 200}.
mean test score: 0.40840738166188084, mean train score: 0.40840738166188084, for {'criterion': 'gini', 'max_depth': 1000000, 'n_estimators': 100}.
mean test sc