# Ensemble

In [41]:
import numpy as np
import pandas as pd
pd.set_option('max_columns', None)
pd.set_option("max_colwidth", None)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.model_selection import cross_validate, cross_val_predict, StratifiedShuffleSplit, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, average_precision_score, f1_score, precision_score, recall_score, balanced_accuracy_score, roc_auc_score, confusion_matrix
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.kernel_approximation import RBFSampler
from sklearn.svm import SVC, LinearSVC
from sklearn.utils import resample
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from bank_marketing import *

## Data Preparation

In [129]:
bank_mkt = import_dataset("../data/BankMarketing.csv")

In [130]:
def split_dataset(data, preprocessor=None, random_state=42):
    """
    Split dataset into train, test and validation sets using preprocessor.
    Because the random state of validation set is not specified, the validation set will be different each time when the function is called.

    Parameters
    ----------
        data : DataFrame

        preprocessor : Pipeline

    Returns
    -------
        datasets : tuple

    Examples
    --------
        from sklearn.preprocessing import OrdinalEncoder
        data = import_dataset("../data/BankMarketing.csv").interpolate(method="pad").loc[:, ["job", "education", "y"]]
        # To unpack all train, test, and validation sets 
        X_train, y_train, X_test, y_test, X_ttrain, y_ttrain, X_validate, y_validate = split_dataset(data, OrdinalEncoder())
        # To unpack train and test sets.
        X_train, y_train, X_test, y_test, *other_sets = split_dataset(data, OrdinalEncoder())
        # To unpack test and validation set
        *other_sets, X_test, y_test, X_ttrain, y_ttrain, X_validate, y_validate = split_dataset(data, OrdinalEncoder())
        # To unpack only train set.
        X_train, y_train, *other_sets = split_dataset(data, OneHotEncoder())
    """
    train_test_split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=random_state)
    for train_index, test_index in train_test_split.split(data.drop("y", axis=1), data["y"]):
        train_set = data.iloc[train_index]
        test_set = data.iloc[test_index]

    X_train = train_set.drop(["duration", "y"], axis=1)
    y_train = train_set["y"].astype("int").to_numpy()
    X_test = test_set.drop(["duration", "y"], axis=1)
    y_test = test_set["y"].astype("int").to_numpy()

    train_validate_split = StratifiedShuffleSplit(n_splits=1, test_size=0.2)
    for ttrain_index, validate_index in train_validate_split.split(X_train, y_train):
        ttrain_set = train_set.iloc[ttrain_index]
        validate_set = train_set.iloc[validate_index]
    
    X_ttrain = ttrain_set.drop(["duration", "y"], axis=1)
    y_ttrain = ttrain_set["y"].astype("int").to_numpy()
    X_validate = validate_set.drop(["duration", "y"], axis=1)
    y_validate = validate_set["y"].astype("int").to_numpy()

    if preprocessor != None:
        X_train = preprocessor.fit_transform(X_train, y_train)
        X_test = preprocessor.transform(X_test)
        X_ttrain = preprocessor.fit_transform(X_ttrain, y_ttrain)
        X_validate = preprocessor.transform(X_validate)

    return (X_train, y_train, X_test, y_test, X_ttrain, y_ttrain, X_validate, y_validate)

In [131]:
def benchmark(data, preprocessor=None, clf=None):
    """
    Benchmark preprocessor and clf's performance on train, validation and test sets. 
    All the data transformation should be handled by preprocessor and estimation should be handled by clf.
    
    Parameters
    ----------
        data : DataFrame
        
        preprocessor : Pipeline, default = None
        
        clf : estimator, default = None
        
    """
    X_train, y_train, X_test, y_test, X_ttrain, y_ttrain, X_validate, y_validate = split_dataset(data, preprocessor)
    X_sets = [X_ttrain, X_validate, X_test]
    y_sets = [y_ttrain, y_validate, y_test]
    
    metric_names = ["TNR", "TPR", "bACC", "ROC", "REC", "PRE", "AP"]
    set_names = ["Train", "Validate", "Test"]
    metric_df = pd.DataFrame(index=metric_names, columns=set_names)
            
    try:
        clf.fit(X_ttrain, y_ttrain, eval_set=(X_validate, y_validate), verbose=False)
    except (ValueError, TypeError):
        clf.fit(X_ttrain, y_ttrain)
        
    for name, X, y in zip(set_names, X_sets, y_sets):
        # Re-fit model on train set before test set evaluation except CatBoost
        if name == "Test" and not isinstance(clf, CatBoostClassifier):
            clf.fit(X_train, y_train)
        y_pred = clf.predict(X)

        try:
            y_score = clf.decision_function(X)
        except AttributeError:
            y_score = clf.predict_proba(X)[:, 1]
            
        metrics = [recall_score(y, y_pred, pos_label=0),
                   recall_score(y, y_pred),
                   balanced_accuracy_score(y, y_pred),
                   roc_auc_score(y, y_score),
                   recall_score(y, y_pred),
                   precision_score(y, y_pred),
                   average_precision_score(y, y_score)]
        metric_df[name] = metrics
        
    return metric_df

In [133]:
def dftransform(X,
                drop=None,
                cut=None,
                gen=None,
                cyclic=None,
                target=None,
                fillna=True,
                to_numpy=False):
    """
    Encode, transform, and generate categorical data in the dataframe.
    
    Parameters
    ----------
        X : DataFrame
        
        drop : list, default = None
        
        gen : list, default = None
        
        cut : list, default = None
        
        external : list, default = None
        
        cyclic : list, default = None

        fillna : boolean, default = True
    
    Returns
    -------
        X : DataFrame

    Examples
    --------
    bank_mkt = import_dataset("../data/BankMarketing.csv")
    X = cat_encode(bank_mkt)
    """
    X = X.copy()
    
    if gen != None:
        if "year" in gen or "days" in gen:
            X.loc[X.index < 27682, "year"] = 2008
            X.loc[(27682<=X.index) & (X.index<39118), "year"] = 2009
            X.loc[39118<=X.index, "year"] = 2010
            X["year"] = X["year"].astype("int")
        if "days" in gen:
            X["date"] = pd.to_datetime(X[["month", "year"]].assign(day=1))
            X["lehman"] = pd.to_datetime("2008-09-15")
            X["days"] = X["date"] -  X["lehman"]
            X["days"] = X["days"].dt.days
#             X = X.drop(["lehman", "year", "date"], axis=1)
        if "has_previous" in gen:
            X["has_previous"] = X["previous"] > 0
        if "has_default" in gen:
            X["has_default"] = X["default"].notna()
        if "has_marital" in gen:
            X["has_marital"] = X["marital"].notna()

    if cut != None:
        if "pdays" in cut:
            # Cut pdays into categories
            X["pdays"] = pd.cut(X["pdays"], [0, 3, 5, 10, 15, 30, 1000], labels=[3, 5, 10, 15, 30, 1000], include_lowest=True).astype("Int64")
    
    if cyclic != None:
        if "month" in cyclic:
            X['month_sin'] = np.sin(2 * np.pi * X["month"]/12)
            X['month_cos'] = np.cos(2 * np.pi * X["month"]/12)
            X = X.drop("month", axis=1)
        if "day_of_week" in cyclic:
            X['day_sin'] = np.sin(2 * np.pi * X["day_of_week"]/5)
            X['day_cos'] = np.cos(2 * np.pi * X["day_of_week"]/5)
            X = X.drop("day_of_week", axis=1)
            
    # Transform target encoded feature as str
    if target != None:
        X[target] = X[target].astype("str")

    # Other categorical features will be coded as its order in pandas categorical index
    X = X.apply(lambda x: x.cat.codes if pd.api.types.is_categorical_dtype(x) else (x.astype("Int64") if pd.api.types.is_bool_dtype(x) else x))
    
    if fillna:
        # Clients who have been contacted but do not have pdays record should be encoded as 999
        # Clients who have not been contacted should be encoded as -999 
        X.loc[X["pdays"].isna() & X["poutcome"].notna(), "pdays"] = 999
        X["pdays"] = X["pdays"].fillna(-999)
        # Fill other missing values as -1
        X = X.fillna(-1)
    else:
        X = X.astype("float")
    
    if drop != None:
        # Drop features
        X = X.drop(drop, axis=1)
    
    if to_numpy:
        X = X.to_numpy()
    
    return X

In [135]:
X_train, y_train, X_test, y_test, *other_sets = split_dataset(bank_mkt, FunctionTransformer(dftransform, kw_args={"to_numpy":True}))
xgb_clf = XGBClassifier(max_depth=3, scale_pos_weight=8)
xgb_result = benchmark(bank_mkt,
             FunctionTransformer(dftransform, kw_args={"to_numpy":True}),
             xgb_clf)

In [None]:
xgb_clf.fit()

## Trained on Different Features

In [55]:
cat_clf = make_pipeline(CatBoostClassifier(eval_metric="AUC", class_weights=[1, 8], verbose=False))
benchmark(bank_mkt, FunctionTransformer(preprocessor), cat_clf)

Unnamed: 0,Train,Validate,Test
TNR,0.886028,0.871536,0.865353
TPR,0.774672,0.597035,0.615302
bACC,0.83035,0.734286,0.740327
ROC,0.92245,0.782709,0.788783
REC,0.774672,0.597035,0.615302
PRE,0.463243,0.371022,0.367203
AP,0.665119,0.450498,0.44152


In [56]:
cat_clf = make_pipeline(FunctionTransformer(preprocessor),
                        CatBoostClassifier(eval_metric="AUC", class_weights=[1, 8], verbose=False))
benchmark(bank_mkt, None, cat_clf)

Unnamed: 0,Train,Validate,Test
TNR,0.888295,0.871878,0.865353
TPR,0.769956,0.617251,0.615302
bACC,0.829126,0.744564,0.740327
ROC,0.922014,0.792597,0.788783
REC,0.769956,0.617251,0.615302
PRE,0.466721,0.379453,0.367203
AP,0.662793,0.449308,0.44152


In [83]:
xgb_clf = make_pipeline(FunctionTransformer(preprocessor, kw_args= {"drop": drop_features,
                                                                     "gen": ["days",
                                                                             "has_previous",
                                                                             "has_default",
                                                                             "has_marital"]}),
                        XGBClassifier(max_depth=3, gamma=1, min_child_weight=1, scale_pos_weight=8))

benchmark(bank_mkt, None, xgb_clf)

Unnamed: 0,Train,Validate,Test
TNR,0.862336,0.862641,0.858101
TPR,0.660155,0.625337,0.646552
bACC,0.761245,0.743989,0.752326
ROC,0.830029,0.796904,0.809499
REC,0.660155,0.625337,0.646552
PRE,0.378451,0.366219,0.366524
AP,0.527896,0.472036,0.462625


In [85]:
drop_features = ["age",
                 "job",
                 "marital",
                 "education",
                 "housing",
                 "loan",
                 "default"]

drop_cat = make_pipeline(FunctionTransformer(preprocessor, kw_args= {"drop": drop_features}),
                         CatBoostClassifier(eval_metric="AUC", class_weights=[1, 8], verbose=False))
benchmark(bank_mkt, None, drop_cat)

Unnamed: 0,Train,Validate,Test
TNR,0.869264,0.868457,0.861795
TPR,0.67228,0.636119,0.62931
bACC,0.770772,0.752288,0.745553
ROC,0.85098,0.809011,0.800525
REC,0.67228,0.636119,0.62931
PRE,0.395013,0.380338,0.366374
AP,0.56974,0.474639,0.455604


In [82]:
drop_xgb = make_pipeline(FunctionTransformer(preprocessor, kw_args= {"drop": drop_features}),
                         XGBClassifier(max_depth=3, gamma=1, min_child_weight=1, scale_pos_weight=8))
benchmark(bank_mkt, None, drop_xgb)

Unnamed: 0,Train,Validate,Test
TNR,0.850746,0.843996,0.860153
TPR,0.662176,0.65903,0.636853
bACC,0.756461,0.751513,0.748503
ROC,0.826109,0.809678,0.808397
REC,0.662176,0.65903,0.636853
PRE,0.360337,0.349036,0.366398
AP,0.515664,0.494759,0.458896


In [62]:
cat_features = ["job",
                "marital",
                "default",
                "housing",
                "loan",
                "poutcome"]

num_features =  ["age",
                 "campaign",
                 "pdays",
                 "previous",
                 "emp.var.rate",
                 "cons.price.idx",
                 "cons.conf.idx",
                 "euribor3m",
                 "nr.employed"]
    
hot_scaler = ColumnTransformer([
    ("one_hot_encoder", OneHotEncoder(drop="first"), cat_features),
    ("scaler", StandardScaler(), num_features)
], remainder="passthrough")

In [64]:
lr_clf = make_pipeline(FunctionTransformer(preprocessor, kw_args= {"fillna": True}),
                       hot_scaler,
                       LogisticRegression(class_weight="balanced", max_iter=1000))
benchmark(bank_mkt, None, lr_clf)

Unnamed: 0,Train,Validate,Test
TNR,0.799384,0.796784,0.807061
TPR,0.66487,0.660377,0.667026
bACC,0.732127,0.728581,0.737043
ROC,0.785378,0.790635,0.782101
REC,0.66487,0.660377,0.667026
PRE,0.296174,0.292014,0.305076
AP,0.436104,0.475031,0.42662


In [65]:
rbf_clf = make_pipeline(FunctionTransformer(preprocessor, kw_args= {"fillna": True}),
                        hot_scaler,
                        RBFSampler(gamma=0.001, random_state=42),
                        LinearSVC(loss="squared_hinge", dual=False, class_weight="balanced", max_iter=1000))
benchmark(bank_mkt, None, rbf_clf)

Unnamed: 0,Train,Validate,Test
TNR,0.797759,0.794218,0.800766
TPR,0.669923,0.66442,0.674569
bACC,0.733841,0.729319,0.737668
ROC,0.792383,0.775301,0.785526
REC,0.669923,0.66442,0.674569
PRE,0.29607,0.290684,0.300672
AP,0.450046,0.437103,0.422932


In [88]:
bank_mkt

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,False,False,False,telephone,5,1,261,1,,0,,1.1,93.994,-36.4,4.857,5191.0,False
1,57,services,married,high.school,,False,False,telephone,5,1,149,1,,0,,1.1,93.994,-36.4,4.857,5191.0,False
2,37,services,married,high.school,False,True,False,telephone,5,1,226,1,,0,,1.1,93.994,-36.4,4.857,5191.0,False
3,40,admin.,married,basic.6y,False,False,False,telephone,5,1,151,1,,0,,1.1,93.994,-36.4,4.857,5191.0,False
4,56,services,married,high.school,False,False,True,telephone,5,1,307,1,,0,,1.1,93.994,-36.4,4.857,5191.0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41171,73,retired,married,professional.course,False,True,False,cellular,11,5,334,1,,0,,-1.1,94.767,-50.8,1.028,4963.6,True
41172,46,blue-collar,married,professional.course,False,False,False,cellular,11,5,383,1,,0,,-1.1,94.767,-50.8,1.028,4963.6,False
41173,56,retired,married,university.degree,False,True,False,cellular,11,5,189,2,,0,,-1.1,94.767,-50.8,1.028,4963.6,False
41174,44,technician,married,professional.course,False,False,False,cellular,11,5,442,1,,0,,-1.1,94.767,-50.8,1.028,4963.6,True


In [102]:
cat_features = ["job",
                "marital",
                "default",
                "housing",
                "loan"]

num_features =  ["age",
                 "campaign",
                 "pdays",
                 "previous"]
    
hot_scaler = ColumnTransformer([
    ("one_hot_encoder", OneHotEncoder(drop="first"), cat_features),
    ("scaler", StandardScaler(), num_features)
], remainder="passthrough")

In [112]:
bank_mkt

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,False,False,False,telephone,5,1,261,1,,0,,1.1,93.994,-36.4,4.857,5191.0,False
1,57,services,married,high.school,,False,False,telephone,5,1,149,1,,0,,1.1,93.994,-36.4,4.857,5191.0,False
2,37,services,married,high.school,False,True,False,telephone,5,1,226,1,,0,,1.1,93.994,-36.4,4.857,5191.0,False
3,40,admin.,married,basic.6y,False,False,False,telephone,5,1,151,1,,0,,1.1,93.994,-36.4,4.857,5191.0,False
4,56,services,married,high.school,False,False,True,telephone,5,1,307,1,,0,,1.1,93.994,-36.4,4.857,5191.0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41171,73,retired,married,professional.course,False,True,False,cellular,11,5,334,1,,0,,-1.1,94.767,-50.8,1.028,4963.6,True
41172,46,blue-collar,married,professional.course,False,False,False,cellular,11,5,383,1,,0,,-1.1,94.767,-50.8,1.028,4963.6,False
41173,56,retired,married,university.degree,False,True,False,cellular,11,5,189,2,,0,,-1.1,94.767,-50.8,1.028,4963.6,False
41174,44,technician,married,professional.course,False,False,False,cellular,11,5,442,1,,0,,-1.1,94.767,-50.8,1.028,4963.6,True


In [125]:
drop_features = ["age",
                 "job",
                 "marital",
                 "education",
                 "housing",
                 "loan",
                 "default",
                 "contact",
                 "month",
                 "day_of_week",
                 "campaign",
                 "pdays",
                 "previous",
                 "poutcome"]

knn_clf = make_pipeline(FunctionTransformer(preprocessor, kw_args= {"drop": drop_features, "fillna":True, "gen":["year", "days"]}),
                        StandardScaler(),
                        KNeighborsClassifier(n_neighbors=8))
benchmark(bank_mkt, None, knn_clf)

Unnamed: 0,Train,Validate,Test
TNR,0.981012,0.979815,0.980569
TPR,0.224318,0.204852,0.172414
bACC,0.602665,0.592334,0.576492
ROC,0.759406,0.752768,0.744048
REC,0.224318,0.204852,0.172414
PRE,0.6,0.562963,0.529801
AP,0.373362,0.345359,0.329592


In [128]:
voting_clf = VotingClassifier(
    estimators=[("cat", cat_clf),
                ("knn", knn_clf),
                ("drop_xgb", drop_xgb)],
    voting="soft",
    n_jobs=-1)
benchmark(bank_mkt, None, voting_clf)

Unnamed: 0,Train,Validate,Test
TNR,0.919642,0.908827,0.907909
TPR,0.600202,0.557951,0.572198
bACC,0.759922,0.733389,0.740054
ROC,0.88088,0.802681,0.800868
REC,0.600202,0.557951,0.572198
PRE,0.486752,0.43717,0.44103
AP,0.580858,0.430137,0.444582


In [127]:
stacking_clf = StackingClassifier(
    estimators=[("cat", cat_clf),
                ("knn", knn_clf),
                ("drop_xgb", drop_xgb)])
benchmark(bank_mkt, None, stacking_clf)

Unnamed: 0,Train,Validate,Test
TNR,0.985246,0.977934,0.978517
TPR,0.346918,0.297844,0.28125
bACC,0.666082,0.637889,0.629883
ROC,0.874184,0.801059,0.806147
REC,0.346918,0.297844,0.28125
PRE,0.749091,0.631429,0.624402
AP,0.582316,0.46105,0.462739
