# Discussions on Data Analytics Project

## Data Preparation

In [18]:
import numpy as np
import pandas as pd
from category_encoders.target_encoder import TargetEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV, RandomizedSearchCV, cross_validate
from sklearn.linear_model import SGDClassifier
from sklearn.kernel_approximation import RBFSampler
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import accuracy_score, average_precision_score, f1_score, precision_score, recall_score, balanced_accuracy_score, roc_auc_score, confusion_matrix
from sklearn.utils.fixes import loguniform
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

In [19]:
def import_dataset(filename):
    bank_mkt = pd.read_csv(filename,
                           na_values=["unknown", "nonexistent"],
                           true_values=["yes", "success"],
                           false_values=["no", "failure"])
    # Treat pdays = 999 as missing values
    bank_mkt["pdays"] = bank_mkt["pdays"].replace(999, pd.NA)
    # Convert types, "Int64" is nullable integer data type in pandas
    bank_mkt = bank_mkt.astype(dtype={"age": "Int64",
                                      "job": "category",
                                      "marital": "category",
                                      "education": "category",
                                      "default": "boolean",
                                      "housing": "boolean",
                                      "loan": "boolean",
                                      "contact": "category",
                                      "month": "category",
                                      "day_of_week": "category",
                                      "duration": "Int64",
                                      "campaign": "Int64",
                                      "pdays": "Int64",
                                      "previous": "Int64",
                                      "poutcome": "boolean",
                                      "y": "boolean"})
    # Drop duplicates
    bank_mkt = bank_mkt.drop_duplicates().reset_index(drop=True)
    # reorder categorical data
    bank_mkt["education"] = bank_mkt["education"].cat.reorder_categories(["illiterate", "basic.4y", "basic.6y", "basic.9y", "high.school", "professional.course", "university.degree"], ordered=True)
    bank_mkt["month"] = bank_mkt["month"].cat.reorder_categories(["mar", "apr", "jun", "jul", "may", "aug", "sep", "oct", "nov", "dec"], ordered=True)
    bank_mkt["day_of_week"] = bank_mkt["day_of_week"].cat.reorder_categories(["mon", "tue", "wed", "thu", "fri"], ordered=True)
    return bank_mkt

In [20]:
bank_mkt = import_dataset("../data/BankMarketing.csv")

In [57]:
train_test_split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_index, test_index in train_test_split.split(bank_mkt.drop("y", axis=1), bank_mkt["y"]):
    bank_train_set = bank_mkt.loc[train_index].reset_index(drop=True)
    bank_test_set = bank_mkt.loc[test_index].reset_index(drop=True)

## Dropping Features May or May Not Change Results
- Dropping some features may improve the results.
- Test and Validation metrics give different opionions.
- Ensemble models with different dropping strategies?

### Drop Nothing

In [36]:
# Features with missing values that should be imputed with most freq value
freq_features = ["default", "housing", "loan"]

# Features with missing values that should be imputed by IterativeImputer
ite_features = ["age", "job", "marital", "education", "default", "housing", "loan"]

# Features that should be mean encoded
# target_features = ["month", "day_of_week"]
# target_features = [1,2,3,8,9,10]

# drop_features = ["age", "job", "marital", "education", "housing", "loan", "default", "duration", "y"]
drop_features = ["y"]

def tree_encode(X):
    """
    Encode categorical data into numerical values.
    pdays column will be feature engineered and discretized.
    """
    X = X.copy()
    # pdays column will be feature engineered and discretized.
    X.loc[X["pdays"].isna() & X["poutcome"].notna(), "pdays"] = 999
    #X["pdays"] = pd.cut(X["pdays"], [0, 5, 10, 15, 30, 1000], labels=[1, 2, 3, 4, 5], include_lowest=True).astype("Int64")
    # Cut age into age groups
    # X["age"] = pd.cut(X["age"], [0, 18, 24, 30, 35, 40, 45, 50, 55, 60, 100], labels=[18, 24, 30, 35, 40, 45, 50, 55, 60, 100], include_lowest=True).astype("Int64")
    # Encode nominal and ordinal features
    # `month` will be encoded to the corresponding number, e.g. "mar" -> 3.
    month_map = {"mar": 3,
                 "apr": 4,
                 "may": 5,
                 "jun": 6,
                 "jul": 7,
                 "aug": 8,
                 "sep": 9,
                 "oct": 10,
                 "nov": 11,
                 "dec": 12}
    # Drop features
    X = X.drop(drop_features, axis=1)
    # Other categorical features will be coded as its order in pandas categorical index
    X = X.apply(lambda x: x.cat.codes if pd.api.types.is_categorical_dtype(x) else (x.astype("Int64") if pd.api.types.is_bool_dtype(x) else x))
    # X[target_features] = X[target_features].astype("str")
    # Fill missing values as -1
     # X = X.fillna(-1).to_numpy()
    X = X.fillna(-1)
    return X

tree_encoder = FunctionTransformer(tree_encode)

# freq_imputer will impute missing values in columns specified by freq_features
freq_imputer = ColumnTransformer([
    ("freq_imputer",
     SimpleImputer(missing_values=-1, strategy="most_frequent"),
     freq_features)],
    remainder="passthrough")

tree_preprocessor = Pipeline([
    ("tree_encoder", tree_encoder),
    ("freq_imputer", freq_imputer)
])

y_train = bank_train_set["y"].astype("int").to_numpy()
X_train = tree_preprocessor.fit_transform(bank_train_set)
y_test = bank_test_set["y"].astype("int").to_numpy()
X_test = tree_preprocessor.transform(bank_test_set)

In [37]:
train_validate_split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=62)

for train_index, validate_index in train_validate_split.split(X_train, y_train):
    X_ttrain = X_train[train_index]
    y_ttrain = y_train[train_index]
    X_validate = X_train[validate_index]
    y_validate = y_train[validate_index]

In [38]:
cat_clf = CatBoostClassifier(eval_metric="AUC", class_weights=[1, 8])
cat_fit = cat_clf.fit(X_ttrain, y_ttrain, eval_set=(X_validate, y_validate), verbose=False)

X_sets = [X_ttrain, X_validate, X_test]
y_sets = [y_ttrain, y_validate, y_test]

matric_names = ["TNR", "TPR", "bACC", "ROC", "REC", "PRE", "F1", "AP"]
set_names = ["Train", "Validate", "Test"]
matric_df = pd.DataFrame(index=matric_names, columns=set_names)

for name, X, y in zip(set_names, X_sets, y_sets):
    y_pred = cat_clf.predict(X)
    y_score = cat_clf.predict_proba(X)[:,1]
    matrics = [recall_score(y, y_pred, pos_label=0),
               recall_score(y, y_pred),
               balanced_accuracy_score(y, y_pred),
               roc_auc_score(y, y_score),
               recall_score(y, y_pred),
               precision_score(y, y_pred),
               f1_score(y, y_pred),
               average_precision_score(y, y_score)]
    matric_df[name] = matrics

matric_df

Unnamed: 0,Train,Validate,Test
TNR,0.875422,0.86897,0.864122
TPR,0.963287,0.921833,0.921336
bACC,0.919355,0.895402,0.892729
ROC,0.968406,0.947962,0.950264
REC,0.963287,0.921833,0.921336
PRE,0.49541,0.471724,0.462662
F1,0.654313,0.624088,0.615994
AP,0.780295,0.65188,0.671839


### Drop Duration

In [58]:
# Features with missing values that should be imputed with most freq value
freq_features = ["default", "housing", "loan"]

# Features with missing values that should be imputed by IterativeImputer
ite_features = ["age", "job", "marital", "education", "default", "housing", "loan"]

# Features that should be mean encoded
# target_features = ["month", "day_of_week"]
# target_features = [1,2,3,8,9,10]

# drop_features = ["age", "job", "marital", "education", "housing", "loan", "default", "duration", "y"]
drop_features = ["duration", "y"]

def tree_encode(X):
    """
    Encode categorical data into numerical values.
    pdays column will be feature engineered and discretized.
    """
    X = X.copy()
    # pdays column will be feature engineered and discretized.
    X.loc[X["pdays"].isna() & X["poutcome"].notna(), "pdays"] = 999
    #X["pdays"] = pd.cut(X["pdays"], [0, 5, 10, 15, 30, 1000], labels=[1, 2, 3, 4, 5], include_lowest=True).astype("Int64")
    # Cut age into age groups
    # X["age"] = pd.cut(X["age"], [0, 18, 24, 30, 35, 40, 45, 50, 55, 60, 100], labels=[18, 24, 30, 35, 40, 45, 50, 55, 60, 100], include_lowest=True).astype("Int64")
    # Encode nominal and ordinal features
    # `month` will be encoded to the corresponding number, e.g. "mar" -> 3.
    month_map = {"mar": 3,
                 "apr": 4,
                 "may": 5,
                 "jun": 6,
                 "jul": 7,
                 "aug": 8,
                 "sep": 9,
                 "oct": 10,
                 "nov": 11,
                 "dec": 12}
    # Drop features
    X = X.drop(drop_features, axis=1)
    # Other categorical features will be coded as its order in pandas categorical index
    X = X.apply(lambda x: x.cat.codes if pd.api.types.is_categorical_dtype(x) else (x.astype("Int64") if pd.api.types.is_bool_dtype(x) else x))
    # X[target_features] = X[target_features].astype("str")
    # Fill missing values as -1
     # X = X.fillna(-1).to_numpy()
    X = X.fillna(-1)
    return X

tree_encoder = FunctionTransformer(tree_encode)

# freq_imputer will impute missing values in columns specified by freq_features
freq_imputer = ColumnTransformer([
    ("freq_imputer",
     SimpleImputer(missing_values=-1, strategy="most_frequent"),
     freq_features)],
    remainder="passthrough")

tree_preprocessor = Pipeline([
    ("tree_encoder", tree_encoder),
    ("freq_imputer", freq_imputer)
])

y_train = bank_train_set["y"].astype("int").to_numpy()
X_train = tree_preprocessor.fit_transform(bank_train_set)
y_test = bank_test_set["y"].astype("int").to_numpy()
X_test = tree_preprocessor.transform(bank_test_set)

In [59]:
train_validate_split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=62)

for train_index, validate_index in train_validate_split.split(X_train, y_train):
    X_ttrain = X_train[train_index]
    y_ttrain = y_train[train_index]
    X_validate = X_train[validate_index]
    y_validate = y_train[validate_index]

In [60]:
cat_clf = CatBoostClassifier(eval_metric="AUC", class_weights=[1, 8])
cat_fit = cat_clf.fit(X_ttrain, y_ttrain, eval_set=(X_validate, y_validate), verbose=False)

X_sets = [X_ttrain, X_validate, X_test]
y_sets = [y_ttrain, y_validate, y_test]

matric_names = ["TNR", "TPR", "bACC", "ROC", "REC", "PRE", "F1", "AP"]
set_names = ["Train", "Validate", "Test"]
matric_df = pd.DataFrame(index=matric_names, columns=set_names)

for name, X, y in zip(set_names, X_sets, y_sets):
    y_pred = cat_clf.predict(X)
    y_score = cat_clf.predict_proba(X)[:,1]
    matrics = [recall_score(y, y_pred, pos_label=0),
               recall_score(y, y_pred),
               balanced_accuracy_score(y, y_pred),
               roc_auc_score(y, y_score),
               recall_score(y, y_pred),
               precision_score(y, y_pred),
               f1_score(y, y_pred),
               average_precision_score(y, y_score)]
    matric_df[name] = matrics

matric_df

Unnamed: 0,Train,Validate,Test
TNR,0.860839,0.859562,0.862343
TPR,0.656787,0.614555,0.66056
bACC,0.758813,0.737059,0.761451
ROC,0.84424,0.797126,0.809376
REC,0.656787,0.614555,0.66056
PRE,0.374712,0.357087,0.378629
F1,0.477181,0.451709,0.481351
AP,0.527415,0.470583,0.476541


### Drop Almost Everything

In [54]:
drop_features = ["age", "job", "marital", "education", "housing", "loan", "default", "y"]

def tree_encode(X):
    """
    Encode categorical data into numerical values.
    pdays column will be feature engineered and discretized.
    """
    X = X.copy()
    # pdays column will be feature engineered and discretized.
    X.loc[X["pdays"].isna() & X["poutcome"].notna(), "pdays"] = 999
    #X["pdays"] = pd.cut(X["pdays"], [0, 5, 10, 15, 30, 1000], labels=[1, 2, 3, 4, 5], include_lowest=True).astype("Int64")
    # Cut age into age groups
    # X["age"] = pd.cut(X["age"], [0, 18, 24, 30, 35, 40, 45, 50, 55, 60, 100], labels=[18, 24, 30, 35, 40, 45, 50, 55, 60, 100], include_lowest=True).astype("Int64")
    # Encode nominal and ordinal features
    # `month` will be encoded to the corresponding number, e.g. "mar" -> 3.
    month_map = {"mar": 3,
                 "apr": 4,
                 "may": 5,
                 "jun": 6,
                 "jul": 7,
                 "aug": 8,
                 "sep": 9,
                 "oct": 10,
                 "nov": 11,
                 "dec": 12}
    # Drop features
    X = X.drop(drop_features, axis=1)
    # Other categorical features will be coded as its order in pandas categorical index
    X = X.apply(lambda x: x.cat.codes if pd.api.types.is_categorical_dtype(x) else (x.astype("Int64") if pd.api.types.is_bool_dtype(x) else x))
    # X[target_features] = X[target_features].astype("str")
    # Fill missing values as -1
    X = X.fillna(-1).to_numpy()
    return X

tree_encoder = FunctionTransformer(tree_encode)

tree_preprocessor = Pipeline([
    ("tree_encoder", tree_encoder)
])

y_train = bank_train_set["y"].astype("int").to_numpy()
X_train = tree_preprocessor.fit_transform(bank_train_set)
y_test = bank_test_set["y"].astype("int").to_numpy()
X_test = tree_preprocessor.transform(bank_test_set)

In [55]:
train_validate_split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=62)

for train_index, validate_index in train_validate_split.split(X_train, y_train):
    X_ttrain = X_train[train_index]
    y_ttrain = y_train[train_index]
    X_validate = X_train[validate_index]
    y_validate = y_train[validate_index]

In [56]:
cat_clf = CatBoostClassifier(eval_metric="AUC", class_weights=[1, 8])
cat_fit = cat_clf.fit(X_ttrain, y_ttrain, eval_set=(X_validate, y_validate), verbose=False)

X_sets = [X_ttrain, X_validate, X_test]
y_sets = [y_ttrain, y_validate, y_test]

matric_names = ["TNR", "TPR", "bACC", "ROC", "REC", "PRE", "F1", "AP"]
set_names = ["Train", "Validate", "Test"]
matric_df = pd.DataFrame(index=matric_names, columns=set_names)

for name, X, y in zip(set_names, X_sets, y_sets):
    y_pred = cat_clf.predict(X)
    y_score = cat_clf.predict_proba(X)[:,1]
    matrics = [recall_score(y, y_pred, pos_label=0),
               recall_score(y, y_pred),
               balanced_accuracy_score(y, y_pred),
               roc_auc_score(y, y_score),
               recall_score(y, y_pred),
               precision_score(y, y_pred),
               f1_score(y, y_pred),
               average_precision_score(y, y_score)]
    matric_df[name] = matrics

matric_df

Unnamed: 0,Train,Validate,Test
TNR,0.864987,0.858536,0.857553
TPR,0.957561,0.913747,0.931034
bACC,0.911274,0.886141,0.894294
ROC,0.962667,0.948381,0.948905
REC,0.957561,0.913747,0.931034
PRE,0.473833,0.450498,0.453543
F1,0.633961,0.603471,0.609954
AP,0.746284,0.66188,0.665315


In [51]:
drop_features = ["age", "job", "marital", "education", "housing", "loan", "default", "duration", "y"]

def tree_encode(X):
    """
    Encode categorical data into numerical values.
    pdays column will be feature engineered and discretized.
    """
    X = X.copy()
    # pdays column will be feature engineered and discretized.
    X.loc[X["pdays"].isna() & X["poutcome"].notna(), "pdays"] = 999
    #X["pdays"] = pd.cut(X["pdays"], [0, 5, 10, 15, 30, 1000], labels=[1, 2, 3, 4, 5], include_lowest=True).astype("Int64")
    # Cut age into age groups
    # X["age"] = pd.cut(X["age"], [0, 18, 24, 30, 35, 40, 45, 50, 55, 60, 100], labels=[18, 24, 30, 35, 40, 45, 50, 55, 60, 100], include_lowest=True).astype("Int64")
    # Encode nominal and ordinal features
    # `month` will be encoded to the corresponding number, e.g. "mar" -> 3.
    month_map = {"mar": 3,
                 "apr": 4,
                 "may": 5,
                 "jun": 6,
                 "jul": 7,
                 "aug": 8,
                 "sep": 9,
                 "oct": 10,
                 "nov": 11,
                 "dec": 12}
    # Drop features
    X = X.drop(drop_features, axis=1)
    # Other categorical features will be coded as its order in pandas categorical index
    X = X.apply(lambda x: x.cat.codes if pd.api.types.is_categorical_dtype(x) else (x.astype("Int64") if pd.api.types.is_bool_dtype(x) else x))
    # X[target_features] = X[target_features].astype("str")
    # Fill missing values as -1
    X = X.fillna(-1).to_numpy()
    return X

tree_encoder = FunctionTransformer(tree_encode)

tree_preprocessor = Pipeline([
    ("tree_encoder", tree_encoder)
])

y_train = bank_train_set["y"].astype("int").to_numpy()
X_train = tree_preprocessor.fit_transform(bank_train_set)
y_test = bank_test_set["y"].astype("int").to_numpy()
X_test = tree_preprocessor.transform(bank_test_set)

In [52]:
train_validate_split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=62)

for train_index, validate_index in train_validate_split.split(X_train, y_train):
    X_ttrain = X_train[train_index]
    y_ttrain = y_train[train_index]
    X_validate = X_train[validate_index]
    y_validate = y_train[validate_index]

In [53]:
cat_clf = CatBoostClassifier(eval_metric="AUC", class_weights=[1, 8])
cat_fit = cat_clf.fit(X_ttrain, y_ttrain, eval_set=(X_validate, y_validate), verbose=False)

X_sets = [X_ttrain, X_validate, X_test]
y_sets = [y_ttrain, y_validate, y_test]

matric_names = ["TNR", "TPR", "bACC", "ROC", "REC", "PRE", "F1", "AP"]
set_names = ["Train", "Validate", "Test"]
matric_df = pd.DataFrame(index=matric_names, columns=set_names)

for name, X, y in zip(set_names, X_sets, y_sets):
    y_pred = cat_clf.predict(X)
    y_score = cat_clf.predict_proba(X)[:,1]
    matrics = [recall_score(y, y_pred, pos_label=0),
               recall_score(y, y_pred),
               balanced_accuracy_score(y, y_pred),
               roc_auc_score(y, y_score),
               recall_score(y, y_pred),
               precision_score(y, y_pred),
               f1_score(y, y_pred),
               average_precision_score(y, y_score)]
    matric_df[name] = matrics

matric_df

Unnamed: 0,Train,Validate,Test
TNR,0.867254,0.866062,0.865764
TPR,0.639272,0.61186,0.615302
bACC,0.753263,0.738961,0.740533
ROC,0.816888,0.793971,0.804984
REC,0.639272,0.61186,0.615302
PRE,0.379448,0.367017,0.367912
F1,0.476226,0.458818,0.460484
AP,0.499583,0.460966,0.458497


## Standardization on Dummy Variables

In [None]:
svm_preprocessor = Pipeline([
    ("tree_encoder", svm_encoder),
    ("ite_transformer", ite_transformer),
    ("one_hot_encoder", one_hot_encoder), # Standardize Dummy Variables?
    ("scaler", StandardScaler())
])

## Encode Validation Set

In [None]:
svm_preprocessor = Pipeline([
    ("svm_encoder", svm_encoder),
    ("freq_imputer", freq_imputer), # Iterative Imputer?
    ("one_hot_encoder", one_hot_encoder), # Target Encoding? Frequency Encoding?
    ("scaler", StandardScaler()),
    ("rbf", RBFSampler(random_state=42)), # Kernel Approxmation or Use less data?
    ("svm", SGDClassifier(class_weight="balanced"))
])

![](grid_search_cross_validation.png)

## Categorical Encoding Methods

- Mean encoding
- Frequency encoding
- Cyclic encoding
- Entity Embedding, new ways to work with categorical data.

## Ensemble Ideas

- What's the starting point for ensembles?

- KNN + Decision Tress

## Incorporate More Infomation

- Gold price? 
- Currency rate?