# 2. Bank - Modeling

We have done the Exploratory Data Analysis in the [Last Notebook](1_Bank_EDA.ipynb), now it's time to actually start the Machine Learning Part.

## 2.1 Base Estimators

In [1]:
import warnings
warnings.filterwarnings(
    "ignore", message=r".*A worker stopped while some jobs were given to the executor.*",
    category=UserWarning, module=r"joblib[.]externals[.]loky[.]process_executor"
)
warnings.filterwarnings(
    "ignore",
    message=r".*Using `tqdm.autonotebook.tqdm` in notebook mode.*",
    module=r"tqdm_joblib"
)

import os
import pandas as pd
import joblib
from tqdm.notebook import tqdm
from tqdm_joblib import tqdm_joblib
from sklearn.ensemble import StackingClassifier
from sklearn import set_config
from bank_functions import *
from bank_feat_engineer import *
from bank_models import *
from sklearn.model_selection import StratifiedKFold, cross_val_predict
N_FOLDS = 3

result_folder = "Results"
os.makedirs(result_folder, exist_ok=True)
set_config(transform_output="pandas")   

# Load data
df_train = pd.read_csv(r"Data/train.csv")
df_test = pd.read_csv(r"Data/test.csv") 

target = "y"
X_cat = ["job","month","poutcome","education","contact","marital","loan","housing","default"]
X_num = ["balance","duration","pdays","age","campaign","previous","day"]

# Set categorical columns to 'category' dtype
for col in X_cat:
    df_train[col] = df_train[col].astype('category')
    df_test[col]  = df_test[col].astype('category')
    
X = df_train[X_cat + X_num]
y = df_train[target]

The goal is to maximize the AUC of the ROC Curve. We are going to test different algorithms from multiple families. We set different preprocessors on the different algorithms as some of them need imputing missing values, encoding categorical variables or scaling numeric variables.

In [2]:
pre = {
    "base": make_preprocessor(X_num, X_cat),
    "imp": make_preprocessor(X_num, X_cat, impute=True),
    "imp_onehot": make_preprocessor(X_num, X_cat, onehot=True, sparse_output=True, scaler=True),
    "imp_dense": make_preprocessor(X_num, X_cat, onehot=True, sparse_output=False),
    "dense": make_preprocessor(X_num, X_cat, onehot=True, sparse_output=False),
    "imp_dense_sc": make_preprocessor(X_num, X_cat, onehot=True, sparse_output=False, scaler=True),
    "imp_ordinal": make_preprocessor(X_num, X_cat, ordinal=True),
}

We are going to use a diverse set of estimators to capture the maximum variance possible. Each pipeline is tuned with cross-validation and model-appropriate search strategies:
- Fast and Strong baselines Gradient Boosting Models: **LightGBM, XGBoost, CatBoost and HistGradientBoosting**
- Bagging / Randomized trees: **Random Forest and ExtraTrees**
- Purely Additive Boosting : **Explainable Boosting Machine**
- Linear and Distance/Margin models : **ElasticNet, k NN and SVC**
- Neural-based tabular models : **Shallow MLP and TabNet**

In [3]:
# Unified configuration list 
configs = [
    ("lgbm",run_lightGBM,      {"preprocessor": pre["base"], "n_jobs": 16}),
    ("xgb", run_XGBoost,       {"preprocessor": pre["base"]}),
    ("cat", run_CatBoost,      {"preprocessor": pre["base"], "X_cat": X_cat, "n_jobs": 2}),
    ("hgb", run_HGB,           {"preprocessor": pre["imp_dense"]}),
    ("rf",  run_randomForest,  {"preprocessor": pre["dense"]}),
    ("et",  run_extraTrees,    {"preprocessor": pre["imp_dense"]}),
    ("ebm", run_EBM,           {"preprocessor": pre["imp"]}),
    ("lr",  run_LogReg,        {"preprocessor": pre["imp_onehot"]}),
    ("knn", run_KNN,           {"preprocessor": pre["imp_dense_sc"], "n_jobs": 4}),
    ("svc", run_SVC,           {"preprocessor": pre["imp_dense_sc"], "n_jobs": 16}),
    ("nn",  run_NeuralNetwork, {"preprocessor": pre["imp_dense_sc"]}),
    ("tab", run_tabnet,        {"X_num": X_num, "X_cat": X_cat, "preprocessor": pre["imp_ordinal"], "n_candidates": 20}),
]

# Run models or load them from disk 
model_folder = os.path.join(result_folder, "models")
os.makedirs(model_folder, exist_ok=True)

overwrite_model = False
models = {}

for key, func, kw in configs:
    
    model_path = os.path.join(model_folder, f"models_{key}.joblib")
    if overwrite_model or not os.path.exists(model_path):
        print(f"Running {key}")
        search = func(X=X, y=y, **kw)
        joblib.dump(search, model_path, compress=3)
        models[key] = search
    else:
        print(f"Loading {key}")
        models[key] = joblib.load(model_path)

Loading lgbm
Loading xgb
Loading cat
Loading hgb
Loading rf
Loading et
Loading ebm
Loading lr
Loading knn
Loading svc
Loading nn
Loading tab


In [None]:
# Write Predictions to submit to Kaggle
pred_folder = os.path.join(result_folder, "predictions")
os.makedirs(pred_folder, exist_ok=True)
overwrite_pred = False

for model_name, search in models.items():
    pred_path = os.path.join(pred_folder, f"predictions_{model_name}.csv")
    if overwrite_pred or not os.path.exists(pred_path):
        print(f"Predicting {model_name}")
        warnings.filterwarnings("ignore", category=UserWarning)
        write_predictions(
            df=df_test,
            model=search.best_estimator_,
            features=X_cat + X_num,
            target=target,
            path= pred_path
        )

In [12]:
# Compute cross-validation OOF predictions for analysis in the final notebook
cv_folder = os.path.join(result_folder, "cross_validation")
os.makedirs(cv_folder, exist_ok=True)

overwrite_cv = False
cv = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)
oof_list = []

for model_name, search in models.items():
    scores_path = os.path.join(cv_folder, f"cv_{model_name}_scores.npy")
    best = search.best_estimator_

    if overwrite_cv or not os.path.exists(scores_path):
        y_score = cross_val_predict(best, X, y, cv=cv, n_jobs=3, method="predict_proba")[:, 1]
        np.save(scores_path, y_score)
        print(f"{model_name} OOF Computed")
    else:
        y_score = np.load(scores_path, allow_pickle=True)
        print(f"{model_name} OOF Loaded")

    oof_list.append(y_score)

df_oof = pd.DataFrame(np.vstack(oof_list).T, columns=list(models.keys()))
# df_oof.to_csv(os.path.join(result_folder, "cv_oof.csv"), index=False)

lgbm OOF Loaded
xgb OOF Loaded
cat OOF Loaded
hgb OOF Loaded
rf OOF Loaded
et OOF Loaded
ebm OOF Loaded
lr OOF Loaded
knn OOF Loaded
svc OOF Loaded
nn OOF Loaded
tab OOF Computed


## 2.2 Stacking Classifier

After identifying strong individual learners, we build a **StackingClassifier** where the final estimator is LightGBM as it allows the ensemble to:
- Weight the different mistakes of the single models
- Blend high-bias and high-variance learners to optimize AUC without over-smoothing

In [None]:
overwrite_stack = True
stack_path = os.path.join(model_folder, "models_stack.joblib")

if overwrite_stack or not os.path.exists(stack_path):
    stack = StackingClassifier(
        estimators=[(model_name, search.best_estimator_) for model_name, search in models.items()],
        final_estimator= lgb_default(seed=SEED, n_estimator = 2500),
        cv=cv,
        n_jobs=1,
        verbose=2
    )

    with tqdm_joblib(tqdm(total=2 * len(models), desc="GridSearch Stacking")):
        stack.fit(X, y)
    joblib.dump(stack,stack_path , compress=3)
else:
    stack = joblib.load(stack_path)

In [None]:
write_predictions(
    df=df_test,model=stack,features=X_cat + X_num,
    target=target,path= os.path.join(pred_folder, "predictions_stack.csv")
)

#### Stack Cross Validation

In [None]:
overwrite_stack_cv = True
stack_cv_path = os.path.join(cv_folder, "cv_stack.npy")

if overwrite_stack_cv or not os.path.exists(stack_cv_path):
    with tqdm_joblib(tqdm(total=cv.get_n_splits(), desc="GridSearch Cross Val Predict")):
        oof_stack = cross_val_predict(stack, X, y,cv=cv, n_jobs=2,method='predict_proba',verbose=10)
        np.save(stack_cv_path, oof_stack)
else:
    oof_stack = np.load(stack_cv_path)

In [None]:
# Update the results data frames with the stack
df_oof["stack"] = oof_stack[:,1]
df_oof.to_csv(os.path.join(result_folder, "cv_oof.csv"), index=False)  

## 2.3 Compute SHAP Values

In [None]:
shap_folder = os.path.join(result_folder, "Shap")
os.makedirs(shap_folder, exist_ok=True)

overwrite_explainer = False
N_BG = 250
N_SAMPLE = 1000
max_evals = 1024  

for model_name in list(models.keys()):
    shap_path = os.path.join(shap_folder, f"shap_{model_name}.joblib")
    if overwrite_explainer or not os.path.exists(shap_path):
        print(f"Running {model_name}")
        sv = compute_shap_payload(models, model_name, X, N_SAMPLE, X_num, X_cat, n_bg=N_BG, 
                                  max_evals=max_evals) 
        joblib.dump(sv, shap_path, compress=3)
    else:
        print(f"Loading {model_name}")
        sv = joblib.load(shap_path)

In this notebook, we established a set of baseline models using only the raw features. This gave us a first benchmark to understand the dataset and evaluate model performance in its simplest form. In the [Next Notebook](3_Bank_Features_Engineering.ipynb), we will move on to feature engineering, where we will transform and enrich the data to give our models a stronger representation and hopefully improve their predictive power.