# Modeling Notebook

This notebook will be used to build and evaluate various classification models to predict whether a client will subscribe to a term deposit based on the preprocessed data In the previous notebook the output of the preprocessing were saved as artifacts. This notebook will load those artifacts and use them to train and evaluate the models.

In [None]:
# Import libraries
import joblib
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.linear_model import Perceptron, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import cross_validate
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

In [None]:
# load artifacts from preprocessing notebook
pipeline = joblib.load("artifacts/preprocessing_pipeline.joblib")
X_train_raw = joblib.load("artifacts/X_train_raw.joblib")
X_test_raw = joblib.load("artifacts/X_test_raw.joblib")
y_train = joblib.load("artifacts/y_train.joblib")
y_test = joblib.load("artifacts/y_test.joblib")

In [None]:
# define function for modeling generator
def make_pipeline(clf):
    return ImbPipeline([
        ("preprocess", pipeline),
        ("smote", SMOTE(random_state=42)),
        ("classifier", clf)
    ])

In [None]:
# initialize the models (classifiers)
models = {
    "Perceptron": Perceptron(random_state=42),
    "Logistic Regression": LogisticRegression(random_state=42),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Support Vector Machine": SVC(random_state=42, probability=True),
    "Neural Network": MLPClassifier(random_state=42, max_iter=1000),
    "XGBoost": XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'),
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42)
}

In [None]:
# set scoring metrics
scoring = ["accuracy", "precision", "recall", "f1", "roc_auc"]

In [None]:
# cross-validate each model
# initialize results dictionary
avg_results = {}
full_results = {}
for name, clf in models.items():
    pipe = make_pipeline(clf)
    cv_results = cross_validate(pipe, X_train_raw, y_train_raw, cv=5, scoring=scoring, n_jobs=-1)
    full_results[name] = {
        "fit_time": cv_results["fit_time"],
        "score_time": cv_results["score_time"],
        "test_accuracy": cv_results["test_accuracy"],
        "test_precision": cv_results["test_precision"],
        "test_recall": cv_results["test_recall"],
        "test_f1": cv_results["test_f1"],
        "test_roc_auc": cv_results["test_roc_auc"]}
    avg_results[name] = {metric: np.mean(cv_results[f"test_{metric}"]) for metric in scoring}
    avg_results[name].update({f"std_{metric}": np.std(cv_results[f"test_{metric}"]) for metric in scoring})