# I. Project Team Members

| Prepared by | Email | Prepared for |
| :-: | :-: | :-: |
| **_Your Name_** | _Your Email_ | **_Project Name_** |

# II. Notebook Target Definition

_Insert Text Here_

# III. Notebook Setup

## III.A. Import Libraries

In [None]:
from datetime import datetime
from interpret import set_visualize_provider, show
from interpret.glassbox import ExplainableBoostingClassifier
from interpret.provider import InlineProvider
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, auc, classification_report, ConfusionMatrixDisplay, roc_auc_score, roc_curve
from sklearn.model_selection import cross_val_score, GridSearchCV
from tqdm import tqdm
import hashlib
import json
import numpy as np
import pandas as pd
import pickle

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
set_visualize_provider(InlineProvider())

## III.B. Import Data

In [None]:
X_train = pd.read_pickle('../../data/processed/X_train_woe.pkl')
X_test = pd.read_pickle('../../data/processed/X_test_woe.pkl')
y_train = pd.read_pickle('../../data/processed/y_train.pkl')
y_test = pd.read_pickle('../../data/processed/y_test.pkl')

In [None]:
X_train.head()

In [None]:
X_test.head()

In [None]:
y_train.head()

In [None]:
y_test.head()

# IV. Models Training and Evaluation

## IV.A. Data Shape Inspection

In [None]:
X_train.shape, X_test.shape

In [None]:
y_train.shape, y_test.shape

## IV.B. Data Information Inspection

In [None]:
X_train.info()

In [None]:
X_test.info()

In [None]:
y_train.info()

In [None]:
y_test.info()

## IV.C. Training Log

In [None]:
def time_stamp():
    return datetime.now()


def create_logger():
    return {
        "model_name": [],
        "model_uid": [],
        "training_time": [],
        "training_date": [],
        "performance": [],
        "f1_score_avg": [],
        "data_configurations": []
    }


def training_log_updater(current_log, log_path):
    try:
        with open(log_path, 'r') as file:
            last_log = json.load(file)
    except FileNotFoundError:
        with open(log_path, 'w') as file:
            file.write("[]")
        with open(log_path, 'r') as file:
            last_log = json.load(file)
    last_log.append(current_log)
    with open(log_path, 'w') as file:
        json.dump(last_log, file)
    return last_log


def model_training_and_evaluation(models_list, model_prefix, X_train, y_train, X_test, y_test, data_configuration, log_path):
    logger = create_logger()
    for model in tqdm(models_list):
        model_name = model_prefix + "-" + model["model_name"]
        start_time = time_stamp()
        model["model_object"].fit(X_train, y_train)
        finished_time = time_stamp()
        elapsed_time = (finished_time - start_time).total_seconds()
        y_prediction = model["model_object"].predict(X_test)
        performance = classification_report(
            y_test, y_prediction, output_dict=True)
        original_id = str(start_time) + str(finished_time)
        hashed_id = hashlib.md5(original_id.encode()).hexdigest()
        model["model_uid"] = hashed_id
        logger["model_name"].append(model_name)
        logger["model_uid"].append(hashed_id)
        logger["training_time"].append(elapsed_time)
        logger["training_date"].append(str(start_time))
        logger["performance"].append(performance)
        logger["f1_score_avg"].append(performance["macro avg"]["f1-score"])
        logger["data_configurations"].append(data_configuration)
    training_log = training_log_updater(logger, log_path)
    return training_log, models_list


def training_log_to_df_converter(training_log):
    all_training_logs_df = pd.DataFrame()
    for log in tqdm(training_log):
        individual_log_df = pd.DataFrame(log)
        performance_df = pd.json_normalize(individual_log_df["performance"])
        individual_log_df = pd.concat([individual_log_df.drop(
            "performance", axis=1), performance_df], axis=1)
        all_training_logs_df = pd.concat(
            [all_training_logs_df, individual_log_df])
    all_training_logs_df.sort_values(["f1_score_avg", "training_time"], ascending=[
                                     False, True], inplace=True)
    all_training_logs_df.reset_index(inplace=True, drop=True)
    return all_training_logs_df


def best_model_finder(all_training_logs_df, models_list):
    model_object = None
    best_model_info = all_training_logs_df.iloc[0]
    for configuration_data in models_list:
        for model_data in models_list[configuration_data]:
            if model_data["model_uid"] == best_model_info["model_uid"]:
                model_object = model_data["model_object"]
                break
    if model_object == None:
        raise RuntimeError("The best model not found in your list of model.")
    return model_object

## IV.D. Baseline Models

In [None]:
log_reg_baseline = LogisticRegression(random_state=777)
ebm_baseline = ExplainableBoostingClassifier(random_state=777)

In [None]:
models_list = {
    "vanilla": [
        {"model_name": log_reg_baseline.__class__.__name__,
            "model_object": log_reg_baseline, "model_uid": ""},
        {"model_name": ebm_baseline.__class__.__name__,
            "model_object": ebm_baseline, "model_uid": ""}
    ],
    "smote": [
        {"model_name": log_reg_baseline.__class__.__name__,
            "model_object": log_reg_baseline, "model_uid": ""},
        {"model_name": ebm_baseline.__class__.__name__,
            "model_object": ebm_baseline, "model_uid": ""},
    ],
}

In [None]:
models_list

### IV.D.1. Vanilla Models

In [None]:
training_log, models_list_vanilla = model_training_and_evaluation(
    models_list["vanilla"],
    "baseline_model",
    X_train,
    y_train,
    X_test,
    y_test,
    "vanilla",
    '../../models/logs/training_log.json'
)

In [None]:
models_list

### IV.D.2. Sampling Models

In [None]:
training_log, models_list_smote = model_training_and_evaluation(
    models_list["smote"],
    "smote_model",
    X_train,
    y_train,
    X_test,
    y_test,
    "smote",
    '../../models/logs/training_log.json'
)

In [None]:
models_list

## IV.E. Models Selection

### IV.E.1. Benchmark Performance Review

In [None]:
# Model performance that a model would achieve if it always predicted the most common label.
benchmark = y_train.value_counts(normalize=True)[0]
benchmark

### IV.E.2. Baseline Base Model Performance Review

In [None]:
all_training_logs_df = training_log_to_df_converter(training_log)
all_training_logs_df

In [None]:
baseline_best_model = best_model_finder(all_training_logs_df, models_list)
baseline_best_model

In [None]:
baseline_train_prediction = baseline_best_model.predict(X_train)
baseline_test_prediction = baseline_best_model.predict(X_test)

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(12, 6))
ConfusionMatrixDisplay.from_predictions(
    y_train, baseline_train_prediction, ax=ax[0])
ax[0].set_title("Baseline Train Confusion Matrix")
ConfusionMatrixDisplay.from_predictions(
    y_test, baseline_test_prediction, ax=ax[1])
ax[1].set_title("Baseline Test Confusion Matrix")
plt.show()

In [None]:
def get_prediction_metrics(y_true, y_pred):
    report = classification_report(y_true, y_pred, output_dict=True)
    accuracy = accuracy_score(y_true, y_pred)
    auc_roc = roc_auc_score(y_true, y_pred)
    metrics = {
        "precision": report["weighted avg"]["precision"],
        "recall": report["weighted avg"]["recall"],
        "f1-score": report["weighted avg"]["f1-score"],
        "accuracy": accuracy,
        "auc_roc": auc_roc
    }
    return metrics

In [None]:
baseline_train_metrics = get_prediction_metrics(
    y_train, baseline_train_prediction)
baseline_train_metrics["dataset"] = "Train"
baseline_test_metrics = get_prediction_metrics(
    y_test, baseline_test_prediction)
baseline_test_metrics["dataset"] = "Test"
baseline_metrics_df = pd.DataFrame(
    [baseline_train_metrics, baseline_test_metrics])
baseline_metrics_df

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, baseline_test_prediction)
roc_auc = auc(fpr, tpr)
plt.figure()
plt.plot(fpr,
         tpr,
         color='darkorange',
         lw=2,
         label="ROC curve (area = %0.2f)" % roc_auc)
plt.plot([0, 1],
         [0, 1],
         color='navy',
         lw=2,
         linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver Operating Characteristic")
plt.legend(loc='lower right')
plt.show()

### IV.E.3. Export Baseline Best Model

In [None]:
with open('../../models/baseline_best_model.pkl', 'wb') as file:
    pickle.dump(baseline_best_model, file)

## IV.F. Hyperparameters Tuning

### IV.F.1. Hyperparameters List

In [None]:
log_reg_hyperparams = {
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'max_iter': [50, 100, 200, 300, 400, 500]
}

In [None]:
log_reg_grid_search = GridSearchCV(
    LogisticRegression(), log_reg_hyperparams, n_jobs=-1, verbose=420)

In [None]:
models_list["fine-tuned"] = [{"model_name": log_reg_grid_search.__class__.__name__ + "-" +
                              log_reg_grid_search.estimator.__class__.__name__, "model_object": log_reg_grid_search, "model_uid": ""}]

### IV.F.2. Best Model Hyperparameter Retraining

In [None]:
training_log, models_list_tuned = model_training_and_evaluation(
    models_list["fine-tuned"],
    "tuned_model",
    X_train,
    y_train,
    X_test,
    y_test,
    "tuned",
    '../../models/logs/training_log.json'
)

In [None]:
models_list

### IV.E.3. Hyperparameter-tuned Model Performance Review

In [None]:
all_training_logs_df_tuned = training_log_to_df_converter(training_log)
all_training_logs_df_tuned

In [None]:
models_dict_tuned = {"fine-tuned": models_list_tuned}
tuned_best_model = tuned_model_finder(
    models_dict_tuned["fine-tuned"], "GridSearchCV")
tuned_best_model

In [None]:
tuned_train_prediction = tuned_best_model.predict(X_train)
tuned_test_prediction = tuned_best_model.predict(X_test)

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(12, 6))
ConfusionMatrixDisplay.from_predictions(
    y_train, tuned_train_prediction, ax=ax[0])
ax[0].set_title("Fine-Tuned Train Confusion Matrix")
ConfusionMatrixDisplay.from_predictions(
    y_test, tuned_test_prediction, ax=ax[1])
ax[1].set_title("Fine-Tuned Test Confusion Matrix")
plt.show()

In [None]:
tuned_train_metrics = get_prediction_metrics(y_train, tuned_train_prediction)
tuned_train_metrics["dataset"] = "Train"
tuned_test_metrics = get_prediction_metrics(y_test, tuned_test_prediction)
tuned_test_metrics["dataset"] = "Test"
tuned_metrics_df = pd.DataFrame([tuned_train_metrics, tuned_test_metrics])
tuned_metrics_df

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, tuned_test_prediction)
roc_auc = auc(fpr, tpr)
plt.figure()
plt.plot(fpr,
         tpr,
         color='darkorange',
         lw=2,
         label="ROC curve (area = %0.2f)" % roc_auc)
plt.plot([0, 1],
         [0, 1],
         color='navy',
         lw=2,
         linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver Operating Characteristic")
plt.legend(loc='lower right')
plt.show()

### IV.F.4. Export Hyperparameter-tuned Best Model

In [None]:
with open('../../models/tuned_best_model.pkl', 'wb') as file:
    pickle.dump(tuned_best_model, file)