# I. Project Team Members

| Prepared by | Email | Prepared for |
| :-: | :-: | :-: |
| **Hardefa Rogonondo** | hardefarogonondo@gmail.com | **IBRD Credit Scorecard Predictive Engine** |

# II. Notebook Target Definition

This notebook delineates the model training and evaluation stage of IBRD Credit Scorecard Predictive Engine Project. Here, we train and test predictive models on our preprocessed and feature-engineered loan data. We leverage metrics such as confusion matrix, ROC-AUC, and F1 score to gauge model performance. The trade-offs between various types of prediction errors are evaluated to choose the most suitable model, in line with our business needs. The result is a robust model capable of effectively predicting loans most likely to be cancelled or terminated, fostering more informed loan management decisions.

# III. Notebook Setup

## III.A. Import Libraries

In [None]:
# check again
from datetime import datetime
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay, f1_score, precision_score, recall_score, roc_auc_score
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from tqdm import tqdm
from xgboost import XGBClassifier
import hashlib
import json
import numpy as np
import pandas as pd
import pickle

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## III.B. Import Data

In [None]:
X_train = pd.read_pickle('../../data/processed/X_train_ohe.pkl')
X_test = pd.read_pickle('../../data/processed/X_test_ohe.pkl')
y_train = pd.read_pickle('../../data/processed/y_train.pkl')
y_test = pd.read_pickle('../../data/processed/y_test.pkl')

In [None]:
X_train.head()

In [None]:
X_test.head()

In [None]:
y_train.head()

In [None]:
y_test.head()

# IV. Models Training

## IV.A. Data Shape Inspection

In [None]:
X_train.shape, X_test.shape

In [None]:
y_train.shape, y_test.shape

## IV.B. Data Information Inspection

In [None]:
X_train.info()

In [None]:
X_test.info()

In [None]:
y_train.info()

In [None]:
y_test.info()

## IV.C. Training Log

In [None]:
def time_stamp():
    return datetime.now()

def create_logger():
    return {
        "model_name": [],
        "model_uid": [],
        "training_time": [],
        "training_date": [],
        "performance": [],
        "f1_score_avg": [],
        "data_configurations": []
    }

def training_log_updater(current_log, log_path):
    try:
        with open(log_path, "r") as file:
            last_log = json.load(file)
    except FileNotFoundError:
        with open(log_path, "w") as file:
            file.write("[]")
        with open(log_path, "r") as file:
            last_log = json.load(file)
    last_log.append(current_log)
    with open(log_path, "w") as file:
        json.dump(last_log, file)
    return last_log

def model_training_and_evaluation(models_list, model_prefix, X_train, y_train, X_test, y_test, data_configuration, log_path):
    logger = create_logger()
    for model in tqdm(models_list):
        model_name = model_prefix + "-" + model["model_name"]
        start_time = time_stamp()
        model["model_object"].fit(X_train, y_train)
        finished_time = time_stamp()
        elapsed_time = (finished_time - start_time).total_seconds()
        y_prediction = model["model_object"].predict(X_test)
        performance = classification_report(y_test, y_prediction, output_dict = True)
        original_id = str(start_time) + str(finished_time)
        hashed_id = hashlib.md5(original_id.encode()).hexdigest()
        model["model_uid"] = hashed_id
        logger["model_name"].append(model_name)
        logger["model_uid"].append(hashed_id)
        logger["training_time"].append(elapsed_time)
        logger["training_date"].append(str(start_time))
        logger["performance"].append(performance)
        logger["f1_score_avg"].append(performance["macro avg"]["f1-score"])
        logger["data_configurations"].append(data_configuration)
    training_log = training_log_updater(logger, log_path)
    return training_log, models_list

def training_log_to_df_converter(training_log):
    all_training_logs_df = pd.DataFrame()
    for log in tqdm(training_log):
        individual_log_df = pd.DataFrame(log)
        performance_df = pd.json_normalize(individual_log_df["performance"])
        individual_log_df = pd.concat([individual_log_df.drop("performance", axis = 1), performance_df], axis = 1)
        all_training_logs_df = pd.concat([all_training_logs_df, individual_log_df])
    all_training_logs_df.sort_values(["f1_score_avg", "training_time"], ascending = [False, True], inplace = True)
    all_training_logs_df.reset_index(inplace = True, drop = True)
    return all_training_logs_df

def best_model_finder(all_training_logs_df, models_list):
    model_object = None
    best_model_info = all_training_logs_df.iloc[0]
    for configuration_data in models_list:
        for model_data in models_list[configuration_data]:
            if model_data["model_uid"] == best_model_info["model_uid"]:
                model_object = model_data["model_object"]
                break
    if model_object == None:
        raise RuntimeError("The best model not found in your list of model.")
    return model_object

## IV.D. Baseline Models

In [None]:
# check again
log_reg_baseline = LogisticRegression()
decision_tree_baseline = DecisionTreeClassifier()
random_forest_baseline = RandomForestClassifier()
xgb_baseline = XGBClassifier()

In [None]:
models_list = {
    "vanilla": [
        {"model_name": log_reg_baseline.__class__.__name__, "model_object": log_reg_baseline, "model_uid": ""},
        {"model_name": decision_tree_baseline.__class__.__name__, "model_object": decision_tree_baseline, "model_uid": ""},
        {"model_name": random_forest_baseline.__class__.__name__, "model_object": random_forest_baseline, "model_uid": ""},
        {"model_name": xgb_baseline.__class__.__name__, "model_object": xgb_baseline, "model_uid": ""}
    ]
}

### IV.D.1. Vanilla Models

In [None]:
training_log, models_list_vanilla = model_training_and_evaluation(
    models_list["vanilla"],
    "baseline_model",
    X_train,
    y_train,
    X_test,
    y_test,
    "vanilla",
    '../../models/logs/training_log.json'
)

In [None]:
models_list["vanilla"] = models_list_vanilla

## IV.E. Models Selection

In [None]:
# Model performance that a model would achieve if it always predicted the most common label.
benchmark = y_train.value_counts(normalize = True)[0]
benchmark

In [None]:
all_training_logs_df = training_log_to_df_converter(training_log)
all_training_logs_df

In [None]:
baseline_best_model = best_model_finder(all_training_logs_df, models_list)
baseline_best_model

### IV.E.1. Confusion Matrix Review

In [None]:
y_prediction = baseline_best_model.predict(X_test)
ConfusionMatrixDisplay.from_predictions(y_test, y_prediction)

In [None]:
from sklearn import metrics

In [None]:
confusion_matrix = metrics.confusion_matrix(y_test, y_prediction)
print(confusion_matrix)

In [None]:
y_test.value_counts()

### IV.E.2. Export Baseline Best Model

In [None]:
# with open("../../models/baseline_best_model.pkl", "wb") as file:
#     pickle.dump(baseline_best_model, file)

## IV.F. Hyperparameter Tuning

### IV.F.1. Hyperparameters List

In [None]:
xgb_hyperparams = {
    "learning_rate": [0.01, 0.05, 0.1],
    "max_depth": [3, 5, 7, 10],
    "n_estimators": [100, 200, 500],
    "subsample": [0.5, 0.7, 1],
    "colsample_bytree": [0.5, 0.7, 1],
}

In [None]:
xgb_grid_search = GridSearchCV(XGBClassifier(use_label_encoder=False, eval_metric='logloss'), xgb_hyperparams, n_jobs = -1, verbose = 420)

models_list["vanilla"].append({
    "model_name": xgb_grid_search.__class__.__name__ + "-" + xgb_grid_search.estimator.__class__.__name__,
    "model_object": xgb_grid_search,
    "model_uid": ""
})

### IV.F.2. Best Model Hyperparameter Retraining

In [None]:
training_log, models_list_vanilla_tuned = model_training_and_evaluation(
    [models_list["vanilla"][-1]],
    "tuned_model",
    X_train,
    y_train,
    X_test,
    y_test,
    "vanilla",
    '../../models/logs/training_log.json'
)

In [None]:
models_list["vanilla"][-1]

In [None]:
all_training_logs_df_tuned = training_log_to_df_converter(training_log)
all_training_logs_df_tuned

In [None]:
tuned_best_model = best_model_finder(all_training_logs_df_tuned, models_list)
tuned_best_model

In [None]:
y_prediction = tuned_best_model.predict(X_test)
ConfusionMatrixDisplay.from_predictions(y_test, y_prediction)

In [None]:
confusion_matrix

### IV.F.3. Export Hyperparameter-tuned Best Model

In [None]:
# with open("../../models/tuned_best_model.pkl", "wb") as file:
#     pickle.dump(tuned_best_model, file)