In [1]:
import sys
sys.path.append('..') 

In [19]:
import src.util as utils

import pandas as pd
import numpy as np
import os

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

from sklearn.metrics import classification_report, ConfusionMatrixDisplay, roc_curve, roc_auc_score

from datetime import datetime as dt
from tqdm import tqdm
import yaml
import joblib
import json
import copy
import hashlib

In [5]:
os.chdir(os.path.dirname(os.getcwd()))
os.getcwd()

'd:\\ML\\PACMANN INTRO PROJECT'

In [8]:
config = utils.load_config()

In [9]:
X_train_feng = utils.pickle_load(config['train_feng_set_path'][0])
y_train_feng = utils.pickle_load(config['train_feng_set_path'][1])
X_val_feng = utils.pickle_load(config['val_feng_set_path'][0])
y_val_feng = utils.pickle_load(config['val_feng_set_path'][1])

In [10]:
def time_stamp(to_str = False):
    if to_str:
        return dt.now().strftime("%Y-%m-%d %H:%M:%S")
    else:
        return dt.now()

In [11]:
def create_log_template():
    logger = {
        "model_name" : [],
        "model_uid" : [],
        "training_time" : [],
        "training_date" : [],
        "performance" : [],
        "f1_score_avg" : [],
        "data_configurations" : [],
    }

    return logger

In [12]:
def training_log_updater(current_log, log_path):
    current_log = current_log.copy()

    try:
        with open(log_path, "r") as file:
            last_log = json.load(file)
        file.close()
    except FileNotFoundError as ffe:
        with open(log_path, "w") as file:
            file.write("[]")
        file.close()
        with open(log_path, "r") as file:
            last_log = json.load(file)
        file.close()
    
    last_log.append(current_log)

    with open(log_path, "w") as file:
        json.dump(last_log, file)
        file.close()

    return last_log

In [24]:
def training_log_to_df(training_log):
    training_res = pd.DataFrame()

    for log in tqdm(training_log):
        training_res = pd.concat([training_res, pd.DataFrame(log)])
    
    training_res.sort_values(["f1_score_avg", "training_time"], ascending = [False, True], inplace = True)
    training_res.reset_index(inplace = True, drop = True)
    
    return training_res

In [28]:
lgr_baseline_multinomial_lbfgs = LogisticRegression(multi_class='multinomial', solver='lbfgs', random_state=42)
lgr_baseline_multinomial_sag = LogisticRegression(multi_class='multinomial', solver='sag', random_state=42)
lgr_baseline_ovr_lbfgs = LogisticRegression(multi_class='ovr', solver='lbfgs', random_state=42)
lgr_baseline_ovr_sag = LogisticRegression(multi_class='ovr', solver='sag', random_state=42)
dct_baseline = DecisionTreeClassifier(random_state=42)
rfc_baseline = RandomForestClassifier(random_state=42)
knn_baseline = KNeighborsClassifier(random_state=42)
xgb_baseline = XGBClassifier(objective='multi:softmax', num_class=len(config['encoder_classes']), random_state=42)

In [43]:
list_of_model = {
    "default_sampling" : [
        { "model_name": "lgr_baseline_multinomial_lbfgs", "model_object": lgr_baseline_multinomial_lbfgs, "model_uid": ""},
        { "model_name": "lgr_baseline_multinomial_sag", "model_object": lgr_baseline_multinomial_sag, "model_uid": ""},
        { "model_name": "lgr_baseline_ovr_lbfgs", "model_object": lgr_baseline_ovr_lbfgs, "model_uid": ""},
        { "model_name": "lgr_baseline_ovr_sag", "model_object": lgr_baseline_ovr_sag, "model_uid": ""},
        { "model_name": dct_baseline.__class__.__name__, "model_object": dct_baseline, "model_uid": ""},
        { "model_name": rfc_baseline.__class__.__name__, "model_object": rfc_baseline, "model_uid": ""},
        { "model_name": knn_baseline.__class__.__name__, "model_object": knn_baseline, "model_uid": ""},
        { "model_name": xgb_baseline.__class__.__name__, "model_object": xgb_baseline, "model_uid": ""}
        ]
    }

In [17]:
def train_eval_model(list_of_model, prefix_model_name, x_train, y_train, data_configuration_name, x_valid, y_valid, log_path):

    list_of_model = copy.deepcopy(list_of_model) # since list is a mutable objects (referring to the same object)
    logger = create_log_template()

    for model in tqdm(list_of_model):    
        model_name = prefix_model_name + "-" + model["model_name"]

        start_time = time_stamp()
        model["model_object"].fit(x_train, y_train)
        finished_time = time_stamp()

        elapsed_time = finished_time - start_time
        elapsed_time = elapsed_time.total_seconds()

        y_pred = model["model_object"].predict(x_valid)
        performance = classification_report(y_valid, y_pred, output_dict = True)

        plain_id = str(start_time) + str(finished_time)
        chiper_id = hashlib.md5(plain_id.encode()).hexdigest()

        model["model_uid"] = chiper_id

        logger["model_name"].append(model_name)
        logger["model_uid"].append(chiper_id)
        logger["training_time"].append(elapsed_time)
        logger["training_date"].append(str(start_time))
        logger["performance"].append(performance)
        logger["f1_score_avg"].append(performance["macro avg"]["f1-score"])
        logger["data_configurations"].append(data_configuration_name)

    training_log = training_log_updater(logger, log_path)

    return training_log, list_of_model

In [44]:
training_log, list_of_model_train = train_eval_model(
    list_of_model["default_sampling"],
    "baseline_model",
    X_train_feng,
    y_train_feng,
    "default_sampling",
    X_val_feng,
    y_val_feng,
    config['models_training_log_path']
)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
100%|██████████| 8/8 [00:01<00:00,  5.43it/s]


In [45]:
list_of_model["undersampling"] = copy.deepcopy(list_of_model_train)

In [46]:
training_res = training_log_to_df(training_log)

100%|██████████| 1/1 [00:00<00:00, 333.36it/s]


In [47]:
training_res

Unnamed: 0,model_name,model_uid,training_time,training_date,performance,f1_score_avg,data_configurations
0,baseline_model-XGBClassifier,469110b4e0acddc0c78dabcf361ecd2a,0.399496,2023-04-08 23:10:32.084369,"{'0': {'precision': 1.0, 'recall': 1.0, 'f1-sc...",0.953901,default_sampling
1,baseline_model-KNeighborsClassifier,c71b7757460c16447a2c75d3a0d012cf,0.003103,2023-04-08 23:10:32.027975,"{'0': {'precision': 1.0, 'recall': 1.0, 'f1-sc...",0.951555,default_sampling
2,baseline_model-RandomForestClassifier,c1afe5ce13011c27f36d094612aa4cb3,0.520536,2023-04-08 23:10:31.477917,"{'0': {'precision': 1.0, 'recall': 1.0, 'f1-sc...",0.945304,default_sampling
3,baseline_model-DecisionTreeClassifier,cd6a533c9af7f37d5814b387db5f208f,0.008973,2023-04-08 23:10:31.461946,"{'0': {'precision': 1.0, 'recall': 1.0, 'f1-sc...",0.945269,default_sampling
4,baseline_model-lgr_baseline_ovr_sag,cf0a3853013902ef720c79e7ff685823,0.059541,2023-04-08 23:10:31.396403,"{'0': {'precision': 1.0, 'recall': 1.0, 'f1-sc...",0.839731,default_sampling
5,baseline_model-lgr_baseline_ovr_lbfgs,cd8915b0ea7280a93502e9f14700f35d,0.088689,2023-04-08 23:10:31.300715,"{'0': {'precision': 1.0, 'recall': 1.0, 'f1-sc...",0.839731,default_sampling
6,baseline_model-lgr_baseline_multinomial_sag,06a4f72c39c61c6eab84f0d33d883bd0,0.079999,2023-04-08 23:10:31.210190,"{'0': {'precision': 1.0, 'recall': 1.0, 'f1-sc...",0.837271,default_sampling
7,baseline_model-lgr_baseline_multinomial_lbfgs,45b7293aefa160d1d3ab281d2cc96d47,0.170586,2023-04-08 23:10:31.026558,"{'0': {'precision': 1.0, 'recall': 1.0, 'f1-sc...",0.837271,default_sampling
