In [None]:
# Enabling auto reload modules
%load_ext autoreload
%autoreload 2

In [None]:
from pprint import pp

import pandas as pd

# You need to import custom class for loading pickle file to work.
from classes_v1 import DataHandler, MyEval, MyUtil, RegSwitcher

In [None]:
IS_PLOT = False
SAVE_PLOT = False

In [None]:
# Search for pkl files
from os import listdir
from os.path import isfile, join

onlyfiles = [f for f in listdir(".") if (isfile(join(".", f)) and f.endswith("pkl"))]
pp(onlyfiles)

In [None]:
filename = "S04_data_2025-05-16_08-05.pkl"
data_load = MyUtil.load_data(filename=filename)

# Print keys
pp([k for k in data_load.keys()])


In [None]:
dt = MyUtil.get_dt()

In [None]:
data_handler = data_load["data_handler"]
df_fit = data_load["df_fit"]

In [None]:
df_fit

In [None]:
# Sort the DataFrame by "rank_test_score"
df_fit = df_fit.sort_values(by="rank_test_score")

# Groups the sorted DataFrame by the columns "id_split" and "estimator".
# For each group (unique combination of split and estimator), selects the first row (which, after sorting, is the one with the best rank_test_score).
# .reset_index() turns the groupby indices back into columns for a clean DataFrame.
df_fit_select = df_fit.groupby(["id_split", "estimator"]).first().reset_index()

display(df_fit_select)

In [None]:
# Initialize blank model (optional)
reg = RegSwitcher(base=None)


df_arr = []
for idx, fit in df_fit_select.iterrows():
    # pp(fit["param_split"])
    # pp(fit["params"])

    param_split = fit["param_split"]
    data_handler.split_and_scale(**param_split)

    X_train, Y_train = data_handler.get_train()
    X_test, Y_test = data_handler.get_test()

    params = fit["params"]
    reg.set_params(**params)

    reg.fit(X_train, Y_train)

    Y_train_pred = reg.predict(X_train)
    Y_test_pred = reg.predict(X_test)

    _df = MyEval.eval(
        Y_train=Y_train,
        Y_train_pred=Y_train_pred,
        Y_test=Y_test,
        Y_test_pred=Y_test_pred,
        id_split=fit["id_split"],
        estimator=fit["estimator"],
    )
    df_arr.append(_df)

    if IS_PLOT:
        id_split = fit["id_split"]
        estimator = fit["estimator"]
        MyEval.plot_res(
            Y_train=Y_train,
            Y_train_pred=Y_train_pred,
            Y_test=Y_test,
            Y_test_pred=Y_test_pred,
            dt=dt,
            save=SAVE_PLOT,
            file_prefix=f"S05-{estimator}-{id_split}",
        )

df_eval = pd.concat(df_arr).reset_index(drop=True)

In [None]:
df_eval

In [None]:
import seaborn as sns

sns.boxplot(data=df_eval, x="estimator", y="MSE Test", hue="Y")

In [None]:
sns.boxplot(data=df_eval, x="estimator", y="R2 Test", hue="Y")