In [None]:
import pickle
from datetime import datetime
from pprint import pp

import pandas as pd
from sklearn.base import BaseEstimator
from sklearn.model_selection import train_test_split


In [None]:
SAVE_DATA = True

In [None]:
class MyUtil:
    def save_data(filename, data):
        with open(filename, "wb") as file:
            pickle.dump(data, file)

    def load_data(filename):
        with open(filename, "rb") as file:
            data = pickle.load(file)
        return data

    def get_dt():
        return datetime.now().strftime("%Y-%m-%d_%H-%M")

In [None]:
class DataHandler():
    def __init__(self, _X, _Y, scalerX, scalerY):
        self._X = _X
        self._Y = _Y
        self.scalerX = scalerX
        self.scalerY = scalerY
        self.X_train = None
        self.X_test = None
        self.Y_train = None
        self.Y_test = None

    def split_and_scale(self, test_size, random_state):
        _X_train, _X_test, _Y_train, _Y_test = train_test_split(
            self._X, self._Y, test_size=test_size, random_state=random_state
        )
        self.X_train = self.scalerX.fit_transform(_X_train)
        self.X_test = self.scalerX.transform(_X_test)

        self.Y_train = self.scalerY.fit_transform(_Y_train)
        self.Y_test = self.scalerY.transform(_Y_test)

    def get_train(self):
        return self.X_train, self.Y_train

    def get_test(self):
        return self.X_test, self.Y_test

In [None]:
class RegSwitcher(BaseEstimator):
    def __init__(self, base=None):
        self.base = base

    def fit(self, X, Y):
        self.base.fit(X, Y)
        self.is_fitted_ = True
        return self

    def predict(self, X):
        return self.base.predict(X)

In [None]:
dt = MyUtil.get_dt()

### Read data

In [None]:
df = pd.read_excel("data.xlsx", index_col="exp")
df.head()

In [None]:
df.info()

### Extract data

In [None]:
_X = df.iloc[:, :-3].values
_Y = df.iloc[:, -3:].values
print(_X.shape)
print(_Y.shape)

### Initialize DataHandler

In [None]:
from sklearn.preprocessing import StandardScaler

data_handler = DataHandler(
    _X=_X, _Y=_Y, scalerX=StandardScaler(), scalerY=StandardScaler()
)

# Test
# data_handler.split_and_scale(test_size=0.3, random_state=0)
# X_train, Y_train = ds.get_train()
# print(X_train.shape)
# print(Y_train.shape)

### Setup hyper-parameter search

#### Spliting parameters

In [None]:
from sklearn.model_selection import ParameterGrid

param_grid_split = [{"random_state": [1, 2, 3, 4, 5], "test_size": [0.3]}]
param_list_split = list(ParameterGrid(param_grid_split))
pp(param_list_split)

#### Model hyper parameters

In [None]:
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.multioutput import MultiOutputRegressor
from sklearn.svm import SVR

base_lr = MultiOutputRegressor(estimator=LinearRegression())
base_svr = MultiOutputRegressor(estimator=SVR())
base_rf = MultiOutputRegressor(estimator=RandomForestRegressor())

# This is for testing
reg = RegSwitcher(base=base_svr)
pp(reg.get_params())

In [None]:
param_grid_hyper = [
    {"base": [base_lr]},
    {"base": [base_svr], "base__estimator__C": [0.01, 0.1, 1]},
    {"base": [base_rf], "base__estimator__n_estimators": [10, 50, 200]},
]

In [None]:
# Initialize blank model (optional)
reg = RegSwitcher(base=None)


df_arr = []
for idx_split, param_split in enumerate(param_list_split):
    data_handler.split_and_scale(**param_split)
    X_train, Y_train = data_handler.get_train()

    gs = GridSearchCV(
        estimator=reg,
        param_grid=param_grid_hyper,
        cv=3,
        # scoring="neg_mean_squared_error",
        scoring="r2",
        n_jobs=-1,
    )
    gs.fit(X_train, Y_train)
    _df = pd.DataFrame(gs.cv_results_)
    _df["id_split"] = idx_split
    _df["param_split"] = [param_split for _ in range(_df.shape[0])]
    df_arr.append(_df)

df_fit = pd.concat(df_arr)
df_fit = df_fit.reset_index().rename(columns={"index": "id_gs"})

In [None]:
df_fit

Create a new column called `"estimator"` that contains the class name (as a string) of the estimator used in each row.

1. **The DataFrame (`df_fit`):**
    - Each row represents a different set of hyperparameters tested during grid search.
    - The `"param_base"` column contains objects `MultiOutputRegressor`.

2. **The `.apply()` Method:**
    - `df_fit["param_base"].apply(...)` applies a function to every entry in the `"param_base"` column.

3. **The Lambda Function:**
    - For each entry `x` in `"param_base"`, the lambda function does:
      - `x.estimator`: Accesses the estimator object.
      - `x.estimator.__class__`: Gets the class of the estimator.
      - `x.estimator.__class__.__name__`: Gets the name of the class as a string (e.g., `"RandomForestClassifier"`).

4. **Assigning the Result:**
    - The resulting estimator names are stored in a new column `"estimator"`.


In [None]:
df_fit["estimator"] = df_fit["param_base"].apply(
    lambda x: x.estimator.__class__.__name__
)

# Test
# pp(df_eval["param_base"][0].estimator.__class__.__name__)

In [None]:
# Extract only columns that I will use
pp(df_fit.columns)

cols = [
    "id_split",
    "param_split",
    "id_gs",
    "params",
    "estimator",
    "mean_test_score",
    "std_test_score",
    "rank_test_score",
]

df_fit = df_fit[cols]
df_fit

### Save data

In [None]:
if SAVE_DATA:
    filename = f"S04_data_{dt}.pkl"

    data_save = {
        "desc": "This is the saved data",
        "data_handler": data_handler,
        "param_split": param_split,
        "param_grid_hyper": param_grid_hyper,
        "df_fit": df_fit,
    }

    # Save the model
    MyUtil.save_data(filename=filename, data=data_save)

### Test loading data

In [None]:
if SAVE_DATA:
    data_load = MyUtil.load_data(filename=filename)

    pp(data_load)