In [2]:
import pickle
from datetime import datetime
from pprint import pp

import pandas as pd
from sklearn.base import BaseEstimator
from sklearn.model_selection import train_test_split


In [3]:
SAVE_DATA = True

In [4]:
class MyUtil:
    def save_data(filename, data):
        with open(filename, "wb") as file:
            pickle.dump(data, file)

    def load_data(filename):
        with open(filename, "rb") as file:
            data = pickle.load(file)
        return data

    def get_dt():
        return datetime.now().strftime("%Y-%m-%d_%H-%M")

In [5]:
class DataHandler:
    def __init__(self, _X, _Y, scalerX, scalerY):
        self._X = _X
        self._Y = _Y
        self.scalerX = scalerX
        self.scalerY = scalerY
        self.X_train = None
        self.X_test = None
        self.Y_train = None
        self.Y_test = None

    def split_and_scale(self, test_size, random_state):
        _X_train, _X_test, _Y_train, _Y_test = train_test_split(
            self._X, self._Y, test_size=test_size, random_state=random_state
        )
        self.X_train = self.scalerX.fit_transform(_X_train)
        self.X_test = self.scalerX.transform(_X_test)

        self.Y_train = self.scalerY.fit_transform(_Y_train)
        self.Y_test = self.scalerY.transform(_Y_test)

    def get_train(self):
        return self.X_train, self.Y_train

    def get_test(self):
        return self.X_test, self.Y_test

In [6]:
class RegSwitcher(BaseEstimator):
    def __init__(self, base=None):
        self.base = base

    def fit(self, X, Y):
        self.base.fit(X, Y)
        self.is_fitted_ = True
        return self

    def predict(self, X):
        return self.base.predict(X)

In [7]:
dt = MyUtil.get_dt()

### Read data


In [8]:
df = pd.read_excel("data.xlsx", index_col="exp")
df.head()

Unnamed: 0_level_0,m1,m2,m3,s1__autocorrelation__lag_8,s1__autocorrelation__lag_9,s1__autocorrelation__lag_7,s1__autocorrelation__lag_6,s1__autocorrelation__lag_5,s1__autocorrelation__lag_4,s1__longest_strike_above_mean,...,s1__ar_coefficient__coeff_3__k_10,s1__approximate_entropy__m_2__r_0.1,s1__lempel_ziv_complexity__bins_3,s1__partial_autocorrelation__lag_4,"s1__fft_coefficient__attr_""abs""__coeff_7","s1__agg_autocorrelation__f_agg_""var""__maxlag_40",s1__spkt_welch_density__coeff_2,y1,y2,y3
exp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
E001,150.223716,1176.177278,1.142097,-0.305434,-0.519191,-0.074829,0.159896,0.38579,0.590387,14,...,0.183996,0.158567,0.204152,-0.360084,0.293617,0.499488,5.886812e-08,55.460434,1.065917,114.57862
E002,102.534268,1483.654982,1.104716,-0.243785,-0.454262,-0.021002,0.202836,0.416423,0.608972,14,...,0.18437,0.144742,0.203008,-0.344364,6.142373,0.477743,3.643621e-06,50.640306,1.285666,124.651484
E003,119.890549,1254.897451,2.162773,-0.329006,-0.543405,-0.095913,0.142612,0.373002,0.582151,14,...,0.184036,0.144268,0.208163,-0.364611,26.783283,0.506435,0.0001590028,50.832405,1.154859,57.018054
E004,162.830799,1302.043195,1.308283,-0.065152,-0.266498,0.138913,0.337187,0.521401,0.683873,16,...,0.187213,0.137326,0.193662,-0.355441,33.227591,0.460547,0.0007926165,62.476545,1.025161,132.221218
E005,165.720956,1154.482314,1.56683,-0.304881,-0.518177,-0.074836,0.159321,0.384728,0.589003,14,...,0.183978,0.128546,0.19244,-0.357588,11.43947,0.49794,0.0001462831,57.634438,1.043776,92.160269


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100 entries, E001 to E100
Data columns (total 50 columns):
 #   Column                                              Non-Null Count  Dtype  
---  ------                                              --------------  -----  
 0   m1                                                  100 non-null    float64
 1   m2                                                  100 non-null    float64
 2   m3                                                  100 non-null    float64
 3   s1__autocorrelation__lag_8                          100 non-null    float64
 4   s1__autocorrelation__lag_9                          100 non-null    float64
 5   s1__autocorrelation__lag_7                          100 non-null    float64
 6   s1__autocorrelation__lag_6                          100 non-null    float64
 7   s1__autocorrelation__lag_5                          100 non-null    float64
 8   s1__autocorrelation__lag_4                          100 non-null    float64
 9   

### Extract data


In [10]:
_X = df.iloc[:, :-3].values
_Y = df.iloc[:, -3:].values
print(_X.shape)
print(_Y.shape)

(100, 47)
(100, 3)


### Initialize DataHandler


In [11]:
from sklearn.preprocessing import StandardScaler

data_handler = DataHandler(
    _X=_X, _Y=_Y, scalerX=StandardScaler(), scalerY=StandardScaler()
)

# Test
# data_handler.split_and_scale(test_size=0.3, random_state=0)
# X_train, Y_train = ds.get_train()
# print(X_train.shape)
# print(Y_train.shape)

### Setup hyper-parameter search


#### Spliting parameters


In [12]:
from sklearn.model_selection import ParameterGrid

param_grid_split = [{"random_state": [1, 2, 3, 4, 5], "test_size": [0.3]}]
param_list_split = list(ParameterGrid(param_grid_split))
pp(param_list_split)

[{'random_state': 1, 'test_size': 0.3},
 {'random_state': 2, 'test_size': 0.3},
 {'random_state': 3, 'test_size': 0.3},
 {'random_state': 4, 'test_size': 0.3},
 {'random_state': 5, 'test_size': 0.3}]


#### Model hyper parameters


In [13]:
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.multioutput import MultiOutputRegressor
from sklearn.svm import SVR

base_lr = MultiOutputRegressor(estimator=LinearRegression())
base_svr = MultiOutputRegressor(estimator=SVR())
base_rf = MultiOutputRegressor(estimator=RandomForestRegressor())

# This is for testing
reg = RegSwitcher(base=base_svr)
pp(reg.get_params())

{'base__estimator__C': 1.0,
 'base__estimator__cache_size': 200,
 'base__estimator__coef0': 0.0,
 'base__estimator__degree': 3,
 'base__estimator__epsilon': 0.1,
 'base__estimator__gamma': 'scale',
 'base__estimator__kernel': 'rbf',
 'base__estimator__max_iter': -1,
 'base__estimator__shrinking': True,
 'base__estimator__tol': 0.001,
 'base__estimator__verbose': False,
 'base__estimator': SVR(),
 'base__n_jobs': None,
 'base': MultiOutputRegressor(estimator=SVR())}


In [14]:
param_grid_hyper = [
    {"base": [base_lr]},
    {"base": [base_svr], "base__estimator__C": [0.01, 0.1, 1]},
    {"base": [base_rf], "base__estimator__n_estimators": [10, 50, 200]},
]

In [15]:
# Initialize blank model (optional)
reg = RegSwitcher(base=None)


df_arr = []
for idx_split, param_split in enumerate(param_list_split):
    data_handler.split_and_scale(**param_split)
    X_train, Y_train = data_handler.get_train()

    gs = GridSearchCV(
        estimator=reg,
        param_grid=param_grid_hyper,
        cv=3,
        # scoring="neg_mean_squared_error",
        scoring="r2",
        n_jobs=-1,
    )
    gs.fit(X_train, Y_train)
    _df = pd.DataFrame(gs.cv_results_)
    _df["id_split"] = idx_split
    _df["param_split"] = [param_split for _ in range(_df.shape[0])]
    df_arr.append(_df)

df_fit = pd.concat(df_arr)
df_fit = df_fit.reset_index().rename(columns={"index": "id_gs"})

In [16]:
df_fit

Unnamed: 0,id_gs,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_base,param_base__estimator__C,param_base__estimator__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,id_split,param_split
0,0,0.005696,0.001312,0.002684,0.000632,MultiOutputRegressor(estimator=LinearRegressio...,,,{'base': MultiOutputRegressor(estimator=Linear...,0.038175,0.752024,-7.906865,-2.372222,3.924419,7,0,"{'random_state': 1, 'test_size': 0.3}"
1,1,0.006367,0.000472,0.002669,0.000474,MultiOutputRegressor(estimator=SVR()),0.01,,{'base': MultiOutputRegressor(estimator=SVR())...,-0.002276,-0.041889,0.024655,-0.006503,0.02733,6,0,"{'random_state': 1, 'test_size': 0.3}"
2,2,0.006357,0.002599,0.00251,0.000408,MultiOutputRegressor(estimator=SVR()),0.1,,{'base': MultiOutputRegressor(estimator=SVR())...,0.302295,0.30957,0.302825,0.304896,0.003312,5,0,"{'random_state': 1, 'test_size': 0.3}"
3,3,0.004357,0.000625,0.002178,0.000233,MultiOutputRegressor(estimator=SVR()),1.0,,{'base': MultiOutputRegressor(estimator=SVR())...,0.617707,0.814445,0.800537,0.744229,0.089645,4,0,"{'random_state': 1, 'test_size': 0.3}"
4,4,0.06521,0.004865,0.005855,0.000949,MultiOutputRegressor(estimator=RandomForestReg...,,10.0,{'base': MultiOutputRegressor(estimator=Random...,0.706828,0.85099,0.699727,0.752515,0.069693,3,0,"{'random_state': 1, 'test_size': 0.3}"
5,5,0.282355,0.031672,0.014225,0.001709,MultiOutputRegressor(estimator=RandomForestReg...,,50.0,{'base': MultiOutputRegressor(estimator=Random...,0.720217,0.855995,0.738507,0.771573,0.06016,1,0,"{'random_state': 1, 'test_size': 0.3}"
6,6,0.989156,0.017984,0.042846,0.002366,MultiOutputRegressor(estimator=RandomForestReg...,,200.0,{'base': MultiOutputRegressor(estimator=Random...,0.728403,0.847749,0.721851,0.766001,0.057866,2,0,"{'random_state': 1, 'test_size': 0.3}"
7,0,0.003851,0.000473,0.001334,0.000471,MultiOutputRegressor(estimator=LinearRegressio...,,,{'base': MultiOutputRegressor(estimator=Linear...,-0.139373,0.777359,-6.449578,-1.937197,3.212609,7,1,"{'random_state': 2, 'test_size': 0.3}"
8,1,0.00351,0.000713,0.002341,0.000242,MultiOutputRegressor(estimator=SVR()),0.01,,{'base': MultiOutputRegressor(estimator=SVR())...,0.042289,0.032906,-0.041063,0.011377,0.037278,6,1,"{'random_state': 2, 'test_size': 0.3}"
9,2,0.003503,0.000408,0.002018,0.000408,MultiOutputRegressor(estimator=SVR()),0.1,,{'base': MultiOutputRegressor(estimator=SVR())...,0.328093,0.341163,0.244914,0.304724,0.042627,5,1,"{'random_state': 2, 'test_size': 0.3}"


In [17]:
# Create a new column called `"estimator"` that contains the class name (as a string) of the estimator used in each row.
# 1. **The DataFrame (`df_fit`):**
#    - Each row represents a different set of hyperparameters tested during grid search.
#    - The `"param_base"` column contains objects `MultiOutputRegressor`.
# 2. **The `.apply()` Method:**
#    - `df_fit["param_base"].apply(...)` applies a function to every entry in the `"param_base"` column.
# 3. **The Lambda Function:**
#    - For each entry `x` in `"param_base"`, the lambda function does:
#      - `x.estimator`: Accesses the estimator object.
#      - `x.estimator.__class__`: Gets the class of the estimator.
#      - `x.estimator.__class__.__name__`: Gets the name of the class as a string (e.g., `"RandomForestClassifier"`).
# 4. **Assigning the Result:**
#    - The resulting estimator names are stored in a new column `"estimator"`.
   
df_fit["estimator"] = df_fit["param_base"].apply(
    lambda x: x.estimator.__class__.__name__
)

# Test
# pp(df_eval["param_base"][0].estimator.__class__.__name__)

In [18]:
df_fit

Unnamed: 0,id_gs,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_base,param_base__estimator__C,param_base__estimator__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,id_split,param_split,estimator
0,0,0.005696,0.001312,0.002684,0.000632,MultiOutputRegressor(estimator=LinearRegressio...,,,{'base': MultiOutputRegressor(estimator=Linear...,0.038175,0.752024,-7.906865,-2.372222,3.924419,7,0,"{'random_state': 1, 'test_size': 0.3}",LinearRegression
1,1,0.006367,0.000472,0.002669,0.000474,MultiOutputRegressor(estimator=SVR()),0.01,,{'base': MultiOutputRegressor(estimator=SVR())...,-0.002276,-0.041889,0.024655,-0.006503,0.02733,6,0,"{'random_state': 1, 'test_size': 0.3}",SVR
2,2,0.006357,0.002599,0.00251,0.000408,MultiOutputRegressor(estimator=SVR()),0.1,,{'base': MultiOutputRegressor(estimator=SVR())...,0.302295,0.30957,0.302825,0.304896,0.003312,5,0,"{'random_state': 1, 'test_size': 0.3}",SVR
3,3,0.004357,0.000625,0.002178,0.000233,MultiOutputRegressor(estimator=SVR()),1.0,,{'base': MultiOutputRegressor(estimator=SVR())...,0.617707,0.814445,0.800537,0.744229,0.089645,4,0,"{'random_state': 1, 'test_size': 0.3}",SVR
4,4,0.06521,0.004865,0.005855,0.000949,MultiOutputRegressor(estimator=RandomForestReg...,,10.0,{'base': MultiOutputRegressor(estimator=Random...,0.706828,0.85099,0.699727,0.752515,0.069693,3,0,"{'random_state': 1, 'test_size': 0.3}",RandomForestRegressor
5,5,0.282355,0.031672,0.014225,0.001709,MultiOutputRegressor(estimator=RandomForestReg...,,50.0,{'base': MultiOutputRegressor(estimator=Random...,0.720217,0.855995,0.738507,0.771573,0.06016,1,0,"{'random_state': 1, 'test_size': 0.3}",RandomForestRegressor
6,6,0.989156,0.017984,0.042846,0.002366,MultiOutputRegressor(estimator=RandomForestReg...,,200.0,{'base': MultiOutputRegressor(estimator=Random...,0.728403,0.847749,0.721851,0.766001,0.057866,2,0,"{'random_state': 1, 'test_size': 0.3}",RandomForestRegressor
7,0,0.003851,0.000473,0.001334,0.000471,MultiOutputRegressor(estimator=LinearRegressio...,,,{'base': MultiOutputRegressor(estimator=Linear...,-0.139373,0.777359,-6.449578,-1.937197,3.212609,7,1,"{'random_state': 2, 'test_size': 0.3}",LinearRegression
8,1,0.00351,0.000713,0.002341,0.000242,MultiOutputRegressor(estimator=SVR()),0.01,,{'base': MultiOutputRegressor(estimator=SVR())...,0.042289,0.032906,-0.041063,0.011377,0.037278,6,1,"{'random_state': 2, 'test_size': 0.3}",SVR
9,2,0.003503,0.000408,0.002018,0.000408,MultiOutputRegressor(estimator=SVR()),0.1,,{'base': MultiOutputRegressor(estimator=SVR())...,0.328093,0.341163,0.244914,0.304724,0.042627,5,1,"{'random_state': 2, 'test_size': 0.3}",SVR


In [19]:
# This imports Python’s built-in regular expression module, which is used for pattern matching in strings.
import re

# - This regex pattern matches column names like `split0_test_score`, `split1_test_score`, etc.
#     - `split` - literal string
#     - `\d+` - one or more digits
#     - `_test_score` - literal string
pattern = r"split\d+_test_score"

# - Iterates through all column names in `df_fit`
# - Selects columns whose names match the pattern exactly
# - **Result:** `colsSplitTestScore` is a list of all columns like `split0_test_score`, `split1_test_score`, etc.
colsSplitTestScore = [col for col in df_fit.columns if re.fullmatch(pattern, col)]

# - Selects all the split test score columns from `df_fit`
# - For each row, collects the values from these columns into a NumPy array
# - Assigns this array to a new column called `validation_scores`
# - **Result:** Each row in `validation_scores` contains an array of validation scores across all splits for that parameter set
df_fit["validation_scores"] = df_fit[colsSplitTestScore].apply(
    lambda row: row.values, axis=1
)

In [None]:
# Extract only columns that I will use
pp(df_fit.columns)sdfasdf

pattern = r"split\d+_test_score"
colsSplitTestScore = [col for col in df_fit.columns if re.fullmatch(pattern, col)]
pp(colsSplitTestScore)


cols = [
    "id_split",
    "param_split",
    "id_gs",
    "params",
    "estimator",
    "mean_test_score",
    "std_test_score",
    "rank_test_score",
    "validation_scores",
]

df_fit = df_fit[cols]
df_fit

Index(['id_gs', 'mean_fit_time', 'std_fit_time', 'mean_score_time',
       'std_score_time', 'param_base', 'param_base__estimator__C',
       'param_base__estimator__n_estimators', 'params', 'split0_test_score',
       'split1_test_score', 'split2_test_score', 'mean_test_score',
       'std_test_score', 'rank_test_score', 'id_split', 'param_split',
       'estimator', 'validation_scores'],
      dtype='object')
['split0_test_score', 'split1_test_score', 'split2_test_score']


Unnamed: 0,id_split,param_split,id_gs,params,estimator,mean_test_score,std_test_score,rank_test_score,validation_scores
0,0,"{'random_state': 1, 'test_size': 0.3}",0,{'base': MultiOutputRegressor(estimator=Linear...,LinearRegression,-2.372222,3.924419,7,"[0.0381745359446262, 0.7520240440765592, -7.90..."
1,0,"{'random_state': 1, 'test_size': 0.3}",1,{'base': MultiOutputRegressor(estimator=SVR())...,SVR,-0.006503,0.02733,6,"[-0.0022755109373579683, -0.04188876497945082,..."
2,0,"{'random_state': 1, 'test_size': 0.3}",2,{'base': MultiOutputRegressor(estimator=SVR())...,SVR,0.304896,0.003312,5,"[0.302294515805052, 0.3095696294549572, 0.3028..."
3,0,"{'random_state': 1, 'test_size': 0.3}",3,{'base': MultiOutputRegressor(estimator=SVR())...,SVR,0.744229,0.089645,4,"[0.6177065038720949, 0.8144446900525925, 0.800..."
4,0,"{'random_state': 1, 'test_size': 0.3}",4,{'base': MultiOutputRegressor(estimator=Random...,RandomForestRegressor,0.752515,0.069693,3,"[0.7068284888375006, 0.8509901556431202, 0.699..."
5,0,"{'random_state': 1, 'test_size': 0.3}",5,{'base': MultiOutputRegressor(estimator=Random...,RandomForestRegressor,0.771573,0.06016,1,"[0.7202168450001457, 0.8559946676769449, 0.738..."
6,0,"{'random_state': 1, 'test_size': 0.3}",6,{'base': MultiOutputRegressor(estimator=Random...,RandomForestRegressor,0.766001,0.057866,2,"[0.7284026574157582, 0.8477485276167961, 0.721..."
7,1,"{'random_state': 2, 'test_size': 0.3}",0,{'base': MultiOutputRegressor(estimator=Linear...,LinearRegression,-1.937197,3.212609,7,"[-0.13937271441371277, 0.777359375151465, -6.4..."
8,1,"{'random_state': 2, 'test_size': 0.3}",1,{'base': MultiOutputRegressor(estimator=SVR())...,SVR,0.011377,0.037278,6,"[0.042289212343515126, 0.032905918683216906, -..."
9,1,"{'random_state': 2, 'test_size': 0.3}",2,{'base': MultiOutputRegressor(estimator=SVR())...,SVR,0.304724,0.042627,5,"[0.3280930724834618, 0.3411633989478616, 0.244..."


### Save data


In [21]:
if SAVE_DATA:
    filename = f"S04_data_{dt}.pkl"

    data_save = {
        "desc": "This is the saved data",
        "data_handler": data_handler,
        "param_split": param_split,
        "param_grid_hyper": param_grid_hyper,
        "df_fit": df_fit,
    }

    # Save the model
    MyUtil.save_data(filename=filename, data=data_save)

### Test loading data


In [22]:
if SAVE_DATA:
    data_load = MyUtil.load_data(filename=filename)

    pp(data_load)

{'desc': 'This is the saved data',
 'data_handler': <__main__.DataHandler object at 0x000001D7C0E3E270>,
 'param_split': {'random_state': 5, 'test_size': 0.3},
 'param_grid_hyper': [{'base': [MultiOutputRegressor(estimator=LinearRegression())]},
                      {'base': [MultiOutputRegressor(estimator=SVR())],
                       'base__estimator__C': [0.01, 0.1, 1]},
                      {'base': [MultiOutputRegressor(estimator=RandomForestRegressor())],
                       'base__estimator__n_estimators': [10, 50, 200]}],
 'df_fit':     id_split                            param_split  id_gs  \
0          0  {'random_state': 1, 'test_size': 0.3}      0   
1          0  {'random_state': 1, 'test_size': 0.3}      1   
2          0  {'random_state': 1, 'test_size': 0.3}      2   
3          0  {'random_state': 1, 'test_size': 0.3}      3   
4          0  {'random_state': 1, 'test_size': 0.3}      4   
5          0  {'random_state': 1, 'test_size': 0.3}      5   
6          0