In [35]:
import pickle
from datetime import datetime

import matplotlib.pyplot as plt
import pandas as pd
from sklearn.base import BaseEstimator
from sklearn.metrics import (
    PredictionErrorDisplay,
    mean_absolute_percentage_error,
    mean_squared_error,
    r2_score,
)
from sklearn.model_selection import train_test_split


In [36]:
class MyUtil:
    def save_data(filename, data):
        with open(filename, "wb") as file:
            pickle.dump(data, file)

    def load_data(filename):
        with open(filename, "rb") as file:
            data = pickle.load(file)
        return data

In [37]:
class DataSplitter():
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y
        self.X_train = None
        self.X_test = None
        self.Y_train = None
        self.Y_test = None

    def split(self, test_size, random_state):
        self.X_train, self.X_test, self.Y_train, self.Y_test = train_test_split(
            self.X, self.Y, test_size=test_size, random_state=random_state
        )
        self.random_state = random_state
        self.test_size = test_size
    
    def get_train(self):
        return self.X_train, self.Y_train
    
    def get_test(self):
        return self.X_test, self.Y_test

In [38]:
class MyModel(BaseEstimator):
    def __init__(self, base=None, scalerX=None, scalerY=None):
        self.base = base
        self.scalerX = scalerX
        self.scalerY = scalerY
        self.dt = datetime.now().strftime("%Y-%m-%d_%H-%M")

    def fit(self, X, Y):
        
        X_sc = self.scalerX.fit_transform(X)
        Y_sc = self.scalerY.fit_transform(Y)
        self.base.fit(X_sc, Y_sc)
        self.is_fitted = True
        return self

    def predict(self, X):
        X_sc = self.transformX(X)
        return self.base.predict(X_sc)

    def transformX(self, X):
        if not self.is_fitted:
            raise Exception("Model is not fitted yet")
        X_sc = self.scalerX.transform(X)
        return X_sc

    def transformY(self, Y):
        if not self.is_fitted:
            raise Exception("Model is not fitted yet")
        Y_sc = self.scalerY.transform(Y)
        return Y_sc

    @staticmethod
    def eval_perf(y_true, y_pred):
        mse = mean_squared_error(y_true=y_true, y_pred=y_pred)
        mape = mean_absolute_percentage_error(y_true=y_true, y_pred=y_pred)
        r2 = r2_score(y_true=y_true, y_pred=y_pred)
        return mse, mape, r2

    @staticmethod
    def print_perf(data):
        for k, v in data.items():
            print(k, ":", v)

    def eval(self, X_train, X_test, Y_train, Y_test, param=None, save=False):
        Y_train = self.transformY(Y_train)
        Y_test = self.transformY(Y_test)
        Y_train_pred = self.predict(X_train)
        Y_test_pred = self.predict(X_test)

        data_arr = []
        for i in range(0, Y_train.shape[1]):
            mse_train, mape_train, r2_train = self.eval_perf(
                y_true=Y_train[:, i], y_pred=Y_train_pred[:, i]
            )
            mse_test, mape_test, r2_test = self.eval_perf(
                y_true=Y_test[:, i], y_pred=Y_test_pred[:, i]
            )

            data = {
                "model": self.base.estimator.__class__.__name__,
                "param": param,
                "Y": f"Y-{i + 1}",
                "MSE Train": mse_train,
                "MSE Test": mse_test,
                "MAPE Train": mape_train,
                "MAPE Test": mape_test,
                "R2 Train": r2_train,
                "R2 Test": r2_test,
            }
            # self.print_perf(data)
            data_arr.append(data)

        mse_train, mape_train, r2_train = self.eval_perf(
            y_true=Y_train, y_pred=Y_train_pred
        )
        mse_test, mape_test, r2_test = self.eval_perf(y_true=Y_test, y_pred=Y_test_pred)
        # self.print_perf(data)
        data = {
            "model": self.base.estimator.__class__.__name__,
            "param": param,
            "Y": "Y-All",
            "MSE Train": mse_train,
            "MSE Test": mse_test,
            "MAPE Train": mape_train,
            "MAPE Test": mape_test,
            "R2 Train": r2_train,
            "R2 Test": r2_test,
        }
        data_arr.append(data)
        df_eval = pd.DataFrame.from_dict(data_arr)
        if save:
            filename = f"eval_{self.dt}.xlsx"
            df_eval.to_excel(filename, index=False)
        return df_eval

    def plot_res(self, X_train, X_test, Y_train, Y_test, save=False):
        Y_train = self.transformY(Y_train)
        Y_test = self.transformY(Y_test)
        Y_train_pred = self.predict(X_train)
        Y_test_pred = self.predict(X_test)

        for i in range(0, Y_train.shape[1]):
            fig, axes = plt.subplots(
                nrows=1,
                ncols=2,
                figsize=(10, 5),
                constrained_layout=True,
                sharex=True,
                sharey=True,
            )

            display_train = PredictionErrorDisplay(
                y_true=Y_train[:, i], y_pred=Y_train_pred[:, i]
            )
            display_train.plot(ax=axes[0])
            axes[0].set_title("Train")

            display_train = PredictionErrorDisplay(
                y_true=Y_test[:, i], y_pred=Y_test_pred[:, i]
            )
            display_train.plot(ax=axes[1])
            axes[1].set_title("Test")

            if save:
                filename = f"res_plot_{self.dt}_{i}.png"
                fig.savefig(filename, dpi=300)
            plt.show()


### Read data

In [39]:
df = pd.read_excel("data.xlsx", index_col="exp")
df.head()

Unnamed: 0_level_0,m1,m2,m3,s1__autocorrelation__lag_8,s1__autocorrelation__lag_9,s1__autocorrelation__lag_7,s1__autocorrelation__lag_6,s1__autocorrelation__lag_5,s1__autocorrelation__lag_4,s1__longest_strike_above_mean,...,s1__ar_coefficient__coeff_3__k_10,s1__approximate_entropy__m_2__r_0.1,s1__lempel_ziv_complexity__bins_3,s1__partial_autocorrelation__lag_4,"s1__fft_coefficient__attr_""abs""__coeff_7","s1__agg_autocorrelation__f_agg_""var""__maxlag_40",s1__spkt_welch_density__coeff_2,y1,y2,y3
exp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
E001,150.223716,1176.177278,1.142097,-0.305434,-0.519191,-0.074829,0.159896,0.38579,0.590387,14,...,0.183996,0.158567,0.204152,-0.360084,0.293617,0.499488,5.886812e-08,55.460434,1.065917,114.57862
E002,102.534268,1483.654982,1.104716,-0.243785,-0.454262,-0.021002,0.202836,0.416423,0.608972,14,...,0.18437,0.144742,0.203008,-0.344364,6.142373,0.477743,3.643621e-06,50.640306,1.285666,124.651484
E003,119.890549,1254.897451,2.162773,-0.329006,-0.543405,-0.095913,0.142612,0.373002,0.582151,14,...,0.184036,0.144268,0.208163,-0.364611,26.783283,0.506435,0.0001590028,50.832405,1.154859,57.018054
E004,162.830799,1302.043195,1.308283,-0.065152,-0.266498,0.138913,0.337187,0.521401,0.683873,16,...,0.187213,0.137326,0.193662,-0.355441,33.227591,0.460547,0.0007926165,62.476545,1.025161,132.221218
E005,165.720956,1154.482314,1.56683,-0.304881,-0.518177,-0.074836,0.159321,0.384728,0.589003,14,...,0.183978,0.128546,0.19244,-0.357588,11.43947,0.49794,0.0001462831,57.634438,1.043776,92.160269


In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100 entries, E001 to E100
Data columns (total 50 columns):
 #   Column                                              Non-Null Count  Dtype  
---  ------                                              --------------  -----  
 0   m1                                                  100 non-null    float64
 1   m2                                                  100 non-null    float64
 2   m3                                                  100 non-null    float64
 3   s1__autocorrelation__lag_8                          100 non-null    float64
 4   s1__autocorrelation__lag_9                          100 non-null    float64
 5   s1__autocorrelation__lag_7                          100 non-null    float64
 6   s1__autocorrelation__lag_6                          100 non-null    float64
 7   s1__autocorrelation__lag_5                          100 non-null    float64
 8   s1__autocorrelation__lag_4                          100 non-null    float64
 9   

### Extract data

In [41]:
_X = df.iloc[:, :-3].values
_Y = df.iloc[:, -3:].values
print(_X.shape)
print(_Y.shape)

(100, 47)
(100, 3)


### Split data

In [42]:
ds = DataSplitter(X=_X, Y=_Y)
ds.split(test_size=0.3, random_state=0)

### # Hyper-Parameter Setting

In [43]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.multioutput import MultiOutputRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR

base_lr = MultiOutputRegressor(estimator=LinearRegression())
base_svr = MultiOutputRegressor(estimator=SVR())
base_rf = MultiOutputRegressor(estimator=RandomForestRegressor())

reg = MyModel(base=base_svr, scalerX=StandardScaler(), scalerY=StandardScaler())
reg.get_params()


{'base__estimator__C': 1.0,
 'base__estimator__cache_size': 200,
 'base__estimator__coef0': 0.0,
 'base__estimator__degree': 3,
 'base__estimator__epsilon': 0.1,
 'base__estimator__gamma': 'scale',
 'base__estimator__kernel': 'rbf',
 'base__estimator__max_iter': -1,
 'base__estimator__shrinking': True,
 'base__estimator__tol': 0.001,
 'base__estimator__verbose': False,
 'base__estimator': SVR(),
 'base__n_jobs': None,
 'base': MultiOutputRegressor(estimator=SVR()),
 'scalerX__copy': True,
 'scalerX__with_mean': True,
 'scalerX__with_std': True,
 'scalerX': StandardScaler(),
 'scalerY__copy': True,
 'scalerY__with_mean': True,
 'scalerY__with_std': True,
 'scalerY': StandardScaler()}

In [44]:
from sklearn.model_selection import ParameterGrid

param_grid = [
    {"base": [base_svr], "base__estimator__C": [0.01, 0.1, 1]},
    {"base": [base_lr]},
    {"base": [base_rf], "base__estimator__n_estimators": [10, 50, 200]},
]

param_list = list(ParameterGrid(param_grid))
print(param_list[1])

param_grid_sp = [
    {"random_state": [1,2,3,4,5],
     "test_size": [0.3]
        
        }]
param_list_sp = list(ParameterGrid(param_grid_sp))
print(param_list_sp)

{'base': MultiOutputRegressor(estimator=SVR()), 'base__estimator__C': 0.1}
[{'random_state': 1, 'test_size': 0.3}, {'random_state': 2, 'test_size': 0.3}, {'random_state': 3, 'test_size': 0.3}, {'random_state': 4, 'test_size': 0.3}, {'random_state': 5, 'test_size': 0.3}]


In [45]:
from itertools import product

reg = MyModel(base=None, scalerX=StandardScaler(), scalerY=StandardScaler())

df_arr = []
for param, param_sp in product(param_list, param_list_sp):
    ds.split(**param_sp)
    X_train, Y_train = ds.get_train()
    X_test, Y_test = ds.get_test()
    reg.set_params(**param)
    reg.fit(X_train, Y_train)
    _df = reg.eval(
        X_train=X_train, X_test=X_test, Y_train=Y_train, Y_test=Y_test, param={**param, **param_sp}
    )
    _df["random_state"] = param_sp["random_state"]
    df_arr.append(_df)

df_eval = pd.concat(df_arr)
df_eval

Unnamed: 0,model,param,Y,MSE Train,MSE Test,MAPE Train,MAPE Test,R2 Train,R2 Test,random_state
0,SVR,{'base': MultiOutputRegressor(estimator=SVR(C=...,Y-1,0.894472,0.891485,0.906385,0.964281,0.105528,0.005793,1
1,SVR,{'base': MultiOutputRegressor(estimator=SVR(C=...,Y-2,0.902810,0.751289,1.374025,0.989006,0.097190,0.056342,1
2,SVR,{'base': MultiOutputRegressor(estimator=SVR(C=...,Y-3,0.938257,0.611090,9.763102,1.789892,0.061743,0.042087,1
3,SVR,{'base': MultiOutputRegressor(estimator=SVR(C=...,Y-All,0.911846,0.751288,4.014504,1.247726,0.088154,0.034741,1
0,SVR,{'base': MultiOutputRegressor(estimator=SVR(C=...,Y-1,0.894765,0.896674,0.882962,1.044920,0.105235,0.030311,2
...,...,...,...,...,...,...,...,...,...,...
3,RandomForestRegressor,{'base': MultiOutputRegressor(estimator=Random...,Y-All,0.020664,0.102848,0.342115,0.762779,0.979336,0.903564,4
0,RandomForestRegressor,{'base': MultiOutputRegressor(estimator=Random...,Y-1,0.017482,0.093820,0.283582,0.320117,0.982518,0.915095,5
1,RandomForestRegressor,{'base': MultiOutputRegressor(estimator=Random...,Y-2,0.029704,0.295497,1.474737,0.585021,0.970296,0.758435,5
2,RandomForestRegressor,{'base': MultiOutputRegressor(estimator=Random...,Y-3,0.018046,0.120582,0.331250,0.700155,0.981954,0.914064,5


### Save data

In [46]:
filename = f"data_{reg.dt}.pkl"

data_save = {
    "model": reg,
    "desc": "This is the saved data",
    "Y_train": Y_train,
    "Y_test": Y_test,
}

# Save the model
MyUtil.save_data(filename=filename, data=data_save)

### Test loading data

In [47]:
data_load = MyUtil.load_data(filename=filename)

print(data_load)

{'model': MyModel(base=MultiOutputRegressor(estimator=RandomForestRegressor(n_estimators=200)),
        scalerX=StandardScaler(), scalerY=StandardScaler()), 'desc': 'This is the saved data', 'Y_train': array([[ 53.76740232,   1.25958759,  34.66449993],
       [ 53.91496353,   1.35261509,  32.35478778],
       [ 63.12832471,   1.1126281 ,  95.71603586],
       [ 58.77307369,   1.05310803,  82.68135243],
       [ 56.75477485,   1.14303335,  98.32897644],
       [ 70.06614987,   0.98118122, 126.63417574],
       [ 53.73461703,   0.9148905 , 113.14798104],
       [ 54.52385084,   1.10044252, 103.64481054],
       [ 70.13160757,   0.85268767, 180.73437975],
       [ 63.45964248,   0.84066015, 172.87738751],
       [ 61.78086658,   1.18711531,  73.52215379],
       [ 53.3739997 ,   0.96700595,  91.67638076],
       [ 60.86099747,   0.87854192, 130.6644162 ],
       [ 60.67603464,   1.05429122,  58.5953597 ],
       [ 43.6133633 ,   1.70625093,  19.20400046],
       [ 57.96819834,   0.9300264