In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('bmh')
import itertools
from tqdm import tqdm
from datetime import datetime
import statsmodels.api as sm
import statsmodels.formula.api as smf

# 5 Publish

In [6]:
class regVariableConversion(object):
    def __init__(self, **kwargs):
        # optional 
        self.on_cpPlot = kwargs.get("on_cpPlot", False)
        self.on_combine = kwargs.get("on_combine", False)
        self.on_log = kwargs.get("on_log", True)
        self.on_recCp = kwargs.get("on_recCp", True)
        self.on_recAll = kwargs.get("on_recAll", True)

        # key setting
        self.comb = kwargs.get("comb", 3)  # number of factor
        self.KEY = [
            "1", "2", "3", "4", "5", "6", "7", "8", "9", "a",
            "b", "c", "d"
        ]

        self.comb_nCr = [0]  #n: key total, r:combination 
        sum = 0
        self.label_list = []  # all combination label
        for i_comb in range(1, 1+self.comb):
            x = len(list(itertools.combinations(range(len(self.KEY)), i_comb)))
            sum += x
            self.comb_nCr.append(sum)
            for combo in itertools.combinations(self.KEY, i_comb):
                label = ""
                for i in range(len(combo)):
                    label += combo[i]
                self.label_list.append(label)

        # decision
        self.sep_homos = 10  # number of separating for homoscedasticity
        self.th_homoAndMse = kwargs["th_homoAndMse"] if "th_homoAndMse" in kwargs else 1.05

        # optional
        self.now = datetime.now().strftime('%Y_%m%d_%H%M%S')

    def _print(self, log):
        if self.on_log == True:
            print(log)

    def clear(self):
        self.res_formula = []  # "Y ~ + X + np.log(X)"
        self.res_key = []  # "123"
        self.res_ind = []  # ['X3', 'X3', 'logX/X', 'logX']

        # predict
        idx_costs = ["mse", "r", "homo","aic", "1d"]
        self.df_bestCosts_score = pd.DataFrame(index=idx_costs)
        self.df_bestCosts_key = pd.DataFrame(index=idx_costs)
        self.df_bestCosts_mse = pd.DataFrame(index=idx_costs)

        self.df_output = pd.DataFrame()

        # slidingFilter
        self.GAP = []

        # record change point calculation
        self.cp_col  = []  # id
        self.cp_ls = []  # calculation result
        self.cp_cost = []  # cp cost

    def slidingFilter(self):
        min_ = list(self.df.iloc[:, :-1].min())
        for i_col in range(len(min_)):
            gap = 0
            if min_[i_col] < 1:
                if min_[i_col] >= 0:
                    gap = min_[i_col] + 1
                else:
                    gap = -(min_[i_col])*1.1 + 1

            if gap != 0:
                self.df.iloc[:, i_col] = self.df.iloc[:, i_col] + gap

            self.GAP.append(gap)

    def df_init(self, df, col_target):
        """
        change columns name of input dataframe
        ["AT", "V", "EP"] => ["X0", "X1", "Y"] 
        """
        cols = list(df.columns)
        
        col_ex = cols.copy()
        col_ex.remove(col_target)

        self.col_origin = col_ex.copy()
        self.col_origin.append(col_target)
        df = df.reindex(self.col_origin, axis="columns")

        col_replace = []
        for i in range(len(col_ex)): col_replace.append(f"X{i}")
        col_replace.append("Y")
        df.columns = col_replace
    
        self.col_target = col_replace[-1]
        self.col_ex = col_replace[:-1]
        self.df = df

        # sliding Filter
        self.slidingFilter()


    def key2smf(self, keys):
        """ key list is converted to smf format

        Args:
            keys (tuple, list, str): [ex] ("1", "2", "3")

        Returns:
            form (str) : [ex] "Y ~ + X + np.log(X)"
        """
        form = f"{self.col_target} ~ "
        if "1" in keys: form += f"+ {self.col_ex0} "
        if "2" in keys: form += f"+ np.square({self.col_ex0}) "
        if "3" in keys: form += f"+ np.power({(self.col_ex0)}, 3) "
        if "4" in keys: form += f"+ np.log({self.col_ex0}) "
        if "5" in keys: form += f"+ np.reciprocal({self.col_ex0}) "
        if "6" in keys: form += f"+ np.sqrt({self.col_ex0}) "
        if "7" in keys: form += f"+ {self.col_ex0}:np.log({self.col_ex0}) "
        if "8" in keys: form += f"+ np.square({self.col_ex0}):np.log({self.col_ex0}) "
        if "9" in keys: form += f"+ np.log({self.col_ex0}):np.reciprocal({self.col_ex0}) "
        if "a" in keys: form += f"+ np.reciprocal(np.square({self.col_ex0})) "
        if "b" in keys: form += f"+ {self.col_ex0}:np.sqrt({self.col_ex0}) "
        if "c" in keys: form += f"+ np.sqrt(np.log({self.col_ex0})) "
        if "d" in keys: form += f"+ np.sqrt({self.col_ex0}*np.log({self.col_ex0})) "

        return form

    def changepoint(self, Y):
        score = []
        X = range(len(Y))
        for i in range(len(Y)-1):
            if i == 0:
                res1 = sm.OLS(Y, sm.add_constant(X)).fit()
                score.append(np.mean(np.abs(res1.resid)))
            else:
                X1, X2, Y1, Y2 = X[:i+1], X[i:], Y[:i+1], Y[i:]
                res1 = sm.OLS(Y1, sm.add_constant(X1)).fit()
                res2 = sm.OLS(Y2, sm.add_constant(X2)).fit()
                score1 = np.abs(res1.resid)
                score.append(np.sum(score1 + np.abs(res2.resid))/(len(X)+1))

        idx_cp = np.array(score).argmin()

        if self.on_recCp:
            self.cp_col.append(f"{self.col_ex0}")
            self.cp_ls.append(idx_cp)  # calculation result
            self.cp_cost.append(np.array(score))  # cp cost

        return idx_cp

    def publish_cpTable(self):
        df_cp = pd.DataFrame(self.cp_cost, index=self.cp_col).T
        idx_cp = self.cp_ls
        return df_cp, idx_cp

    def cost(self):
        """
        In given a variable, it calculates cost for all formula combination.
        """
        x_ = self.df[self.col_ex0].values
        idx_xsort = np.argsort(x_)
        unit = round(len(x_)/self.sep_homos)

        # formula combination loop
        score = []
        for i_comb in tqdm(range(1, 1+self.comb)):
            for combo in itertools.combinations(self.KEY, i_comb):
                formula= self.key2smf(combo) 
                results = smf.ols(formula, data=self.df).fit()
                resid_sort = results.resid.values[idx_xsort]

                residMean = []
                for i_homos in range(self.sep_homos):
                    residMean.append(np.mean(resid_sort[unit*i_homos:unit*(i_homos+1)]))

                score.append([results.mse_resid, results.rsquared, np.std(residMean), results.aic])

        self.costs = np.array(score).T  #[cost type]x[factor combination]

    def evaluateOnLayer(self):
        evals = [] #  mse, r, homo, aic
        evals_id = [] #  mse, r, homo, aic
        nCr = self.comb_nCr

        # best evals in each factor numbers[layer](1, 2, 3....)
        for i in range(len(nCr)-1):
            c_ = self.costs[:, nCr[i]:nCr[i+1]]
            evals.append(np.min(c_, axis=1))
            evals_id.append(np.argmin(c_, axis=1) + nCr[i])

        evals = np.asarray(evals).T  # [cost type]x[factor layer]
        evals_id = np.asarray(evals_id).T  # [cost type]x[factor layer]

        # Which layer(factor) is appropriate in each cost type?
        # Basically if layer increased, performace also increase. it needs to find converged point.
        res0, res1 = [], []
        # MSE
        idx_mse = self.changepoint(evals[0, :])
        res0.append(evals[0, idx_mse])
        res1.append(int(evals_id[0, idx_mse]))
        # R
        idx_r = self.changepoint(evals[1, :])
        res0.append(evals[1, idx_r])
        res1.append(int(evals_id[1, idx_r]))
        # Homo
        idx_homo = int((evals[2, :]).argmin())
        res0.append(evals[2, idx_homo])
        res1.append(int(evals_id[2, idx_homo]))
        # AIC
        idx_aic = int((evals[3, :]).argmin())
        res0.append(evals[3, idx_aic])
        res1.append(int(evals_id[3, idx_aic]))
        # 1Factor limitation
        res0.append(evals[0, 0])
        res1.append(int(evals_id[0, 0]))

        self.df_bestCosts_score[self.col_ex0] = res0
        self.df_bestCosts_key[self.col_ex0] = res1
        self.df_bestCosts_mse[self.col_ex0] = [
            res0[0],
            self.costs[0, res1[1]],
            self.costs[0, res1[2]],
            self.costs[0, res1[3]],
            self.costs[0, res1[4]]
        ]

    def decision(self):
        costs = self.df_bestCosts_score.loc[:, self.col_ex0].values
        keys = self.df_bestCosts_key.loc[:, self.col_ex0].values
        labels = self.label_list
        r = costs[1]
        numComb_mse, numComb_homo = len(labels[keys[0]]), len(labels[keys[2]])

        if r < 0.6:  # well fitted? => No
            key_ = labels[keys[4]]
            self.res_formula.append(self.key2smf(key_))
            self._print(f"[Result/{self.col_ex0}]:r is {r} >> 1d")
        else:
            mse_homo = self.df_bestCosts_mse.loc["homo", self.col_ex0]
            mse_mse = self.df_bestCosts_mse.loc["mse", self.col_ex0]
            if mse_homo/mse_mse > self.th_homoAndMse:
                key_ = labels[keys[0]]
                self.res_formula.append(self.key2smf(key_))
                self._print(f"[Result/{self.col_ex0}]:homo/mse is {mse_homo/mse_mse} >> mse")
            else:
                if numComb_mse >= numComb_homo:
                    key_ = labels[keys[2]]
                    self.res_formula.append(self.key2smf(key_))
                    self._print(f"[Result/{self.col_ex0}]:mse,homo:{numComb_mse},{numComb_homo} >> mse")
                else:
                    key_ = labels[keys[0]]
                    self.res_formula.append(self.key2smf(key_))
                    self._print(f"[Result/{self.col_ex0}]:mse,homo:{numComb_mse},{numComb_homo} >> homo")

    def publish(self):
        """ 
        Mods:
            df_output
        """
        index_ = self.col_ex.copy()
        index_.append("Y")
        formula_ = self.res_formula.copy()
        formula_.append(None)
        GAP_ = self.GAP.copy()
        GAP_.append(np.nan)

        self.df_output.index = index_
        self.df_output["formula"] = formula_
        self.df_output["col_origin"] = self.col_origin
        self.df_output["gap"] = GAP_

    def fit(self, df, col_target):
        self.clear()
        self.df_init(df, col_target)

        for i_col in range(len(self.col_ex)):
            self.col_ex0 = self.col_ex[i_col]
            self.cost()
            self.evaluateOnLayer()
            self.decision()
            
            if self.on_recAll: 
                pd.to_pickle(self.costs, f"../_temp/{self.now}_{self.col_ex0}.pkl")

        self.publish()

        return self.df_output

dfx = pd.read_csv("../data/df_g1.csv", index_col=0, usecols=[0, 1, 2, 3, 4, 5])
rvc = regVariableConversion()
out = rvc.fit(dfx, "EP")

100%|██████████| 3/3 [00:02<00:00,  1.47it/s]


[Result/X0]:mse,homo:2,3 >> homo


100%|██████████| 3/3 [00:02<00:00,  1.48it/s]


[Result/X1]:mse,homo:2,1 >> mse


100%|██████████| 3/3 [00:01<00:00,  1.52it/s]


[Result/X2]:r is 0.2708026338317573 >> 1d


100%|██████████| 3/3 [00:01<00:00,  1.53it/s]

[Result/X3]:r is 0.1535596796629648 >> 1d





In [69]:
out

Unnamed: 0,formula,col_origin,gap
X0,"Y ~ + np.square(X0) + np.power(X0, 3)",AT,0.0
X1,Y ~ + np.reciprocal(X1),V,0.0
X2,"Y ~ + np.power(X2, 3)",AP,0.0
X3,Y ~ + X3:np.log(X3),RH,0.0
Y,,EP,


In [14]:
dfx

Unnamed: 0,AT,V,AP,RH,EP
0,8.34,40.77,1010.84,90.01,480.48
1,23.64,58.49,1011.40,74.20,445.75
2,29.74,56.90,1007.15,41.91,438.76
3,19.07,49.69,1007.22,76.79,453.09
4,11.80,40.66,1017.13,97.20,464.43
...,...,...,...,...,...
9563,15.12,48.92,1011.80,72.93,462.59
9564,33.41,77.95,1010.30,59.72,432.90
9565,15.99,43.34,1014.20,78.66,465.96
9566,17.65,59.87,1018.58,94.65,450.93


# 4 decision

In [42]:
class regVariableConversion(object):
    def __init__(self, **kwargs):
        # optional 
        self.on_cpPlot = kwargs.get("on_cpPlot", False)
        self.on_combine = kwargs.get("on_combine", False)
        self.on_log = kwargs.get("on_log", True)
        self.on_recCp = kwargs.get("on_recCp", True)
        self.on_recAll = kwargs.get("on_recAll", True)

        # key setting
        self.comb = kwargs.get("comb", 3)  # number of factor
        self.KEY = [
            "1", "2", "3", "4", "5", "6", "7", "8", "9", "a",
            "b", "c", "d"
        ]

        self.comb_nCr = [0]  #n: key total, r:combination 
        sum = 0
        self.label_list = []  # all combination label
        for i_comb in range(1, 1+self.comb):
            x = len(list(itertools.combinations(range(len(self.KEY)), i_comb)))
            sum += x
            self.comb_nCr.append(sum)
            for combo in itertools.combinations(self.KEY, i_comb):
                label = ""
                for i in range(len(combo)):
                    label += combo[i]
                self.label_list.append(label)

        # decision
        self.sep_homos = 10  # number of separating for homoscedasticity
        self.th_homoAndMse = kwargs["th_homoAndMse"] if "th_homoAndMse" in kwargs else 1.05

        # optional
        self.now = datetime.now().strftime('%Y_%m%d_%H%M%S')

    def _print(self, log):
        if self.on_log == True:
            print(log)

    def clear(self):
        self.res_formula = []  # "Y ~ + X + np.log(X)"
        self.res_key = []  # "123"
        self.res_ind = []  # ['X3', 'X3', 'logX/X', 'logX']

        # predict
        idx_costs = ["mse", "r", "homo","aic", "1d"]
        self.df_bestCosts_score = pd.DataFrame(index=idx_costs)
        self.df_bestCosts_key = pd.DataFrame(index=idx_costs)
        self.df_bestCosts_mse = pd.DataFrame(index=idx_costs)

        self.ls_df_cost = []
        self.df_output = pd.DataFrame()

        # _slidingFilter
        self.GAP = {}

        # record change point calculation
        self.cp_col  = []  # id
        self.cp_ls = []  # calculation result
        self.cp_cost = []  # cp cost

    def df_init(self, df, col_target):
        """
        change columns name of input dataframe
        ["AT", "V", "EP"] => ["X0", "X1", "Y"] 
        """
        cols = list(df.columns)
        
        col_ex = cols.copy()
        col_ex.remove(col_target)

        self.col_origin = col_ex.copy()
        self.col_origin.append(col_target)
        df = df.reindex(self.col_origin, axis="columns")

        col_replace = []
        for i in range(len(col_ex)): col_replace.append(f"X{i} ")
        col_replace.append("Y")
        df.columns = col_replace
    
        self.col_target = col_replace[-1]
        self.col_ex = col_replace[:-1]
        self.df = df

    def key2smf(self, keys):
        """ key list is converted to smf format

        Args:
            keys (tuple, list, str): [ex] ("1", "2", "3")

        Returns:
            form (str) : [ex] "Y ~ + X + np.log(X)"
        """
        form = f"{self.col_target} ~ "
        if "1" in keys: form += f"+ {self.col_ex0} "
        if "2" in keys: form += f"+ np.square({self.col_ex0}) "
        if "3" in keys: form += f"+ np.power({(self.col_ex0)}, 3) "
        if "4" in keys: form += f"+ np.log({self.col_ex0}) "
        if "5" in keys: form += f"+ np.reciprocal({self.col_ex0}) "
        if "6" in keys: form += f"+ np.sqrt({self.col_ex0}) "
        if "7" in keys: form += f"+ {self.col_ex0}:np.log({self.col_ex0}) "
        if "8" in keys: form += f"+ np.square({self.col_ex0}):np.log({self.col_ex0}) "
        if "9" in keys: form += f"+ np.log({self.col_ex0}):np.reciprocal({self.col_ex0}) "
        if "a" in keys: form += f"+ np.reciprocal(np.square({self.col_ex0})) "
        if "b" in keys: form += f"+ {self.col_ex0}:np.sqrt({self.col_ex0}) "
        if "c" in keys: form += f"+ np.sqrt(np.log({self.col_ex0})) "
        if "d" in keys: form += f"+ np.sqrt({self.col_ex0}*np.log({self.col_ex0})) "

        return form

    def key2ind(self, key):
        """ key is converted the name human can recognize easily.

        Args:
            key (str): [ex] "3394"

        Returns:
            list: [ex] ['X3', 'X3', 'logX/X', 'logX']
        """
        result = [s.replace('1', 'X') for s in key]
        result = [s.replace('2', 'X2') for s in result]
        result = [s.replace('3', 'X3') for s in result]
        result = [s.replace('4', 'logX') for s in result]
        result = [s.replace('5', '1/X') for s in result]
        result = [s.replace('6', '√X') for s in result]
        result = [s.replace('7', 'X_logX') for s in result]
        result = [s.replace('8', 'X2_logX') for s in result]
        result = [s.replace('9', 'logX/X') for s in result]
        result = [s.replace('a', '1/X2') for s in result]
        result = [s.replace('b', 'X√X') for s in result]
        result = [s.replace('c', '√logx') for s in result]
        result = [s.replace('d', '√XlogX') for s in result]

        return result

    def changepoint(self, Y):
        score = []
        X = range(len(Y))
        for i in range(len(Y)-1):
            if i == 0:
                res1 = sm.OLS(Y, sm.add_constant(X)).fit()
                score.append(np.mean(np.abs(res1.resid)))
            else:
                X1, X2, Y1, Y2 = X[:i+1], X[i:], Y[:i+1], Y[i:]
                res1 = sm.OLS(Y1, sm.add_constant(X1)).fit()
                res2 = sm.OLS(Y2, sm.add_constant(X2)).fit()
                score1 = np.abs(res1.resid)
                score.append(np.sum(score1 + np.abs(res2.resid))/(len(X)+1))

        idx_cp = np.array(score).argmin()

        if self.on_recCp:
            self.cp_col.append(f"{self.col_ex0}")
            self.cp_ls = [idx_cp]  # calculation result
            self.cp_cost = [np.array(score)]  # cp cost

        return idx_cp

    # def publish_cpTable(self):

    def cost(self):
        """
        In given a variable, it calculates cost for all formula combination.
        """
        x_ = self.df[self.col_ex0].values
        idx_xsort = np.argsort(x_)
        unit = round(len(x_)/self.sep_homos)

        # formula combination loop
        score = []
        for i_comb in tqdm(range(1, 1+self.comb)):
            for combo in itertools.combinations(self.KEY, i_comb):
                formula= self.key2smf(combo) 
                results = smf.ols(formula, data=self.df).fit()
                resid_sort = results.resid.values[idx_xsort]

                residMean = []
                for i_homos in range(self.sep_homos):
                    residMean.append(np.mean(resid_sort[unit*i_homos:unit*(i_homos+1)]))

                score.append([results.mse_resid, results.rsquared, np.std(residMean), results.aic])

        self.costs = np.array(score).T  #[cost type]x[factor combination]

    def evaluateOnLayer(self):
        evals = [] #  mse, r, homo, aic
        evals_id = [] #  mse, r, homo, aic
        nCr = self.comb_nCr

        # best evals in each factor numbers[layer](1, 2, 3....)
        for i in range(len(nCr)-1):
            c_ = self.costs[:, nCr[i]:nCr[i+1]]
            evals.append(np.min(c_, axis=1))
            evals_id.append(np.argmin(c_, axis=1) + nCr[i])

        evals = np.asarray(evals).T  # [cost type]x[factor layer]
        evals_id = np.asarray(evals_id).T  # [cost type]x[factor layer]

        # Which layer(factor) is appropriate in each cost type?
        # Basically if layer increased, performace also increase. it needs to find converged point.
        res0, res1 = [], []
        # MSE
        idx_mse = self.changepoint(evals[0, :])
        res0.append(evals[0, idx_mse])
        res1.append(int(evals_id[0, idx_mse]))
        # R
        idx_r = self.changepoint(evals[1, :])
        res0.append(evals[1, idx_r])
        res1.append(int(evals_id[1, idx_r]))
        # Homo
        idx_homo = int((evals[2, :]).argmin())
        res0.append(evals[2, idx_homo])
        res1.append(int(evals_id[2, idx_homo]))
        # AIC
        idx_aic = int((evals[3, :]).argmin())
        res0.append(evals[3, idx_aic])
        res1.append(int(evals_id[3, idx_aic]))
        # 1Factor limitation
        res0.append(evals[0, 0])
        res1.append(int(evals_id[0, 0]))

        self.df_bestCosts_score[self.col_ex0] = res0
        self.df_bestCosts_key[self.col_ex0] = res1
        self.df_bestCosts_mse[self.col_ex0] = [
            res0[0],
            self.costs[0, res1[1]],
            self.costs[0, res1[2]],
            self.costs[0, res1[3]],
            self.costs[0, res1[4]]
        ]

    def store_result(self, key):
        self.res_key.append(key)
        self.res_ind.append(self.key2ind(key))
        self.res_formula.append(self.key2smf(key))

    def decision(self):
        costs = self.df_bestCosts_score.loc[:, self.col_ex0].values
        keys = self.df_bestCosts_key.loc[:, self.col_ex0].values
        labels = self.label_list
        r = costs[1]
        numComb_mse, numComb_homo = len(labels[keys[0]]), len(labels[keys[2]])

        if r < 0.6:  # well fitted? => No
            key_ = labels[keys[4]]
            self.store_result(key_)
            self._print(f"[Result/{self.col_ex0}]:r is {r} >> 1d")
        else:
            mse_homo = self.df_bestCosts_mse.loc["homo", self.col_ex0]
            mse_mse = self.df_bestCosts_mse.loc["mse", self.col_ex0]
            if mse_homo/mse_mse > self.th_homoAndMse:
                key_ = labels[keys[0]]
                self.store_result(key_)
                self._print(f"[Result/{self.col_ex0}]:homo/mse is {mse_homo/mse_mse} >> mse")
            else:
                if numComb_mse >= numComb_homo:
                    key_ = labels[keys[2]]
                    self.store_result(key_)
                    self._print(f"[Result/{self.col_ex0}]:mse,homo:{numComb_mse},{numComb_homo} >> mse")
                else:
                    key_ = labels[keys[0]]
                    self.store_result(key_)
                    self._print(f"[Result/{self.col_ex0}]:mse,homo:{numComb_mse},{numComb_homo} >> homo")

    def publish(self):
        """ publish each transformed variables in a dataframe including target columns

        Mods:
            df_output
        """
        out = self.df_output
        out[self.col_target] = self.df[self.col_target].values
        for i in range(len(self.res_key)):
            colx = self.col_ex[i]
            x = self.df[colx].values
            if "1" in self.res_key[i]: out[f"{colx}"] = x
            if "2" in self.res_key[i]: out[f"{colx}2"] = np.square(x)
            if "3" in self.res_key[i]: out[f"{colx}3"] = np.power(x, 3)
            if "4" in self.res_key[i]: out[f"log{colx}"] = np.log(x)
            if "5" in self.res_key[i]: out[f"1/{colx}"] = np.reciprocal(x)
            if "6" in self.res_key[i]: out[f"√{colx}"] = np.sqrt(x)
            if "7" in self.res_key[i]: out[f"{colx}_log{colx}"] = x*np.log(x)
            if "8" in self.res_key[i]: out[f"{colx}2_log{colx}"] = np.square(x)*np.log(x)
            if "9" in self.res_key[i]: out[f"log{colx}/{colx}"] = np.log(x)/x
            if "a" in self.res_key[i]: out[f"1/{colx}2"] = np.reciprocal(np.square(x))
            if "b" in self.res_key[i]: out[f"{colx}√{colx}"] = x*np.sqrt(x)
            if "c" in self.res_key[i]: out[f"√log{colx}"] = np.sqrt(np.log(x))
            if "d" in self.res_key[i]: out[f"√{colx}log{colx}"] = np.sqrt(x*np.log(x))

    def fit(self, df, col_target):
        self.clear()
        self.df_init(df, col_target)

        for i_col in range(len(self.col_ex)):
            self.col_ex0 = self.col_ex[i_col]
            self.cost()
            self.evaluateOnLayer()
            self.decision()
            
            if self.on_recAll: 
                pd.to_pickle(self.costs, f"../_temp/{self.now}_{self.col_ex0}.pkl")

        self.publish()


dfx = pd.read_csv("../data/df_g1.csv", index_col=0, usecols=[0, 1, 2, 3, 4, 5])
rvc = regVariableConversion()
rvc.fit(dfx, "EP")

100%|██████████| 3/3 [00:02<00:00,  1.47it/s]


[Result/X0]:mse,homo:2,3 >> homo


100%|██████████| 3/3 [00:02<00:00,  1.49it/s]


[Result/X1]:mse,homo:2,1 >> mse


100%|██████████| 3/3 [00:01<00:00,  1.52it/s]


[Result/X2]:r is 0.2708026338317573 >> 1d


100%|██████████| 3/3 [00:01<00:00,  1.52it/s]

[Result/X3]:r is 0.1535596796629648 >> 1d





In [43]:
rvc.df_output

Unnamed: 0,Y,X02,X03,1/X1,X23,X3_logX3
0,480.48,69.5556,580.093704,0.024528,1.032874e+09,405.037869
1,445.75,558.8496,13211.204544,0.017097,1.034591e+09,319.561900
2,438.76,884.4676,26304.066424,0.017575,1.021604e+09,156.555830
3,453.09,363.6649,6935.089643,0.020125,1.021817e+09,333.351105
4,464.43,139.2400,1643.032000,0.024594,1.052275e+09,444.862113
...,...,...,...,...,...,...
9534,462.59,228.6144,3456.649728,0.020442,1.035819e+09,312.833241
9535,432.90,1116.2281,37293.180821,0.012829,1.031219e+09,244.234912
9536,465.96,255.6801,4088.324799,0.023073,1.043208e+09,343.361501
9537,450.93,311.5225,5498.372125,0.016703,1.056782e+09,430.675093


In [5]:
"X0_2"

array([2.54474561e+01, 2.54474561e+01, 1.66234336e-01, 5.79334325e+04,
       2.78711345e+01])

In [16]:
costs = rvc.df_bestCosts_score.loc[:, rvc.col_ex0].values
keys = rvc.df_bestCosts_key.loc[:, rvc.col_ex0].values
keys

array([ 25,  25, 106, 255,  12], dtype=int64)

In [35]:
costs = rvc.df_bestCosts_score.loc[:, rvc.col_ex0].values
keys = rvc.df_bestCosts_key.loc[:, rvc.col_ex0].values
labels = rvc.label_list
r = costs[1]
numComb_mse, numComb_homo = len(labels[keys[0]]), len(labels[keys[2]])
rvc.res_key = []
rvc.res_ind = []
rvc.res_formula = []

if r < 0.6:  # well fitted? => No
    key_ = labels[keys[4]]
    rvc.res_key.append(key_)
    rvc.res_ind.append(rvc.key2ind(key_))
    rvc.res_formula.append(rvc.key2smf(key_))
    rvc._print(f"[Result/{rvc.col_ex0}]:r is {r} >> 1d")
else:
    mse_homo = rvc.df_bestCosts_mse.loc["homo", rvc.col_ex0]
    mse_mse = rvc.df_bestCosts_mse.loc["mse", rvc.col_ex0]
    if mse_homo/mse_mse > rvc.th_homoAndMse:
        key_ = labels[keys[0]]
        rvc.res_key.append(key_)
        rvc.res_ind.append(rvc.key2ind(key_))
        rvc.res_formula.append(rvc.key2smf(key_))
        rvc._print(f"[Result/{rvc.col_ex0}]:homo/mse is {mse_homo/mse_mse} >> mse")
    else:
        if numComb_mse >= numComb_homo:
            key_ = labels[keys[2]]
            rvc.res_key.append(key_)
            rvc.res_ind.append(rvc.key2ind(key_))
            rvc.res_formula.append(rvc.key2smf(key_))
            rvc._print(f"[Result/{rvc.col_ex0}]:mse,homo:{numComb_mse},{numComb_homo} >> mse")
        else:
            key_ = labels[keys[0]]
            rvc.res_key.append(key_)
            rvc.res_ind.append(rvc.key2ind(key_))
            rvc.res_formula.append(rvc.key2smf(key_))
            rvc._print(f"[Result/{rvc.col_ex0}]:mse,homo:{numComb_mse},{numComb_homo} >> homo")

print(rvc.res_key, rvc.res_ind, rvc.res_formula)

[Result/X0]:mse,homo:2,3 >> homo
['23'] [['X2', 'X3']] ['Y ~ + np.square(X0) + np.power(X0, 3) ']


# 3

In [58]:
class regVariableConversion(object):
    def __init__(self, **kwargs):
        # optional 
        self.on_cpPlot = kwargs.get("on_cpPlot", False)
        self.on_combine = kwargs.get("on_combine", False)
        self.on_log = kwargs.get("on_log", False)
        self.on_recCp = kwargs.get("on_rec", True)

        # key setting
        self.comb = kwargs.get("comb", 3)  # number of factor
        self.KEY = [
            "1", "2", "3", "4", "5", "6", "7", "8", "9", "a",
            "b", "c", "d"
        ]

        self.comb_nCr = [0]  #n: key total, r:combination 
        sum = 0
        self.label_list = []  # all combination label
        for i_comb in range(1, 1+self.comb):
            x = len(list(itertools.combinations(range(len(self.KEY)), i_comb)))
            sum += x
            self.comb_nCr.append(sum)
            for combo in itertools.combinations(self.KEY, i_comb):
                label = ""
                for i in range(len(combo)):
                    label += combo[i]
                self.label_list.append(label)

        # decision
        self.sep_homos = 10  # number of separating for homoscedasticity
        self.th_homoAndMse = kwargs["th_homoAndMse"] if "th_homoAndMse" in kwargs else 1.05

    def clear(self):
        self.res_formula = []  # "Y ~ + X + np.log(X)"
        self.res_key = []  # "123"
        self.res_ind = []  # ['X3', 'X3', 'logX/X', 'logX']

        # predict
        idx_costs = ["mse", "r", "homo","aic", "1d"]
        self.df_bestCosts_score = pd.DataFrame(index=idx_costs)
        self.df_bestCosts_key = pd.DataFrame(index=idx_costs)
        self.df_bestCosts_mse = pd.DataFrame(index=idx_costs)

        self.ls_df_cost = []
        self.df_output = pd.DataFrame()

        # _slidingFilter
        self.GAP = {}

        # record change point calculation
        self.cp_col  = []  # id
        self.cp_ls = []  # calculation result
        self.cp_cost = []  # cp cost

    def df_init(self, df, col_target):
        """
        change columns name of input dataframe
        ["AT", "V", "EP"] => ["X0", "X1", "Y"] 
        """
        cols = list(df.columns)
        
        col_ex = cols.copy()
        col_ex.remove(col_target)

        self.col_origin = col_ex.copy()
        self.col_origin.append(col_target)
        df = df.reindex(self.col_origin, axis="columns")

        col_replace = []
        for i in range(len(col_ex)): col_replace.append(f"X{i}")
        col_replace.append("Y")
        df.columns = col_replace
    
        self.col_target = col_replace[-1]
        self.df = df

    def key2smf(self, keys):
        """ key list is converted to smf format

        Args:
            keys (tuple, list, str): [ex] ("1", "2", "3")

        Returns:
            form (str) : [ex] "Y ~ + X + np.log(X)"
        """
        form = f"{self.col_target} ~ "
        if "1" in keys: form += f"+ {self.col_ex0} "
        if "2" in keys: form += f"+ np.square({self.col_ex0}) "
        if "3" in keys: form += f"+ np.power({(self.col_ex0)}, 3) "
        if "4" in keys: form += f"+ np.log({self.col_ex0}) "
        if "5" in keys: form += f"+ np.reciprocal({self.col_ex0}) "
        if "6" in keys: form += f"+ np.sqrt({self.col_ex0}) "
        if "7" in keys: form += f"+ {self.col_ex0}:np.log({self.col_ex0}) "
        if "8" in keys: form += f"+ np.square({self.col_ex0}):np.log({self.col_ex0}) "
        if "9" in keys: form += f"+ np.log({self.col_ex0}):np.reciprocal({self.col_ex0}) "
        if "a" in keys: form += f"+ np.reciprocal(np.square({self.col_ex0})) "
        if "b" in keys: form += f"+ {self.col_ex0}:np.sqrt({self.col_ex0}) "
        if "c" in keys: form += f"+ np.sqrt(np.log({self.col_ex0})) "
        if "d" in keys: form += f"+ np.sqrt({self.col_ex0}*np.log({self.col_ex0})) "

        return form

    def changepoint(self, Y):
        score = []
        X = range(len(Y))
        for i in range(len(Y)-1):
            if i == 0:
                res1 = sm.OLS(Y, sm.add_constant(X)).fit()
                score.append(np.mean(np.abs(res1.resid)))
            else:
                X1, X2, Y1, Y2 = X[:i+1], X[i:], Y[:i+1], Y[i:]
                res1 = sm.OLS(Y1, sm.add_constant(X1)).fit()
                res2 = sm.OLS(Y2, sm.add_constant(X2)).fit()
                score1 = np.abs(res1.resid)
                score.append(np.sum(score1 + np.abs(res2.resid))/(len(X)+1))

        idx_cp = np.array(score).argmin()

        if self.on_recCp:
            self.cp_col.append(f"{self.col_ex0}")
            self.cp_ls = [idx_cp]  # calculation result
            self.cp_cost = [np.array(score)]  # cp cost

        return idx_cp

    # def publish_cpTable(self):

    def cost(self):
        """
        In given a variable, it calculates cost for all formula combination.
        """
        col_ex = self.df.columns[:-1]
        self.col_ex0 = col_ex[0]

        x_ = self.df[self.col_ex0].values
        idx_xsort = np.argsort(x_)
        unit = round(len(x_)/self.sep_homos)

        # formula combination loop
        score = []
        for i_comb in tqdm(range(1, 1+self.comb)):
            for combo in itertools.combinations(self.KEY, i_comb):
                formula= self.key2smf(combo) 
                results = smf.ols(formula, data=self.df).fit()
                resid_sort = results.resid.values[idx_xsort]

                residMean = []
                for i_homos in range(self.sep_homos):
                    residMean.append(np.mean(resid_sort[unit*i_homos:unit*(i_homos+1)]))

                score.append([results.mse_resid, results.rsquared, np.std(residMean), results.aic])

        self.costs = np.array(score).T  #[cost type]x[factor combination]

    def evaluateOnLayer(self):
        evals = [] #  mse, r, homo, aic
        evals_id = [] #  mse, r, homo, aic
        nCr = self.comb_nCr

        # best evals in each factor numbers[layer](1, 2, 3....)
        for i in range(len(nCr)-1):
            c_ = self.costs[:, nCr[i]:nCr[i+1]]
            evals.append(np.min(c_, axis=1))
            evals_id.append(np.argmin(c_, axis=1) + nCr[i])

        evals = np.asarray(evals).T  # [cost type]x[factor layer]
        evals_id = np.asarray(evals_id).T  # [cost type]x[factor layer]

        # Which layer(factor) is appropriate in each cost type?
        # Basically if layer increased, performace also increase. it needs to find converged point.
        res0, res1 = [], []
        # MSE
        idx_mse = self.changepoint(evals[0, :])
        res0.append(evals[0, idx_mse])
        res1.append(int(evals_id[0, idx_mse]))
        # R
        idx_r = self.changepoint(evals[1, :])
        res0.append(evals[0, idx_r])
        res1.append(int(evals_id[0, idx_r]))
        # Homo
        idx_homo = int((evals[2, :]).argmin())
        res0.append(evals[2, idx_homo])
        res1.append(int(evals_id[2, idx_homo]))
        # AIC
        idx_aic = int((evals[3, :]).argmin())
        res0.append(evals[3, idx_aic])
        res1.append(int(evals_id[3, idx_aic]))
        # 1 Factor limitation
        res0.append(evals[0, 0])
        res1.append(int(evals_id[0, 0]))

        self.df_bestCosts_score[self.col_ex0] = res0
        self.df_bestCosts_key[self.col_ex0] = res1
        self.df_bestCosts_mse[self.col_ex0] = [
            res0[0],
            self.costs[0, res1[1]],
            self.costs[0, res1[2]],
            self.costs[0, res1[3]],
            self.costs[0, res1[4]]
        ]

    def fit(self, df, col_target):
        self.clear()

        self.df_init(df, col_target)
        self.cost()
        self.evaluateOnLayer()
        

dfx = pd.read_csv("../data/df_g1.csv", index_col=0, usecols=[0, 1, 2, 3, 4, 5])
rvc = regVariableConversion()
rvc.fit(dfx, "EP")

100%|██████████| 3/3 [00:02<00:00,  1.32it/s]


In [60]:
rvc.df_bestCosts_score


Unnamed: 0,X0
mse,25.447456
r,25.447456
homo,0.166234
aic,57933.432477
1d,27.871135


In [61]:
rvc.df_bestCosts_key

Unnamed: 0,X0
mse,25
r,25
homo,106
aic,255
1d,12


In [62]:
rvc.df_bestCosts_mse

Unnamed: 0,X0
mse,25.447456
r,25.447456
homo,25.424973
aic,25.407062
1d,27.871135


In [57]:
rvc.costs.shape

(4, 377)

In [44]:
evals = [] #  mse, r, homo, aic
evals_id = [] #  mse, r, homo, aic
nCr = rvc.comb_nCr

# best evals in each factor numbers[layer](1, 2, 3....)
for i in range(len(nCr)-1):
    c_ = rvc.costs[:, nCr[i]:nCr[i+1]]
    evals.append(np.min(c_, axis=1))
    evals_id.append(np.argmin(c_, axis=1) + nCr[i])

evals = np.asarray(evals)  # factor layer x cost
evals_id = np.asarray(evals_id)  # factor layer x cost

# which element number is appropriate in each cost?
# basically number increased, performace also increase. Need converged point
result = []
# MSE
best_idx = rvc.changepoint(evals[:, 0])
result.append([evals[best_idx, 0], evals_id[best_idx, 0]])
# R
best_idx = rvc.changepoint(evals[:, 1])
result.append([evals[best_idx, 1], evals_id[best_idx, 1]])
# Homo
best_idx = int((evals[:, 2]).argmin())
result.append([evals[best_idx, 2], evals_id[best_idx, 2]])
# AIC
best_idx = int((evals[:, 3]).argmin())
result.append([evals[best_idx, 3], evals_id[best_idx, 3]])
# 1 Factor limitation
result.append([evals[0, 0], evals_id[0, 0]])

result


[[25.447456116142128, 25],
 [0.8029389766960007, 42],
 [0.1662343356877426, 106],
 [57933.43247740091, 255],
 [27.871134514991247, 12]]

In [49]:
evals.shape

(3, 4)

In [None]:
def _evaluateOnLayer(self):
    """evaluate cost on each layer of element number,and decide best element
        combination in a cost. df_mse_allCost is for comapring these by same
        character.

    Returns: None

    Mods:df_bestEachCost, df_mse_allCost
    """
    def changepoint(Y):
        res = []
        X = range(len(Y))
        for i in range(len(Y)-1):
            if i == 0:
                model1 = sm.OLS(Y, sm.add_constant(X))
                result1 = model1.fit()
                res.append(np.abs(result1.resid).mean())
            else:
                X1, X2, Y1, Y2 = X[0:i+1], X[i:], Y[0:i+1], Y[i:]
                model1 = sm.OLS(Y1, sm.add_constant(X1))
                result1 = model1.fit()
                result_x = np.abs(result1.resid).mean()
                model2 = sm.OLS(Y2, sm.add_constant(X2))
                result2 = model2.fit()
                res.append((result_x + np.abs(result2.resid).mean())/2)

        out = np.array(res).argmin()
        if self.CPonPlot:
            plt.figure(figsize=(12, 3))
            plt.plot(Y)
            plt.axvline(out, color="r")
            plt.title(f"{self.col_ex0}")
            plt.show()

        return out

    mse, r, homo, aic = [], [], [], []
    mse_label, r_label, homo_label, aic_label = [], [], [], []
    nCr = self.comb_nCr
    df = self.df_cost.T

    # find best combination in each element number
    for i in range(len(nCr)-1):
        dfx = df.iloc[nCr[i]:nCr[i+1]]
        mse.append(dfx.loc[:, "mse"].min())
        mse_label.append(dfx.loc[:, "mse"].idxmin())
        r.append(dfx.loc[:, "r"].max())
        r_label.append(dfx.loc[:, "r"].idxmax())
        homo.append(dfx.loc[:, "homo"].min())
        homo_label.append(dfx.loc[:, "aic"].idxmin())
        aic.append(dfx.loc[:, "aic"].min())
        aic_label.append(dfx.loc[:, "aic"].idxmin())

    # which element number is appropriate in each cost?
    # basically number increased, performace also increase. Need converged point
    result = []
    # MSE
    mse_best_formatID = changepoint(np.array(mse))
    result.append(mse_label[mse_best_formatID])
    result.append(mse[mse_best_formatID])
    # R
    r_best_formatID = changepoint(np.array(r))
    result.append(r_label[r_best_formatID])
    result.append(r[r_best_formatID])
    # Homo
    homo_best_formatID = int(np.array(homo[0:3]).argmin())
    result.append(homo_label[homo_best_formatID])
    result.append(homo[homo_best_formatID])
    # AIC
    aic_best_formatID = int(np.array(aic).argmin())
    result.append(aic_label[aic_best_formatID])
    result.append(aic[aic_best_formatID])
    # 1 element limitation
    result.append(mse_label[0])
    result.append(mse[0])

    self.df_bestCosts[self.col_ex0] = result

    result_mse = [
        mse[mse_best_formatID],
        df.loc[r_label[r_best_formatID], "mse"],
        df.loc[homo_label[homo_best_formatID], "mse"],
        df.loc[aic_label[aic_best_formatID], "mse"],
        df.loc[mse_label[0], "mse"]
    ]
    self.df_mse_bestCosts[self.col_ex0] = result_mse

# 2

In [33]:
class regVariableConversion(object):
    def __init__(self, **kwargs):
        # optional 
        self.on_cpPlot = kwargs.get("on_cpPlot", False)
        self.on_combine = kwargs.get("on_combine", False)
        self.on_log = kwargs.get("on_log", False)

        # key setting
        self.comb = kwargs.get("comb", 3)  # number of factor
        self.KEY = [
            "1", "2", "3", "4", "5", "6", "7", "8", "9", "a",
            "b", "c", "d"
        ]

        self.comb_nCr, sum, self.label_list = [0], 0, []
        for i_comb in range(1, 1+self.comb):
            x = len(list(itertools.combinations(range(len(self.KEY)), i)))
            sum += x
            self.comb_nCr.append(sum)
            for combo in itertools.combinations(self.KEY, i_comb):
                label = ""
                for i in range(len(combo)):
                    label += combo[i]
                self.label_list.append(label)

        # decision
        self.sep_homos = 10  # number of separating for homoscedasticity
        self.th_homoAndMse = kwargs["th_homoAndMse"] if "th_homoAndMse" in kwargs else 1.05

    def clear(self):
        self.res_formula = []  # "Y ~ + X + np.log(X)"
        self.res_key = []  # "123"
        self.res_ind = []  # ['X3', 'X3', 'logX/X', 'logX']

        # predict
        self.df_bestCosts_mse = pd.DataFrame(
            index=["mse", "r", "homo", "aic", "1d"])

        self.df_bestCosts = pd.DataFrame(
            index=[ "mse", "mse_c", "r", "r_c", "homo", "homo_c",
                "aic", "aic_c", "1d", "1d_c"])

        self.ls_df_cost = []
        self.df_output = pd.DataFrame()

        # _slidingFilter
        self.GAP = {}

    def df_init(self, df, col_target):
        """
        change columns name of input dataframe
        ["AT", "V", "EP"] => ["X0", "X1", "Y"] 
        """
        cols = list(df.columns)
        
        col_ex = cols.copy()
        col_ex.remove(col_target)

        self.col_origin = col_ex.copy()
        self.col_origin.append(col_target)
        df = df.reindex(self.col_origin, axis="columns")

        col_replace = []
        for i in range(len(col_ex)): col_replace.append(f"X{i}")
        col_replace.append("Y")
        df.columns = col_replace
    
        self.col_target = col_replace[-1]
        self.df = df

    def key2smf(self, keys):
        """ key list is converted to smf format

        Args:
            keys (tuple, list, str): [ex] ("1", "2", "3")

        Returns:
            form (str) : [ex] "Y ~ + X + np.log(X)"
        """
        form = f"{self.col_target} ~ "
        if "1" in keys: form += f"+ {self.col_ex0} "
        if "2" in keys: form += f"+ np.square({self.col_ex0}) "
        if "3" in keys: form += f"+ np.power({(self.col_ex0)}, 3) "
        if "4" in keys: form += f"+ np.log({self.col_ex0}) "
        if "5" in keys: form += f"+ np.reciprocal({self.col_ex0}) "
        if "6" in keys: form += f"+ np.sqrt({self.col_ex0}) "
        if "7" in keys: form += f"+ {self.col_ex0}:np.log({self.col_ex0}) "
        if "8" in keys: form += f"+ np.square({self.col_ex0}):np.log({self.col_ex0}) "
        if "9" in keys: form += f"+ np.log({self.col_ex0}):np.reciprocal({self.col_ex0}) "
        if "a" in keys: form += f"+ np.reciprocal(np.square({self.col_ex0})) "
        if "b" in keys: form += f"+ {self.col_ex0}:np.sqrt({self.col_ex0}) "
        if "c" in keys: form += f"+ np.sqrt(np.log({self.col_ex0})) "
        if "d" in keys: form += f"+ np.sqrt({self.col_ex0}*np.log({self.col_ex0})) "

        return form

    def cost(self):
        """
        In given a variable, it calculates cost for all formula combination.
        """
        col_ex = rvc.df.columns[:-1]
        rvc.col_ex0 = col_ex[0]

        x_ = rvc.df[rvc.col_ex0].values
        idx_xsort = np.argsort(x_)
        unit = round(len(x_)/rvc.sep_homos)

        # formula combination loop
        score = []
        for i_comb in tqdm(range(1, 1+self.comb)):
            for combo in itertools.combinations(self.KEY, i_comb):
                formula= self.key2smf(combo) 
                results = smf.ols(formula, data=rvc.df).fit()
                resid_sort = results.resid.values[idx_xsort]

                residMean = []
                for i_homos in range(rvc.sep_homos):
                    residMean.append(np.mean(resid_sort[unit*i_homos:unit*(i_homos+1)]))

                score.append([results.mse_resid, results.rsquared, np.std(residMean), results.aic])

        return np.array(score).T

    def fit(self, df, col_target):
        self.clear()

        self.df_init(df, col_target)
        self.costs = self.cost()

dfx = pd.read_csv("../data/df_g1.csv", index_col=0, usecols=[0, 1, 2, 3, 4, 5])
rvc = regVariableConversion()

100%|██████████| 3/3 [00:00<00:00, 2993.08it/s]


In [35]:
rvc.fit(dfx, "EP")
rvc.costs


100%|██████████| 3/3 [00:01<00:00,  1.57it/s]


(4, 377)

# 1

In [66]:
class regVariableConversion(object):
    def __init__(self, **kwargs):
        # optional 
        self.on_cpPlot = kwargs.get("on_cpPlot", False)
        self.on_combine = kwargs.get("on_combine", False)
        self.on_log = kwargs.get("on_log", False)

        # setting
        self.comb = kwargs.get("comb", 4)  # number of factor
        self.KEY = [
            "1", "2", "3", "4", "5", "6", "7", "8", "9", "a",
            "b", "c", "d"
        ]
 
        self.sep_homos = 10  # number of separating for homoscedasticity
        self.th_homoAndMse = kwargs["th_homoAndMse"] if "th_homoAndMse" in kwargs else 1.05

        self.comb_nCr, sum = [0], 0  # sum of factor combination
        for i in range(1, 1+self.comb):
            x = len(list(itertools.combinations(range(len(self.KEY)), i)))
            sum += x
            self.comb_nCr.append(sum)

    def clear(self):
        self.res_formula = []  # "Y ~ + X + np.log(X)"
        self.res_key = []  # "123"
        self.res_ind = []  # ['X3', 'X3', 'logX/X', 'logX']

        # predict
        self.col_target, self.col_ex = None, None
        self.col_origin = None
        self.df, self.df_cost = None, None

        self.df_bestCosts_mse = pd.DataFrame(
            index=["mse", "r", "homo", "aic", "1d"])

        self.df_bestCosts = pd.DataFrame(
            index=[ "mse", "mse_c", "r", "r_c", "homo", "homo_c",
                "aic", "aic_c", "1d", "1d_c"])

        self.ls_df_cost = []

        # _slidingFilter
        self.GAP = {}
        self.df_output = pd.DataFrame()

    def df_init(self, col_target):
        """
        change columns name of input dataframe
        ["AT", "V", "EP"] => ["X0", "X1", "Y"] 
        """
        cols = list(self.df.columns)

        col_ex = cols.copy()
        col_ex.remove(col_target)

        self.col_origin = col_ex.copy()
        self.col_origin.append(col_target)
        self.df = self.df.reindex(self.col_origin, axis="columns")

        col_replace = []
        for i in range(len(col_ex)): col_replace.append(f"X{i}")
        col_replace.append("Y")
        self.df.columns = col_replace

    def fit(self, df, col_target):
        self.clear()

        self.df = df
        self.df_init(col_target)


dfx = pd.read_csv("../data/df_g1.csv", index_col=0, usecols=[0, 1, 2, 3, 4, 5])
rvc = regVariableConversion()
rvc.fit(dfx, "EP")
rvc.df.head()

Unnamed: 0,X0,X1,X2,X3,Y
0,8.34,40.77,1010.84,90.01,480.48
1,23.64,58.49,1011.4,74.2,445.75
2,29.74,56.9,1007.15,41.91,438.76
3,19.07,49.69,1007.22,76.79,453.09
4,11.8,40.66,1017.13,97.2,464.43
