# 선형 회귀 모델을 위한 데이터 변환
- feature value와 target value의 분포가 정규분포이면 좋다
- target value의 분포가 치우친 왜곡(skew)인 경우이면 예측 성능에 부정적인 영향
- 일반적으로 선형 회귀 모델을 적용하기 전에 데이터 스케일링/정규화 작업을 수행한다

## Feature data set
- StandardScaler를 사용하여 평균 0, 분산 1 의 표준 정규분포로 변환
- 다항식 변환 추가 적용
- log 변환

## Target data set
- log 변환

In [3]:
import numpy as np 
import pandas as pd 
from sklearn.datasets import load_boston
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import cross_val_score

boston = load_boston()
df = pd.DataFrame(boston.data, columns=boston.feature_names)
df['PRICE'] = boston.target

y_target = df['PRICE']
X_data = df.drop(['PRICE'], axis=1, inplace=False)

def get_linear_reg_eval(model_name, params=None, X_data_n=None, index_n=None, y_target_n=None):
    coeff_df = pd.DataFrame()
    for param in params:
        if model_name == 'Ridge': model = Ridge(alpha=param)
        elif model_name == 'Lasso': model = Lasso(alpha=param)
        elif model_name == 'ElasticNet': model = ElasticNet(alpha=param, l1_ratio=0.7)
        neg_mse_scores = cross_val_score(model, X_data_n, y_target_n, scoring='neg_mean_squared_error', cv=5)
        avg_rmse = np.mean(np.sqrt(-1 * neg_mse_scores))
        print('alpha {} : {}'.format(param, avg_rmse))
        model.fit(X_data_n, y_target_n)
        coeff = pd.Series(data=model.coef_, index=index_n)
        colname = 'alpha:' + str(param)
        coeff_df[colname] = coeff
    return coeff_df

In [5]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import PolynomialFeatures

def get_scaled_data(method='None', p_degree=None, input_data=None):
    if method == 'Standard': scaled_data = StandardScaler().fit_transform(input_data)
    elif method == 'MinMax': scaled_data = MinMaxScaler().fit_transform(input_data)
    elif method == 'Log': scaled_data = np.log1p(input_data)
    else: scaled_data = input_data

    if p_degree != None:
        scaled_data = PolynomialFeatures(degree=p_degree, include_bias=False).fit_transform(scaled_data)
    return scaled_data

alphas = [0.1, 1, 10, 100]
scale_methods = [(None, None), ('Standard', None), ('Standard', 2), ('MinMax', None), ('MinMax', 2), ('Log', None)]

for scale_method in scale_methods:
    X_data_scaled = get_scaled_data(method=scale_method[0], p_degree=scale_method[1], input_data=X_data)
    print(scale_method[0], scale_method[1])
    get_linear_reg_eval('Ridge', params=alphas, X_data_n=X_data_scaled, y_target_n=y_target)

None None
alpha 0.1 : 5.78848662703241
alpha 1 : 5.6525709656135446
alpha 10 : 5.518166280868971
alpha 100 : 5.329589628472144
Standard None
alpha 0.1 : 5.825992799389435
alpha 1 : 5.80288951725773
alpha 10 : 5.636831222559067
alpha 100 : 5.421374711794966
Standard 2
alpha 0.1 : 8.827235873001744
alpha 1 : 6.871287525962218
alpha 10 : 5.484870425533342
alpha 100 : 4.634437778261926
MinMax None
alpha 0.1 : 5.763570225288303
alpha 1 : 5.465045081564939
alpha 10 : 5.754163637679423
alpha 100 : 7.634919863623539
MinMax 2
alpha 0.1 : 5.297560003127131
alpha 1 : 4.322660371524896
alpha 10 : 5.185204257580534
alpha 100 : 6.537866889704364
Log None
alpha 0.1 : 4.770429614681663
alpha 1 : 4.676227018966875
alpha 10 : 4.836439992635265
alpha 100 : 6.2409162327190355


### 대체적으로 log 변환이 제일 좋은 성능을 나타내는 것을 알 수 있다