# 보스턴 부동산 집값 예측문제
#### Features(X)
1. CRIM: 도시별 범죄발생률
2. ZN: 25,000평을 넘는 토지의 비율
3. INDUS: 도시별 비상업 지구의 비유
4. CHAS: 찰스 강의 더미 변수(1 = 강의 경계, 0 = 나머지)
5. NOX: 일산화질소 농도
6. RM: 주거할 수 있는 평균 방의개수
7. AGE: 1940년 이전에 지어진 주택의 비율
8. DIS: 5개의 고용지원센터까지의 가중치가 고려된 거리
9. RAD: 고속도로의 접근 용이성에 대한 지표
10. TAX: 10,000달러당 재산세 비율
11. PTRATIO: 도시별 교사와 학생의 비율 
12. B: 도시의 흑인 거주 비유
13. LSTAT: 저소득층의 비율
#### traget(y)
14. MEDV   : 본인 소유의 주택가격(중앙값) (단위: $1,000)  

# 필요한 라이브러리 임폴트

In [3]:
# 불필요한 경고 출력을 방지
import warnings
warnings.filterwarnings('ignore')

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# 데이터 확인

In [2]:
# 방법2 캐글에서 직접 다운로드해서 로딩하기
columns = ['crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax', 'ptratio', 'black', 'lstat', 'price']
hpd_df = pd.read_csv('./data/bostton_house_prices.csv', header=None, delimiter=r'\s+', names=columns)
hpd_df.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,price
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


In [8]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [6]:
hpd_df_temp = hpd_df.drop('price',axis=1)
hpd_df_temp.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33


In [13]:
# 객체생성 -> fit(hpd_df_temp) -> transform(hpd_df_temp)
mm_scale = MinMaxScaler(hpd_df_temp)

In [17]:
MinMaxScaler().fit_transform(hpd_df_temp)

In [19]:
mm_scale_data = MinMaxScaler().fit_transform(hpd_df_temp)
mm_scale_data

array([[0.00000000e+00, 1.80000000e-01, 6.78152493e-02, ...,
        2.87234043e-01, 1.00000000e+00, 8.96799117e-02],
       [2.35922539e-04, 0.00000000e+00, 2.42302053e-01, ...,
        5.53191489e-01, 1.00000000e+00, 2.04470199e-01],
       [2.35697744e-04, 0.00000000e+00, 2.42302053e-01, ...,
        5.53191489e-01, 9.89737254e-01, 6.34657837e-02],
       ...,
       [6.11892474e-04, 0.00000000e+00, 4.20454545e-01, ...,
        8.93617021e-01, 1.00000000e+00, 1.07891832e-01],
       [1.16072990e-03, 0.00000000e+00, 4.20454545e-01, ...,
        8.93617021e-01, 9.91300620e-01, 1.31070640e-01],
       [4.61841693e-04, 0.00000000e+00, 4.20454545e-01, ...,
        8.93617021e-01, 1.00000000e+00, 1.69701987e-01]])

# 데이터 정규화 - MinMaxScaler()
- 피처들의 데이터 수준을 맞춰주기 위해서 MinMaxScaler 수행

In [20]:
from sklearn.preprocessing import MinMaxScaler

# make extend boston dataset
# Refernce : https://github.com/amueller/mglearn/blob/master/mglearn/datasets.py#L30
def data_pre(df, y):
    X =  df.drop(y, axis=1).values

    X = MinMaxScaler().fit_transform(X)
    y = df[y]
    return X, y

In [5]:
# 특징이 확장된 보스턴 부동산 가격 데이터 불러오기
X, y = data_pre(hpd_df, 'price')
print(X[:2])
print()
print(y[:2])
print('Extended Feature Shape :', X.shape)

[[0.00000000e+00 1.80000000e-01 6.78152493e-02 0.00000000e+00
  3.14814815e-01 5.77505269e-01 6.41606591e-01 2.69203139e-01
  0.00000000e+00 2.08015267e-01 2.87234043e-01 1.00000000e+00
  8.96799117e-02]
 [2.35922539e-04 0.00000000e+00 2.42302053e-01 0.00000000e+00
  1.72839506e-01 5.47997701e-01 7.82698249e-01 3.48961980e-01
  4.34782609e-02 1.04961832e-01 5.53191489e-01 1.00000000e+00
  2.04470199e-01]]

0    24.0
1    21.6
Name: price, dtype: float64
Extended Feature Shape : (506, 13)


## baseline 성능
실습파일 : 2-2.Regression_boston_house_price_pred(EDA_Feature Selection)_cvs.ipynb
#### baseline #1 - Average MSE : 37.1318(기본 Linear Regression)
#### baseline #2 - Average MSE : 34.10008 (기본 Linear Regression + Feature Selection 적용)

In [21]:
from sklearn.model_selection import KFold

num_split = 5
# n_splits : validation split 갯수
# 매개변수 : shuffle = True, random_state = 2024
kf = KFold(n_splits=num_split)  
# kf = KFold(n_splits=num_split, shuffle = True, random_state = 2024)  

tot_MSE = 0.0
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    # 선형회귀(Linear Regression) 모델 선언하기
    model_lr = LinearRegression()

    # 선형회귀(Linear Regression) 모델 학습하기
    model_lr.fit(X_train, y_train)

    # 테스트 데이터에 대한 예측 수행하기
    y_pred = model_lr.predict(X_test)

    # MSE(Mean Squared Error) 측정 수행하기
    tot_MSE = tot_MSE + mean_squared_error(y_test, y_pred)

# 평균 에러 구하기    
avg_MSE = tot_MSE / num_split 
print('Average MSE :', avg_MSE)
print('Avergae RMSE :', np.sqrt(avg_MSE))

Average MSE : 37.13180746769898
Avergae RMSE : 6.093587405436881


# KFold 교차검증 + L2 규제 알고리즘

In [7]:
from sklearn.linear_model import Ridge  # L2 규제

num_split = 5
# n_splits : validation split 갯수
# 매개변수 : shuffle = True, random_state = 2024
kf = KFold(n_splits=num_split)
# kf = KFold(n_splits=num_split, shuffle = True, random_state = 2024)  

tot_MSE = 0.0
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # 선형회귀(Linear Regression) 모델 선언하기
    #ridge_reg = Ridge(alpha=0.8)
    ridge_reg = Ridge(alpha=0.2)
    #ridge_reg = Ridge(alpha=1)

    # 선형회귀(Linear Regression) 모델 학습하기
    ridge_reg.fit(X_train, y_train)

    # 테스트 데이터에 대한 예측 수행하기
    y_pred = ridge_reg.predict(X_test)

    # MSE(Mean Squared Error) 측정 수행하기
    tot_MSE = tot_MSE + mean_squared_error(y_test, y_pred)

# 평균 에러 구하기    
avg_MSE = tot_MSE / num_split 
print('Average MSE :', tot_MSE)
print('Avergae RMSE :', np.sqrt(tot_MSE))

Average MSE : 177.9344604063832
Avergae RMSE : 13.339207637876518


# KFold 교차검증 + L1 규제 알고리즘

In [8]:
from sklearn.linear_model import Lasso  # L1 규제

num_split = 5
# n_splits : validation split 갯수
# 매개변수 : shuffle = True, random_state = 2024
kf = KFold(n_splits=num_split)
# kf = KFold(n_splits=num_split, shuffle = True, random_state = 2024)  

tot_MSE = 0.0
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    # 선형회귀(Linear Regression) 모델 선언하기
    lasso_reg = Lasso(alpha=0.02) # 알파값을 작게 줌 -> 규제를 높임 

    # 선형회귀(Linear Regression) 모델 학습하기
    lasso_reg.fit(X_train, y_train)

    # 테스트 데이터에 대한 예측 수행하기
    y_pred = lasso_reg.predict(X_test)

    # MSE(Mean Squared Error) 측정 수행하기
    tot_MSE = tot_MSE + mean_squared_error(y_test, y_pred)

# 평균 에러 구하기    
avg_MSE = tot_MSE / num_split 
print('Average MSE :', avg_MSE)
print('Avergae RMSE :', np.sqrt(avg_MSE))

Average MSE : 35.36971883670263
Avergae RMSE : 5.947244642412369


# KFold 교차검증 + ElasticNet(L1+ L2) 규제 알고리즘

In [9]:
from sklearn.linear_model import ElasticNet

num_split = 5
# n_splits : validation split 갯수
# 매개변수 : shuffle = True, random_state = 2024
kf = KFold(n_splits=num_split)
# kf = KFold(n_splits=num_split, shuffle = True, random_state = 2024)  

tot_MSE = 0.0
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    # 선형회귀(Linear Regression) 모델 선언하기
    #elasticnet_reg = ElasticNet(alpha=1)
    elasticnet_reg = ElasticNet(alpha=0.02)

    # 선형회귀(Linear Regression) 모델 학습하기
    elasticnet_reg.fit(X_train, y_train)

    # 테스트 데이터에 대한 예측을 수행합니다.
    y_pred = elasticnet_reg.predict(X_test)

    # MSE(Mean Squared Error)를 측정합니다.
    tot_MSE = tot_MSE + mean_squared_error(y_test, y_pred)

# 평균 에러 구하기    
avg_MSE = tot_MSE / num_split 
print('Average MSE :', avg_MSE)
print('Avergae RMSE :', np.sqrt(avg_MSE))

Average MSE : 32.420895381917
Avergae RMSE : 5.693934964672234


## 최적의 하이퍼 파라미터 찾는 방법

### GridSerachCV 활용  
- 모델링시 필요한 하이퍼파라미터를 설정할 때 가장 최적의 파리미터값을 찾아주는 방법중 하나  
- https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html?highlight=gridsearchcv#sklearn.model_selection.GridSearchCV

#### GridSearchCV 주요 파라미터   
: estimator, param_grid, scoring=None, n_jobs=None, cv=None, refit=True 등이 있다.

- estimator : 평가할 모델을 전달  
- param_grid : 각 파라미터와 시험할 값들을 딕셔너리로 넣기  
  평가 방법은 scoring으로 측정하며 cv는 기본적으로 KFold의 횟수를 정하는 값

- https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
(평가 방식)

- refit=True : 생성된 GridSearchCV 객체를, 가장 좋은 파라미터를 전달한 estimator로 바꿔줌

In [22]:
import numpy as np
from sklearn.model_selection import train_test_split

from sklearn.linear_model import Lasso
from sklearn.model_selection import RepeatedKFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, make_scorer, mean_squared_error

In [23]:
import numpy as np

arr = np.arange(0.01, 0.1, 0.01)
print(arr)


[0.01 0.02 0.03 0.04 0.05 0.06 0.07 0.08 0.09]


In [25]:
model_lasso = Lasso()
#print(model_lasso.get_params().keys()) 

# define evaluation
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

# define search space (파이프라인 단계 이름과 맞추기 위해 lasso__ 추가)
param_grid = dict()
# param_grid['lasso__alpha'] = [0.01, 0.1, 1, 10, 100]
# param_grid['lasso__alpha'] = [0.009, 0.01, 0.02, 0.03, 0.1]
param_grid['lasso__alpha'] = [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08]
param_grid['lasso__fit_intercept'] = [True, False]

# 파이프라인: StandardScaler -> Lasso
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('lasso', model_lasso)
])

########## 모델 검증
#grid_search = GridSearchCV(pipeline, param_grid=param_grid) 
grid_search = GridSearchCV(pipeline, param_grid=param_grid, 
                           cv=cv, scoring='neg_mean_squared_error',
                           refit=True, return_train_score=True
                           ) 
# define search
# grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=KFold(n_splits=5), scoring='r2') 
# grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, scoring=make_scorer(r2_score))
# grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=KFold(n_splits=5), scoring=make_scorer(r2_score))

# execute search
grid_search.fit(X_train, y_train)
print("1. 학습모델 best_estimator_ : ", grid_search.best_estimator_)
print("2. 학습모델 best_params_ : ", grid_search.best_params_)
print("3. 학습모델 best_score_MSE : ", -1 * grid_search.best_score_)
# print("3. 학습모델 best_score_R2 : ", grid_search.best_score_)

y_grid_pred = grid_search.predict(X_test)

# MSE, RMSE 평가
MSE = mean_squared_error(y_test, y_grid_pred)
RMSE = np.sqrt(MSE)

print('4. 테스트 데이터 : MSE : {0:.5f}, RMSE : {1:.5f}'.format(MSE, RMSE))

1. 학습모델 best_estimator_ :  Pipeline(steps=[('scaler', StandardScaler()), ('lasso', Lasso(alpha=0.05))])
2. 학습모델 best_params_ :  {'lasso__alpha': 0.05, 'lasso__fit_intercept': True}
3. 학습모델 best_score_MSE :  25.344584947326602
4. 테스트 데이터 : MSE : 29.29703, RMSE : 5.41267


In [13]:
scores_df = pd.DataFrame(grid_search.cv_results_)
df_score = scores_df.sort_values(by='mean_test_score', ascending=False)
df_score.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_lasso__alpha,param_lasso__fit_intercept,params,split0_test_score,split1_test_score,split2_test_score,...,split22_train_score,split23_train_score,split24_train_score,split25_train_score,split26_train_score,split27_train_score,split28_train_score,split29_train_score,mean_train_score,std_train_score
8,0.000998,0.000257,0.000298,0.000456,0.05,True,"{'lasso__alpha': 0.05, 'lasso__fit_intercept':...",-20.587931,-38.794437,-28.88973,...,-22.329577,-24.293943,-23.746813,-20.368778,-23.659447,-23.827257,-19.914864,-22.969095,-22.726007,1.240465
6,0.000985,4.5e-05,0.000332,0.000469,0.04,True,"{'lasso__alpha': 0.04, 'lasso__fit_intercept':...",-20.559766,-38.697018,-28.838936,...,-22.28667,-24.246883,-23.712809,-20.327687,-23.617046,-23.784476,-19.868719,-22.92383,-22.683796,1.240504
10,0.00093,0.000248,0.000399,0.000488,0.06,True,"{'lasso__alpha': 0.06, 'lasso__fit_intercept':...",-20.605603,-38.917839,-28.965061,...,-22.381282,-24.334916,-23.783873,-20.412193,-23.710684,-23.879414,-19.95598,-23.024338,-22.773138,1.241594
4,0.00103,0.000179,0.000332,0.00047,0.03,True,"{'lasso__alpha': 0.03, 'lasso__fit_intercept':...",-20.537024,-38.635544,-28.798831,...,-22.246239,-24.206715,-23.686272,-20.289067,-23.580334,-23.751196,-19.831275,-22.888624,-22.648808,1.240754
12,0.001029,0.000179,0.000765,0.000422,0.07,True,"{'lasso__alpha': 0.07, 'lasso__fit_intercept':...",-20.630779,-39.06997,-29.079,...,-22.432837,-24.379575,-23.827629,-20.463499,-23.754776,-23.926322,-19.998757,-23.089639,-22.823574,1.243453


In [14]:
df_score.columns

Index(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',
       'param_lasso__alpha', 'param_lasso__fit_intercept', 'params',
       'split0_test_score', 'split1_test_score', 'split2_test_score',
       'split3_test_score', 'split4_test_score', 'split5_test_score',
       'split6_test_score', 'split7_test_score', 'split8_test_score',
       'split9_test_score', 'split10_test_score', 'split11_test_score',
       'split12_test_score', 'split13_test_score', 'split14_test_score',
       'split15_test_score', 'split16_test_score', 'split17_test_score',
       'split18_test_score', 'split19_test_score', 'split20_test_score',
       'split21_test_score', 'split22_test_score', 'split23_test_score',
       'split24_test_score', 'split25_test_score', 'split26_test_score',
       'split27_test_score', 'split28_test_score', 'split29_test_score',
       'mean_test_score', 'std_test_score', 'rank_test_score',
       'split0_train_score', 'split1_train_score', 'split2_train_score'

In [15]:
 df_score.loc[:, ['params', 'mean_test_score', 'std_test_score', 'rank_test_score']]

Unnamed: 0,params,mean_test_score,std_test_score,rank_test_score
8,"{'lasso__alpha': 0.05, 'lasso__fit_intercept':...",-25.344585,12.63582,1
6,"{'lasso__alpha': 0.04, 'lasso__fit_intercept':...",-25.350288,12.54116,2
10,"{'lasso__alpha': 0.06, 'lasso__fit_intercept':...",-25.35305,12.731417,3
4,"{'lasso__alpha': 0.03, 'lasso__fit_intercept':...",-25.365258,12.445683,4
12,"{'lasso__alpha': 0.07, 'lasso__fit_intercept':...",-25.380208,12.827162,5
2,"{'lasso__alpha': 0.02, 'lasso__fit_intercept':...",-25.381766,12.350481,6
0,"{'lasso__alpha': 0.01, 'lasso__fit_intercept':...",-25.401663,12.256304,7
14,"{'lasso__alpha': 0.08, 'lasso__fit_intercept':...",-25.424167,12.921551,8
13,"{'lasso__alpha': 0.07, 'lasso__fit_intercept':...",-608.192955,44.582763,9
15,"{'lasso__alpha': 0.08, 'lasso__fit_intercept':...",-608.2009,44.768162,10


In [26]:
# grid search linear regression model on the auto insurance dataset
from sklearn.linear_model import Ridge
from sklearn.model_selection import RepeatedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# define model
model = Ridge()

# define evaluation
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=2024)

# define search space
param_grid = dict()
param_grid['ridge__solver'] = ['svd', 'cholesky', 'lsqr', 'sag']
param_grid['ridge__alpha'] = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100]
param_grid['ridge__fit_intercept'] = [True, False]

# 파이프라인: StandardScaler -> Ridge  ; 1.x 버전부터 변경
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('ridge', model)
])

# define search
grid_search = GridSearchCV(pipeline, param_grid=param_grid, 
                          scoring='neg_mean_squared_error', 
                          n_jobs=-1, cv=cv,
                          refit=True, return_train_score=True
                          )

# execute search
result = grid_search.fit(X_train, y_train)

# summarize result
print('1. Best Score: %s' % -result.best_score_)
print('2. Best best_params_: %s' % result.best_params_)

1. Best Score: 25.23564208173714
2. Best best_params_: {'ridge__alpha': 10, 'ridge__fit_intercept': True, 'ridge__solver': 'cholesky'}


In [17]:
y_grid_pred = grid_search.predict(X_test)

In [18]:
# MSE, RMSE 평가
MSE = mean_squared_error(y_test, y_grid_pred)
RMSE = np.sqrt(MSE)
print('3. 테스트 데이터 : MSE : {0:.5f}, RMSE : {1:.5f}'.format(MSE, RMSE))
print('4. 테스트 데이터 :', grid_search.best_params_)

3. 테스트 데이터 : MSE : 29.88918, RMSE : 5.46710
4. 테스트 데이터 : {'ridge__alpha': 10, 'ridge__fit_intercept': True, 'ridge__solver': 'sag'}


In [19]:
scores_df = pd.DataFrame(grid_search.cv_results_)

In [20]:
#df_score = scores_df.sort_values(by='mean_test_score', ascending=False)
df_score = scores_df.sort_values(by='rank_test_score', ascending=False)
df_score

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_ridge__alpha,param_ridge__fit_intercept,param_ridge__solver,params,split0_test_score,split1_test_score,...,split22_train_score,split23_train_score,split24_train_score,split25_train_score,split26_train_score,split27_train_score,split28_train_score,split29_train_score,mean_train_score,std_train_score
62,0.002060,0.000572,0.000565,0.000494,100.00000,False,lsqr,"{'ridge__alpha': 100, 'ridge__fit_intercept': ...",-669.569354,-591.838165,...,-604.419889,-597.402846,-617.833678,-608.400090,-611.803226,-583.659845,-624.900854,-616.256283,-607.791454,8.188172
60,0.001761,0.000421,0.000598,0.000488,100.00000,False,svd,"{'ridge__alpha': 100, 'ridge__fit_intercept': ...",-669.558530,-591.835968,...,-604.419550,-597.402550,-617.833339,-608.399742,-611.802911,-583.659489,-624.900517,-616.256162,-607.791134,8.188174
61,0.001495,0.000498,0.000565,0.000494,100.00000,False,cholesky,"{'ridge__alpha': 100, 'ridge__fit_intercept': ...",-669.558530,-591.835968,...,-604.419550,-597.402550,-617.833339,-608.399742,-611.802911,-583.659489,-624.900517,-616.256162,-607.791134,8.188174
63,0.002192,0.000700,0.000631,0.000480,100.00000,False,sag,"{'ridge__alpha': 100, 'ridge__fit_intercept': ...",-669.560846,-591.834019,...,-604.419599,-597.402487,-617.833232,-608.399770,-611.803088,-583.659502,-624.900554,-616.256447,-607.791110,8.188194
6,0.001627,0.000602,0.000532,0.000497,0.00001,False,lsqr,"{'ridge__alpha': 1e-05, 'ridge__fit_intercept'...",-676.195004,-581.163589,...,-601.962653,-594.874680,-615.167470,-605.757360,-609.173735,-581.248745,-622.050086,-613.789163,-605.232656,8.094599
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42,0.001960,0.000480,0.000598,0.000488,1.00000,True,lsqr,"{'ridge__alpha': 1, 'ridge__fit_intercept': Tr...",-28.734157,-18.252989,...,-23.759829,-23.771129,-23.383284,-21.775127,-22.128900,-21.065593,-22.647256,-22.861081,-22.611760,1.053554
50,0.002126,0.000339,0.000830,0.000452,10.00000,True,lsqr,"{'ridge__alpha': 10, 'ridge__fit_intercept': T...",-28.061231,-18.424784,...,-23.866477,-23.878624,-23.499996,-21.881976,-22.238847,-21.171473,-22.767797,-22.963378,-22.720625,1.058882
48,0.001661,0.000536,0.000698,0.000457,10.00000,True,svd,"{'ridge__alpha': 10, 'ridge__fit_intercept': T...",-28.062541,-18.424666,...,-23.866476,-23.878623,-23.499996,-21.881976,-22.238844,-21.171472,-22.767793,-22.963377,-22.720620,1.058881
49,0.001827,0.000452,0.000631,0.000480,10.00000,True,cholesky,"{'ridge__alpha': 10, 'ridge__fit_intercept': T...",-28.062541,-18.424666,...,-23.866476,-23.878623,-23.499996,-21.881976,-22.238844,-21.171472,-22.767793,-22.963377,-22.720620,1.058881


In [21]:
# score 결과값(ndarray형태로 할당됨) 중 특정 칼럼들만 가져오기 
df_score[['params', 'mean_train_score', 'mean_test_score', 'rank_test_score',
           'split0_test_score', 'split1_test_score', 'split2_test_score',
           'split3_test_score','split4_test_score']].sort_values('rank_test_score')

Unnamed: 0,params,mean_train_score,mean_test_score,rank_test_score,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score
51,"{'ridge__alpha': 10, 'ridge__fit_intercept': T...",-22.720725,-25.235538,1,-28.063188,-18.425272,-26.199170,-37.990033,-17.191709
49,"{'ridge__alpha': 10, 'ridge__fit_intercept': T...",-22.720620,-25.235642,2,-28.062541,-18.424666,-26.198882,-37.989387,-17.191180
48,"{'ridge__alpha': 10, 'ridge__fit_intercept': T...",-22.720620,-25.235642,3,-28.062541,-18.424666,-26.198882,-37.989387,-17.191180
50,"{'ridge__alpha': 10, 'ridge__fit_intercept': T...",-22.720625,-25.235849,4,-28.061231,-18.424784,-26.198270,-37.985266,-17.198290
42,"{'ridge__alpha': 1, 'ridge__fit_intercept': Tr...",-22.611760,-25.246709,5,-28.734157,-18.252989,-26.139169,-37.183943,-17.404584
...,...,...,...,...,...,...,...,...,...
6,"{'ridge__alpha': 1e-05, 'ridge__fit_intercept'...",-605.232656,-608.843565,60,-676.195004,-581.163589,-557.055359,-578.341557,-585.689073
63,"{'ridge__alpha': 100, 'ridge__fit_intercept': ...",-607.791110,-609.848951,61,-669.560846,-591.834019,-570.342769,-601.792841,-583.423390
61,"{'ridge__alpha': 100, 'ridge__fit_intercept': ...",-607.791134,-609.849009,62,-669.558530,-591.835968,-570.340453,-601.792095,-583.427134
60,"{'ridge__alpha': 100, 'ridge__fit_intercept': ...",-607.791134,-609.849009,62,-669.558530,-591.835968,-570.340453,-601.792095,-583.427134


## [문제해결] ElasticNet 모델의 최적의 하이퍼파라미터 찾기

- 하이퍼파라미터 정의
  - elasticnet__alpha : [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100]
  - elasticnet__l1_ratio =  [0.1, 0.3, 0.5, 0.7, 0.9]
  - elasticnet__fit_intercept =  [True, False]
- 파이프라인 정의
  - 'scaler', StandardScaler()
  - model = ElasticNet()
- 성능평가
  - scoring='neg_mean_squared_error'

In [24]:
# 최고의 Hyperparameter 찾기 위한 GridsearchCV 사용
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import numpy as np
import pandas as pd

# 모델 정의
model_lr6 = ElasticNet()

# define evaluation
cv = 5
# cv = RepeatedKFold(n_splits=5)
# cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=2024)

# 하이퍼파라미터 정의
param_grid = dict()
param_grid['elasticnet__alpha'] = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100]
param_grid['elasticnet__l1_ratio'] = [0.1, 0.3, 0.5, 0.7, 0.9]
param_grid['elasticnet__fit_intercept'] = [True, False]
# 또는
# param_grid = { 
#     "elasticnet__alpha" : [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100],
#     "elasticnet__l1_ratio" : [0.1, 0.3, 0.5, 0.7, 0.9],
#     "elasticnet__fit_intercept" : [True, False]
# }

# 파이프라인: StandardScaler -> ElasticNet
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # 정규화를 위해 StandardScaler 추가
    ('elasticnet', model_lr6)
])

# define search with GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid=param_grid,         
                           cv=cv,
                           scoring='neg_mean_squared_error', 
                           n_jobs=-1, 
                           refit=True, return_train_score=True
                          )

# GridSearchCV 인자설명
# cv = 하나의 파라미터 쌍으로 모델링할 때 train, test 교차검증을 7번실시하겠다는 뜻
# refit=True : GridSearch한 후 가장 최고로 좋은 파라미터로 학습시켜 놓겠다.
# ㄴ> 이것 때문에 애초에 GridSearchCV 적용한 객체만으로 최적의 파라미터 적용된 모델로드 가능

# GridSearch 하면서 모든 파라미터값들에 대해 학습 수행
grid_search.fit(X_train, y_train)

# 최적의 모델과 파라미터 출력
print("1. 학습모델 best_estimator_ : ", grid_search.best_estimator_)
print("2. 학습모델 best_params_ : ", grid_search.best_params_)
print("3. 학습모델 best_score_ : ", -grid_search.best_score_)

# 테스트 데이터에 대한 예측
y_grid_pred = grid_search.predict(X_test)

# MSE, RMSE 평가
MSE = mean_squared_error(y_test, y_grid_pred)
RMSE = np.sqrt(MSE)
print('4. 테스트 데이터 : MSE : {0:.5f}, RMSE : {1:.5f}'.format(MSE, RMSE))

1. 학습모델 best_estimator_ :  Pipeline(steps=[('scaler', StandardScaler()),
                ('elasticnet', ElasticNet(alpha=1, l1_ratio=0.7))])
2. 학습모델 best_params_ :  {'elasticnet__alpha': 1, 'elasticnet__fit_intercept': True, 'elasticnet__l1_ratio': 0.7}
3. 학습모델 best_score_ :  38.871972335596624
4. 테스트 데이터 : MSE : 29.53210, RMSE : 5.43434


In [26]:
# grid_search.cv_results_

In [27]:
# 각 파라미터값들에 대한 모델 결과값들이 cv_results_ 객체에 할당됨
scores_df = pd.DataFrame(grid_search.cv_results_)
df_score = scores_df.sort_values(by='mean_test_score', ascending=False)
df_score.head()  # 상위 5개의 결과 출력

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_elasticnet__alpha,param_elasticnet__fit_intercept,param_elasticnet__l1_ratio,params,split0_test_score,split1_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
53,0.001993,3.568323e-07,0.000399,0.0004883441,1.0,True,0.7,"{'elasticnet__alpha': 1, 'elasticnet__fit_inte...",-10.759747,-16.792387,...,-38.871972,28.683112,1,-35.854179,-32.776675,-29.730925,-33.888579,-15.678108,-29.585693,7.230949
54,0.001993,0.0006304495,0.000598,0.0004883441,1.0,True,0.9,"{'elasticnet__alpha': 1, 'elasticnet__fit_inte...",-9.993937,-14.95707,...,-39.533061,32.593657,2,-35.019907,-32.117037,-29.128285,-33.244167,-14.066965,-28.715272,7.570664
52,0.001794,0.0003986597,0.000797,0.0003987074,1.0,True,0.5,"{'elasticnet__alpha': 1, 'elasticnet__fit_inte...",-12.00537,-18.375207,...,-39.766023,27.265457,3,-36.515934,-33.489781,-30.42809,-34.554857,-17.192242,-30.436181,6.908739
50,0.002193,0.0003986121,0.000797,0.0003986359,1.0,True,0.1,"{'elasticnet__alpha': 1, 'elasticnet__fit_inte...",-13.630169,-20.570802,...,-39.8736,24.578979,4,-37.852123,-35.061915,-31.098654,-35.342197,-19.084694,-31.687917,6.662185
51,0.001794,0.000398636,0.000997,1.168008e-07,1.0,True,0.3,"{'elasticnet__alpha': 1, 'elasticnet__fit_inte...",-12.946821,-19.409086,...,-39.995571,26.083503,5,-37.210119,-34.272602,-30.864804,-34.928225,-18.141268,-31.083404,6.782875


In [28]:
scores_df.columns

Index(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',
       'param_elasticnet__alpha', 'param_elasticnet__fit_intercept',
       'param_elasticnet__l1_ratio', 'params', 'split0_test_score',
       'split1_test_score', 'split2_test_score', 'split3_test_score',
       'split4_test_score', 'mean_test_score', 'std_test_score',
       'rank_test_score', 'split0_train_score', 'split1_train_score',
       'split2_train_score', 'split3_train_score', 'split4_train_score',
       'mean_train_score', 'std_train_score'],
      dtype='object')

In [29]:
# score 결과값(ndarray형태로 할당됨) 중 특정 칼럼들만 가져오기 
df_score[['params','mean_test_score', 'rank_test_score',
           'split0_test_score', 'split1_test_score', 'split2_test_score',
           'split3_test_score','split4_test_score']]

Unnamed: 0,params,mean_test_score,rank_test_score,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score
53,"{'elasticnet__alpha': 1, 'elasticnet__fit_inte...",-38.871972,1,-10.759747,-16.792387,-50.204357,-27.057894,-89.545476
54,"{'elasticnet__alpha': 1, 'elasticnet__fit_inte...",-39.533061,2,-9.993937,-14.957070,-47.584100,-25.758684,-99.371514
52,"{'elasticnet__alpha': 1, 'elasticnet__fit_inte...",-39.766023,3,-12.005370,-18.375207,-52.997622,-28.841613,-86.610306
50,"{'elasticnet__alpha': 1, 'elasticnet__fit_inte...",-39.873600,4,-13.630169,-20.570802,-55.822557,-29.503339,-79.841133
51,"{'elasticnet__alpha': 1, 'elasticnet__fit_inte...",-39.995571,5,-12.946821,-19.409086,-54.573916,-29.253148,-83.794883
...,...,...,...,...,...,...,...,...
76,"{'elasticnet__alpha': 100, 'elasticnet__fit_in...",-668.544963,74,-487.993827,-485.980247,-1018.738395,-874.738642,-475.273704
77,"{'elasticnet__alpha': 100, 'elasticnet__fit_in...",-668.544963,74,-487.993827,-485.980247,-1018.738395,-874.738642,-475.273704
78,"{'elasticnet__alpha': 100, 'elasticnet__fit_in...",-668.544963,74,-487.993827,-485.980247,-1018.738395,-874.738642,-475.273704
79,"{'elasticnet__alpha': 100, 'elasticnet__fit_in...",-668.544963,74,-487.993827,-485.980247,-1018.738395,-874.738642,-475.273704
