# Voting (회귀) 하이퍼파라미터 튜닝

## #01. 준비작업

### [1] 패키지 가져오기

In [11]:
# 연결된 모듈이 업데이트 되면 즉시 자동 로드함
%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings(action='ignore')

from hossam.core import *
from hossam.util import *
from hossam.plot import *
from hossam.analysis import *
from hossam.classification import *

# 보팅 회귀
from sklearn.ensemble import VotingRegressor

# 회귀모형 클래스
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.linear_model import SGDRegressor

# 하이퍼파라미터 튜닝 객체
from sklearn.model_selection import RandomizedSearchCV

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### #02. 데이터 가져오기

In [2]:
origin = my_read_excel("https://data.hossam.kr/mldata/boston.xlsx")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    int64  
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    int64  
 9   TAX      506 non-null    int64  
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
 13  MEDV     506 non-null    float64
dtypes: float64(11), int64(3)
memory usage: 55.5 KB
None

상위 5개 행
+----+---------+------+---------+--------+-------+-------+-------+--------+-------+-------+-----------+--------+---------+--------+
|    |    CRIM |   ZN |   INDUS |   CHAS |   NOX |    RM |   AGE |    DIS |   RAD |   TAX

## #02. 데이터 전처리

[1] 훈련/검증 데이터 분할 및 데이터 표준화

In [3]:
x_train, x_test, y_train, y_test = my_train_test_split(origin, 'MEDV', scalling=True)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((404, 13), (102, 13), (404,), (102,))

## #03. 하이퍼파라미터 튜닝

### [1] Voting에 참여시킬 개별 알고리즘 객체 생성

In [5]:
lrg = get_estimator(classname=LinearRegression)
rgr = get_estimator(classname=Ridge)
lsr = get_estimator(classname=Lasso)
knr = get_estimator(classname=KNeighborsRegressor)
dtr = get_estimator(classname=DecisionTreeRegressor)
# svr = get_estimator(classname=SVR)
# sgdr = get_estimator(classname=SGDRegressor)

lrg, rgr, lsr, knr, dtr, svr, sgdr

(LinearRegression(n_jobs=-1),
 Ridge(max_iter=1000, n_jobs=-1, random_state=1000),
 Lasso(n_jobs=-1, random_state=1000),
 KNeighborsRegressor(n_jobs=-1),
 DecisionTreeRegressor(random_state=1000),
 SVR(max_iter=1000, n_jobs=-1),
 SGDRegressor(early_stopping=True, random_state=1000))

### [2] 보팅 분류기 생성

In [6]:
vo = VotingRegressor(
    # 결합할 알고리즘의 리스트 (이름, 알고리즘객체) - 이름은 개발자가 마음대로 지정
    #estimators=[('lrg', lrg), ('rgr', rgr), ('lsr', lsr), ('knr', knr), ('dtr', dtr), ('svr', svr), ('sgdr', sgdr)],
    estimators=[('lrg', lrg), ('rgr', rgr), ('lsr', lsr), ('knr', knr), ('dtr', dtr)],
    n_jobs=-1)

### [3] 모듈을 통한 기본 하이퍼파라미터 가져오기

In [12]:
lr_params = get_hyper_params(classname=LinearRegression, key='lrg')
lr_params

{'lrg__fit_intercept': [True, False]}

In [13]:
rgr_params = get_hyper_params(classname=Ridge, key='rgr')
rgr_params

{'rgr__alpha': [0.001, 0.01, 0.1, 1, 10, 100],
 'rgr__solver': ['auto',
  'svd',
  'cholesky',
  'lsqr',
  'sparse_cg',
  'sag',
  'saga']}

In [14]:
lsr_params = get_hyper_params(classname=Lasso, key='lsr')
lsr_params

{'lsr__alpha': [0.001, 0.01, 0.1, 1, 10, 100],
 'lsr__selection': ['cyclic', 'random']}

In [15]:
knr_params = get_hyper_params(classname=KNeighborsRegressor, key='knr')
knr_params

{'knr__n_neighbors': array([2, 3, 4, 5, 6, 7, 8, 9]),
 'knr__weights': ['uniform', 'distance'],
 'knr__metric': ['euclidean', 'manhattan', 'minkowski']}

In [16]:
dtr_params = get_hyper_params(classname=DecisionTreeRegressor, key='dtr')
dtr_params

{'dtr__criterion': ['squared_error',
  'friedman_mse',
  'absolute_error',
  'poisson'],
 'dtr__splitter': ['best', 'random']}

### [4] Voting용 하이퍼 파라미터로 생성

회귀를 위한 Voting의 경우 soft, hard를 구분하지 않는다.

In [18]:
params = {}
params.update(lr_params)
params.update(rgr_params)
params.update(lsr_params)
params.update(knr_params)
params.update(dtr_params)
params

{'lrg__fit_intercept': [True, False],
 'rgr__alpha': [0.001, 0.01, 0.1, 1, 10, 100],
 'rgr__solver': ['auto',
  'svd',
  'cholesky',
  'lsqr',
  'sparse_cg',
  'sag',
  'saga'],
 'lsr__alpha': [0.001, 0.01, 0.1, 1, 10, 100],
 'lsr__selection': ['cyclic', 'random'],
 'knr__n_neighbors': array([2, 3, 4, 5, 6, 7, 8, 9]),
 'knr__weights': ['uniform', 'distance'],
 'knr__metric': ['euclidean', 'manhattan', 'minkowski'],
 'dtr__criterion': ['squared_error',
  'friedman_mse',
  'absolute_error',
  'poisson'],
 'dtr__splitter': ['best', 'random']}

### [5] 하이퍼파라미터 튜닝

In [None]:
rcv = RandomizedSearchCV(
    estimator=vo, 
    param_distributions=params, 
    n_iter=get_max_iter(), 
    n_jobs=get_n_jobs(),
    random_state=get_random_state(), 
    cv=5)

rcv.fit(x_train, y_train)

result_df = DataFrame(rcv.cv_results_['params'])
result_df['mean_test_score'] = rcv.cv_results_['mean_test_score']
result_df.sort_values(by='mean_test_score', ascending=False)

result_df

### [6] 성능평가

## #04. 모듈화 기능 확인