### Ridge, Lasso, Elastic

#### 보스턴 집값 예측
- 데이터셋: boston.csv
- 학습방법: 지도학습 >> 회귀
- 피쳐/독립: 13개
- 타겟/종속: 1개 (MEDV)

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split

In [2]:
DATA_PATH = '../data/boston.csv'
dataDF = pd.read_csv(DATA_PATH)
dataDF.head(3)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7


In [3]:
# 데이터 기본 정보 확인
dataDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    int64  
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    int64  
 9   TAX      506 non-null    float64
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
 13  MEDV     506 non-null    float64
dtypes: float64(12), int64(2)
memory usage: 55.5 KB


#### 전처리
* [2_1] 데이터 정제: 결측치, 중복값, 이상치, 컬럼별 고유값 추출로 이상 데이터 체크<br></br>
* [2_2] 데이터 스케일링: 표준화, 정규화, 인코딩		=> 진행 여부에 따라 성능 변화는 경우에 따라 다름!!!
	- 정규분포 데이터셋을 기반으로 한 모델 => StandardScaler, Log 변환...
	- 피쳐의 값의 범위 차이를 줄이기	   => 피쳐 스케일링, MinMaxScaler, RobustScaler....
	- 범주형 피쳐 => 수치화 인코딩 OnehotEncoder, OrdinalEncoder
	- 문자열 타겟 => 정수 라벨인코딩 LabelEncoder




##### 수치 피쳐 스케일링 w/ StandardScaler

In [5]:
# 피쳐 & 타겟 분리
featureDF = dataDF.iloc[:, :-1]
targetSR = dataDF['MEDV']

print(f'featureDF: {featureDF.shape}	targetSR: {targetSR.shape}')

featureDF: (506, 13)	targetSR: (506,)


In [6]:
# 학습용 & 테스트용 데이터셋 분리
X_train, X_test, y_train, y_test = train_test_split(featureDF, targetSR, random_state=10)

print(f'X_train:	{X_train.shape}		y_train:	{y_train.shape}')
print(f'X_train:	{X_test.shape}		y_train:	{y_test.shape}')

X_train:	(379, 13)		y_train:	(379,)
X_train:	(127, 13)		y_train:	(127,)


In [7]:
# train DS로 스케일러 생성
ssScaler = StandardScaler()

ssScaler.fit(X_train)

X_train_scaled = ssScaler.transform(X_train)
X_test_scaled = ssScaler.transform(X_test)

#### 교차검증 학습 진행

In [8]:
from sklearn.model_selection import cross_validate
from sklearn.linear_model import Ridge

In [25]:
### 모델 성능 좌우 하이퍼-파라미터 제어, a.k.a 튜닝
alpha_values=[0., 1., 10, 100]						# 디폴트: 1.0

# 학습진행
# - cv: 3개
# - scoring: 'mean_squared_error' , 'r2'
# - return_train_score

for value in alpha_values:

	# 모델 인스턴스 생성
	ridge_model = Ridge(alpha=value, max_iter=5)			# max_iter: 최대 학습횟수 / 너무 많이 학습해도 과적합

	result = cross_validate( ridge_model, X_train_scaled, y_train,
							cv=3, scoring=['neg_mean_squared_error', 'r2'] ,
							return_train_score=True,
							return_estimator=True)
	print("="*60)
	# print(result.keys())
	print()
	resultDF = pd.DataFrame(result)[["test_r2","train_r2"]]
	resultDF['diff'] = abs( result['test_r2'] - result['train_r2'] )

	# diff 오름차순 정렬 맨 첫 인덱스
	best_model_idx = resultDF['diff'].sort_values().index[0]

	print(result['estimator'][best_model_idx].coef_)
	
	print(f'\n[alpha= {value}] 모델 결과: \n{resultDF}\n\n' )
	print("="*60)

dict_keys(['fit_time', 'score_time', 'estimator', 'test_neg_mean_squared_error', 'train_neg_mean_squared_error', 'test_r2', 'train_r2'])

[-1.41407793  1.56590993  0.15536906  0.65522098 -2.36200159  2.31948624
  0.1173831  -3.59071105  2.71475429 -2.33252925 -1.88390034  1.04036915
 -3.50250877]
[alpha= 0.0] 모델 결과: 
    test_r2  train_r2      diff
0  0.747022  0.755720  0.008699
1  0.756482  0.740082  0.016400
2  0.680801  0.786156  0.105355


dict_keys(['fit_time', 'score_time', 'estimator', 'test_neg_mean_squared_error', 'train_neg_mean_squared_error', 'test_r2', 'train_r2'])

[-1.39035961  1.53043843  0.11109741  0.6621853  -2.29024619  2.34249774
  0.10030677 -3.52062389  2.57481444 -2.20749462 -1.86406784  1.03607796
 -3.48102887]
[alpha= 1.0] 모델 결과: 
    test_r2  train_r2      diff
0  0.748283  0.755663  0.007380
1  0.756292  0.740039  0.016253
2  0.680991  0.786097  0.105106


dict_keys(['fit_time', 'score_time', 'estimator', 'test_neg_mean_squared_error', 'train_neg_mean_squar

#### GridSearchCV
* 하이퍼 파라미터 튜닝 & 교차 검증 동시 진행
	- Dictionary 형태로 하이퍼-파라미터 선언

In [27]:
from sklearn.model_selection import GridSearchCV

In [28]:
# Ridge의 Hyper-parameter 값 설정
params = {'alpha': [0, 0.1, 0.5, 1.0],						
          'max_iter': [3,5]
		  }										# model -> (0,3), (0,5), (0.1,3), (0.1,5), ..., (1.0,3),(1.0,5) 
												# 8개 Ridge 모델 생성

In [30]:
# 학습 모델 인스턴스 생성
rModel = Ridge()

# GridSearchCV 인스턴스 생성
searchCV = GridSearchCV(rModel, param_grid=params, cv=3,
                        verbose=True,							# 진행상황 확인용 (어떤 반환값???)
                        return_train_score=True)

In [31]:
# 학습 진행
searchCV.fit(X_train_scaled, y_train)

# Fitting 3 folds for each of 8 candidates, totalling 24 fits

Fitting 3 folds for each of 8 candidates, totalling 24 fits


In [32]:
# .fit() 파라미터 확인
searchCV.best_params_

{'alpha': 1.0, 'max_iter': 3}

In [34]:
best_model = searchCV.best_estimator_
best_model

In [36]:
# Ridge 결과 DF
Ridge_resultDF = pd.DataFrame(searchCV.cv_results_)		# cv_results => Bunch data
Ridge_resultDF.head(3)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,param_max_iter,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
0,0.001821,0.000226,0.000667,0.0004713215,0.0,3,"{'alpha': 0, 'max_iter': 3}",0.747022,0.756482,0.680801,0.728101,0.033669,7,0.75572,0.740082,0.786156,0.760653,0.019131
1,0.000999,4e-06,0.000335,0.0004735058,0.0,5,"{'alpha': 0, 'max_iter': 5}",0.747022,0.756482,0.680801,0.728101,0.033669,7,0.75572,0.740082,0.786156,0.760653,0.019131
2,0.000997,1e-06,0.000996,5.15043e-07,0.1,3,"{'alpha': 0.1, 'max_iter': 3}",0.747159,0.756462,0.680831,0.728151,0.033675,5,0.75572,0.740081,0.786156,0.760652,0.019131
