#### 보스턴 집갑 예측 모델
- DataSet : boston.csv
- 학습방법 : 지도학습 >> 회귀
- 피처/독립 : 13개
- 타겟/종속 : 13개

[1] 데이터 준비

In [1]:
# 모듈로딩
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split

In [2]:
# 데이터
DATA_FILE = r"C:\Hwan\ML_Work\D0830\boston.csv"

In [4]:
# CSV => DataFrame
dataDF = pd.read_csv(DATA_FILE)
dataDF.head(2)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6


In [5]:
# 데이터 기본 정보 확인
dataDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    int64  
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    int64  
 9   TAX      506 non-null    float64
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
 13  MEDV     506 non-null    float64
dtypes: float64(12), int64(2)
memory usage: 55.5 KB


[2] 전처리
- [2-1] Data 정제  

#### 결측치, 중복값, 이상치, 컬럼별 고유값 추출로 이상 데이터 체크

In [7]:
# 결측치 확인
dataDF.isna().sum()

CRIM       0
ZN         0
INDUS      0
CHAS       0
NOX        0
RM         0
AGE        0
DIS        0
RAD        0
TAX        0
PTRATIO    0
B          0
LSTAT      0
MEDV       0
dtype: int64

- [2-2] 표준화 & 정규화 ====> 잔행 여부에 따라 성능의 변화는 경우에 따라 다름 
   * 정규분포 데이터셋을 기반으로 한 모델 ==> StandardScaler, Log 변환, MinMaxScaler
   * Feature의 값의 범위 차이를 줄이기 ==> 피처 스케일링, MinMaxScaler, RobustScaler . . . 
   * 범주형 피쳐 ==> 수치화 인코딩 OneHotEncoder, OrdinalEncoder
   * 글자 or 문자열 타겟 ==> 정수, 라벨 인코딩 LabelEncoder
   *   ===> 이것들을 수행한다고 성능이 확 좋아지진 않음 (좋아지는 경우도 존재함)
   

[2-3] 피처와 타겟 분리

In [10]:
featureDF = dataDF.iloc[:,:-1]
featureDF

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296.0,15.3,396.90,4.98
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.90,9.14
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.90,5.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273.0,21.0,391.99,9.67
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273.0,21.0,396.90,9.08
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273.0,21.0,396.90,5.64
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273.0,21.0,393.45,6.48


In [9]:
targerSR = dataDF["MEDV"]
targerSR

0      24.0
1      21.6
2      34.7
3      33.4
4      36.2
       ... 
501    22.4
502    20.6
503    23.9
504    22.0
505    11.9
Name: MEDV, Length: 506, dtype: float64

In [11]:
print(f"featureDF : {featureDF.shape} taergerSR : {targerSR.shape}")

featureDF : (506, 13) taergerSR : (506,)


[3] 학습준비
 

[3-1] 학습용 데이터셋과 테스트용 데이터셋 분리

In [12]:
X_train, X_test, y_train, y_test = train_test_split(featureDF, targerSR, random_state=10)

In [13]:
print(f"X_train : {X_train.shape}  y_train : {y_train.shape}")
print(f"X_test : {X_test.shape}  y_test : {y_test.shape}")

X_train : (379, 13)  y_train : (379,)
X_test : (127, 13)  y_test : (127,)


[3-2] 학습용 데이터셋으로 스케일러 생성

In [14]:
## 수치피처 값의 범위 차가 큰 상황 -> Scaling(스케일링) 진행 
ssScaler = StandardScaler()
ssScaler.fit(X_train)

In [16]:
X_train_scaled = ssScaler.transform(X_train)
X_test_scaled = ssScaler.transform(X_test)

[4] 학습진행 => 교차검증으로 진행

In [18]:
from sklearn.model_selection import cross_validate
from sklearn.linear_model import Ridge

In [20]:
# 모델이 있어야함
# 모델 인스턴스 생성
ridge_model = Ridge(alpha = 1.0) # alpha는 1.0이 기본 값

In [23]:
# 학습진행
# - CV: 3개
# - scoring : "neg_mean_squared_error", "r2"
# - return_train_score
result = cross_validate(ridge_model, X_train_scaled, y_train, cv=3, scoring=["neg_mean_squared_error", "r2"], return_train_score= True)

In [24]:
resultDF = pd.DataFrame(result)
resultDF

Unnamed: 0,fit_time,score_time,test_neg_mean_squared_error,train_neg_mean_squared_error,test_r2,train_r2
0,0.035321,0.001012,-17.320297,-20.143636,0.748283,0.755663
1,0.000999,0.0,-22.582566,-18.210772,0.756292,0.740039
2,0.000992,0.0,-22.657585,-17.293662,0.680991,0.786097


In [29]:
## 모델의 성능을 좌우하는 Hyper_parameter 제어 ==> 튜닝
alpha_value = [0., 1., 10, 100]

for value in alpha_value :
    ridge_model = Ridge(alpha = value)
    result = cross_validate(ridge_model, X_train_scaled, y_train, cv=3, scoring=["neg_mean_squared_error", "r2"], return_train_score= True, return_estimator=True)

resultDF = pd.DataFrame(result)

print(f"[Ridge(alpha = {value})]")
print(resultDF, end = "\n\n")

[Ridge(alpha = 100)]
   fit_time  score_time         estimator  test_neg_mean_squared_error  \
0  0.000999    0.001002  Ridge(alpha=100)                   -18.988666   
1  0.001000    0.000000  Ridge(alpha=100)                   -25.390202   
2  0.001000    0.001000  Ridge(alpha=100)                   -26.468497   

   train_neg_mean_squared_error   test_r2  train_r2  
0                    -24.050879  0.724036  0.708269  
1                    -21.952310  0.725993  0.686628  
2                    -20.660563  0.627335  0.744452  



In [30]:
resultDF = pd.DataFrame(result)[["test_r2", "train_r2"]]

In [32]:
resultDF["diff"] = abs(resultDF["test_r2"]-resultDF["train_r2"])
print(result["estimator"][0].coef_)
print(f"[Ridge(alpha={value})]")
print(resultDF, end ="\n\n")

[-0.78141029  0.70910255 -0.46407849  0.72503917 -0.69294458  2.41757287
 -0.24148703 -1.21831206  0.28616643 -0.63423538 -1.31602563  0.78528977
 -2.39571659]
[Ridge(alpha=100)]
    test_r2  train_r2      diff
0  0.724036  0.708269  0.015767
1  0.725993  0.686628  0.039365
2  0.627335  0.744452  0.117117



In [None]:
# 릿지 대신에 라소를 입력해도 됨
# 라소를 입력하면  필요없는 것은 0으로 만듦
# 0으로 만드는 것은 위험함 ==> 릿지를 많이 사용 

###===> 이제껏 한게 하이퍼파라미터의 튜닝

[Lasso로 튜닝]

In [34]:
from sklearn.linear_model import Lasso

alpha_value = [0., 1., 10, 100]

for value in alpha_value :
    ridge_model = Lasso(alpha = value, max_iter = 3)
    result = cross_validate(ridge_model, X_train_scaled, y_train, cv=3, scoring=["neg_mean_squared_error", "r2"], return_train_score= True, return_estimator=True)

resultDF = pd.DataFrame(result)

print(f"[Ridge(alpha = {value})]")
print(resultDF, end = "\n\n")

resultDF = pd.DataFrame(result)[["test_r2", "train_r2"]]

resultDF["diff"] = abs(resultDF["test_r2"]-resultDF["train_r2"])
print(result["estimator"][0].coef_)
print(f"[Ridge(alpha={value})]")
print(resultDF, end ="\n\n")


[Ridge(alpha = 100)]
   fit_time  score_time                     estimator  \
0  0.001000    0.000000  Lasso(alpha=100, max_iter=3)   
1  0.000998    0.001002  Lasso(alpha=100, max_iter=3)   
2  0.000000    0.001000  Lasso(alpha=100, max_iter=3)   

   test_neg_mean_squared_error  train_neg_mean_squared_error   test_r2  \
0                   -68.809168                    -82.441875 -0.000010   
1                   -94.035455                    -70.051907 -0.014817   
2                   -72.336970                    -80.848183 -0.018473   

   train_r2  
0       0.0  
1       0.0  
2       0.0  

[-0.  0. -0.  0. -0.  0. -0.  0. -0. -0. -0.  0. -0.]
[Ridge(alpha=100)]
    test_r2  train_r2      diff
0 -0.000010       0.0  0.000010
1 -0.014817       0.0  0.014817
2 -0.018473       0.0  0.018473



  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


- Hyper-parameter 튜닝과 교차 검증을 동시에 진행

In [35]:
from sklearn.model_selection import GridSearchCV

In [36]:
# Ridge의 Hyper-paramter 값 설정
params = {"alpha":[0.,0.1, 0.5, 1.0], "max_ilter":[3,5]}

# 결과 : 0.,3 -> Model, 0.,5 -> Model 총 2개
# 결과 : 0.1, 3 -> Model, 0.1,5 -> Model 총 2개
# 결과 : 0.5, 3 -> Model, 0.5, 5 -> Model 총 2개
# 결과 : 1.0, 3 -> Model, 1.0, 5 -> Model 총 2개
# 총 8개의 model 생성 

In [39]:
# 인스턴스 생성
rModel = Ridge()

# GridSearchCV 인스턴스 생성
serchCV = GridSearchCV(rModel, params, cv = 3, verbose = True, return_train_score= True) # 기본 cv = 5

In [40]:
# 학습 진행
serchCV.fit(X_train_scaled, y_train)

Fitting 3 folds for each of 8 candidates, totalling 24 fits


ValueError: Invalid parameter 'max_ilter' for estimator Ridge(alpha=0.0). Valid parameters are: ['alpha', 'copy_X', 'fit_intercept', 'max_iter', 'positive', 'random_state', 'solver', 'tol'].

In [42]:
# fit() 진행 후 모델 파라미터 확인
serchCV.best_params_

AttributeError: 'GridSearchCV' object has no attribute 'best_params_'

In [43]:
serchCV.best_estimator_

AttributeError: 'GridSearchCV' object has no attribute 'best_estimator_'

In [44]:
serchCV.best_index_

AttributeError: 'GridSearchCV' object has no attribute 'best_index_'

In [45]:
bestModel = serchCV.best_estimator_
resultDF = pd.DataFrame(serchCV.cv_results_)
resultDF

AttributeError: 'GridSearchCV' object has no attribute 'best_estimator_'