<a href="https://colab.research.google.com/github/johyunkang/MLwithPythonCookbook/blob/main/11_%EB%AA%A8%EB%8D%B8%ED%8F%89%EA%B0%80.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Chapter 11 모델평가

## 11.1 교차검증 모델 만들기

In [15]:
from sklearn import datasets
from sklearn import metrics
from sklearn.model_selection import KFold, cross_val_score, StratifiedKFold
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

# 데이터셋 로드
digits = datasets.load_digits()

# 특성 행렬을 만들기
features = digits.data
print('features shape:', features.shape)
print('features sample:', features[:3])

# 타깃벡터
target = digits.target
print('\n\ntarget shape:', target.shape)
print('target sample:', target[:3])

scaler = StandardScaler()
lr = LogisticRegression()

# 표준화 후 로지스틱회귀 파이프라인 생성
pipeline = make_pipeline(scaler, lr)

# K-폴드 교차검증
kf = KFold(n_splits=10, shuffle=True, random_state=1)
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)

cv_result = cross_val_score(pipeline, # 파이프라인
                            features, # 특성행렬
                            target,   # 타깃
                            cv = kf,  # 교차검증 기법
                            scoring='accuracy', # 평가지표
                            n_jobs = -1 # 모든 CPU 사용
                            )

# 평균 계산
print('\n\n평균 계산(KFold):', cv_result.mean())
print(cv_result)

cv_result = cross_val_score(pipeline, # 파이프라인
                            features, # 특성행렬
                            target,   # 타깃
                            cv = skf,  # 교차검증 기법
                            scoring='accuracy', # 평가지표
                            n_jobs = -1 # 모든 CPU 사용
                            )

# 평균 계산
print('\n\n평균 계산(StratifiedKFold):', cv_result.mean())
print(cv_result)

features shape: (1797, 64)
features sample: [[ 0.  0.  5. 13.  9.  1.  0.  0.  0.  0. 13. 15. 10. 15.  5.  0.  0.  3.
  15.  2.  0. 11.  8.  0.  0.  4. 12.  0.  0.  8.  8.  0.  0.  5.  8.  0.
   0.  9.  8.  0.  0.  4. 11.  0.  1. 12.  7.  0.  0.  2. 14.  5. 10. 12.
   0.  0.  0.  0.  6. 13. 10.  0.  0.  0.]
 [ 0.  0.  0. 12. 13.  5.  0.  0.  0.  0.  0. 11. 16.  9.  0.  0.  0.  0.
   3. 15. 16.  6.  0.  0.  0.  7. 15. 16. 16.  2.  0.  0.  0.  0.  1. 16.
  16.  3.  0.  0.  0.  0.  1. 16. 16.  6.  0.  0.  0.  0.  1. 16. 16.  6.
   0.  0.  0.  0.  0. 11. 16. 10.  0.  0.]
 [ 0.  0.  0.  4. 15. 12.  0.  0.  0.  0.  3. 16. 15. 14.  0.  0.  0.  0.
   8. 13.  8. 16.  0.  0.  0.  0.  1.  6. 15. 11.  0.  0.  0.  1.  8. 13.
  15.  1.  0.  0.  0.  9. 16. 16.  5.  0.  0.  0.  0.  3. 13. 16. 16. 11.
   5.  0.  0.  0.  0.  3. 11. 16.  9.  0.]]


target shape: (1797,)
target sample: [0 1 2]


평균 계산(KFold): 0.9693916821849783
[0.97777778 0.98888889 0.96111111 0.94444444 0.97777778 0.98333333
 0.95555556

## 11.2 기본 회귀 모델 만들기 

과제 : 다른 모델과 비교하기 위해 기본 회귀 모델을 만들고 싶습니다

해결 : `DummyRegressor`를 사용하여 기본 모델로 사용할 간단한 더미 모델을 만듭니다.

In [20]:
from sklearn.datasets import load_boston
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings(action='ignore') # 경고 무시
# warnings.filterwarnings(action='default') # 경고 나오게 

boston = load_boston()

features, target = boston.data, boston.target
print('features shape:', features.shape)
print('features sample:', features[:3])
print('\n\ntarget shape:', target.shape)
print('target sample:', target[:3])

x_train, x_test, y_train, y_test = train_test_split(features, target, random_state=0)

# 더미 회귀모델을 만듭니다.
dummy = DummyRegressor(strategy='mean')

# 더미 회귀모델 훈련
dummy.fit(x_train, y_train)

# R2 점수 계산
r2 = dummy.score(x_test, y_test)
print('R2 score:', r2)

features shape: (506, 13)
features sample: [[6.3200e-03 1.8000e+01 2.3100e+00 0.0000e+00 5.3800e-01 6.5750e+00
  6.5200e+01 4.0900e+00 1.0000e+00 2.9600e+02 1.5300e+01 3.9690e+02
  4.9800e+00]
 [2.7310e-02 0.0000e+00 7.0700e+00 0.0000e+00 4.6900e-01 6.4210e+00
  7.8900e+01 4.9671e+00 2.0000e+00 2.4200e+02 1.7800e+01 3.9690e+02
  9.1400e+00]
 [2.7290e-02 0.0000e+00 7.0700e+00 0.0000e+00 4.6900e-01 7.1850e+00
  6.1100e+01 4.9671e+00 2.0000e+00 2.4200e+02 1.7800e+01 3.9283e+02
  4.0300e+00]]


target shape: (506,)
target sample: [24.  21.6 34.7]
R2 score: -0.001119359203955339


다른 모델을 훈련하고 평가하여 성능점수 비교

In [21]:
from sklearn.linear_model import LinearRegression

ols = LinearRegression()
ols.fit(x_train, y_train)
# R2 계산
r2_ols = ols.score(x_test, y_test)
print('r2 score:', r2_ols)

r2 score: 0.635463843320211


## 11.4 이진 분류기의 예측 평가하기 

In [None]:
# p.288