## CHAPTER 5.  모델 평가와 성능 향상

In [1]:
%matplotlib inline
import sys 
sys.path.append('..')
from preamble import *

from sklearn.datasets import make_blobs
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# create a synthetic dataset
X, y = make_blobs(random_state=0)
print("X.shape:", X.shape)
print("y.shape:", y.shape)

# split data and labels into a training and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# instantiate a model and fit it to the training set
logreg = LogisticRegression().fit(X_train, y_train)

# evaluate the model on the test set
print("Test set score: {:.2f}".format(logreg.score(X_test, y_test)))

X.shape: (100, 2)
y.shape: (100,)
Test set score: 0.88


### 5.1 교차 검증
- 교차 검증
  - 데이터를 여러 번 반복해서 나누어 모델 학습
- K-Fold cross-vailidation
  - Fold: 원본 데이터에 대한 부분 집합
  - K로는 5나 10을 주로 사용
    - 첫번째 모델은 첫번째 fold를 테스트 데이터로 사용하고 나머지를 훈련 데이터로 사용
    - 두번째 모델은 두번째 fold를 테스트 데이터로 사용하고 나머지를 훈련 데이터로 사용
    - 세번째 모델은...

#### 5.1.1 scikit-learn의 교차 검증
- scikit-learn의 교차 검증
  - model_selection.cross_val_score(estimator, X, y=None, cv=None) 함수 사용
    - estimator
      - estimator object implementing ‘fit’
      - The object to use to fit the data.
    - X
      - The data to fit.
    - y
      - The target variable to try to predict in the case of supervised learning.
    - cv
      - K-Fold의 K값 (기본 값: 3)

In [2]:
from sklearn.model_selection import cross_val_score
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression

iris = load_iris()
print("iris.data.shape:", iris.data.shape)
print("iris.target.shape:", iris.target.shape)

logreg = LogisticRegression()

scores = cross_val_score(logreg, iris.data, iris.target)
print("Cross-validation scores: {}".format(scores))

iris.data.shape: (150, 4)
iris.target.shape: (150,)
Cross-validation scores: [0.961 0.922 0.958]


In [3]:
scores = cross_val_score(logreg, iris.data, iris.target, cv=5)
print("Cross-validation scores: {}".format(scores))

Cross-validation scores: [1.    0.967 0.933 0.9   1.   ]


In [4]:
print("Average cross-validation score: {:.2f}".format(scores.mean()))

Average cross-validation score: 0.96
