In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pylab as plt
import statsmodels.api as sm
from sklearn.metrics import classification_report,confusion_matrix

  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)


### Scikit-Learn의 교차 검증 기능
- data를 train set과 test set으로 단순 분리
    - data splitter
        - `train_test_split()`
- 복수의 test set 준비
    - cross validation generator
        - `KFold`
        - `LeaveOneOut`
        - `ShuffleSplit`
- 복수의 test set 사용하여 평가 과정 반복
    - cross validation calculator
        - `cross_val_score()`

In [3]:
X = np.arange(10).reshape(5,2)

In [4]:
y = np.arange(5)

In [6]:
from sklearn.model_selection import train_test_split
X_train , X_test , y_train , y_test = train_test_split(X,y,test_size=0.33,random_state=42)

In [7]:
X_train

array([[4, 5],
       [0, 1],
       [6, 7]])

In [8]:
X_test

array([[2, 3],
       [8, 9]])

In [9]:
y_train

array([2, 0, 3])

In [10]:
y_test

array([1, 4])

### Cross-Validation
- KFold 클래스를 비롯한 교차 검증 클래스 객체들은 Cross Validation Generator 들로서 트레이닝/테스트용 데이터 인덱스를 내보내는 split method를 제공한다.
####  K-fold CV
- K-fold CV 방법은 데이터 셋을 K개의 Sub-set으로 분리하는 방법이다.

In [11]:
N = 5
X = np.arange(8 * N).reshape(-1, 2) * 10
y = np.hstack([np.ones(N), np.ones(N) * 2, np.ones(N) * 3, np.ones(N) * 4])
print("X:\n", X, sep="")
print("y:\n", y, sep="")

X:
[[  0  10]
 [ 20  30]
 [ 40  50]
 [ 60  70]
 [ 80  90]
 [100 110]
 [120 130]
 [140 150]
 [160 170]
 [180 190]
 [200 210]
 [220 230]
 [240 250]
 [260 270]
 [280 290]
 [300 310]
 [320 330]
 [340 350]
 [360 370]
 [380 390]]
y:
[1. 1. 1. 1. 1. 2. 2. 2. 2. 2. 3. 3. 3. 3. 3. 4. 4. 4. 4. 4.]


In [12]:
from sklearn.model_selection import KFold

cv = KFold(n_splits=3, shuffle=True, random_state=0)
for train_index, test_index in cv.split(X):
    print("test index :", test_index)
    print("." * 80 )        
    print("train index:", train_index)
    print("=" * 80 )

test index : [ 1  6  8 10 17 18 19]
................................................................................
train index: [ 0  2  3  4  5  7  9 11 12 13 14 15 16]
test index : [ 2  4  5  7  9 13 14]
................................................................................
train index: [ 0  1  3  6  8 10 11 12 15 16 17 18 19]
test index : [ 0  3 11 12 15 16]
................................................................................
train index: [ 1  2  4  5  6  7  8  9 10 13 14 17 18 19]


##### K-fold method Parameter instruction
- n_splits : int, default=3
    Number of folds. Must be at least 2.
- shuffle : boolean, optional
    Whether to shuffle the data before splitting into batches.
- random_state : int, RandomState instance or None, optional, default=None
    If int, random_state is the seed used by the random number generator;
    If RandomState instance, random_state is the random number generator;
    If None, the random number generator is the RandomState instance used
    by `np.random`. Used when ``shuffle`` == True.

### 교차 평가 시행
- Cross Validation 은 단순히 데이터 셋을 나누느 역할을 수행할 뿐이다. 실제로 묘형의 성능을 구하려면 이렇게 나누어진 데이터셋을 사용하여 평가를 반복하여야 한다. 이 과정을 자동화하는 명령이 바로바로바로 `cross_val_score()`이다.

### without cross_val_score method

In [14]:
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

X,y,coef = make_regression(n_samples=1000, n_features=1, noise=20,coef=True,random_state=0)

model = LinearRegression()
cv = KFold(10)

scores = np.zeros(10)
for i ,(train_index,test_index) in enumerate(cv.split(X)):
    X_train = X[train_index]
    y_train = y[train_index]
    X_test = X[test_index]
    y_test = y[test_index]
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    scores[i] = r2_score(y_test, y_pred)
scores

array([0.95636425, 0.94908323, 0.93880683, 0.92906829, 0.93119768,
       0.95362566, 0.93217768, 0.94308775, 0.94579305, 0.94749884])

### with cross_val_score method

In [15]:
from sklearn.model_selection import cross_val_score
cross_val_score(model,X,y,scoring='r2',cv=cv)

array([0.95636425, 0.94908323, 0.93880683, 0.92906829, 0.93119768,
       0.95362566, 0.93217768, 0.94308775, 0.94579305, 0.94749884])

## Practicing CV from scik-learn.org

#### Cross-validation iterators for i.i.d. data
- Assuming that some data is independent and identically Distributed(i.i.d.) is making the assumption that all samples stem from the `same generative process` and that the generative process is assumed to have no memory of `past generated` samples.

### K-fold
- KFold divides all the samples in k groups of samples, called folds(if k=n, this is equivalent to the Leave One Out strategy),of equal sizes. The prediction function is learned using k-1 folds, and the fold left out is used for test.

In [16]:
import numpy as np
from sklearn.model_selection import KFold

X = ['a','b','c','d']
kf = KFold(n_splits=2)
for train,test in kf.split(X):
    print('%s,%s'%(train,test))

[2 3],[0 1]
[0 1],[2 3]


### Computing cross-validated metrics
- The simplest way to use cross-validation is to call the `cross_val_score` helper function on the estimator and the dataset.