## k-fold

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold

x_data = np.array([
    [2,1],
    [3,2],
    [3,4],
    [5,5],
    [7,5],
    [2,5],
    [8,9],
    [9,10],
    [6,12],
    [9,2],
    [6,10],
    [2,4]
])

In [4]:
kf = KFold(n_splits=5)

In [5]:
for train_index, test_index in kf.split(x_data):
    print('trian_index :', train_index)
    print('test_index :', test_index)

trian_index : [ 3  4  5  6  7  8  9 10 11]
test_index : [0 1 2]
trian_index : [ 0  1  2  6  7  8  9 10 11]
test_index : [3 4 5]
trian_index : [ 0  1  2  3  4  5  8  9 10 11]
test_index : [6 7]
trian_index : [ 0  1  2  3  4  5  6  7 10 11]
test_index : [8 9]
trian_index : [0 1 2 3 4 5 6 7 8 9]
test_index : [10 11]


#### K=Fold 교차검증 -> 보통 회귀 문제에서 사용됨
- 학습 데이터와 테스트 에이터를 k개의 세트로 나누어 검증하는 방법
- 데이터셋이 굉장히 적을 때 훈련데이터를 어떻게든 최대한 늘려보려고 사용되기도 하는 방법
- 여러 개의 훈련 테스트 짝으로 검증과정을 거침

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

### 2. 데이터 수집

In [8]:
x_data = np.array([
    [2,1],
    [3,2],
    [3,4],
    [5,5],
    [7,5],
    [2,5],
    [8,9],
    [9,10],
    [6,12],
    [9,2],
    [6,10],
    [2,4]
])

y_data = np.array([3, 5, 7, 10, 12, 7, 13, 13, 12, 13, 12, 6])

### 3. 데이처 전처리

### 4. EDA

### 5~7 모델링(모델선택, 학습, 평가)

In [9]:
lr = LinearRegression()

In [12]:
train_scores = []
test_scores = []

kf = KFold(n_splits=5)
for train_index, test_index in kf.split(x_data):  # 5번 검증
    x_train = np.array(x_data)[train_index]
    y_train = np.array(y_data)[train_index]
    x_test = np.array(x_data)[test_index]
    y_test = np.array(y_data)[test_index]

    model_kf = LinearRegression()
    model_kf.fit(x_train, y_train)
    
    score= model_kf.score(x_train, y_train)   #R2
    train_scores.append(score)
    
    score= model_kf.score(x_test, y_test)     #R2
    test_scores.append(score)

In [13]:
train_scores

[0.9522707858769932,
 0.9469593697441799,
 0.9446524178499608,
 0.9232432525564045,
 0.9166499001004778]

In [14]:
test_scores

[-1.1475590101753324,
 0.56847222331606,
 0.0,
 -11.7747639790487,
 0.9602035173350366]

In [15]:
print(np.array(train_scores).mean())

0.9367551452256032


In [16]:
print(np.array(test_scores).mean())

-2.278729449714587


#### cross_validation

In [24]:
from sklearn.model_selection import cross_validate

In [25]:
model = LinearRegression()

In [26]:
cv_results= cross_validate(model, x_data, y_data)

In [27]:
print(cv_results['test_score'].mean())

-2.278729449714587


In [28]:
df = pd.DataFrame(cv_results)
df= df.sort_values(by='test_score', ascending=False)

In [29]:
df

Unnamed: 0,fit_time,score_time,test_score
4,0.0,0.001,0.960204
1,0.001,0.0,0.568472
2,0.001,0.0,0.0
0,0.001001,0.0,-1.147559
3,0.000999,0.0,-11.774764


#### cross_val_score

In [30]:
from sklearn.model_selection import cross_val_score

In [31]:
model = LinearRegression()
model.fit(x_data,y_data)

LinearRegression()

In [32]:
cv_score =  cross_val_score(model,x_data,y_data, cv=5)

In [33]:
print(' cross_val_score :',  cv_score.mean())

 cross_val_score : -2.278729449714587


## 분류

In [35]:
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [37]:
x_data = np.array([
    [2,1],
    [3,2],
    [3,4],
    [5,5],
    [7,5],
    [2,5],
    [8,9],
    [9,10],
    [6,12],
    [9,2],
    [6,10],
    [2,4]
])
y_data = np.array([2, 2, 2, 1, 1, 2, 0, 0, 0, 1, 0, 2])


label =['A','B','C']

In [38]:
model = LogisticRegression()

In [42]:
# 내부적으로 알아서 학습(fit)이 됨
cv_results = cross_validate(model, x_data, y_data, return_estimator=True)



In [43]:
print(cv_results['test_score'].mean())

0.9333333333333332


In [44]:
df= pd.DataFrame(cv_results)
df = df.sort_values(by='test_score', ascending=False)

In [45]:
df

Unnamed: 0,fit_time,score_time,estimator,test_score
1,0.003,0.0,LogisticRegression(),1.0
2,0.004,0.0,LogisticRegression(),1.0
3,0.002999,0.001,LogisticRegression(),1.0
4,0.002999,0.0,LogisticRegression(),1.0
0,0.002506,0.001003,LogisticRegression(),0.666667


### 계층적 k-겹 교차검증(Stratified k-fold cross validation)

- 분류 모델에 적용
- k-겹 교차검증 모델은 k-fold가 원본 데이터 집합의 레이블 분포를 학습 및 검증 데이터 세트에 제대로 분배하지 못하는 문제를 해결해줌
- target값(정답값)=레이블/클래스의 속성값의 개수를 골고루 넣어주게 됨

In [50]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

In [53]:
from sklearn.datasets import load_iris

In [54]:
iris = load_iris()

In [55]:
x =iris.data
y= iris.target

In [56]:
skf = StratifiedKFold(n_splits=5, random_state =42, shuffle=True)

In [61]:
idx_iter = 0
cv_accuray = []

for train_index, test_index in skf.split(x,y):
    
    #split으로 반환된 인덱스를 이용하여 학습데이터, 테스트데이터를 정의
    x_train, x_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    #학습
    model.fit(x_train, y_train)
    pred = model.predict(x_test)
    
    idx_iter += 1
    accuracy = np.round(accuracy_score(y_test, pred),4)
    train_size = x_train.shape[0]
    test_size = x_test.shape[0]
    
    print('{0}번째 교차 검증 정확도 : {1} \n 학습 데이터 크기 : {2} \n 검증 데이터 크기 : {3}'.format(idx_iter,
                                                                               accuracy,
                                                                              train_size,
                                                                              test_size))
    
    cv_accuray.append(accuracy)


1번째 교차 검증 정확도 : 1.0 
 학습 데이터 크기 : 120 
 검증 데이터 크기 : 30
2번째 교차 검증 정확도 : 0.9667 
 학습 데이터 크기 : 120 
 검증 데이터 크기 : 30
3번째 교차 검증 정확도 : 0.9333 
 학습 데이터 크기 : 120 
 검증 데이터 크기 : 30
4번째 교차 검증 정확도 : 1.0 
 학습 데이터 크기 : 120 
 검증 데이터 크기 : 30
5번째 교차 검증 정확도 : 0.9333 
 학습 데이터 크기 : 120 
 검증 데이터 크기 : 30


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt