# 붓꽃 품종 분류(Stratified KFold 모델)

In [9]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold 
from sklearn.metrics import accuracy_score 
import pandas as pd 
import numpy as np 

In [10]:
iris = load_iris()

iris_data = iris.data # 붓꽃 데이터 저장
iris_label = iris.target # 붓꽃 레이블 저장
print(iris_label)
print(iris.target_names)

iris_df = pd.DataFrame(data=iris_data, columns=iris.feature_names)
iris_df['label'] = iris.target
iris_df['label'].value_counts()

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]
['setosa' 'versicolor' 'virginica']


2    50
1    50
0    50
Name: label, dtype: int64

## 기존 KFold 모델의 문제점: 분포가 치우친 경우 해결하지 못한다(fold sets = 3개인 경우 accuracy가 0이 되는 상황 발생)

In [13]:
dt_clf = DecisionTreeClassifier(random_state=156)

kfold = KFold(n_splits=3) # fold sets = 3개
cv_accuracy = []
n_iter = 0

for train_index, test_index in kfold.split(iris_data):
    X_train, X_test = iris_data[train_index], iris_data[test_index]
    y_train, y_test = iris_label[train_index], iris_label[test_index]
    dt_clf.fit(X_train, y_train)
    pred = dt_clf.predict(X_test)
    n_iter += 1
    accuracy = np.round(accuracy_score(y_test, pred), 4) 
    train_size = X_train.shape[0]
    test_size = X_test.shape[0]
    print(n_iter, accuracy, train_size, test_size)
    print(n_iter, test_index)
    print()
    cv_accuracy.append(accuracy)

print(np.mean(cv_accuracy))

1 0.0 100 50
1 [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49]

2 0.0 100 50
2 [50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73
 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97
 98 99]

3 0.0 100 50
3 [100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117
 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135
 136 137 138 139 140 141 142 143 144 145 146 147 148 149]

0.0


## Stratified-KFold 모델이 이를 해결(전체 레이블 분포대로 각 폴드의 레이블 분포 유지)

In [18]:
skf = StratifiedKFold(n_splits=3)
n_iter = 0

for train_index, test_index in skf.split(iris_df, iris_df['label']): # 전체 레이블 비율대로 나눠야 하기에 label 필요
    n_iter += 1
    label_train = iris_df['label'].iloc[train_index]
    label_test = iris_df['label'].iloc[test_index]
    print('iteration', n_iter)
    print(label_train.value_counts())
    print(label_test.value_counts())

iteration 1
2    33
1    33
0    33
Name: label, dtype: int64
2    17
1    17
0    17
Name: label, dtype: int64
iteration 2
2    33
1    33
0    33
Name: label, dtype: int64
2    17
1    17
0    17
Name: label, dtype: int64
iteration 3
2    34
1    34
0    34
Name: label, dtype: int64
2    16
1    16
0    16
Name: label, dtype: int64
