# iris data 이용해 교차 검증 
- sklearn.model_selection.KFold

In [22]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
import numpy as np
import pandas as pd

In [65]:
# iris 데이터 로드

from sklearn.datasets import load_iris
iris = load_iris()
features = iris.data # 독립변수 값 (feature 값)
label = iris.target # 종속변수 값 (label 값)
dt_clf = DecisionTreeClassifier(random_state=156) # DecisionTreeClassifier 객체 생성

# KFold

## 방법 1

In [40]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.2, random_state=11)

kfold = KFold(n_splits=5) # 기본값 5
cv_accuracy = []

# 여기서 적힌 test는 실제 test가 아닌 validation 검증을 위한 것 ! 
for i,j in kfold.split(X_train) :
#     print(f'train set : {i}') # 인덱스 번호를 줌
#     print(f'validation set : {j}')
    train_X, test_X = X_train[i], X_train[j] # 인덱스가 아닌 실제값으로 넣어주는 작업
    train_y, test_y = y_train[i], y_train[j] 
    # 학습 및 예측
    dt_clf.fit(train_X, train_y)
    pred = dt_clf.predict(test_X)
    accuracy = np.round(accuracy_score(test_y, pred), 4)
    cv_accuracy.append(accuracy)
print(cv_accuracy)
print(np.mean(cv_accuracy))
    

[0.875, 0.9583, 1.0, 0.9167, 0.9583]
0.94166


## 방법2
- 3개로 나누기 때문에 검증데이터에 0,1,2가 모두 포함되지 않을 수 있기 때문에 accuracy = 0

In [50]:
from sklearn.model_selection import train_test_split
kfold = KFold(n_splits=3) 
cv_accuracy = []
n_iter = 0

for train_index  , test_index in kfold.split(features):
#     print(f"train set :{i}")
#     print(f"validation set :{j}")
#     print('\n')
    n_iter +=1
    X_train , X_test = features[train_index] , features[test_index]
    y_train , y_test = label[train_index] , label[test_index]
    
    dt_clf.fit(X_train, y_train)
    pred = dt_clf.predict(X_test)
    accuracy = np.round(accuracy_score(y_test,pred), 4)
    print(f'\n{n_iter}번째 정확도 {accuracy}  , 학습데이터 크기 {X_train.shape[0]}, 검증데이터 크기 {X_test.shape[0]}')
    print(f'\n {n_iter}번째 검증데이터 인덱스 {test_index} ')
    cv_accuracy.append(accuracy)
    
# print(accuracy)
# print(cv_accuracy)
# np.mean(cv_accuracy)


1번째 정확도 0.0  , 학습데이터 크기 100, 검증데이터 크기 50

 1번째 검증데이터 인덱스 [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49] 

2번째 정확도 0.0  , 학습데이터 크기 100, 검증데이터 크기 50

 2번째 검증데이터 인덱스 [50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73
 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97
 98 99] 

3번째 정확도 0.0  , 학습데이터 크기 100, 검증데이터 크기 50

 3번째 검증데이터 인덱스 [100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117
 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135
 136 137 138 139 140 141 142 143 144 145 146 147 148 149] 


# KFold vs Stratified K fold
- KFold의 순서대로 뽑는 단점을 보완하기 위해 Stratified K fold 이용
- iris 데이터 처럼 y값이 0,0,0,0,1,1,1,2,2,2 순서가 있으면 정확한 예측 못 하므로 ! 
- Stratified K fold 사용하면 0에서 몇개 1에서 몇개 2에서 몇개 골고루 뽑아줌 !

# Stratified K fold
- sklearn.model_selection.StratifiedKFold
- class sklearn.model_selection.StratifiedKFold(n_splits=5, *, shuffle=False, random_state=None)

In [78]:
from sklearn.model_selection import StratifiedKFold

sfk = StratifiedKFold(n_splits=3)
cv_accuracy = []
n_iter = 0

for train_index, test_index in sfk.split(features, label) :
    n_iter +=1
    X_train, X_test = features[train_index], features[test_index]
    y_train, y_test = label[train_index], label[test_index]
    dt_clf.fit(X_train, y_train)
    pred = dt_clf.predict(X_test)
    accuracy = np.round(accuracy_score(y_test,pred), 4)
    cv_accuracy.append(accuracy)
    
print(accuracy)
print(cv_accuracy)
np.mean(cv_accuracy)


0.98
[0.98, 0.94, 0.98]


0.9666666666666667

# cross_val_score 한줄로
- sklearn.model_selection.cross_val_score
- sklearn.model_selection.cross_val_score(estimator, X, y=None, *, groups=None, scoring=None, cv=None, n_jobs=None, verbose=0, fit_params=None, pre_dispatch='2*n_jobs', error_score=nan)
- estimator : 알고리즘 / cv : 교차검증 몇 겹으로 할 것인지  

In [80]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.datasets import load_iris

iris_data = load_iris()
dt_clf = DecisionTreeClassifier(random_state=156)

features = iris_data.data
label = iris_data.target

# 성능 지표는 정확도, 교차 검증 세트는 3개
# dt_clf.fit도 여기서 해 줌
# predict 하는 것은 검증 데이터에 대한 것이고 test에 대한 것 아님
scores = cross_val_score(dt_clf, features, label, scoring='accuracy', cv = 3)
print('교차 검증별 정확도 :', np.round(scores, 4))
print('평균 검증 정확도 :', np.round(np.mean(scores), 4))

교차 검증별 정확도 : [0.98 0.94 0.98]
평균 검증 정확도 : 0.9667


# GridSearchCV
- cv 값을 이용해 최적의 하이퍼파라미터를 찾아주는 것
- sklearn.model_selection.GridSearchCV
- class sklearn.model_selection.GridSearchCV(estimator, param_grid, *, scoring=None, n_jobs=None, refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs', error_score=nan, return_train_score=False)

In [84]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

# 데이터 로딩하고 학습데이터와 테스트 데이터 분리
iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris_data.data, iris_data.target, test_size=0.2,random_state=121)

dtree = DecisionTreeClassifier()

parameters = {'max_depth' : [1,2,3], 'min_samples_split' : [2,3]} # dict 형태여야 함

import pandas as pd

grid_dtree = GridSearchCV(dtree, param_grid=parameters, cv=3, refit=True)

# 붓꽃 Train 데이터로 param_grid의 하이퍼 파라미터들을 순차적으로 학습/평가
grid_dtree.fit(X_train, y_train)
# GridSearchCV 결과 추출하여 DataFrame으로 변환
scores_df = pd.DataFrame(grid_dtree.cv_results_)
scores_df[['params', 'mean_test_score', 'rank_test_score',\
    'split0_test_score', 'split1_test_score', 'split2_test_score']]

Unnamed: 0,params,mean_test_score,rank_test_score,split0_test_score,split1_test_score,split2_test_score
0,"{'max_depth': 1, 'min_samples_split': 2}",0.7,5,0.7,0.7,0.7
1,"{'max_depth': 1, 'min_samples_split': 3}",0.7,5,0.7,0.7,0.7
2,"{'max_depth': 2, 'min_samples_split': 2}",0.958333,3,0.925,1.0,0.95
3,"{'max_depth': 2, 'min_samples_split': 3}",0.958333,3,0.925,1.0,0.95
4,"{'max_depth': 3, 'min_samples_split': 2}",0.975,1,0.975,1.0,0.95
5,"{'max_depth': 3, 'min_samples_split': 3}",0.975,1,0.975,1.0,0.95


In [88]:
# GridSearchCV의 refit으로 이미 학습이 된 estimator 반환
estimator = grid_dtree.best_estimator_

# GridSearchCV의 best_estimator_는 이미 최적 하이퍼 파라미터로 학습이 됨
pred = estimator.predict(X_test)
print('테스트 데이터 세트 정확도 : {0:.4f}'.format(accuracy_score(y_test,pred)))

테스트 데이터 세트 정확도 : 0.9667


# 분류용 가상 데이터 생성 (make_classification)
- sklearn.datasets.make_classification
- sklearn.datasets.make_classification(n_samples=100, n_features=20, *, n_informative=2, n_redundant=2, n_repeated=0, n_classes=2, n_clusters_per_class=2, weights=None, flip_y=0.01, class_sep=1.0, hypercube=True, shift=0.0, scale=1.0, shuffle=True, random_state=None)

In [127]:
from sklearn.datasets import make_classification
import matplotlib.pyplot as plt

In [134]:
X, y = make_classification(n_samples=300,
                           n_features=2, 
                           n_informative=2,
                           n_redundant=0, 
                           n_clusters_per_class=1, 
                           random_state=42,
                           n_classes=3
                          )
print(len(X))

300


## 위 iris 데이터 이용해서 한 걸 수정해서 가져온 것 


In [135]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

# 데이터 로딩하고 학습데이터와 테스트 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=121)

dtree = DecisionTreeClassifier()

parameters = {'max_depth' : [1,2,3], 'min_samples_split' : [2,3]}

import pandas as pd

grid_dtree = GridSearchCV(dtree, param_grid=parameters, cv=3, refit=True)

# 붓꽃 Train 데이터로 param_grid의 하이퍼 파라미터들을 순차적으로 학습/평가
grid_dtree.fit(X_train, y_train)
# GridSearchCV 결과 추출하여 DataFrame으로 변환
scores_df = pd.DataFrame(grid_dtree.cv_results_)
scores_df[['params', 'mean_test_score', 'rank_test_score',\
    'split0_test_score', 'split1_test_score', 'split2_test_score']]

Unnamed: 0,params,mean_test_score,rank_test_score,split0_test_score,split1_test_score,split2_test_score
0,"{'max_depth': 1, 'min_samples_split': 2}",0.625,5,0.6125,0.625,0.6375
1,"{'max_depth': 1, 'min_samples_split': 3}",0.625,5,0.6125,0.625,0.6375
2,"{'max_depth': 2, 'min_samples_split': 2}",0.891667,3,0.8875,0.875,0.9125
3,"{'max_depth': 2, 'min_samples_split': 3}",0.891667,3,0.8875,0.875,0.9125
4,"{'max_depth': 3, 'min_samples_split': 2}",0.9125,1,0.9,0.875,0.9625
5,"{'max_depth': 3, 'min_samples_split': 3}",0.9125,1,0.9,0.875,0.9625


In [138]:
pred = grid_dtree.predict(X_test)

In [139]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, pred)

0.95

# diabtes 데이터

In [171]:
import pandas as pd

diabet_df = pd.read_csv('./datasets/diabetes (2).csv')
diabet_df.head(1)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1


In [172]:
features = diabet_df.drop('Outcome', axis=1)
label = diabet_df['Outcome']
features

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33
...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63
764,2,122,70,27,0,36.8,0.340,27
765,5,121,72,23,112,26.2,0.245,30
766,1,126,60,0,0,30.1,0.349,47


In [173]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

# 데이터 로딩하고 학습데이터와 테스트 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.2,random_state=121)

dtree = DecisionTreeClassifier()

parameters = {'max_depth' : [1,2,3], 'min_samples_split' : [2,3]}

import pandas as pd

grid_dtree = GridSearchCV(dtree, param_grid=parameters, cv=3, refit=True)

# 붓꽃 Train 데이터로 param_grid의 하이퍼 파라미터들을 순차적으로 학습/평가
grid_dtree.fit(X_train, y_train)
# GridSearchCV 결과 추출하여 DataFrame으로 변환
scores_df = pd.DataFrame(grid_dtree.cv_results_)
scores_df[['params', 'mean_test_score', 'rank_test_score',\
    'split0_test_score', 'split1_test_score', 'split2_test_score']]

Unnamed: 0,params,mean_test_score,rank_test_score,split0_test_score,split1_test_score,split2_test_score
0,"{'max_depth': 1, 'min_samples_split': 2}",0.711701,3,0.731707,0.707317,0.696078
1,"{'max_depth': 1, 'min_samples_split': 3}",0.711701,3,0.731707,0.707317,0.696078
2,"{'max_depth': 2, 'min_samples_split': 2}",0.723139,1,0.731707,0.707317,0.730392
3,"{'max_depth': 2, 'min_samples_split': 3}",0.723139,1,0.731707,0.707317,0.730392
4,"{'max_depth': 3, 'min_samples_split': 2}",0.695441,5,0.707317,0.682927,0.696078
5,"{'max_depth': 3, 'min_samples_split': 3}",0.695441,5,0.707317,0.682927,0.696078


In [181]:
import numpy as np
test = X_train.iloc[0,:]
np.array(list(test))

array([  6.   , 108.   ,  44.   ,  20.   , 130.   ,  24.   ,   0.813,
        35.   ])

In [176]:
grid_dtree.predict(X_train.iloc[])

Pregnancies                  6.00
Glucose                     96.00
BloodPressure                0.00
SkinThickness                0.00
Insulin                      0.00
BMI                         23.70
DiabetesPedigreeFunction     0.19
Age                         28.00
Name: 601, dtype: float64