# 붓꽃 데이터 

## 학습/테스트 데이터 셋 분리하지 않고 예측

In [1]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [2]:
iris_data = load_iris()
dt_clf = DecisionTreeClassifier()

train_data = iris_data.data
train_label = iris_data.target

# 학습(fit) 수행
dt_clf.fit(train_data, train_label)

# 테스트
pred  = dt_clf.predict(train_data)
print("예측 정확도:", accuracy_score(train_label, pred))

예측 정확도: 1.0


#### 예측을 train_data로 했기 때문에 결과 100%로 나옴

## 학습/테스트 데이터 셋 분리하고예측

In [1]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [5]:
iris_data = load_iris()
dt_clf = DecisionTreeClassifier()

# 학습/테스트 분할(split)

X_train, X_test, y_train, y_test = train_test_split(iris_data.data, 
                                                   iris_data.target,
                                                   test_size = 0.3,
                                                   random_state = 21)


print(y_train)

[0 0 0 1 0 1 0 1 2 1 1 0 0 2 0 0 0 0 0 1 1 0 2 1 0 0 2 1 0 2 1 2 0 2 0 0 1
 2 1 1 0 2 1 0 2 1 1 2 1 2 1 2 2 0 0 2 2 0 1 2 1 1 2 1 2 0 2 2 0 0 1 2 0 0
 1 2 0 0 1 2 2 0 2 1 0 1 2 1 0 2 2 1 1 2 2 2 1 2 1 1 2 2 0 0 1]


In [6]:
# 학습 수행
dt_clf.fit(X_train, y_train)

# 예측 수행
pred = dt_clf.predict(X_test)
print("예측 정확도:", accuracy_score(y_test, pred))

예측 정확도: 0.9333333333333333


## 넘파이 ndarray 뿐만 아니라 판다스 DataFrame/Series도 train_test_split() 가능

In [8]:
import pandas as pd

iris_df = pd.DataFrame(iris_data.data, columns = iris_data.feature_names)
iris_df['target'] = iris_data.target
iris_df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [9]:
# 피처 데이터프레임 반환 (마지막 열 전까지, 마지막 열 제외)
feature_df = iris_df.iloc[:,:-1]

# 타깃 데이터프레임 반환
target_df = iris_df.iloc[:, -1]

# 학슴/테스트 데이터 분할 
X_train, X_test, y_train, y_test = train_test_split(feature_df, 
                                                   iris_data.target,
                                                   test_size = 0.3,
                                                   random_state = 4)


In [10]:
type(X_train)

pandas.core.frame.DataFrame

In [12]:
dt_clf = DecisionTreeClassifier()
dt_clf.fit(X_train, y_train)
pred = dt_clf.predict(X_test)
print("예측 정확도: {0:0.3f}".format(accuracy_score(y_test, pred))) 

예측 정확도: 0.978


## K-폴드 예제

In [14]:

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold

iris = load_iris()
features = iris.data
label = iris.target

features.shape

(150, 4)

In [16]:
# DecisionTreeClassifier 객체 생성
dt_clf = DecisionTreeClassifier(random_state = 156)

# 5개의 폴드 세트로 분리하는 KFold 객체 생성
kfold = KFold(n_splits = 5)

# 폴드 세트별 정확도를 담을 리스트 객체 생성
cv_accuracy = []

# 폴드 별 학습용, 검증용 데이터 세트의 행 인덱스 확인
for train_index, test_index in kfold.split(features):
    print(train_index, test_index)

[ 30  31  32  33  34  35  36  37  38  39  40  41  42  43  44  45  46  47
  48  49  50  51  52  53  54  55  56  57  58  59  60  61  62  63  64  65
  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80  81  82  83
  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 101
 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119
 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137
 138 139 140 141 142 143 144 145 146 147 148 149] [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29]
[  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  60  61  62  63  64  65
  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80  81  82  83
  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 101
 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119
 120 121 122 123 124 125 126 127 128 129 130 131 132 1

In [19]:
import numpy as np
for train_index, test_index in kfold.split(features):
    X_train = features[train_index] 
    X_test = features[test_index]
    y_train = label[train_index]
    y_test = label[test_index]
    
    dt_clf.fit(X_train, y_train)
    pred = dt_clf.predict(X_test)
    
    acc = np.round(accuracy_score(y_test, pred), 3)
    train_size = X_train.shape[0]
    test_size = X_test.shape[0]
    
    print('정확도: %f, 학습데이터크기: %d, 검증데이터크기: %d' %(acc, train_size, test_size))
    cv_accuracy.append(acc)
print('평균 검증 정확도: ', np.mean(cv_accuracy))

정확도: 1.000000, 학습데이터크기: 120, 검증데이터크기: 30
정확도: 0.967000, 학습데이터크기: 120, 검증데이터크기: 30
정확도: 0.867000, 학습데이터크기: 120, 검증데이터크기: 30
정확도: 0.933000, 학습데이터크기: 120, 검증데이터크기: 30
정확도: 0.733000, 학습데이터크기: 120, 검증데이터크기: 30
평균 검증 정확도:  0.9


## cross_val_score()

In [1]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.datasets import load_iris
import numpy as np

iris = load_iris()
dt_clf = DecisionTreeClassifier(random_state = 156)

features = iris.data
label = iris.target

scores = cross_val_score(dt_clf, features, label, scoring = 'accuracy', cv = 3)
print("교차 검증별 정확도:", scores)
print("평균 검증 정확도:", np.round(np.mean(scores), 4))

교차 검증별 정확도: [0.98 0.94 0.98]
평균 검증 정확도: 0.9667


In [None]:
# cross_val_score()는 cv로 지정된 횟수만큼
# scoring 파라미터로 지정된 평가 지표로 평가 결과값을 배열로 반환
# 일반적으로 평가 결과값 평균을 평가 수치로 사용

## 최적의 하이퍼파라미터를 찾는 기법 : 사이킷런의 GridSearchCV클래스

In [None]:
# classifier나 Regressor와 같은 알고리즘에 사용된다.
# 그리드서치의 주요 파라미터
# estimator : classifier, regressor, pipeline
# param_grid : key + 리스트 값을 가지는 딕셔너리
# scoring : 예측 성능을 측정할 평가 방법
# cv : 교차 검증을 위해 분할되는 학습/테스트 세트의 개수


In [6]:
# GridSearchCV를 이훃해 결정 트리 알고리즘의 여러가지 최적화 파라미터를 
# 순차적으로 적용해서 붓꽃 데이터 예측 분석

from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score

iris = load_iris()
X_train, X_test,y_train, y_test = train_test_split(iris.data, 
                                                   iris.target, 
                                                   test_size = 0.2,
                                                  random_state = 121)
dt_clf = DecisionTreeClassifier()
parameters= {'max_depth': [1, 2, 3], 'min_samples_split': [2, 3]}

# 하이퍼파라미터는 딕셔너리 형식으로 지정
# key : 결정트리의 하이퍼파라미터
# value : 하이퍼파라미터의 값

In [8]:
import pandas as pd
grid_tree = GridSearchCV(dt_clf, param_grid = parameters, cv = 3, refit = True, return_train_score = True)
grid_tree.fit(X_train, y_train)
scores_df = pd.DataFrame(grid_tree.cv_results_)
# 그냥 grid_tree.cv_results_ 이렇게 입력을 하면 결과가 지저분하게 보인다.
# 그래서 DataFrame으로 바꿔서 출력을 하는게 좋다.
scores_df


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
0,0.004965,0.002147,0.001665,0.000473,1,2,"{'max_depth': 1, 'min_samples_split': 2}",0.7,0.7,0.7,0.7,1.110223e-16,5,0.7,0.7,0.7,0.7,1.110223e-16
1,0.003988,0.00082,0.001661,0.000469,1,3,"{'max_depth': 1, 'min_samples_split': 3}",0.7,0.7,0.7,0.7,1.110223e-16,5,0.7,0.7,0.7,0.7,1.110223e-16
2,0.002661,0.000939,0.002331,0.000476,2,2,"{'max_depth': 2, 'min_samples_split': 2}",0.925,1.0,0.95,0.958333,0.03118048,3,0.975,0.9375,0.9625,0.958333,0.01559024
3,0.003319,0.000468,0.00166,0.000477,2,3,"{'max_depth': 2, 'min_samples_split': 3}",0.925,1.0,0.95,0.958333,0.03118048,3,0.975,0.9375,0.9625,0.958333,0.01559024
4,0.003984,7e-06,0.002668,0.001252,3,2,"{'max_depth': 3, 'min_samples_split': 2}",0.975,1.0,0.95,0.975,0.02041241,1,0.9875,0.9625,0.9875,0.979167,0.01178511
5,0.002666,0.000951,0.002653,0.000467,3,3,"{'max_depth': 3, 'min_samples_split': 3}",0.975,1.0,0.95,0.975,0.02041241,1,0.9875,0.9625,0.9875,0.979167,0.01178511


In [9]:
scores_df[['params', 'mean_test_score', 'rank_test_score']]

Unnamed: 0,params,mean_test_score,rank_test_score
0,"{'max_depth': 1, 'min_samples_split': 2}",0.7,5
1,"{'max_depth': 1, 'min_samples_split': 3}",0.7,5
2,"{'max_depth': 2, 'min_samples_split': 2}",0.958333,3
3,"{'max_depth': 2, 'min_samples_split': 3}",0.958333,3
4,"{'max_depth': 3, 'min_samples_split': 2}",0.975,1
5,"{'max_depth': 3, 'min_samples_split': 3}",0.975,1


In [12]:
# 최고 성능을 가지는 파라미터 조합 및 예측 성능 1위 값 출력
print(grid_tree.best_params_)
print(grid_tree.best_score_)

{'max_depth': 3, 'min_samples_split': 2}
0.975


In [13]:
best_dt = grid_tree.best_estimator_

# 최고 성능을 가지는 파라미터를 적용해서 예측

pred = best_dt.predict(X_test)
accuracy_score(y_test, pred)

0.9666666666666667

In [None]:
# 일반적으로 학습 데이터를 GridSearchCV를 이용해
# 최적 하이퍼 파라미터로 튜닝