In [6]:
# gridsearchCV를 이용해
# 결정트리 알고리즘의 여러가지 최적화 파라미터를 순차적으로 적용해서
# 붓꽃 데이터 예측 분석

from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, train_test_split

iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=121)

dt_clf = DecisionTreeClassifier()

parameters = {'max_depth' : [1, 2, 3], 'min_samples_split' : [2, 3]}
# 하이퍼파라미터는 딕셔너리 형식으로  지정
# key 값이 결정트리의 하이퍼파라미터
# value : 하이퍼파라미터의 값

In [7]:
import pandas as pd

In [9]:
grid_tree = GridSearchCV(dt_clf, param_grid=parameters, cv=3, refit=True, return_train_score=True)

grid_tree.fit(X_train, y_train)

scores_df = pd.DataFrame(grid_tree.cv_results_)

In [10]:
scores_df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
0,0.000667,0.000472,0.000333,0.00047,1,2,"{'max_depth': 1, 'min_samples_split': 2}",0.7,0.7,0.7,0.7,1.110223e-16,5,0.7,0.7,0.7,0.7,1.110223e-16
1,0.000676,0.000479,0.0,0.0,1,3,"{'max_depth': 1, 'min_samples_split': 3}",0.7,0.7,0.7,0.7,1.110223e-16,5,0.7,0.7,0.7,0.7,1.110223e-16
2,0.0,0.0,0.000672,0.000475,2,2,"{'max_depth': 2, 'min_samples_split': 2}",0.925,1.0,0.95,0.958333,0.03118048,3,0.975,0.9375,0.9625,0.958333,0.01559024
3,0.000333,0.00047,0.0,0.0,2,3,"{'max_depth': 2, 'min_samples_split': 3}",0.925,1.0,0.95,0.958333,0.03118048,3,0.975,0.9375,0.9625,0.958333,0.01559024
4,0.000665,0.00047,0.0,0.0,3,2,"{'max_depth': 3, 'min_samples_split': 2}",0.975,1.0,0.95,0.975,0.02041241,1,0.9875,0.9625,0.9875,0.979167,0.01178511
5,0.000998,3e-06,0.000342,0.000484,3,3,"{'max_depth': 3, 'min_samples_split': 3}",0.975,1.0,0.95,0.975,0.02041241,1,0.9875,0.9625,0.9875,0.979167,0.01178511


In [11]:
grid_tree.cv_results_

{'mean_fit_time': array([0.00066686, 0.00067639, 0.        , 0.00033251, 0.00066495,
        0.0009981 ]),
 'std_fit_time': array([4.71544255e-04, 4.78508207e-04, 0.00000000e+00, 4.70246438e-04,
        4.70191058e-04, 2.65729483e-06]),
 'mean_score_time': array([0.00033259, 0.        , 0.0006717 , 0.        , 0.        ,
        0.00034237]),
 'std_score_time': array([0.00047036, 0.        , 0.00047502, 0.        , 0.        ,
        0.00048418]),
 'param_max_depth': masked_array(data=[1, 1, 2, 2, 3, 3],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_min_samples_split': masked_array(data=[2, 3, 2, 3, 2, 3],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'max_depth': 1, 'min_samples_split': 2},
  {'max_depth': 1, 'min_samples_split': 3},
  {'max_depth': 2, 'min_samples_split': 2},
  {'max_depth': 2, 'min_samples_split': 3},
  {'ma

In [12]:
# gridSearchCV 결과 세트로 딕셔너리 형태인 cv_results_ 를 dataFrame으로 변환후 일부 파라미터 확인

scores_df[['params', 'mean_test_score', 'rank_test_score']]

Unnamed: 0,params,mean_test_score,rank_test_score
0,"{'max_depth': 1, 'min_samples_split': 2}",0.7,5
1,"{'max_depth': 1, 'min_samples_split': 3}",0.7,5
2,"{'max_depth': 2, 'min_samples_split': 2}",0.958333,3
3,"{'max_depth': 2, 'min_samples_split': 3}",0.958333,3
4,"{'max_depth': 3, 'min_samples_split': 2}",0.975,1
5,"{'max_depth': 3, 'min_samples_split': 3}",0.975,1


In [16]:
#최고 성능으 가지는 파라미터 조합 및 에측 성능 1위 값 출력
print('최적 파라미터: ',grid_tree.best_params_)
print('최고 정확도 : ',grid_tree.best_score_)

최적 파라미터:  {'max_depth': 3, 'min_samples_split': 2}
최고 정확도 :  0.975


In [17]:
# gridSearchCV 객체의 생성 파라미터로 refit=True로 설정된 경우(디폴트)
# gridSeachCV가 최적 성능을 나타내는 하이퍼 파라미터로 Estimator를 학습하고
# best_estimator_로 저장
# GridSearchCV의 refit으로 이미 학습이 된 estimator

best_dt = grid_tree.best_estimator_

pred = best_dt.predict(X_test)
accuracy_score(y_test, pred)

0.9666666666666667