### 1. K Fold

In [5]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
import numpy as np
from sklearn.datasets import load_iris

In [11]:
iris = load_iris()
features = iris.data
label = iris.target
dt_clf = DecisionTreeClassifier(random_state = 123)

In [10]:
kfold = KFold(n_splits = 5)
cv_acc = []
n_iter = 0

In [15]:
for tr_idx, te_idx in kfold.split(features):
    # tr, te Split
    X_tr, X_te = features[tr_idx], features[te_idx]
    y_tr, y_te = label[tr_idx], label[te_idx]
    
    # Training 
    dt_clf.fit(X_tr, y_tr)
    
    n_iter +=1 
    
    # Test
    pred = dt_clf.predict(X_te)
    # Test Score
    acc = np.round(accuracy_score(y_te, pred),4)
   
    cv_acc.append(acc)
    
print(np.mean(cv_acc))

0.92


### 2. Stratified

In [16]:
from sklearn.model_selection import StratifiedKFold

In [18]:
df_clf = DecisionTreeClassifier(random_state = 112)

skfold = StratifiedKFold(n_splits = 3)
n_iter = 0
cv_acc = []

for tr_idx, te_idx in skfold.split(features, label):
    # Tr, Te Split
    X_tr, X_te = features[tr_idx], features[te_idx]
    y_tr, y_te = label[tr_idx], label[te_idx]
    
    # Training 
    dt_clf.fit(X_tr, y_tr)
    
    n_iter +=1 
    
    # Test
    pred = dt_clf.predict(X_te)
    # Test Score
    acc = np.round(accuracy_score(y_te, pred),4)
   
    cv_acc.append(acc)
    
print(np.mean(cv_acc))

0.9604


### 3. Easier API 

- 내부적으로 StratifiedKFold

In [26]:
from sklearn.model_selection import cross_val_score

In [25]:
dt_clf = DecisionTreeClassifier(random_state = 134)

data = iris.data
label = iris.target

scores = cross_val_score(dt_clf, data, label, scoring = "accuracy", cv = 5)

print(np.mean(scores))

0.9600000000000002


### 4. GridSearchCV

In [27]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [29]:
X_tr, X_te, y_tr, y_te = train_test_split(iris.data, iris.target, test_size = 0.2, random_state = 12)

In [31]:
dtree = DecisionTreeClassifier()

parameters = {'max_depth' : [1,2,3], 'min_samples_split':[2,3]}

grid_dtree = GridSearchCV(dtree, param_grid = parameters, cv=3, refit = True)

In [32]:
grid_dtree.fit(X_tr, y_tr)



GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=None,
                                              splitter='best'),
             iid='warn', n_jobs=None,
             param_grid={'max_depth': [1, 2, 3], 'min_samples_split': [2, 3]},
             pre_dispatch='2*n_jobs', refit=True, return_

In [34]:
grid_dtree.cv_results_

{'mean_fit_time': array([0.00066527, 0.        , 0.0009977 , 0.        , 0.00033228,
        0.00033243]),
 'std_fit_time': array([4.70415277e-04, 0.00000000e+00, 1.62093465e-06, 0.00000000e+00,
        4.69909263e-04, 4.70134046e-04]),
 'mean_score_time': array([0.00033267, 0.00033196, 0.        , 0.00033228, 0.        ,
        0.        ]),
 'std_score_time': array([0.00047047, 0.00046946, 0.        , 0.00046991, 0.        ,
        0.        ]),
 'param_max_depth': masked_array(data=[1, 1, 2, 2, 3, 3],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_min_samples_split': masked_array(data=[2, 3, 2, 3, 2, 3],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'max_depth': 1, 'min_samples_split': 2},
  {'max_depth': 1, 'min_samples_split': 3},
  {'max_depth': 2, 'min_samples_split': 2},
  {'max_depth': 2, 'min_samples_split': 3},
  {'ma

In [39]:
print(grid_dtree.best_params_)
print(grid_dtree.best_score_)

estimator = grid_dtree.best_estimator_

{'max_depth': 3, 'min_samples_split': 2}
0.9333333333333333
