# 교차 검증 (Cross validation) 과 그리드 서치 (Grid search)

<table align="left">
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/rickiepark/hg-mldl/blob/master/5-2.ipynb"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />구글 코랩에서 실행하기</a>
  </td>
</table>

## 검증 세트

In [1]:
import pandas as pd

wine = pd.read_csv('https://bit.ly/wine-date')

In [3]:
# wine.info()
wine.head()
# wine["class"]

Unnamed: 0,alcohol,sugar,pH,class
0,9.4,1.9,3.51,0.0
1,9.8,2.6,3.2,0.0
2,9.8,2.3,3.26,0.0
3,9.8,1.9,3.16,0.0
4,9.4,1.9,3.51,0.0


In [11]:
data = wine[['alcohol', 'sugar', 'pH']].to_numpy()
target = wine['class'].to_numpy()

### Training 과 Validation dataset으로 분할

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
train_input, test_input, train_target, test_target = train_test_split(data, target, test_size = 0.2, random_state = 42)

In [14]:
sub_input, val_input, sub_target, val_target = train_test_split(
    train_input, train_target, test_size=0.2, random_state=42)

In [15]:
print(sub_input.shape, val_input.shape)

(4157, 3) (1040, 3)


In [16]:
from sklearn.tree import DecisionTreeClassifier

# Controls the randomness of the estimator.
# the algorithm will select ``max_features`` at random at each split

# max_depth : int, default=None The maximum depth of the tree. If None, then nodes are expanded until
#     all leaves are pure or until all leaves contain less than min_samples_split samples.

#  min_impurity_decrease (IG) : float, default=0.0 : A node will be split if this split induces a decrease of the impurity  greater than or equal to this value.

# min_samples_split : int or float, default=2  The minimum number of samples required to split an internal node:

# 'min_samples_leaf' min_samples_leaf : int or float, default=1. The minimum number of samples required to be at a leaf node.

dt = DecisionTreeClassifier(random_state=42)
dt.fit(sub_input, sub_target)

# Overfitting problem
print(dt.score(sub_input, sub_target))
print(dt.score(val_input, val_target))

0.9971133028626413
0.864423076923077


## 교차 검증

In [17]:
# Evaluate metric(s) by cross-validation and also record fit/score times.
from sklearn.model_selection import cross_validate
# None, to use the default 5-fold cross validation,
scores = cross_validate(dt, train_input, train_target, cv = 10)
print(scores['test_score'])
# print(scores['test_score'].mean())

[0.84807692 0.85769231 0.875      0.86730769 0.88461538 0.87692308
 0.875      0.86705202 0.8477842  0.81695568]


In [27]:
import numpy as np

print(np.mean(scores['test_score']))

0.8616407292129834


In [30]:
# 분류모델의 경우 target class의 비율이 잘 유지 되도록
# The folds are made by preserving the percentage of samples for each class.

from sklearn.model_selection import StratifiedKFold

# whether to shuffle each class's samples before splitting into batches
# Note that the samples within each split will not be shuffled.
splitter = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

scores = cross_validate(dt, train_input, train_target, cv=splitter)
print(np.mean(scores['test_score']))

0.8574181117533719


## 하이퍼파라미터 튜닝

In [None]:
### Hyperparameter 탐색과 교차검증을 동시에 실시

In [42]:
from sklearn.model_selection import GridSearchCV

# params = {'min_impurity_decrease': [0.0001, 0.0002, 0.0003, 0.0004, 0.0005]}
params = {'max_depth': [2, 3, 4, 5, 6]}
# max_depth=3

In [43]:
### 25번의 교차검증을 실시
## 각 params, 5번의 cv 실시
## ``-1`` means using all processors

gs = GridSearchCV(DecisionTreeClassifier(random_state=42), params, cv = None, n_jobs=-1)

In [44]:
### 최적의 모델을 이용하여 모델을 다시 훈련

gs.fit(train_input, train_target)

GridSearchCV(estimator=DecisionTreeClassifier(random_state=42), n_jobs=-1,
             param_grid={'max_depth': [2, 3, 4, 5, 6]})

In [45]:
dt = gs.best_estimator_
print(dt.score(train_input, train_target))

0.8672310948624207


In [46]:
print(gs.best_params_)

{'max_depth': 5}


### 5번의 교차검증 결과 확인

In [47]:
print(gs.cv_results_['mean_test_score'])

[0.81393555 0.84125583 0.85337806 0.85780355 0.8558801 ]


In [48]:
best_index = np.argmax(gs.cv_results_['mean_test_score'])
print(best_index)

print(gs.cv_results_['params'][best_index])
# print(gs.cv_results_['params'][0])


3
{'max_depth': 5}


In [49]:
params = {'min_impurity_decrease': np.arange(0.0001, 0.001, 0.0001),
          'max_depth': range(5, 20, 1),
          'min_samples_split': range(2, 100, 10)
          }

In [53]:
gs = GridSearchCV(DecisionTreeClassifier(random_state=42), params, n_jobs=-1)

gs.fit(train_input, train_target)

GridSearchCV(estimator=DecisionTreeClassifier(random_state=42), n_jobs=-1,
             param_grid={'max_depth': range(5, 20),
                         'min_impurity_decrease': array([0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008,
       0.0009]),
                         'min_samples_split': range(2, 100, 10)})

In [51]:
print(gs.best_params_)

{'max_depth': 14, 'min_impurity_decrease': 0.0004, 'min_samples_split': 12}


In [52]:
print(np.max(gs.cv_results_['mean_test_score']))

0.8683865773302731


### 랜덤 서치
#### 매개변수로서 확률분포객체를 전달

In [54]:
from scipy.stats import uniform, randint

In [81]:
# A uniform discrete random variable

# randint, A uniform discrete random variable.
# rgen = randint(low = 0, high = 9)
# rgen.rvs(size = 2)

array([1, 6, 8, 5, 6, 3, 0, 1, 7, 5])

In [62]:
# Find the unique elements of an array.
np.unique(rgen.rvs(1000), return_counts=True)
# np.unique(rgen.rvs(1000))

array([0, 1, 2, 3, 4, 5, 6, 7, 8])

In [49]:
# A uniform continuous random variable.
ugen = uniform(0, 1)
ugen.rvs(10)

array([0.39440966, 0.19262961, 0.10935117, 0.8668987 , 0.82391654,
       0.11823552, 0.05774906, 0.38668945, 0.77125242, 0.32254741])

In [74]:
params = {'min_impurity_decrease': uniform(0.0001, 0.001),
          'max_depth': randint(20, 50),
          'min_samples_split': randint(2, 25),
          'min_samples_leaf': randint(1, 25),
          }


# params = {'min_impurity_decrease': uniform(0.0001, 0.001),
#           'max_depth': range(5, 20, 1),
#           'min_samples_split': randint(2, 25),
#           'min_samples_leaf': randint(1, 25),
#           }

In [75]:
# In contrast to GridSearchCV, not all parameter values are tried out, but
# rather a fixed number of parameter settings is sampled from the specified
# distributions

from sklearn.model_selection import RandomizedSearchCV

gs = RandomizedSearchCV(DecisionTreeClassifier(random_state=42), params, 
                        n_iter=100, n_jobs=-1, random_state=42)
gs.fit(train_input, train_target)

RandomizedSearchCV(estimator=DecisionTreeClassifier(random_state=42),
                   n_iter=100, n_jobs=-1,
                   param_distributions={'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000002178CDB1988>,
                                        'min_impurity_decrease': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000002178D79B488>,
                                        'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000002178CE77088>,
                                        'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000002178CDB1808>},
                   random_state=42)

In [76]:
print(gs.best_params_)

{'max_depth': 39, 'min_impurity_decrease': 0.00034102546602601173, 'min_samples_leaf': 7, 'min_samples_split': 13}


In [77]:
print(np.max(gs.cv_results_['mean_test_score']))

0.8695428296438884


In [78]:
dt = gs.best_estimator_

print(dt.score(test_input, test_target))

0.86


## 확인문제

In [None]:
gs = RandomizedSearchCV(DecisionTreeClassifier(splitter='random', random_state=42), params, 
                        n_iter=100, n_jobs=-1, random_state=42)
gs.fit(train_input, train_target)

RandomizedSearchCV(cv=None, error_score=nan,
                   estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features=None,
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    presort='deprecated',
                                                    random_state=42,
         

In [None]:
print(gs.best_params_)
print(np.max(gs.cv_results_['mean_test_score']))

dt = gs.best_estimator_
print(dt.score(test_input, test_target))

{'max_depth': 43, 'min_impurity_decrease': 0.00011407982271508446, 'min_samples_leaf': 19, 'min_samples_split': 18}
0.8458726956392981
0.786923076923077
