## Cross validation

In [1]:
import pandas as pd
wine = pd.read_csv('https://bit.ly/wine_csv_data')

data = wine[['alcohol', 'sugar', 'pH']].to_numpy()
target = wine['class'].to_numpy()

In [16]:
from sklearn.model_selection import train_test_split
X_train_val, X_test, y_train_val, y_test = train_test_split(
    data, target, test_size=0.2, stratify=target, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.2, stratify=y_train_val, random_state=42)


In [15]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
print(dt.score(X_train, y_train))
print(dt.score(X_val, y_val))

0.9980755352417608
0.8740384615384615


In [19]:
import numpy as np
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold
scores = cross_validate(dt, X_train_val, y_train_val, cv=StratifiedKFold()) # cv의 기본값..
print(scores)

np.mean(scores['test_score']) # 검증 폴드의 점수

{'fit_time': array([0.00652575, 0.00500464, 0.0045073 , 0.00550723, 0.004004  ]), 'score_time': array([0.00100017, 0.        , 0.00100136, 0.        , 0.        ]), 'test_score': array([0.86730769, 0.86923077, 0.84696824, 0.85370549, 0.84985563])}


0.8574135633375286

In [20]:
splitter = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_validate(dt, X_train_val, y_train_val, cv=splitter) # 직접 파라미터 설정도 가능
np.mean(scores['test_score'])

0.8576078257003111

## Grid Search

In [21]:
from sklearn.model_selection import GridSearchCV
params = {'min_impurity_decrease': [0.0001, 0.0002, 0.0003, 0.0004, 0.0005]}

gs = GridSearchCV(DecisionTreeClassifier(random_state=42), params, n_jobs=-1)
gs.fit(X_train_val, y_train_val)

dt = gs.best_estimator_ # 가장 좋은 모델
print(gs.best_params_)

best_index = np.argmax(gs.cv_results_['mean_test_score']) # 최댓값의 인덱스
gs.cv_results_['params'][best_index]

{'min_impurity_decrease': 0.0002}


{'min_impurity_decrease': 0.0002}

In [22]:
params: dict = {
    'min_impurity_decrease' : np.arange(0.0001, 0.001, 0.0001),
    'max_depth' : range(5, 20, 1),
    'min_samples_split' : range(2, 100, 10)
}
gs = GridSearchCV(DecisionTreeClassifier(random_state=42), params, n_jobs=-1)
gs.fit(X_train_val, y_train_val)
gs.best_params_

{'max_depth': 19, 'min_impurity_decrease': 0.0002, 'min_samples_split': 2}

In [23]:
print(np.max(gs.cv_results_['mean_test_score']))

0.8731935292811135


## Random Search

In [24]:
from scipy.stats import uniform, randint
params = {
    'min_impurity_decrease' : uniform(0.0001, 0.001),
    'max_depth' : randint(20, 50),
    'min_samples_split' : randint(2, 25),
    'min_samples_leaf' : randint(1, 25)
}

from sklearn.model_selection import RandomizedSearchCV
gs = RandomizedSearchCV(DecisionTreeClassifier(random_state=42),
            params, n_iter=100, n_jobs=-1, random_state=42)
gs.fit(X_train_val, y_train_val)
print(gs.best_params_)

{'max_depth': 26, 'min_impurity_decrease': 0.0002743664290049914, 'min_samples_leaf': 1, 'min_samples_split': 9}


In [25]:
print(np.max(gs.cv_results_['mean_test_score']))

0.8689625749611313


In [26]:
dt = gs.best_estimator_
print(dt.score(X_test, y_test))

0.8630769230769231
