In [3]:
import pandas as pd

wine = pd.read_csv('http://bit.ly/wine_csv_data')
wine.head()

Unnamed: 0,alcohol,sugar,pH,class
0,9.4,1.9,3.51,0.0
1,9.8,2.6,3.2,0.0
2,9.8,2.3,3.26,0.0
3,9.8,1.9,3.16,0.0
4,9.4,1.9,3.51,0.0


In [4]:
data = wine[['alcohol', 'sugar', 'pH']]
target = wine[['class']]

In [5]:
from sklearn.model_selection import train_test_split

train_input, test_input, train_target, test_target = \
train_test_split(data, target, random_state=42)

In [16]:
# 검증 세트 만들기 (훈련 세트 / 검증 세트 / 테스트 세트)

sub_input, val_input, sub_target, val_target = \
train_test_split(train_input, train_target, random_state=42)

In [17]:
print(data.shape, train_input.shape, test_input.shape, sub_input.shape, val_input.shape)

(6497, 3) (4872, 3) (1625, 3) (3654, 3) (1218, 3)


In [18]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=42)

dt.fit(sub_input, sub_target)
print( dt.score(sub_input, sub_target) )
print( dt.score(val_input, val_target) )

0.9978106185002736
0.8571428571428571


In [29]:
from sklearn.model_selection import cross_validate
# 검증하고 싶은 모델, 인풋, 타겟
# 데이터를 섞어주는 기능은 없어, 정렬된 데이터라면 따로 섞기 필요
cross_validate(dt, train_input, train_target)

{'fit_time': array([0.00975966, 0.00697827, 0.00897956, 0.00992274, 0.0069685 ]),
 'score_time': array([0.00299835, 0.00297523, 0.00399017, 0.0030334 , 0.00199509]),
 'test_score': array([0.85128205, 0.84820513, 0.8788501 , 0.85112936, 0.84394251])}

In [21]:
scores = cross_validate(dt, train_input, train_target)
scores['test_score']

array([0.85128205, 0.84820513, 0.8788501 , 0.85112936, 0.84394251])

In [22]:
import numpy as np
np.mean(scores['test_score'])

np.float64(0.8546818301479492)

In [30]:
# 정렬된 데이터라면...
# 기본 n_splits=5, suffle=False,
from sklearn.model_selection import StratifiedKFold

splitter = StratifiedKFold(n_splits=10, shuffle=True)
score = cross_validate(dt, train_input, train_target)
score

{'fit_time': array([0.00839281, 0.0069766 , 0.0059607 , 0.0059731 , 0.00593328]),
 'score_time': array([0.00301504, 0.00327063, 0.00302052, 0.0019958 , 0.00298929]),
 'test_score': array([0.85128205, 0.84820513, 0.8788501 , 0.85112936, 0.84394251])}

In [31]:
from sklearn.model_selection import GridSearchCV

In [38]:
params = {
    'min_impurity_decrease': [0.0001, 0.0002, 0.0003, 0.0004, 0.0005]
}

# 차례로 변경하면서 학습, gs라는 매개변수에 저장

In [35]:
dt = DecisionTreeClassifier(random_state=42)
# 모델, 변경할 파라미터 요소 (dict 형태)
gs = GridSearchCV(dt, params)

In [36]:
gs.fit(train_input, train_target)

In [39]:
# 파라미터를 바꾸며 나온 제일 좋은 값
gs.best_estimator_

In [40]:
gs.best_params_

{'min_impurity_decrease': 0.0003}

In [42]:
gs.cv_results_

{'mean_fit_time': array([0.00621214, 0.01190033, 0.00677657, 0.00478368, 0.00720048]),
 'std_fit_time': array([0.00076626, 0.0016213 , 0.00239193, 0.00039878, 0.00230347]),
 'mean_score_time': array([0.00360579, 0.00382199, 0.00259166, 0.00199351, 0.002774  ]),
 'std_score_time': array([7.70020620e-04, 7.46124528e-04, 1.19624139e-03, 1.90734863e-07,
        9.78410916e-04]),
 'param_min_impurity_decrease': masked_array(data=[0.0001, 0.0002, 0.0003, 0.0004, 0.0005],
              mask=[False, False, False, False, False],
        fill_value=1e+20),
 'params': [{'min_impurity_decrease': 0.0001},
  {'min_impurity_decrease': 0.0002},
  {'min_impurity_decrease': 0.0003},
  {'min_impurity_decrease': 0.0004},
  {'min_impurity_decrease': 0.0005}],
 'split0_test_score': array([0.87384615, 0.87076923, 0.87282051, 0.86461538, 0.86051282]),
 'split1_test_score': array([0.86666667, 0.86871795, 0.87794872, 0.88512821, 0.87794872]),
 'split2_test_score': array([0.88603696, 0.88295688, 0.8798768 , 0.87

In [43]:
import numpy as np
# np.arange() --> 0.0001부터 0.001까지 0.0001의 간격으로
params = {
    'min_impurity_decrease': np.arange(0.0001, 0.001, 0.0001),
    'max_depth': range(5, 20, 1),
    'min_samples_split': range(2, 100, 10),
}

In [45]:
dt = DecisionTreeClassifier(random_state=42)
# n_jobs; 컴퓨터 코어를 얼만큼 쓸 건지 (-1은 전부)
gs = GridSearchCV(dt, params, n_jobs=-1)

In [46]:
gs.fit(train_input, train_target)

In [47]:
gs.best_params_

{'max_depth': 15,
 'min_impurity_decrease': np.float64(0.0001),
 'min_samples_split': 22}

In [50]:
gs.cv_results_['mean_test_score']

array([0.85837161, 0.85837161, 0.85837161, ..., 0.86309693, 0.86309693,
       0.86309693])

In [51]:
from scipy.stats import uniform, randint

In [53]:
randint(0, 10).rvs(10) #10개를 랜덤출력

array([7, 6, 5, 7, 5, 9, 9, 6, 6, 9])

In [58]:
rgen = randint(0, 10)
np.unique(rgen.rvs(1000)) # 고유값 출력

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [57]:
np.unique(rgen.rvs(1000), return_counts=True) # 몇개 나왔나

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([119,  93, 118, 101, 101,  88,  84,  97, 103,  96]))

In [60]:
params = {
    'min_impurity_decrease': uniform(0.0001, 0.001), #~사이의 실수값
    'max_depth': randint(20, 50), #~사이의 정수값
    'min_samples_split': randint(2, 25),
    'min_samples_leaf': randint(1, 25),
}

In [61]:
from sklearn.model_selection import RandomizedSearchCV

In [62]:
gs = RandomizedSearchCV(dt, params, n_iter=100, n_jobs=-1) #n_iter 반복수
gs.fit(train_input, train_target)

In [63]:
gs.best_estimator_

In [64]:
gs.best_params_

{'max_depth': 39,
 'min_impurity_decrease': np.float64(0.00028113101260921386),
 'min_samples_leaf': 2,
 'min_samples_split': 23}