In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [11]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score


In [3]:
wine = pd.read_csv('https://raw.githubusercontent.com/rickiepark/hg-mldl/master/wine.csv')
wine.head(3)

Unnamed: 0,alcohol,sugar,pH,class
0,9.4,1.9,3.51,0.0
1,9.8,2.6,3.2,0.0
2,9.8,2.3,3.26,0.0


In [4]:
X = wine[['alcohol', 'sugar', 'pH']].to_numpy()
y = wine['class'].to_numpy()


In [5]:
X.shape, y.shape

((6497, 3), (6497,))

학습 데이터 분할 

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=50)

In [8]:
from scipy.stats import uniform, randint

In [9]:
params = {
    'min_impurity_decrease' : uniform(0.0001, 0.001),
    'max_depth' : randint(20, 50),
    'min_samples_split' : randint(2, 25),
    'min_samples_leaf' : randint(1, 25)
}

샘플링 횟수는 사이킷런의 랜덤 서치 클래스인 RandomizedSearchCV의 n_iter 매개변수에 지정한다.

In [13]:
from sklearn.model_selection import RandomizedSearchCV

dt_cls =  DecisionTreeClassifier(random_state=50)
gs = RandomizedSearchCV(dt_cls, 
                            params,
                            n_iter=100,
                            n_jobs=-1,
                            random_state=50 
                            )
gs.fit(X_train, y_train)

0,1,2
,estimator,DecisionTreeC...ndom_state=50)
,param_distributions,"{'max_depth': <scipy.stats....001B7F5AEA840>, 'min_impurity_decrease': <scipy.stats....001B7F5AEBBC0>, 'min_samples_leaf': <scipy.stats....001B7F5AE9C10>, 'min_samples_split': <scipy.stats....001B7F5AEAFF0>}"
,n_iter,100
,scoring,
,n_jobs,-1
,refit,True
,cv,
,verbose,0
,pre_dispatch,'2*n_jobs'
,random_state,50

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,36
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,50
,max_leaf_nodes,
,min_impurity_decrease,np.float64(0....0163917149342)


In [14]:
gs.best_params_

{'max_depth': 36,
 'min_impurity_decrease': np.float64(0.00015640163917149342),
 'min_samples_leaf': 1,
 'min_samples_split': 2}

In [15]:
dt = gs.best_estimator_  # <-- DescionTreeClass
print(dt.score(X_train, y_train))

0.938041177602463


0.0002 이 가장 좋은 값으로 선택되었다. 

각 매개변수에서 수행한 교차 검증의 평균 점수는 cvresults 속성의 'mean_test_score' 키에 저장되어 있다. 5번의 교차 검증으로 얻은 점수를 출력해보자.

In [None]:
gs.cv_results_['mean_test_score']

array([0.86934923, 0.86973273, 0.8683875 , 0.86511513, 0.86588325])

위처럼 수동으로 고르는 것보다 넘파이의 argmax() 함수를 사용하면 가장 큰 값의 인덱스를 추출할 수 있다. 그 다음 이 인덱스를 사용해 params 키에 저장된 매개변수를 출력할 수 있다.

In [None]:
idx = np.argmax(gs.cv_results_['mean_test_score'])
gs.cv_results_['params'][idx]

{'min_impurity_decrease': 0.0002}

In [17]:
wine.min(), wine.max()

(alcohol    8.00
 sugar      0.60
 pH         2.72
 class      0.00
 dtype: float64,
 alcohol    14.90
 sugar      65.80
 pH          4.01
 class       1.00
 dtype: float64)

In [None]:
wine[wine['class']==0]

Unnamed: 0,alcohol,sugar,pH,class
0,9.4,1.9,3.51,0.0
1,9.8,2.6,3.20,0.0
2,9.8,2.3,3.26,0.0
3,9.8,1.9,3.16,0.0
4,9.4,1.9,3.51,0.0
...,...,...,...,...
1594,10.5,2.0,3.45,0.0
1595,11.2,2.2,3.52,0.0
1596,11.0,2.3,3.42,0.0
1597,10.2,2.0,3.57,0.0


In [19]:
dt.predict( [[8.0, 0.6, 2.7]])
# dt.predict( [[10.5, 2.1, 3.45]])

array([1.])