In [1]:
# 교차검증, 그리드 서치

In [2]:
# 1. 데이터 준비하기
import pandas as pd
wine = pd.read_csv('http://bit.ly/wine_csv_data')
wine.info()
wine.describe()

# 1-2. 데이터 분류, 세트 만들기
data = wine[['alcohol', 'sugar', 'pH']].to_numpy()
target = wine['class'].to_numpy()

from sklearn.model_selection import train_test_split
train_input, test_input, train_target, test_target = train_test_split(data, target, test_size=0.2, random_state=42)

sub_input, val_input, sub_target, val_target = train_test_split(train_input, train_target, test_size=0.2, random_state=42)
print(sub_input.shape, val_input.shape)



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6497 entries, 0 to 6496
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   alcohol  6497 non-null   float64
 1   sugar    6497 non-null   float64
 2   pH       6497 non-null   float64
 3   class    6497 non-null   float64
dtypes: float64(4)
memory usage: 203.2 KB
(4157, 3) (1040, 3)


In [3]:
# 1-3. 검증세트 평가하기
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(sub_input, sub_target)
print(dt.score(sub_input, sub_target))
print(dt.score(val_input, val_target))

0.9971133028626413
0.8653846153846154


In [4]:
# 2. 교차검증(3-폴드 교차검증)
from sklearn.model_selection import cross_validate
score = cross_validate(dt, train_input, train_target)
print(score)

import numpy as np
print(np.mean(score['test_score']))


{'fit_time': array([0.00800085, 0.00800109, 0.00800085, 0.00800085, 0.        ]), 'score_time': array([0., 0., 0., 0., 0.]), 'test_score': array([0.86923077, 0.85192308, 0.87680462, 0.85563041, 0.83638114])}
0.8579940031094988


In [5]:
# 2-2. 교차검증 수행
from sklearn.model_selection import StratifiedKFold
# StratifiedFold(분류모델), KFold(회귀모델)

score = cross_validate(dt, train_input, train_target, cv = StratifiedKFold())
print(np.mean(score['test_score']))

# 10폴드 교차
splitter = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
score = cross_validate(dt, train_input, train_target, cv=splitter)
print(np.mean(score['test_score']))

0.8579941881987118
0.859151104194457


In [6]:
# 3. 하이퍼파라미터 수행
from sklearn.model_selection import GridSearchCV #GridSeachCV : 하이퍼파라미터+교차검증 한번에 수행
params = {'min_impurity_decrese' : [0.0001, 0.0002, 0.0003, 0.0004, 0.0005]}
gs = GridSearchCV(DecisionTreeClassifier(random_state=42), params, n_jobs=-1) # n_jobs = 코어수행수(-1:모든코어, 1:기본값)


In [7]:
gs.fit(train_input, train_target)
dt = gs.best_estimator_
print(dt.score(train_input,train_target))

In [None]:
print(gs.best_params_)
print(gs.cv_results_['mean_lost_score'])
best_index = np.argmax(gs.cv_results_['mean_lost_score'])
print(gs.cv_results_['params'][best_index])

In [None]:
params = {'min_impurity_decrease' : np.arange(0.0001, 0.001, 0.0001),
          'max_depth' : range(5, 20, 1),
          'min_samples_split' : range(2, 100, 10)
          }

gs = GridSearchCV(DecisionTreeClassifier(random_state=42), params, n_jobs=-1)
gs.fit(train_input, train_target)

In [None]:
# 랜덤 서치
from scipy import uniform, randint #uniform:실수값, randint:정수값
rgen = randint(0,10)
rgen.rvs(10)

np.unique(rgen.rvs(1000), return_counts=True)

ugen = uniform(0,1)
ugen.rvs(10)


In [None]:
params = {'min_impurity_decrese' : uniform(0.0001, 0.001),
          'max_depth' : randint(20, 50),
          'min_samples_split' : randint(2,25),
          'min_samples_leaf' : randint(1, 25)
          }

In [None]:
from sklearn.model_selection import RandomizedSearchCV
gs = RandomizedSearchCV(DecisionTreeClassifier(random_state=42), params, n_iter=100, n_jobs=-1, random_state=42)
gs.fit(train_input, train_target)

print(gs.best_params_)


In [None]:
# 최고 교차검증 점수
print(np.max(gs.cv_results_['mean_test_score']))

dt = gs.best_estimator_
print(dt.score(test_input, test_target))