<a href="https://colab.research.google.com/github/junyeog/machine-learning-practice/blob/main/250416_cross_validation_hyperparameter_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 교차 검증과 그리드 서치

## 검증 데이터셋

In [None]:
import pandas as pd

wine = pd.read_csv('https://bit.ly/wine-date')

### 문제 1 : wine 데이터 확인

In [None]:
# wine 처음 5개 행 데이터 확인
wine.head()

Unnamed: 0,alcohol,sugar,pH,class
0,9.4,1.9,3.51,0.0
1,9.8,2.6,3.2,0.0
2,9.8,2.3,3.26,0.0
3,9.8,1.9,3.16,0.0
4,9.4,1.9,3.51,0.0


In [None]:
# wine 전체 행의 개수 확인
print(wine.shape)

(6497, 4)


In [None]:
# wine 데이터 통계값 확인 (각 특성별 평균, 표준편차, 최소값, 최대값 등)
wine.describe()

Unnamed: 0,alcohol,sugar,pH,class
count,6497.0,6497.0,6497.0,6497.0
mean,10.491801,5.443235,3.218501,0.753886
std,1.192712,4.757804,0.160787,0.430779
min,8.0,0.6,2.72,0.0
25%,9.5,1.8,3.11,1.0
50%,10.3,3.0,3.21,1.0
75%,11.3,8.1,3.32,1.0
max,14.9,65.8,4.01,1.0


In [None]:
# 화이트 와인, 레드 와인 데이터 개수 확인
wine['class'].value_counts()

Unnamed: 0_level_0,count
class,Unnamed: 1_level_1
1.0,4898
0.0,1599


### 데이터셋 분류

In [None]:
data = wine[['alcohol', 'sugar', 'pH']].to_numpy()
target = wine['class'].to_numpy()

In [None]:
from sklearn.model_selection import train_test_split

train_input, test_input, train_target, test_target = train_test_split(
    data, target, test_size=0.2, random_state=42) #훈련,학습 데이터 나눔

In [None]:
sub_input, val_input, sub_target, val_target = train_test_split(
    train_input, train_target, test_size=0.2, random_state=42)#다시 훈련,검증 데이터로 나눔

In [None]:
print(sub_input.shape, val_input.shape)

(4157, 3) (1040, 3)


In [None]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=42)
dt.fit(sub_input, sub_target)

print(dt.score(sub_input, sub_target))
print(dt.score(val_input, val_target))

0.9971133028626413
0.864423076923077


## 교차 검증

In [None]:
from sklearn.model_selection import cross_validate

scores = cross_validate(dt, train_input, train_target)
print(scores)

{'fit_time': array([0.00869679, 0.00819588, 0.00862765, 0.0089829 , 0.0080893 ]), 'score_time': array([0.0012641 , 0.00114655, 0.00122619, 0.00126743, 0.00119567]), 'test_score': array([0.86923077, 0.84615385, 0.87680462, 0.84889317, 0.83541867])}


In [None]:
import numpy as np

print(np.mean(scores['test_score']))

0.855300214703487


In [None]:
from sklearn.model_selection import StratifiedKFold
#k fold 교차 검증
scores = cross_validate(dt, train_input, train_target, cv=StratifiedKFold())
print(np.mean(scores['test_score']))

0.855300214703487


In [None]:
splitter = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_validate(dt, train_input, train_target, cv=splitter)
print(np.mean(scores['test_score']))

0.8574181117533719


## 하이퍼파라미터 튜닝

In [None]:
from sklearn.model_selection import GridSearchCV #그리드서치

params = {'min_impurity_decrease': [0.0001, 0.0002, 0.0003, 0.0004, 0.0005]}

In [None]:
gs = GridSearchCV(DecisionTreeClassifier(random_state=42), params, n_jobs=-1)

In [None]:
gs.fit(train_input, train_target)

In [None]:
dt = gs.best_estimator_ #최적의 파라미터의 score 출력
print(dt.score(train_input, train_target))

0.9615162593804117


In [None]:
print(gs.best_params_)  #최적의 파라미터

{'min_impurity_decrease': 0.0001}


In [None]:
print(gs.cv_results_['mean_test_score']) #파라미터 각각의 test score 평균 출력

[0.86819297 0.86453617 0.86492226 0.86780891 0.86761605]


In [None]:
best_index = np.argmax(gs.cv_results_['mean_test_score'])
print(gs.cv_results_['params'][best_index])

{'min_impurity_decrease': 0.0001}


In [None]:
params = {'min_impurity_decrease': np.arange(0.0001, 0.001, 0.0001),
          'max_depth': range(5, 20, 1),
          'min_samples_split': range(2, 100, 10)
          }

In [None]:
gs = GridSearchCV(DecisionTreeClassifier(random_state=42), params, n_jobs=-1)
gs.fit(train_input, train_target)

In [None]:
print(gs.best_params_)

{'max_depth': 14, 'min_impurity_decrease': np.float64(0.0004), 'min_samples_split': 12}


In [None]:
print(np.max(gs.cv_results_['mean_test_score']))

0.8683865773302731


In [None]:
# 교차검증 수행 시간 프린트
gs.cv_results_['mean_fit_time']

array([0.00697494, 0.00761018, 0.00720248, ..., 0.0068985 , 0.01218619,
       0.01275029])

### 랜덤 서치

In [None]:
from scipy.stats import uniform, randint

In [None]:
# 균등 분포 샘플링
rgen = randint(0, 10)
rgen.rvs(10)

array([3, 7, 4, 8, 9, 6, 7, 3, 7, 3])

In [None]:
np.unique(rgen.rvs(1000), return_counts=True) # 빈도도 함께 출력

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([104,  94,  98, 107, 107,  93,  93,  97, 118,  89]))

In [None]:
ugen = uniform(0, 1)
ugen.rvs(10)

array([0.56383634, 0.32077923, 0.20397974, 0.09388598, 0.05200386,
       0.17659594, 0.95319748, 0.89260683, 0.58310755, 0.10412731])

In [None]:
params = {'min_impurity_decrease': uniform(0.0001, 0.001),
          'max_depth': randint(20, 50),
          'min_samples_split': randint(2, 25),
          'min_samples_leaf': randint(1, 25),
          }

In [None]:
from sklearn.model_selection import RandomizedSearchCV

rs = RandomizedSearchCV(DecisionTreeClassifier(random_state=42), params,
                        n_iter=100, n_jobs=-1, random_state=42)
rs.fit(train_input, train_target)

In [None]:
print(rs.best_params_) #최적의 파라미터 출력

{'max_depth': 39, 'min_impurity_decrease': np.float64(0.00034102546602601173), 'min_samples_leaf': 7, 'min_samples_split': 13}


In [None]:
print(np.max(rs.cv_results_['mean_test_score']))

0.8695428296438884


In [None]:
dt = rs.best_estimator_

print(dt.score(test_input, test_target))

0.86


In [None]:
rs.cv_results_['mean_fit_time']

array([0.01470914, 0.015625  , 0.01367307, 0.01490641, 0.00873423,
       0.01277633, 0.01035175, 0.01425924, 0.00852489, 0.0135282 ,
       0.00896735, 0.00652895, 0.01499815, 0.01576314, 0.01141129,
       0.01565313, 0.00734262, 0.00699811, 0.01012492, 0.00681448,
       0.00824609, 0.01032405, 0.01369858, 0.0082026 , 0.00881133,
       0.01385999, 0.01543746, 0.01553659, 0.00775986, 0.00667996,
       0.00823107, 0.01740861, 0.01453276, 0.01332679, 0.01112285,
       0.00756178, 0.00676632, 0.00772028, 0.00737247, 0.0067811 ,
       0.01139245, 0.01381359, 0.00868664, 0.00937853, 0.01681857,
       0.014293  , 0.01093774, 0.01184454, 0.0078176 , 0.00706148,
       0.00743408, 0.00789165, 0.00673599, 0.00682955, 0.00668421,
       0.00721312, 0.00691209, 0.00910668, 0.00719209, 0.00680289,
       0.00885425, 0.00797615, 0.00663013, 0.00682979, 0.00647945,
       0.00675664, 0.00672593, 0.00794234, 0.00673723, 0.00729446,
       0.0072618 , 0.0080162 , 0.0068965 , 0.00655599, 0.00751

In [None]:
print(np.mean(rs.cv_results_['mean_fit_time']))

0.009221738338470458


### 결정트리 분할 옵션 변경

In [None]:
rs2 = RandomizedSearchCV(DecisionTreeClassifier(splitter='random', random_state=42), params,
                        n_iter=100, n_jobs=-1, random_state=42)
rs2.fit(train_input, train_target)

In [None]:
print(rs2.best_params_)
print(np.max(rs2.cv_results_['mean_test_score']))

dt = rs2.best_estimator_
print(dt.score(test_input, test_target))

{'max_depth': 43, 'min_impurity_decrease': np.float64(0.00011407982271508446), 'min_samples_leaf': 19, 'min_samples_split': 18}
0.8458726956392981
0.786923076923077


In [None]:
rs2.cv_results_['mean_fit_time']

array([0.00404553, 0.00347109, 0.00366411, 0.0035358 , 0.00348153,
       0.00316892, 0.00314102, 0.00331669, 0.00379257, 0.00474005,
       0.00367193, 0.00330095, 0.00335436, 0.00306888, 0.0030458 ,
       0.00320311, 0.00297918, 0.00325165, 0.00342913, 0.00329361,
       0.00323882, 0.00432572, 0.00317721, 0.00307283, 0.00448341,
       0.0045548 , 0.00299692, 0.0039608 , 0.00319343, 0.0031374 ,
       0.00297341, 0.00323191, 0.00287528, 0.00334969, 0.00352688,
       0.00317039, 0.0030273 , 0.0051168 , 0.00621128, 0.0046185 ,
       0.00410028, 0.00376   , 0.00314393, 0.00328064, 0.00977077,
       0.0073143 , 0.00712986, 0.00804701, 0.01025424, 0.00335073,
       0.00640764, 0.00722551, 0.00485597, 0.00746522, 0.01046538,
       0.00417995, 0.00318551, 0.00389957, 0.0030931 , 0.00295777,
       0.00510082, 0.00297976, 0.00319443, 0.00509753, 0.00377827,
       0.0029057 , 0.00314369, 0.00954409, 0.00321569, 0.00303187,
       0.00652399, 0.00912371, 0.00694032, 0.00751443, 0.00506

In [None]:
print(np.mean(rs2.cv_results_['mean_fit_time']))

0.004737856388092041


문제 2 : 위 코드가 기존 랜덤 서치 코드와 다른 점을 2가지 적어보세요.

In [None]:
# splitter값이 기존에는 기본값 'best' 에서 'random'으로 바뀜, 분할을 랜덤으로 하기 떄문에 속도가 빨라