# 파라미터 튜닝

***
### 각 모델별로 성능을 결정하는 하이퍼 파라미터의 최적의 조합을 찾는 과정 - cross validation 형태로 진행된다.
- GridSearchCV
- RandomizedSearchCV
- BayesianOptimization - 최적화

#### KFold 방식으로 하이퍼 파라미터 조합을 탐색한다.

***
### 알고리즘별 하이퍼 파라미터
- 트리 & 부스팅 계열(RandomForest, XGBoost, LightGBM, Catboost 등) : max_depth, n_estimators, learning_rate(랜덤포레스트는 없음)
- 서포트벡터머신(SVM, SVC, SVR 등) : C, alpha

***
### 튜닝 방법
#### 1. GridSearchCV : 그물망 방식으로 모든 파라미터 조합을 사용하여 최적의 조합을 찾는다. 시간이 많이 걸리지만 모든 경우의 수를 다 활용하는 장점이 있다.
#### 2. RandomizedSearchCV : 모든 조합 중 랜덤하게 선택하여 최적의 조합을 찾는다. Grid 방식에 비해 시간은 빠르지만 모든 경우의 수를 활용하진 않는다.
***


In [1]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, KFold, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, roc_auc_score
from xgboost import XGBClassifier, XGBRegressor
from lightgbm import LGBMClassifier, LGBMRegressor

import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.DataFrame(load_breast_cancer().data, columns = load_breast_cancer().feature_names)
data['target'] = load_breast_cancer().target

In [3]:
data

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890,0
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902,0
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300,0
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115,0
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637,0
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820,0
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400,0


In [4]:
test = data.sample(69)
train = data.query('index not in @test.index')

In [5]:
X = train.drop('target', axis = 1)
y = train.target

In [6]:
target = test[X.columns]

In [7]:
skf = StratifiedKFold(n_splits = 10, random_state = 42, shuffle = True)

In [8]:
lgbm_pred = np.zeros((target.shape[0]))
for tr_idx, val_idx in skf.split(X, y) :
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]
    
    lgbm = LGBMClassifier(random_state = 42)
    lgbm.fit(tr_x, tr_y)
    
    val_pred = lgbm.predict_proba(val_x)[:, 1]
    val_score = roc_auc_score(val_y, val_pred)
    print(val_score)
    fold_pred = lgbm.predict_proba(target)[:, 1] / 10
    lgbm_pred += fold_pred

0.9881154499151104
0.9779286926994906
0.9898132427843803
0.9915110356536502
0.9949066213921902
0.9949066213921902
0.9813242784380306
1.0
0.9965277777777778
0.9982638888888888


In [9]:
roc_auc = roc_auc_score(test.target, lgbm_pred)

In [10]:
roc_auc

1.0

#### GridSearchCV

- 모델을 call
- 탐색하고자 하는 파라미터 조합을 세팅
- GridSearchCV(사용할 모델, 탐색할 파라미터, 평가지표, cross_validation 구성 방법)

In [11]:
lgbm = LGBMClassifier(random_state = 42)

In [12]:
lgbm_params = {'max_depth' : [3, 4, 5, 6, 7], 'n_estimators' : [50, 100, 150, 175, 200], 'learning_rate' : [0.005, 0.05, 0.01, 0.1]}

5 x 5 x 4 = 100개의 모델이 생성

In [13]:
grid_cv = GridSearchCV(lgbm, lgbm_params, scoring = 'roc_auc', n_jobs = -1, verbose = 1, cv = skf)

- n_jobs = -1 : CPU 사용 개수로 -1은 모든 CPU 사용
- verbose = 1 : 매 과정을 출력을 하면서 보여라. verbose = 100이면 100번 마다 출력

In [14]:
%%time
grid_cv.fit(X, y)

Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:    9.3s
[Parallel(n_jobs=-1)]: Done 985 out of 1000 | elapsed:   11.8s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:   11.9s finished


CPU times: user 1.59 s, sys: 345 ms, total: 1.94 s
Wall time: 12 s


GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=42, shuffle=True),
             estimator=LGBMClassifier(random_state=42), n_jobs=-1,
             param_grid={'learning_rate': [0.005, 0.05, 0.01, 0.1],
                         'max_depth': [3, 4, 5, 6, 7],
                         'n_estimators': [50, 100, 150, 175, 200]},
             scoring='roc_auc', verbose=1)

파라미터 조합으로 생성된 모델 100개와 cross_validation 10fold이기 때문에 총 1000번의 fit이 일어난다.

In [15]:
grid_cv.best_params_

{'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200}

In [16]:
grid_cv.best_estimator_

LGBMClassifier(max_depth=5, n_estimators=200, random_state=42)

In [18]:
lgbm_pred = np.zeros((target.shape[0]))
for tr_idx, val_idx in skf.split(X, y) :
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]
    
    lgbm = LGBMClassifier(**grid_cv.best_params_)
    # **은 unpack을 의미하는데 딕셔너리 형태로 되어있는 값을 자동으로 배분해서 넣어준다.
    # grid_cv.best_estimator_
    # LGBMClassifier(max_depth = 5, learning_rate = 0.1, n_estimators = 200, random_state = 42)
    lgbm.fit(tr_x, tr_y)
    
    val_pred = lgbm.predict_proba(val_x)[:, 1]
    val_score = roc_auc_score(val_y, val_pred)
    print(val_score)
    fold_pred = lgbm.predict_proba(target)[:, 1] / 10
    lgbm_pred += fold_pred

0.9881154499151104
0.9830220713073005
0.9915110356536503
0.9966044142614601
0.9983022071307301
0.9983022071307301
0.9847198641765705
1.0
0.9930555555555555
1.0


In [19]:
roc_auc = roc_auc_score(test.target, lgbm_pred)

In [20]:
roc_auc

1.0

#### RandomizedSearchCV

In [21]:
rand_cv = RandomizedSearchCV(lgbm, lgbm_params, scoring = 'roc_auc', n_jobs = -1, verbose = 1, cv = skf)

In [22]:
%%time
rand_cv.fit(X, y)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    0.7s


CPU times: user 628 ms, sys: 95.7 ms, total: 724 ms
Wall time: 1.3 s


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    1.2s finished


RandomizedSearchCV(cv=StratifiedKFold(n_splits=10, random_state=42, shuffle=True),
                   estimator=LGBMClassifier(max_depth=5, n_estimators=200),
                   n_jobs=-1,
                   param_distributions={'learning_rate': [0.005, 0.05, 0.01,
                                                          0.1],
                                        'max_depth': [3, 4, 5, 6, 7],
                                        'n_estimators': [50, 100, 150, 175,
                                                         200]},
                   scoring='roc_auc', verbose=1)

In [23]:
rand_cv.best_params_

{'n_estimators': 175, 'max_depth': 4, 'learning_rate': 0.1}

In [24]:
rand_cv.best_estimator_

LGBMClassifier(max_depth=4, n_estimators=175)

In [25]:
lgbm_pred = np.zeros((target.shape[0]))
for tr_idx, val_idx in skf.split(X, y) :
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]
    
    lgbm = LGBMClassifier(**rand_cv.best_params_)
    # **은 unpack을 의미하는데 딕셔너리 형태로 되어있는 값을 자동으로 배분해서 넣어준다.
    # grid_cv.best_estimator_
    # LGBMClassifier(max_depth = 5, learning_rate = 0.1, n_estimators = 200, random_state = 42)
    lgbm.fit(tr_x, tr_y)
    
    val_pred = lgbm.predict_proba(val_x)[:, 1]
    val_score = roc_auc_score(val_y, val_pred)
    print(val_score)
    fold_pred = lgbm.predict_proba(target)[:, 1] / 10
    lgbm_pred += fold_pred

0.9864176570458405
0.9779286926994906
0.9881154499151104
0.9949066213921902
0.9949066213921902
0.9983022071307301
0.9898132427843803
1.0
0.9947916666666666
0.9965277777777778


In [26]:
roc_auc = roc_auc_score(test.target, lgbm_pred)

In [27]:
roc_auc

1.0

### <font color = 'red'> Quiz> 와인의 등급(quailty)을 맞추는 모델을 만들어보시오.
    - 적절한 검증 방법으로 log_loss를 개선하는 파라미터 튜닝을 수행할 것.
    - 최적의 파라미터 조합으로 예측값을 도출한 후 submission을 제출하시오.
    - log_loss는 낮을수록 좋은 지표이기 때문에 파라미터 튜닝 때 scoring = 'neg_log_loss'로 지정

In [47]:
train = pd.read_csv('wine_train.csv')
test = pd.read_csv('wine_test.csv')

In [48]:
train.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
2,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
3,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
4,7.3,0.65,0.0,1.2,0.065,15.0,21.0,0.9946,3.39,0.47,10.0,7


In [49]:
test.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,9.5,0.59,0.44,2.3,0.071,21.0,68.0,0.9992,3.46,0.63,9.5,5
1,7.3,0.585,0.18,2.4,0.078,15.0,60.0,0.99638,3.31,0.54,9.8,5
2,6.6,0.695,0.0,2.1,0.075,12.0,56.0,0.9968,3.49,0.67,9.2,5
3,6.2,0.46,0.17,1.6,0.073,7.0,11.0,0.99425,3.61,0.54,11.4,5
4,6.5,0.61,0.0,2.2,0.095,48.0,59.0,0.99541,3.61,0.7,11.5,6


In [50]:
skf = StratifiedKFold(n_splits = 5, random_state = 42, shuffle = True)

In [51]:
X = train.iloc[:, :-1]
y = train.quality

target = test.iloc[:, :-1]

In [52]:
xgb = XGBClassifier(random_state = 42)

In [53]:
xgb_params = {'max_depth' : [4, 6, 8], 'learning_rate' : [0.05, 0.01, 0.1], 'n_estimators' : [100, 250]}

In [62]:
rand_cv = RandomizedSearchCV(xgb, xgb_params, cv = skf, scoring = 'neg_log_loss', n_jobs = -1)

In [38]:
from sklearn.metrics import log_loss

In [63]:
rand_cv.fit(X, y)



RandomizedSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None, gamma=None,
                                           gpu_id=None, importance_type='gain',
                                           interaction_constraints=None,
                                           learning_rate=None,
                                           max_delta_step=None, max_depth=None,
                                           min_child_weight=None,...
                                           monotone_constraints=None,
                                           n_estimators=100, n_jobs=None,
                                           num_parallel_tree=None,
                                           random_state=4

In [75]:
rand_cv.best_params_

{'n_estimators': 100, 'max_depth': 6, 'learning_rate': 0.05}

In [69]:
xgb_pred = np.zeros((target.shape[0], 3))
for tr_idx, val_idx in skf.split(X, y) :
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]
    
    xgb = rand_cv.best_estimator_
    xgb.fit(tr_x, tr_y)
    
    val_pred = xgb.predict_proba(val_x)
    val_log_loss = log_loss(val_y, val_pred)
    print('\n', val_log_loss, '\n')
    
    fold_pred = xgb.predict_proba(target) / 5
    xgb_pred += fold_pred


 0.6706769987510947 


 0.731327394159654 


 0.7085729155904399 


 0.6793391558771523 


 0.7372564816274322 



In [77]:
pred = np.argmax(xgb_pred, axis = 1) + 5

In [74]:
accuracy_score(test.quality, np.argmax(xgb_pred, axis = 1) + 5)

0.7293577981651376

In [81]:
submission = pd.DataFrame({'wine_id' : range(218), 'quality' : pred})

In [83]:
submission['actual'] = test.quality

In [85]:
submission

Unnamed: 0,wine_id,quality,actual
0,0,5,5
1,1,5,5
2,2,5,5
3,3,5,5
4,4,6,6
...,...,...,...
213,213,6,6
214,214,5,5
215,215,6,6
216,216,5,5
