## Bayesian Optimization

In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
from bayes_opt import BayesianOptimization

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

train_one = pd.get_dummies(train)
test_one = pd.get_dummies(test)

In [3]:
x = train_one.drop(columns=['index','quality'])
y = train['quality']

In [4]:
rf_parameter_bounds = {
                      'max_depth' : (1,3), # 나무의 깊이
                      'n_estimators' : (30,100),
                      }

In [5]:
def rf_bo(max_depth, n_estimators):
  rf_params = {
              'max_depth' : int(round(max_depth)),
               'n_estimators' : int(round(n_estimators)),      
              }
  rf = RandomForestClassifier(**rf_params)

  x_train, x_valid, y_train, y_valid = train_test_split(x,y,test_size = 0.2, )

  rf.fit(x_train,y_train)
  score = accuracy_score(y_valid, rf.predict(x_valid))
  return score

In [6]:
bo_rf = BayesianOptimization(f = rf_bo, pbounds = rf_parameter_bounds,random_state = 0)

In [7]:
bo_rf.maximize(init_points = 5, n_iter = 5)

|   iter    |  target   | max_depth | n_esti... |
-------------------------------------------------
| [0m 1       [0m | [0m 0.53    [0m | [0m 2.098   [0m | [0m 80.06   [0m |
| [0m 2       [0m | [0m 0.52    [0m | [0m 2.206   [0m | [0m 68.14   [0m |
| [95m 3       [0m | [95m 0.5436  [0m | [95m 1.847   [0m | [95m 75.21   [0m |
| [0m 4       [0m | [0m 0.5036  [0m | [0m 1.875   [0m | [0m 92.42   [0m |
| [0m 5       [0m | [0m 0.5345  [0m | [0m 2.927   [0m | [0m 56.84   [0m |
| [0m 6       [0m | [0m 0.5255  [0m | [0m 2.988   [0m | [0m 73.03   [0m |
| [0m 7       [0m | [0m 0.5327  [0m | [0m 2.07    [0m | [0m 80.1    [0m |
| [0m 8       [0m | [0m 0.5218  [0m | [0m 1.618   [0m | [0m 75.33   [0m |
| [0m 9       [0m | [0m 0.4927  [0m | [0m 1.988   [0m | [0m 75.16   [0m |
| [0m 10      [0m | [0m 0.5082  [0m | [0m 2.036   [0m | [0m 80.07   [0m |


In [8]:
max_params = bo_rf.max['params']

max_params['max_depth'] = int(max_params['max_depth'])
max_params['n_estimators'] = int(max_params['n_estimators'])
print(max_params)

{'max_depth': 1, 'n_estimators': 75}


In [9]:
RF = RandomForestClassifier(**max_params)
RF.fit(x,y)
pred = RF.predict(test_one.drop(columns=['index']))
sub = pd.read_csv('submission.csv')
sub['quality'] = pred
sub.to_csv('tune_rf.csv',index = False)

## xgboost 튜닝

xgboost의 하이퍼 파라미터

- learning rate : 높을수록 과적합 되기 쉬움
- subsample : weak learner가 학습에 사용하는 데이터 샘플링 비율, 보통 0.5~1 사이이며 낮을수록 과적합 방지
- n_estimators : 생성할 weak learner의 수, learning_rate가 낮을 때 n_estimators를 높여야 과적합 방지
- colsample_bytree : 각 tree 별 사용된 feature의 퍼센테이지, 보통 0.5~1 사용되며 낮을수록 과적합 방지
- max_depth : 트리의 maximum depth, 적절한 값이 제시되어야 하고 보통 3-10 사이, 높을수록 복잡도가 커져 과적합 하기 쉬움
- lambda : 가중치에 대한 L2 Regularization 적용 값, 피쳐 개수가 많을 때 적용을 검토하며 값이 클수록 과적합 감소
- gamma : 리프노드의 추가분할을 결정할 최소손실 감소값, 해당값보다 손실이 크게 감소할 때 분리
- alpha : 가중치에 대한 L1 Regularization 적용 값, 피쳐 개수가 많을 때 적용을 검토하며 값이 클수록 과적합 감소 효과

In [10]:
from xgboost import XGBClassifier

xgb_parameter_bounds = {
                      'gamma' : (0,10),
                      'max_depth' : (1,3), # 나무의 깊이
                      'subsample' : (0.5,1)
                      }

In [11]:
def xgb_bo(gamma,max_depth, subsample):
  xgb_params = {
              'gamma' : int(round(gamma)),
              'max_depth' : int(round(max_depth)),
               'subsample' : int(round(subsample)),      
              }
  xgb = XGBClassifier(**xgb_params)

  x_train, x_valid, y_train, y_valid = train_test_split(x,y,test_size = 0.2, )

  xgb.fit(x_train,y_train)
  score = accuracy_score(y_valid, xgb.predict(x_valid))
  return score

In [12]:
BO_xgb = BayesianOptimization(f = xgb_bo, pbounds = xgb_parameter_bounds,random_state = 0)
BO_xgb.maximize(init_points = 5, n_iter = 5)

|   iter    |  target   |   gamma   | max_depth | subsample |
-------------------------------------------------------------
| [0m 1       [0m | [0m 0.5664  [0m | [0m 5.488   [0m | [0m 2.43    [0m | [0m 0.8014  [0m |
| [0m 2       [0m | [0m 0.5664  [0m | [0m 5.449   [0m | [0m 1.847   [0m | [0m 0.8229  [0m |
| [95m 3       [0m | [95m 0.5809  [0m | [95m 4.376   [0m | [95m 2.784   [0m | [95m 0.9818  [0m |
| [0m 4       [0m | [0m 0.5555  [0m | [0m 3.834   [0m | [0m 2.583   [0m | [0m 0.7644  [0m |
| [0m 5       [0m | [0m 0.5555  [0m | [0m 5.68    [0m | [0m 2.851   [0m | [0m 0.5355  [0m |
| [0m 6       [0m | [0m 0.5245  [0m | [0m 6.692   [0m | [0m 2.421   [0m | [0m 0.9232  [0m |
| [0m 7       [0m | [0m 0.5591  [0m | [0m 2.303   [0m | [0m 2.915   [0m | [0m 0.6916  [0m |
| [95m 8       [0m | [95m 0.5836  [0m | [95m 4.354   [0m | [95m 2.839   [0m | [95m 0.9727  [0m |
| [0m 9       [0m | [0m 0.5464  [0m | [0m 4.5

In [13]:
max_params = BO_xgb.max['params']
max_params['max_depth'] = int(max_params['max_depth'])
XGB = XGBClassifier(**max_params)
XGB.fit(x,y)

XGBClassifier(gamma=4.353687657938132, max_depth=2, objective='multi:softprob',
              subsample=0.9726705213125111)

In [14]:
pred = XGB.predict(test_one.drop(columns=['index']))
sub = pd.read_csv('submission.csv')
sub['quality'] = pred
sub.to_csv('tune_xgb.csv',index = False)

## LGBM 튜닝

LGBM 하이퍼 파라미터

- learning rate : 높을수록 과적합 되기 쉬움
- subsample : weak learner가 학습에 사용하는 데이터 샘플링 비율, 보통 0.5~1 사이이며 낮을수록 과적합 방지
- n_estimators : 생성할 weak learner의 수, 너무 크면 과적합 발생
- colsample_bytree : 각 tree 별 사용된 feature의 퍼센테이지, 보통 0.5~1 사용되며 낮을수록 과적합 방지
- max_depth : 트리의 maximum depth, 적절한 값이 제시되어야 하고 보통 3-10 사이, 기본값은 깊이에 제한이 없음(-1)
- reg_lambda : 가중치에 대한 L2 Regularization 적용 값, 피쳐 개수가 많을 때 적용을 검토하며 값이 클수록 과적합 감소
- min_child_samples : 최종 리프 노드가 되기 위한 레코드 수로 과적합 제어용
- reg_alpha : 가중치에 대한 L1 Regularization 적용 값, 피쳐 개수가 많을 때 적용을 검토하며 값이 클수록 과적합 감소 효과

In [15]:
from lightgbm import LGBMClassifier

lgbm_parameter_bounds = {
                      'n_estimators' : (30,100),
                      'max_depth' : (1,3), 
                      'subsample' : (0.5,1)
                      }

In [16]:
def lgbm_bo(n_estimators,max_depth, subsample):

  lgbm_params = {
              'n_estimators' : int(round(n_estimators)),
              'max_depth' : int(round(max_depth)),
              'subsample' : int(round(subsample)),      
              }

  lgbm = LGBMClassifier(**lgbm_params)
  
  x_train, x_valid, y_train, y_valid = train_test_split(x,y,test_size = 0.2, )

  lgbm.fit(x_train,y_train)
  score = accuracy_score(y_valid, lgbm.predict(x_valid))
  return score

In [17]:
BO_lgbm = BayesianOptimization(f = lgbm_bo, pbounds = lgbm_parameter_bounds,random_state = 0)
BO_lgbm.maximize(init_points = 5, n_iter = 5)

|   iter    |  target   | max_depth | n_esti... | subsample |
-------------------------------------------------------------
| [0m 1       [0m | [0m 0.5736  [0m | [0m 2.098   [0m | [0m 80.06   [0m | [0m 0.8014  [0m |
| [0m 2       [0m | [0m 0.5627  [0m | [0m 2.09    [0m | [0m 59.66   [0m | [0m 0.8229  [0m |
| [95m 3       [0m | [95m 0.5745  [0m | [95m 1.875   [0m | [95m 92.42   [0m | [95m 0.9818  [0m |
| [0m 4       [0m | [0m 0.5482  [0m | [0m 1.767   [0m | [0m 85.42   [0m | [0m 0.7644  [0m |
| [0m 5       [0m | [0m 0.5582  [0m | [0m 2.136   [0m | [0m 94.79   [0m | [0m 0.5355  [0m |
| [0m 6       [0m | [0m 0.56    [0m | [0m 2.086   [0m | [0m 79.98   [0m | [0m 0.7583  [0m |
| [0m 7       [0m | [0m 0.5545  [0m | [0m 1.461   [0m | [0m 97.03   [0m | [0m 0.6916  [0m |
| [0m 8       [0m | [0m 0.5482  [0m | [0m 2.082   [0m | [0m 68.72   [0m | [0m 0.8663  [0m |
| [0m 9       [0m | [0m 0.5264  [0m | [0m 1.391   

In [18]:
max_params = BO_lgbm.max['params']
max_params['max_depth'] = int(max_params['max_depth'])
max_params['n_estimators'] = int(max_params['n_estimators'])
LGBM = LGBMClassifier(**max_params)
LGBM.fit(x,y)


LGBMClassifier(max_depth=1, n_estimators=92, subsample=0.9818313802505146)

In [19]:
pred = LGBM.predict(test_one.drop(columns =['index']))
sub = pd.read_csv('submission.csv')
sub['quality'] = pred
sub.to_csv('tune_lgbm.csv',index = False)

## Voting Classifier

In [23]:
from sklearn.ensemble import VotingClassifier
VC = VotingClassifier(estimators=[('rf',RF),('xgb',XGB),('lgbm',LGBM)],voting = 'soft')

VC.fit(x,y)
pred = VC.predict(test_one.drop(columns = ['index']))

sub = pd.read_csv('submission.csv')
sub['quality'] = pred
sub.to_csv('tune_voting.csv',index = False)