In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings(action = 'ignore')
%matplotlib inline

# 데이터 분할
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold

# 모델
from xgboost import XGBClassifier 
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier

# 파라미터 최적화
from bayes_opt import BayesianOptimization

# 평가지표
from sklearn.metrics import log_loss

# Data Loading

In [6]:
train = pd.read_csv('train_features.csv', encoding = 'UTF-8')
test = pd.read_csv('test_features.csv', encoding = 'UTF-8')
target = pd.read_csv('cust_train.csv', encoding = 'UTF-8')

In [7]:
# train_features를 train과 validation으로 분할
X_train, X_test, y_train, y_test = train_test_split(train, target.LABEL, 
                                                    test_size = 0.2, random_state = 516, stratify = target.LABEL)

In [3]:
kfold = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 516)

clfs = []
xgb = XGBClassifier(random_state = 516); clfs.append(xgb)
lgb = LGBMClassifier(random_state = 516); clfs.append(lgb)
rf = RandomForestClassifier(random_state = 516); clfs.append(rf)

In [8]:
xgb.fit(X_train, y_train)

KeyboardInterrupt: 

In [None]:
lgbm.fit(X_train, y_train)

In [None]:
xgb.feature_selection_

In [None]:
for name, value in zip(X_train.feature_names, xgb.feature_importances_):
    print(f'{name} : {vlaue}')

In [None]:
sns.barplot(x = xgb.feature_importances_, y = X_train.feature_names)

In [None]:
xgb_params = {'n_estimators' : (100, 500),
              'learnig_rate' : np.arange(0.1,0.6,0.1),
              'max_depth' : (5, 15),
              'subsample' : (0.8, 0.95),
              'colsample_bytree' : (0.8, 0.95)}

def xgb_opt(n_estimators, learning_rate, max_depth, subsample, colsample_bytree):
    params = {'n_estimators' : int(round(n_estimators)),
              'learnig_rate' : learnig_rate,
              'max_depth' : int(round(max_depth)),
              'subsample' : subsample,
              'colsample_bytree' : colsample_bytree}

In [None]:
lgb_params = {'n_estimators' : (100, 500),
              'learnig_rate' : np.arange(0.1,0.6,0.1),
              'max_depth' : (5, 15),
              'num_leaves' : (50, 100),
              'min_child_samples' : (20, 50),
              'subsample' : (0.8, 0.95),
              'colsample_bytree' : (0.8, 0.95)}

def lgbm_opt(n_estimators, learning_rate, max_depth, num_leaves, min_child_samples, subsample, colsample_bytree):
    params = {'n_estimators' : int(round(n_estimators)),
              'learnig_rate' : learning_rate,
              'max_depth' : int(round(max_depth)),
              'num_leaves' : int(round(num_leaves)),
              'min_child_samples' : int(round(min_child_samples)),
              'subsample' : subsample,
              'colsample_bytree' : colsample_bytree}

In [None]:
rf_params = {'n_estimators' : (100, 500),
             'criterion' : ['gini', 'entropy'],
             'max_features' : (0.8, 0.95),
             'max_depth' : (5, 10),
             'min_samples_split' : (30, 100),
             'min_samples_leaf' : (30, 50)}

def rf_opt(n_estimators, criterion, max_features, max_depth, min_samples_split, min_samples_leaf, criterion):
    params = {'n_estimators' : int(round(n_estimators)),
              'criterion' : criterion,
              'max_features' : max_features,
              'max_depth' : int(round(max_depth)),
              'min_samples_split' : int(round(min_samples_split)),
              'min_samples_leaf' : int(round(min_samples_leaf)),
              'criterion' : criterion}

In [None]:
# Step6. BayesianOptimization 객체 생성
BO_lgbm = BayesianOptimization(f = lgbm_opt, pbounds = pbounds, random_state = 516) # 최대화하려는 함수 f, 탐색범위 pbounds

# Step7. 최대화
BO_lgbm.maximize(init_points=50, n_iter=50)

In [None]:
# Step8. 함수를 최대화하는 하이퍼파라미터 저장
max_params = BO_lgbm.max['params']

# 모델의 조건에 맞게 하이퍼 파라미터 변경
max_params['n_estimators'] = int(round(max_params['n_estimators']))
max_params['max_depth'] = int(round(max_params['max_depth']))

max_params

In [None]:
# Step9. 최대화 하이퍼파라미터로 재학습
lgbm_clf = LGBMClassifier(**max_params)
lgbm_clf.fit(X_train, y_train)
score = log_loss(y_valid, lgbm_clf.predict_proba(X_valid))
print(score)

In [None]:
# 모델별 교차검증 후 평균 log_loss 반환
clf_names=[type(clf).__name__ for clf in clfs]
for clf in clfs:
    scores = []
    for train_idx, val_idx in kfold.split(X_train, y_train):
        X_tr, X_val = X_train[train_idx], X_train[val_idx]
        y_tr, y_val = y_train[train_idx], y_train[val_idx]
        clf.fit(X_t, y_t)
        pred = clf.predict_proba(y_t, y_val)
        lls = log_loss(y_val, pred)
        scores.append(lls)
        
    scores = np.mean(scores)
    name = type(clf).__name__
    print('model : %s\t\t\t\taccuracy : %f'%(name, score))

In [None]:
clfs = [
    (
        RandomForestClassifier(random_state=0),
        {'n_estimators': range(10, 310, 10),
         'max_depth': range(1, 12),
         'min_impurity_decrease': [0,0.0001,0.001,0.01,0.0002,0.002,0.02,0.0005,0.005,0.05,0.1,0.2,0.3,0.4,0.5],
         'max_features': ['auto', 'sqrt', 'log2'],
         'bootstrap': [True, False],
         'criterion': ['gini', 'entropy'], 
         'class_weight': ['balanced', 'balanced_subsample', {}],
         'min_samples_split': [2, 5, 7, 9, 10],
         'min_samples_leaf': [2, 3, 4, 5, 6]}
    ),
    (
        XGBClassifier(random_state=0),
        {'n_estimators': range(10, 310, 10),
         'learning_rate': np.arange(0.0, 0.501, 0.001),
         'subsample': [0.2, 0.3, 0.5, 0.7, 0.9, 1],
         'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],
         'colsample_bytree': [0.5, 0.7, 0.9, 1],
         'min_child_weight': [1, 2, 3, 4],
         'reg_alpha': [1e-07,1e-06,0.0001,0.001,0.01,0.0005,0.005,0.05,0.1,0.15,0.2,0.3,0.4,0.5,0.7,1,2,3,4,5,10],
         'reg_lambda': [1e-07,1e-06,0.0001,0.001,0.01,0.0005,0.005,0.05,0.1,0.15,0.2,0.3,0.4,0.5,0.7,1,2,3,4,5,10],
         'scale_pos_weight': np.arange(0.0, 50.1, 0.1)}
    ),
    (
        LGBMClassifier(random_state=0),
        {'n_estimators': range(10, 310, 10),
         'learning_rate': np.arange(0.0, 0.501, 0.001),
         'num_leaves': [10,20,30,40,50,60,70,80,90,100,150,200],
         'min_split_gain': [0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9],
         'reg_alpha': [1e-07,1e-06,0.0001,0.001,0.01,0.0005,0.005,0.05,0.1,0.15,0.2,0.3,0.4,0.5,0.7,1,2,3,4,5,10],
         'reg_lambda': [1e-07,1e-06,0.0001,0.001,0.01,0.0005,0.005,0.05,0.1,0.15,0.2,0.3,0.4,0.5,0.7,1,2,3,4,5,10],
         'feature_fraction': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
         'bagging_fraction': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
         'bagging_freq': [1, 2, 3, 4, 5, 6, 7],
         'min_child_samples': range(5,105,5)}
    ),
    (
        CatBoostClassifier(random_state=0, verbose=False),
        {'n_estimators': range(10, 310, 10),
         'depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],
         'random_strength': [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8],
         'l2_leaf_reg': [1,2,3,4,5,6,7,8,9,10,20,30,50,100,200]}
    ),    
    (
        MLPClassifier(random_state=0),
        {'batch_size': [32, 64, 128],
         'learning_rate' : ['constant', 'adaptive', 'invscaling'],
         'activation': ['tanh', 'relu', 'logistic'],
         'solver': ['sgd', 'adam'],
         'alpha': [1e-07,1e-06,0.0001,0.001,0.01,0.0005,0.005,0.05,0.1,0.15,0.2,0.3,0.4,0.5,0.7,0.9],
         'hidden_layer_sizes': [(32,),(64,),(128,),(32,16),(64,32,16)]}
    ),    
]