In [1]:
import random

import pandas as pd
import numpy as np
import lightgbm as lgb
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc, accuracy_score, confusion_matrix
import matplotlib
import matplotlib.pyplot as plt
from sklearn import metrics
import optuna
matplotlib.rcParams['font.family']='IPAGothic'

In [2]:
def cv_generator():
    base_path = "../../data/std_data/"
    for year in range(1978, 2020):
        train_x = pd.read_pickle(base_path + "train/{}_x.pkl".format(year)).values
        test_x = pd.read_pickle(base_path + "test/{}_x.pkl".format(year)).values
        train_y = pd.read_pickle(base_path + "train/{}_y.pkl".format(year)).values
        test_y = pd.read_pickle(base_path + "test/{}_y.pkl".format(year)).values
        yield (train_x, train_y, test_x, test_y)

In [3]:
def objective(trial):
    #paramter_tuning using optuna
    param = {
        #'boosting_type': trial.suggest_categorical('boosting', ['gbdt', 'dart', 'goss']),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),
        #'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 2, 100),
        'max_depth': trial.suggest_int('max_depth', 1, 20),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
        'n_estimators': trial.suggest_int('n_estimators', 1000, 10000),
        'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
        #'num_leaves': trial.suggest_int('num_leaves', 2, 1000),
        #'num_threads': trial.suggest_int('num_threads',5, 10),
        #'min_sum_hessian_in_leaf': trial.suggest_int('min_sum_hessian_in_leaf', 1, 10),
        #'reg_alpha': trial.suggest_uniform('reg_alpha', 0., 1.0),
        'reg_lambda': trial.suggest_uniform('reg_lambda', 0., 1.0),
        #'class_weight': {str(class_name): 'balanced' for class_name in df.drop("year", axis=1).columns}
    }
    """
    if param['boosting_type'] == 'dart':
        param['drop_rate'] = trial.suggest_loguniform('drop_rate', 1e-8, 1.0)
        param['skip_drop'] = trial.suggest_loguniform('skip_drop', 1e-8, 1.0)
    if param['boosting_type'] == 'goss':
        param['top_rate'] = trial.suggest_uniform('top_rate', 0.0, 1.0)
        param['other_rate'] = trial.suggest_uniform('other_rate', 0.0, 1.0 - param['top_rate'])

    """
    """
    Best trial:
    Value: 0.7416173570019723
    Params: 
    bagging_freq: 5
    min_data_in_leaf: 17
    max_depth: 8
    learning_rate: 0.06221834301779217
    num_leaves: 17
    num_threads: 9
    min_sum_hessian_in_leaf: 4
    """
        
    lightgbm_tuna = lgb.LGBMRegressor(
        random_state=0,
        verbosity=1,
        bagging_seed=0,
        boost_from_average='true',
        metric='auc',
        **param,
    )
        
    pred_y_all = np.array([])
    y_true_all = np.array([])
    
    data_gen = cv_generator()
    
    for (train_x, train_y, test_x, test_y) in data_gen: 
        dtrain = lgb.Dataset(train_x, label=train_y)
        lightgbm_tuna.fit(train_x, train_y)
        pred_y = lightgbm_tuna.predict(test_x)
        pred_y_all = np.hstack((pred_y_all, pred_y))
        y_true_all = np.hstack((y_true_all, test_y))
        
    fpr, tpr, thresholds = metrics.roc_curve(y_true_all, pred_y_all, pos_label=1)
    
    return auc(fpr, tpr)

In [None]:
def main():
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=100)

    print('Number of finished trials: {}'.format(len(study.trials)))

    print('Best trial:')
    trial = study.best_trial

    print('  Value: {}'.format(trial.value))

    print('  Params: ')
    for key, value in trial.params.items():
        print('    {}: {}'.format(key, value))

main()

[I 2019-08-22 12:13:36,393] Finished trial#0 resulted in value: 0.719498450267681. Current best value is 0.719498450267681 with parameters: {'bagging_freq': 6, 'max_depth': 5, 'learning_rate': 0.09099024210322172, 'n_estimators': 1445, 'subsample': 0.8289763724166712, 'reg_lambda': 0.0447949026795208}.
[I 2019-08-22 12:15:07,240] Finished trial#1 resulted in value: 0.7499295576218652. Current best value is 0.7499295576218652 with parameters: {'bagging_freq': 1, 'max_depth': 20, 'learning_rate': 0.0170903088404055, 'n_estimators': 8860, 'subsample': 0.6066692578268549, 'reg_lambda': 0.6440067411035559}.
[I 2019-08-22 12:16:02,586] Finished trial#2 resulted in value: 0.7385178923640462. Current best value is 0.7499295576218652 with parameters: {'bagging_freq': 1, 'max_depth': 20, 'learning_rate': 0.0170903088404055, 'n_estimators': 8860, 'subsample': 0.6066692578268549, 'reg_lambda': 0.6440067411035559}.
[I 2019-08-22 12:16:36,771] Finished trial#3 resulted in value: 0.7299239222316145. 