In [1]:
import random

import pandas as pd
import numpy as np
import lightgbm as lgb
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc, accuracy_score, confusion_matrix
import matplotlib
import matplotlib.pyplot as plt
from sklearn import metrics
import optuna
matplotlib.rcParams['font.family']='IPAGothic'

In [2]:
def cv_generator():
    base_path = "../../data/std_data/"
    for year in range(1978, 2020):
        train_x = pd.read_pickle(base_path + "train/{}_x.pkl".format(year)).values
        test_x = pd.read_pickle(base_path + "test/{}_x.pkl".format(year)).values
        train_y = pd.read_pickle(base_path + "train/{}_y.pkl".format(year)).values
        test_y = pd.read_pickle(base_path + "test/{}_y.pkl".format(year)).values
        yield (train_x, train_y, test_x, test_y)

In [4]:
def objective(trial):
    #paramter_tuning using optuna
    param = {
        #'boosting_type': trial.suggest_categorical('boosting', ['gbdt', 'dart', 'goss']),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 2, 100),
        'max_depth': trial.suggest_int('max_depth', 1, 20),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
        'n_estimators': trial.suggest_int('n_estimators', 100, 5000),
        'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1000),
        #'num_threads': trial.suggest_int('num_threads',5, 10),
        #'min_sum_hessian_in_leaf': trial.suggest_int('min_sum_hessian_in_leaf', 1, 10),
        #'reg_alpha': trial.suggest_uniform('reg_alpha', 0., 1.0),
        'reg_lambda': trial.suggest_uniform('reg_lambda', 0., 1.0),
        #'class_weight': {str(class_name): 'balanced' for class_name in df.drop("year", axis=1).columns}
    }

    """
    Best trial:
    Number of finished trials: 100
    Best trial:
    Value: 0.7737390814313891
    Params: 
    bagging_freq: 4
    min_data_in_leaf: 21
    max_depth: 13
    learning_rate: 0.08731913651405197
    n_estimators: 3394
    subsample: 0.7054763057027115
    num_leaves: 438
    reg_lambda: 0.9377125325944119
    """
        
    lightgbm_tuna = lgb.LGBMRegressor(
        random_state=0,
        verbosity=-1,
        bagging_seed=0,
        boost_from_average='true',
        metric='auc',
        **param,
    )
        
    pred_y_all = np.array([])
    y_true_all = np.array([])
    
    data_gen = cv_generator()
    
    for (train_x, train_y, test_x, test_y) in data_gen: 
        dtrain = lgb.Dataset(train_x, label=train_y)
        lightgbm_tuna.fit(train_x, train_y)
        pred_y = lightgbm_tuna.predict(test_x)
        pred_y_all = np.hstack((pred_y_all, pred_y))
        y_true_all = np.hstack((y_true_all, test_y))
        
    fpr, tpr, thresholds = metrics.roc_curve(y_true_all, pred_y_all, pos_label=1)
    
    return auc(fpr, tpr)

In [5]:
def main():
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=100)

    print('Number of finished trials: {}'.format(len(study.trials)))

    print('Best trial:')
    trial = study.best_trial

    print('  Value: {}'.format(trial.value))

    print('  Params: ')
    for key, value in trial.params.items():
        print('    {}: {}'.format(key, value))

main()

[I 2019-08-22 12:59:41,369] Finished trial#0 resulted in value: 0.3696111580726965. Current best value is 0.3696111580726965 with parameters: {'bagging_freq': 6, 'min_data_in_leaf': 82, 'max_depth': 1, 'learning_rate': 0.021191996812785448, 'n_estimators': 3776, 'subsample': 0.9036629282884031, 'num_leaves': 891, 'reg_lambda': 0.5935000410640767}.
[I 2019-08-22 12:59:49,013] Finished trial#1 resulted in value: 0.397083685545224. Current best value is 0.397083685545224 with parameters: {'bagging_freq': 3, 'min_data_in_leaf': 42, 'max_depth': 9, 'learning_rate': 0.015695824165448728, 'n_estimators': 1000, 'subsample': 0.9089248001669274, 'num_leaves': 724, 'reg_lambda': 0.05248430232358636}.
[I 2019-08-22 12:59:49,656] Finished trial#2 resulted in value: 0.4971118624964778. Current best value is 0.4971118624964778 with parameters: {'bagging_freq': 1, 'min_data_in_leaf': 86, 'max_depth': 13, 'learning_rate': 0.01198060957039323, 'n_estimators': 123, 'subsample': 0.5056426241252854, 'num_l

Number of finished trials: 100
Best trial:
  Value: 0.7737390814313891
  Params: 
    bagging_freq: 4
    min_data_in_leaf: 21
    max_depth: 13
    learning_rate: 0.08731913651405197
    n_estimators: 3394
    subsample: 0.7054763057027115
    num_leaves: 438
    reg_lambda: 0.9377125325944119
