In [1]:
from tqdm import tqdm
import pandas as pd
import numpy as np
from logging import StreamHandler, DEBUG, Formatter, FileHandler, getLogger
logger = getLogger(__name__)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, ParameterGrid
from sklearn.metrics import log_loss, roc_auc_score
import lightgbm as lgb
import optuna

from LoadDate import loadTrainData, loadTestData

In [2]:
LogDir = 'logs/'
SubmitDir = 'data/'
SampleSubmitFile = SubmitDir + 'sample_submission.csv'

In [3]:
def eval_gini(y_true, y_prob):
    '''
    gini係数を計算する関数
    '''
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_prob)]
    ntrue = 0
    gini = 0
    delta = 0
    n = len(y_true)
    for i in range(n-1, -1, -1):
        y_i = y_true[i]
        ntrue += y_i
        gini += y_i * delta
        delta += 1 - y_i
    gini = 1 - 2 * gini / (ntrue * (n - ntrue))
    return gini

In [5]:
def objective(trial):
    '''
    Optunaを使用してパラメータチューニング
    '''
    # クロスバリデーション
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

    drop_rate = trial.suggest_uniform('drop_rate', 0, 1.0)
    feature_fraction = trial.suggest_uniform('feature_fraction', 0, 1.0)
    learning_rate = trial.suggest_uniform('learning_rate', 0, 1.0)
    subsample = trial.suggest_uniform('subsample', 0.8, 1.0)
    num_leaves = trial.suggest_int('num_leaves', 5, 1000)
    verbosity = trial.suggest_int('verbosity', -1, 1)
    num_boost_round = trial.suggest_int('num_boost_round', 10, 100000)
    min_data_in_leaf = trial.suggest_int('min_data_in_leaf', 10, 1000)
    min_child_samples = trial.suggest_int('min_child_samples', 5, 500)
    min_child_weight = trial.suggest_int('min_child_weight', 5, 500)

    params = {"objective": "binary",
              "boosting_type": "gbdt",
 #             "learning_rate": learning_rate
 #             "num_leaves": num_leaves,
 #             "max_bin": 256,
 #             "feature_fraction": feature_fraction,
 #             "verbosity": verbosity,
 #             "drop_rate": drop_rate,
 #             "is_unbalance": False,
 #             "max_drop": 50,
 #             "min_child_samples": min_child_samples,
 #             "min_child_weight": min_child_weight,
 #             "min_split_gain": 0,
 #             "num_boost_round": num_boost_round,
              "min_data_in_leaf": min_data_in_leaf,
 #             "subsample": subsample
              }
    
    # トレーニングデータと検証用データに分割する
    for trainIdx, validIdx in cv.split(xTrain, yTrain):
        # インデックスが返ってくるため、ilocで行を特定する
        # トレーニング用
        trn_x = xTrain.iloc[trainIdx, :]
        # 検証用
        val_x = xTrain.iloc[validIdx, :]

        # トレーニング用
        trn_y = yTrain[trainIdx]
        # 検証用
        val_y = yTrain[validIdx]

        # **変数名で、キーワードargsとして渡せる
        clf = lgb.LGBMClassifier(**params)
        # トレーニングデータでモデル作成
        clf.fit(trn_x, trn_y, verbose=False)

        # 検証用データで予測する
        pred = clf.predict_proba(val_x)[:, 1]
        # Optunaが最小化の最適化を行うため符号を反転する
        scGini = - eval_gini(val_y, pred)

        return(scGini)

In [9]:
df = loadTrainData()

In [10]:
xTrain = df.drop('target', axis=1)
yTrain = df['target'].values

In [11]:
study = optuna.create_study()
study.optimize(objective, n_trials=1)

[I 2019-03-02 22:46:43,410] Finished a trial resulted in value: -0.27540932154206366. Current best value is -0.27540932154206366 with parameters: {'drop_rate': 0.07053069513198229, 'feature_fraction': 0.37059684629635137, 'learning_rate': 0.33655180084643166, 'subsample': 0.9589796585607453, 'num_leaves': 632, 'verbosity': -1, 'num_boost_round': 94697, 'min_data_in_leaf': 52, 'min_child_samples': 99, 'min_child_weight': 488}.


In [12]:
study.best_params

{'drop_rate': 0.07053069513198229,
 'feature_fraction': 0.37059684629635137,
 'learning_rate': 0.33655180084643166,
 'subsample': 0.9589796585607453,
 'num_leaves': 632,
 'verbosity': -1,
 'num_boost_round': 94697,
 'min_data_in_leaf': 52,
 'min_child_samples': 99,
 'min_child_weight': 488}

In [None]:
# **変数名で、キーワードargsとして渡せる
clf = lgb.LGBMClassifier(**study.best_params)
# トレーニングデータでモデル作成
clf.fit(xTrain, yTrain, verbose=False)

