In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn import metrics
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

slack_notify=False

if slack_notify:
    import sys
    sys.path.append("../")
    import slackweb
    from slack_url import slack_url
    slack = slackweb.Slack(url=slack_url())

In [20]:
# loading data
data_ex = pd.read_csv('./data/backup/data_ex2_light.csv')
train = data_ex[data_ex["date_id"] <= 1913]
test = data_ex[(1914 <= data_ex["date_id"]) & (data_ex["date_id"] <= 1941)]

In [18]:
# data_ex[(1548 <= data_ex["date_id"]) & (data_ex["date_id"] <= 1941)].to_csv('./data/backup/data_ex2_light.csv', index=False)

In [22]:
print(f"{sys.getsizeof(data_ex)/1000/1000/1000} GB")
print(f"{sys.getsizeof(train)/1000/1000/1000} GB")
print(f"{sys.getsizeof(test)/1000/1000/1000} GB")
print(f"{sys.getsizeof(subm)/1000/1000/1000} GB")

5.052784092 GB
4.781747627999999 GB
0.366881216 GB
1.5999999999999998e-08 GB


In [7]:
# data_ex2.csv
# print(f"{sys.getsizeof(data_ex)/1000/1000/1000} GB")
# print(f"{sys.getsizeof(train)/1000/1000/1000} GB")
# print(f"{sys.getsizeof(test)/1000/1000/1000} GB")
# print(f"{sys.getsizeof(subm)/1000/1000/1000} GB")

20.133087609999997 GB
19.781208258000003 GB
0.366881216 GB
0.366881216 GB


In [23]:
# define features and params 
features = [
    'item_id', 
    'date_id', 
    'date', 
    'wm_yr_wk', 
    'wday', 
    'month', 
    'year', 
    'event_name_1', 
    'event_name_2', 
    'snap_CA', 
    'snap_TX', 
    'snap_WI', 
    'sell_price', 
    'dept_id_FOODS_1', 
    'dept_id_FOODS_2', 
    'dept_id_FOODS_3', 
    'dept_id_HOBBIES_1', 
    'dept_id_HOBBIES_2', 
    'dept_id_HOUSEHOLD_1', 
    'dept_id_HOUSEHOLD_2', 
    'cat_id_FOODS', 
    'cat_id_HOBBIES', 
    'cat_id_HOUSEHOLD', 
    'store_id_CA_1', 
    'store_id_CA_2', 
    'store_id_CA_3', 
    'store_id_CA_4', 
    'store_id_TX_1', 
    'store_id_TX_2', 
    'store_id_TX_3', 
    'store_id_WI_1', 
    'store_id_WI_2', 
    'store_id_WI_3', 
    'state_id_CA', 
    'state_id_TX', 
    'state_id_WI', 
    'event_type_1_Cultural', 
    'event_type_1_National', 
    'event_type_1_Religious', 
    'event_type_1_Sporting', 
    'event_type_1_nan']

In [24]:
params = {
    'boosting_type': 'gbdt',
    'metric': 'rmse',
    'objective': 'regression',
    'n_jobs': -1,
    'seed': 236,
    'learning_rate': 0.1,
    'bagging_fraction': 0.75,
    'bagging_freq': 10, 
    'colsample_bytree': 0.75,
    'lambda_l1': 0.0,
    'lambda_l2': 1.0,
    'min_data_in_leaf': 1,
    'max_depth': 5,
}

# params = {
#     #'boosting_type': 'gbdt',
#     #'metric': 'rmse',
#     #'objective': 'regression',
#     #'learning_rate': 0.1,
#     'gamma': 0.0,
#     'lambda_l1': 0.0,
#     'lambda_l2': 1.0,
#     'min_data_in_leaf': 1,
#     'max_depth': 5,
#     #'bagging_fraction': 0.8,
#     #'colsample_bytree': 0.8,
#     #'seed': 71,
# }

In [25]:
# dataset
train_set = lgb.Dataset(train[features], train['num'], free_raw_data=False)
test_set = lgb.Dataset(test[features], test['num'], free_raw_data=False)

In [26]:
# for train
def score(params):
    for h in params:
        print(h)
    params["min_data_in_leaf"] = int(params["min_data_in_leaf"])
    params["max_depth"] = int(params["max_depth"])

    model = lgb.train(params,
                    train_set, 
                    num_boost_round = 10000,
                    early_stopping_rounds = 50,
                    valid_sets = [train_set, test_set],
                    valid_names=["train", "test"], 
                    verbose_eval = 100)
                    
    pred = model.predict(test[features])
    score = np.sqrt(metrics.mean_squared_error(pred, test["num"]))
    history.append((params, score))
    return {"loss":score, "status":STATUS_OK}

In [27]:
# explore range for params
param_space = {
    'min_data_in_leaf': hp.quniform('min_data_in_leaf', 0, 100, 10),
    'max_depth': hp.quniform('max_depth', 3, 9, 1),
    'bagging_fraction': hp.quniform('bagging_fraction', 0.6, 0.95, 0.05),
    'colsample_bytree': hp.quniform('colsample_bytree', 0.6, 0.95, 0.05),
    'metrics': 'rmse',
    'bagging_freq': 10
    }
    # 'gamma': hp.loguniform('gamma', np.log(1e-8), np.log(1.0)),
    # 余裕があればlambda_l1, lambda_l2も調整する
    # 'lambda_l1' : hp.loguniform('lambda_l1', np.log(1e-8), np.log(1.0)),
    # 'lambda_l2' : hp.loguniform('lambda_l2', np.log(1e-6), np.log(10.0))


In [None]:
# params exploring by hyperopt
max_evals = 10
trials = Trials()
history = []
fmin(score, space=param_space, algo=tpe.suggest, trials=trials, max_evals=max_evals)

In [None]:
# get params and score
history = sorted(history, key=lambda tpl:tpl[1])
best=history[0]
print(f"best params:{best[0]}, score:{best[1]:.4f}")
if slack_notify:
    slack.notify(text=f"*params_tuning.py has finished!:*\nbest params:{best[0]}, score:{best[1]:.4f}")

param_space = {
    'min_data_in_leaf': hp.quniform('min_data_in_leaf', 0, 100, 10),
    'max_depth': hp.quniform('max_depth', 3, 9, 1),
    'bagging_fraction': hp.quniform('bagging_fraction', 0.6, 0.95, 0.05),
    'colsample_bytree': hp.quniform('colsample_bytree', 0.6, 0.95, 0.05)}
で
{'bagging_fraction': 0.8,
 'colsample_bytree': 0.9,
 'max_depth': 7.0,
 'min_data_in_leaf': 0.0}