In [1]:
import math
import os
import re
import itertools

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import hyperopt
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

%matplotlib inline

In [4]:
DATA_DIR = '/Users/keiji/work/kaggle/sales1c/'

def read(file_name):
    pickle = DATA_DIR + file_name + '.pickle'
    if os.path.exists(pickle):
        return pd.read_pickle(pickle)
    df = pd.read_csv(DATA_DIR + file_name)
    df.to_pickle(pickle)
    return df

df_icats = read('item_categories.csv')
df_items = read('items.csv')
df_shops = read('shops.csv')
df_test = read('test.csv.gz')
df_sales = read('sales_train.csv.gz')

X_train = read('X_train')
y_train = read('y_train')
X_val = read('X_val')
y_val = read('y_val')

In [7]:
def clip(s):
    return s.map(lambda x: max(0.0, min(20.0, x)))

def rmse(x, y):
    return math.sqrt(mean_squared_error(clip(x), clip(y)))

# Hyperparameter tuning

In [8]:
objective_cnt = 0
min_rmse = 10
def objective(params):
    global min_rmse
    global objective_cnt
    objective_cnt += 1

    params['num_leaves'] = int(params['num_leaves'])
    params['max_depth'] = int(params['max_depth'])
    params['min_data_in_leaf'] = int(params['min_data_in_leaf'])
    
    model = LGBMRegressor(random_state=42, n_jobs=-1, **params)
    
    model.fit(X_train, clip_train(y_train), early_stopping_rounds=200, eval_set=[(X_val, clip_train(y_val))], verbose=False)
    y_pred = pd.Series(model.predict(X_val))
    score = rmse(y_val, y_pred)
    if score < min_rmse:
        min_rmse = score
        print("RMSE #{}: {:.6f}: params={}".format(objective_cnt, score, params))
    return -score

space = {
    'num_leaves': hyperopt.hp.quniform('num_leaves', 50, 200, 10),
    'max_depth': hyperopt.hp.quniform('max_depth', 3, 10, 1),
    'min_data_in_leaf': hyperopt.hp.quniform('min_data_in_leaf',  5, 25, 2),
    'colsample_bytree': hyperopt.hp.uniform('colsample_bytree', 0.5, 1.0),
    'learning_rate': hyperopt.hp.uniform('learning_rate', 0.03, 0.9),
    'subsample': hyperopt.hp.uniform('subsample', 0.5, 1.0),
    'reg_lambda': hyperopt.hp.uniform('reg_lambda', 0.0, 50.0),
}

best_params = hyperopt.fmin(
    fn=objective,
    space=space,
    algo=hyperopt.tpe.suggest,
    max_evals=2)
best_params

RMSE #1: 0.925480: params={'num_leaves': 60, 'subsample': 0.7433111808365149, 'learning_rate': 0.21907777859246094, 'max_depth': 7, 'colsample_bytree': 0.7629884602449744, 'reg_lambda': 6.4721944579138935, 'min_data_in_leaf': 10}


{'colsample_bytree': 0.7337997721976569,
 'learning_rate': 0.20632038461069757,
 'max_depth': 10.0,
 'min_data_in_leaf': 6.0,
 'num_leaves': 180.0,
 'reg_lambda': 7.643714160645771,
 'subsample': 0.9483551739837238}