In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [2]:
train_df = pd.read_csv('../preprocessing_data/train.csv')
test_df = pd.read_csv('../preprocessing_data/test.csv')
X = np.array(train_df.drop(['SalePrice'], axis = 1).values)
y = np.array(train_df['SalePrice'].values)
X_test = np.array(test_df.values)

In [3]:
from bayes_opt import BayesianOptimization
import lightgbm as lgb
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import get_scorer, mean_squared_error
import time

In [4]:
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [5]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size= 0.2, random_state=42)

In [6]:
def rid_cl_bo(num_leaves, learning_rate, n_estimators, max_bin,
              feature_fraction_seed, bagging_seed):
    params_lgmb = {}
    params_lgmb['num_leaves'] = round(num_leaves)
    params_lgmb['learning_rate'] = learning_rate
    params_lgmb['n_estimators'] = round(n_estimators)
    params_lgmb['max_bin'] = round(max_bin)
    params_lgmb['feature_fraction_seed'] = round(feature_fraction_seed)
    params_lgmb['bagging_seed'] = round(bagging_seed)

    lgbm = lgb.LGBMRegressor(objective='regression', **params_lgmb)
    lgbm.fit(X_train, y_train)

    score = -rmse(y_valid, lgbm.predict(X_valid))
    return score

# Run Bayesian Optimization
start = time.time()
params_lgbm ={
    'num_leaves': (8, 10), 
    'learning_rate': (0.001, 0.02), 
    'n_estimators': (1000, 1200), 
    'max_bin': (50, 70),
    'feature_fraction_seed': (15, 25),
    'bagging_seed': (5, 8),
}
lgbm_bo = BayesianOptimization(rid_cl_bo, params_lgbm, random_state=111)
lgbm_bo.maximize(init_points=20, n_iter=4)
print('It takes %s minutes' % ((time.time() - start)/60))

|   iter    |  target   | baggin... | featur... | learni... |  max_bin  | n_esti... | num_le... |
-------------------------------------------------------------------------------------------------
| [0m1        [0m | [0m-0.1328  [0m | [0m6.837    [0m | [0m16.69    [0m | [0m0.009285 [0m | [0m65.39    [0m | [0m1.059e+03[0m | [0m8.298    [0m |
| [0m2        [0m | [0m-0.1348  [0m | [0m5.067    [0m | [0m19.2     [0m | [0m0.005535 [0m | [0m56.75    [0m | [0m1.198e+03[0m | [0m8.475    [0m |
| [95m3        [0m | [95m-0.1309  [0m | [95m5.244    [0m | [95m21.7     [0m | [95m0.0128   [0m | [95m55.49    [0m | [95m1.093e+03[0m | [95m8.237    [0m |
| [0m4        [0m | [0m-0.1337  [0m | [0m5.222    [0m | [0m24.01    [0m | [0m0.01609  [0m | [0m66.81    [0m | [0m1.163e+03[0m | [0m9.982    [0m |
| [95m5        [0m | [95m-0.1306  [0m | [95m6.732    [0m | [95m23.14    [0m | [95m0.009005 [0m | [95m50.55    [0m | [95m1.091e+03[0m |

In [7]:
lgbm = lgb.LGBMRegressor(objective='regression',num_leaves=8,
                        learning_rate= 0.015, n_estimators=1033,
                        max_bin = 62, feature_fraction_seed=25, bagging_seed=5)
lgbm.fit(X_train, y_train)
rmse(y_valid, lgbm.predict(X_valid))

0.13039814718287585

In [8]:
ans = pd.read_csv('../submission/cheat.csv').drop('Id', axis = 1)
ans = np.array(ans.values).reshape(1, -1)[0]
y_pred = lgbm.predict(X_test)
print(f"Error: {rmse(np.log1p(ans), y_pred)}")

Error: 0.12960138973508817
