In [83]:
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
# import packages for hyperparameters tuning
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error

import numpy as np
import pandas as pd

In [84]:
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [85]:
train_df = pd.read_csv('../preprocessing_data/preprocessed_train.csv').drop('Id', axis = 1)
test_df = pd.read_csv('../preprocessing_data/preprocessed_test.csv').drop('Id', axis = 1)
X = np.array(train_df.drop(['SalePrice'], axis = 1).values)
y = np.log1p(np.array(train_df['SalePrice'].values))
X_test = np.array(test_df.values)

In [86]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size= 0.2, random_state=42)

In [87]:
n_folds = 5

def getRMSLE(model):
    """
    Return the average RMSLE over all folds of training data.
    """
    # Set KFold to shuffle data before the split
    kf = KFold(n_folds, shuffle=True, random_state=42)

    # Get RMSLE score
    rmse = np.sqrt(-cross_val_score(
        model, X_train, y_train, scoring="neg_mean_squared_error", cv=kf))

    return rmse.mean()

In [88]:
# Set up the k-fold cross-validation
kfold = KFold(n_splits=5, shuffle=True, random_state=0)

# 1 XGBR

In [89]:
space={ 'max_depth': hp.randint('max_depth', 5, 10),
        'min_child_weight' : hp.uniform('min_child_weight', 1, 1.5),
        'gamma': hp.uniform('gamma', 0, 1),
        'subsample': hp.uniform('subsample', 0, 1),
        'colsample_bytree' :  hp.uniform('colsample_bytree', 0.5, 1),
        'learning_rate': hp.uniform('learning_rate', 0.05, 0.3),        
        'reg_alpha' : hp.uniform('reg_alpha', 0, 1),
        'reg_lambda' : hp.uniform('reg_lambda', 0, 1),
        'n_estimators': hp.randint('n_estimators', 1000, 3000)
    }

In [90]:
def objective(params):
    xgboost = XGBRegressor(seed=0, **params)
    xgboost.fit(X_train, y_train)
    loss = rmse(y_valid, xgboost.predict(X_valid))
    # Dictionary with information for evaluation
    return {'loss': loss, 'params': params, 'status': STATUS_OK}

In [91]:
# Optimize
trials = Trials()
best = fmin(fn=objective, 
            space=space, 
            algo=tpe.suggest, 
            max_evals=100, 
            trials=trials)

print('Best hyperparameters:', best)

100%|██████████| 100/100 [08:22<00:00,  5.03s/trial, best loss: 0.12192005941284328]
Best hyperparameters: {'colsample_bytree': 0.9781980756358969, 'gamma': 0.13136823018988167, 'learning_rate': 0.05493140226506563, 'max_depth': 8, 'min_child_weight': 1.4783695196881401, 'n_estimators': 2174, 'reg_alpha': 0.17155914762735375, 'reg_lambda': 0.2811705695969712, 'subsample': 0.386800026462253}


In [92]:
best_learning_rate = best
best_learning_rate['learning_rate'] = hp.uniform('learning_rate', 0.0001, best['learning_rate'])

In [93]:
# Optimize
trials = Trials()
best_learning_rate = fmin(fn=objective, 
            space=best_learning_rate, 
            algo=tpe.suggest, 
            max_evals=100, 
            trials=trials)

print('Best hyperparameters:', best_learning_rate)

100%|██████████| 100/100 [10:48<00:00,  6.48s/trial, best loss: 0.11905596699644086]
Best hyperparameters: {'learning_rate': 0.047550847646322746}


In [94]:
best['learning_rate'] = best_learning_rate['learning_rate']

In [95]:
# xgb = XGBRegressor(colsample_bytree=  0.357, gamma=0.0035, learning_rate=0.05, max_depth=4, min_child_weight=5, n_estimators=1419, reg_alpha=0.84, reg_lambda=0.5)
xgb = XGBRegressor(**best)
xgb.fit(X_train, y_train)
ans = pd.read_csv('../submission/cheat.csv').drop('Id', axis = 1)
ans = np.array(ans.values).reshape(1, -1)[0]
y_pred = xgb.predict(X_test)
print(f"Error: {rmse(np.log1p(ans), y_pred)}")

Error: 0.1317009436603947
