In [None]:
import json
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
import lightgbm as lgb
import sklearn.model_selection as model_selection

sns.set_style('darkgrid')
pd.set_option('display.max_colwidth', None)

# Utils

In [None]:
def rmlse(y_true, y_pred):
    # Alternatively: sklearn.metrics.mean_squared_log_error(y_true, y_pred) ** 0.5
    assert (y_true >= 0).all() 
    assert (y_pred >= 0).all()
    log_error = np.log1p(y_pred) - np.log1p(y_true)  # Note: log1p(x) = log(1 + x)
    return np.mean(log_error ** 2) ** 0.5

# Load Data

In [None]:
apartments = pd.read_csv('resources/data/apartments_train.csv')
buildings = pd.read_csv('resources/data/buildings_train.csv')
data = pd.merge(apartments, buildings.set_index('id'), how='left', left_on='building_id', right_index=True)

apartments_test = pd.read_csv('resources/data/apartments_test.csv')
buildings_test = pd.read_csv('resources/data/buildings_test.csv')
data_test = pd.merge(apartments_test, buildings_test.set_index('id'), how='left', left_on='building_id', right_index=True)

# LightGBM
I try some hyperparameter tuning with optuna, but I am not able to use the rmlse, which is strange. Need to experiment more with this.

In [85]:
import optuna 
from sklearn.model_selection import KFold
from optuna.integration import LightGBMPruningCallback

def objective(trial, X, y):
    param = {
        'verbosity': 1,
        'metric': 'rmse', 
        'random_state': 42,
        'n_estimators': 3000,
        #'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
        #'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),
        #'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        #'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.006,0.008,0.01,0.014,0.017,0.02]),
        'max_depth': trial.suggest_categorical('max_depth', [10,20,100]),
        'num_leaves' : trial.suggest_int('num_leaves', 1, 1000),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 300)
        #'cat_smooth' : trial.suggest_int('min_data_per_groups', 1, 100)
    }
    cv = KFold(n_splits=5, shuffle=True, random_state=42)

    cv_scores = np.empty(5)
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        model = lgb.LGBMRegressor(objective="regression", **param)
        model.fit(
            X_train,
            y_train,
            eval_set=[(X_test, y_test)],
            early_stopping_rounds=100
        )
        preds = model.predict(X_test)
        cv_scores[idx] = rmlse(y_test, preds)

    return np.mean(cv_scores)

In [86]:
features = ['latitude', 'longitude', 'constructed', 'area_total', 'rooms']
X = data[features]   
y = np.log(data['price'])

study = optuna.create_study(direction="minimize", study_name="LGBM Regressor")
func = lambda trial: objective(trial, X, y)
study.optimize(func, n_trials=1)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  fold_sizes = np.full(n_splits, n_samples // n_splits, dtype=np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  test_mask = np.zeros(_num_samples(X), dtype=np.bool)


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 889
[LightGBM] [Info] Number of data points in the train set: 18628, number of used features: 5
[LightGBM] [Info] Start training from score 16.425089
[1]	valid_0's rmse: 0.881675
Training until validation scores don't improve for 100 rounds
[2]	valid_0's rmse: 0.874066
[3]	valid_0's rmse: 0.866543
[4]	valid_0's rmse: 0.859105
[5]	valid_0's rmse: 0.85174
[6]	valid_0's rmse: 0.844454
[7]	valid_0's rmse: 0.837271
[8]	valid_0's rmse: 0.83015
[9]	valid_0's rmse: 0.823126
[10]	valid_0's rmse: 0.816168
[11]	valid_0's rmse: 0.809297
[12]	valid_0's rmse: 0.802489
[13]	valid_0's rmse: 0.795753
[14]	valid_0's rmse: 0.789092
[15]	valid_0's rmse: 0.782513
[16]	valid_0's rmse: 0.776036
[17]	valid_0's rmse: 0.769648
[18]	valid_0's rmse: 0.763322
[19]	valid_0's rmse: 0.757029
[20]	valid_0's rmse: 0.750819
[21]	valid_0's rmse: 0.7447
[22]	valid_0's rmse:

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  test_mask = np.zeros(_num_samples(X), dtype=np.bool)


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 886
[LightGBM] [Info] Number of data points in the train set: 18628, number of used features: 5
[LightGBM] [Info] Start training from score 16.433117
[1]	valid_0's rmse: 0.86775
Training until validation scores don't improve for 100 rounds
[2]	valid_0's rmse: 0.860189
[3]	valid_0's rmse: 0.852747
[4]	valid_0's rmse: 0.845356
[5]	valid_0's rmse: 0.838087
[6]	valid_0's rmse: 0.830851
[7]	valid_0's rmse: 0.823737
[8]	valid_0's rmse: 0.816634
[9]	valid_0's rmse: 0.809689
[10]	valid_0's rmse: 0.802779
[11]	valid_0's rmse: 0.795915
[12]	valid_0's rmse: 0.789128
[13]	valid_0's rmse: 0.782439
[14]	valid_0's rmse: 0.775809
[15]	valid_0's rmse: 0.769279
[16]	valid_0's rmse: 0.762797
[17]	valid_0's rmse: 0.756375
[18]	valid_0's rmse: 0.750047
[19]	valid_0's rmse: 0.743757
[20]	valid_0's rmse: 0.737595
[21]	valid_0's rmse: 0.73149
[22]	valid_0's rms

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  test_mask = np.zeros(_num_samples(X), dtype=np.bool)


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 889
[LightGBM] [Info] Number of data points in the train set: 18628, number of used features: 5
[LightGBM] [Info] Start training from score 16.433689
[1]	valid_0's rmse: 0.848245
Training until validation scores don't improve for 100 rounds
[2]	valid_0's rmse: 0.84094
[3]	valid_0's rmse: 0.83372
[4]	valid_0's rmse: 0.826589
[5]	valid_0's rmse: 0.819552
[6]	valid_0's rmse: 0.812553
[7]	valid_0's rmse: 0.805649
[8]	valid_0's rmse: 0.798807
[9]	valid_0's rmse: 0.792027
[10]	valid_0's rmse: 0.785358
[11]	valid_0's rmse: 0.778734
[12]	valid_0's rmse: 0.772187
[13]	valid_0's rmse: 0.76574
[14]	valid_0's rmse: 0.759341
[15]	valid_0's rmse: 0.753059
[16]	valid_0's rmse: 0.746805
[17]	valid_0's rmse: 0.740641
[18]	valid_0's rmse: 0.734514
[19]	valid_0's rmse: 0.728474
[20]	valid_0's rmse: 0.722518
[21]	valid_0's rmse: 0.716604
[22]	valid_0's rmse

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  test_mask = np.zeros(_num_samples(X), dtype=np.bool)


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 885
[LightGBM] [Info] Number of data points in the train set: 18628, number of used features: 5
[LightGBM] [Info] Start training from score 16.428518
[1]	valid_0's rmse: 0.84053
Training until validation scores don't improve for 100 rounds
[2]	valid_0's rmse: 0.83323
[3]	valid_0's rmse: 0.825995
[4]	valid_0's rmse: 0.818855
[5]	valid_0's rmse: 0.811808
[6]	valid_0's rmse: 0.804819
[7]	valid_0's rmse: 0.797932
[8]	valid_0's rmse: 0.7911
[9]	valid_0's rmse: 0.784289
[10]	valid_0's rmse: 0.777575
[11]	valid_0's rmse: 0.770963
[12]	valid_0's rmse: 0.764362
[13]	valid_0's rmse: 0.757921
[14]	valid_0's rmse: 0.751496
[15]	valid_0's rmse: 0.745152
[16]	valid_0's rmse: 0.738884
[17]	valid_0's rmse: 0.732701
[18]	valid_0's rmse: 0.726587
[19]	valid_0's rmse: 0.720506
[20]	valid_0's rmse: 0.714507
[21]	valid_0's rmse: 0.708585
[22]	valid_0's rmse: 0.702739
[23]	valid_0's rmse: 0.696931
[24]	valid_0's rmse: 0.6

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  test_mask = np.zeros(_num_samples(X), dtype=np.bool)


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 886
[LightGBM] [Info] Number of data points in the train set: 18628, number of used features: 5
[LightGBM] [Info] Start training from score 16.431997
[1]	valid_0's rmse: 0.8459
Training until validation scores don't improve for 100 rounds
[2]	valid_0's rmse: 0.838505
[3]	valid_0's rmse: 0.831215
[4]	valid_0's rmse: 0.823984
[5]	valid_0's rmse: 0.816915
[6]	valid_0's rmse: 0.809844
[7]	valid_0's rmse: 0.80287
[8]	valid_0's rmse: 0.795914
[9]	valid_0's rmse: 0.789043
[10]	valid_0's rmse: 0.782316
[11]	valid_0's rmse: 0.775603
[12]	valid_0's rmse: 0.768964
[13]	valid_0's rmse: 0.762461
[14]	valid_0's rmse: 0.755962
[15]	valid_0's rmse: 0.749598
[16]	valid_0's rmse: 0.743247
[17]	valid_0's rmse: 0.736926
[18]	valid_0's rmse: 0.730718
[19]	valid_0's rmse: 0.72456
[20]	valid_0's rmse: 0.718571
[21]	valid_0's rmse: 0.712554
[22]	valid_0's rmse: 0.706623
[23]	valid_0's rmse: 0.700772
[24]	valid_0's rmse: 0.6

In [87]:
print(f"\tBest value (rmlse): {study.best_value:.5f}")
print(f"\tBest params:")

for key, value in study.best_params.items():
    print(f"\t\t{key}: {value}")

   number     value             datetime_start          datetime_complete  \
0       0  0.009838 2021-10-10 14:12:14.181975 2021-10-10 14:13:43.648580   

                duration  params_learning_rate  params_max_depth  \
0 0 days 00:01:29.466605                  0.01                20   

   params_min_child_samples  params_num_leaves     state  
0                       242                604  COMPLETE  


0.009837794076996646

In [88]:
features = ['latitude', 'longitude', 'constructed', 'area_total', 'rooms']

X_train = data[features]
y_train = data.loc[X_train.index].price

lgb_mod = lgb.LGBMRegressor(
    random_state = 48,
    metric='rmse',
    n_estimators=20000,
    **study.best_params
)

lgb_mod.fit(X_train, np.log(y_train))

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.01, max_depth=20,
              metric='rmse', min_child_samples=242, min_child_weight=0.001,
              min_split_gain=0.0, n_estimators=20000, n_jobs=-1, num_leaves=604,
              objective=None, random_state=48, reg_alpha=0.0, reg_lambda=0.0,
              silent=True, subsample=1.0, subsample_for_bin=200000,
              subsample_freq=0)

In [75]:
preds_test = lgb_mod.predict(data_test[features])
submission = pd.DataFrame()
submission['id'] = data_test.id
submission['price_prediction'] = np.exp(preds_test)
submission.to_csv('submissions/lgbm2_submission.csv', index=False)

In [28]:
features = ['latitude', 'longitude', 'constructed', 'area_total', 'rooms']

data_train, data_valid = model_selection.train_test_split(data, test_size=0.33, stratify=np.log(data.price).round())
X_train = data_train[features]
y_train = data_train.loc[X_train.index].price
X_valid = data_valid[features]
y_valid = data_valid.loc[X_valid.index].price


lgb_mod = lgb.LGBMRegressor(
    num_leaves=10,
    max_depth=5, 
    random_state=42, 
    silent=True, 
    metric='rmsle',
    n_jobs=4, 
    n_estimators=2000,
    colsample_bytree=0.95,
    subsample=0.9,
    learning_rate=0.05,
    seed = 42
)

lgb_mod.fit(X_train, np.log(y_train))

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  return floored.astype(np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  return floored.astype(np.int)




LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=0.95,
              importance_type='split', learning_rate=0.05, max_depth=5,
              metric='rmsle', min_child_samples=20, min_child_weight=0.001,
              min_split_gain=0.0, n_estimators=2000, n_jobs=4, num_leaves=10,
              objective=None, random_state=42, reg_alpha=0.0, reg_lambda=0.0,
              seed=42, silent=True, subsample=0.9, subsample_for_bin=200000,
              subsample_freq=0)

In [29]:
preds_valid = lgb_mod.predict(X_valid)
print('Validation RMSLE: ', rmlse(y_valid, np.exp(preds_valid)))

Validation RMSLE:  0.16431015822158682


In [37]:
preds_test = lgb_mod.predict(data_test[features])
submission = pd.DataFrame()
submission['id'] = data_test.id
submission['price_prediction'] = np.exp(preds_test)
submission.to_csv('submissions/lgbm_submission.csv', index=False)


(9937, 33)
(9937,)


array([34783427.25415785,  7639045.30896283,  6145455.69914033, ...,
       10082195.97498193,  8750817.10927868,  6105413.72482848])