In [None]:
import json
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
import lightgbm as lgb
import sklearn.model_selection as model_selection

sns.set_style('darkgrid')
pd.set_option('display.max_colwidth', None)

# Utils

In [None]:
def rmlse(y_true, y_pred):
    # Alternatively: sklearn.metrics.mean_squared_log_error(y_true, y_pred) ** 0.5
    assert (y_true >= 0).all() 
    assert (y_pred >= 0).all()
    log_error = np.log1p(y_pred) - np.log1p(y_true)  # Note: log1p(x) = log(1 + x)
    return np.mean(log_error ** 2) ** 0.5

# Load Data

In [None]:
apartments = pd.read_csv('resources/data/apartments_train.csv')
buildings = pd.read_csv('resources/data/buildings_train.csv')
data = pd.merge(apartments, buildings.set_index('id'), how='left', left_on='building_id', right_index=True)

apartments_test = pd.read_csv('resources/data/apartments_test.csv')
buildings_test = pd.read_csv('resources/data/buildings_test.csv')
data_test = pd.merge(apartments_test, buildings_test.set_index('id'), how='left', left_on='building_id', right_index=True)

# LightGBM
I try some hyperparameter tuning with optuna, but I am not able to use the rmlse, which is strange. Need to experiment more with this.

In [None]:
import optuna 
from sklearn.model_selection import KFold
from optuna.integration import LightGBMPruningCallback

def objective(trial, X, y):
    param = {
        'metric': 'rmse', 
        'random_state': 48,
        'n_estimators': 20000,
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.006,0.008,0.01,0.014,0.017,0.02]),
        'max_depth': trial.suggest_categorical('max_depth', [10,20,100]),
        'num_leaves' : trial.suggest_int('num_leaves', 1, 1000),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 300),
        'cat_smooth' : trial.suggest_int('min_data_per_groups', 1, 100)
    }
    cv = KFold(n_splits=5, shuffle=True, random_state=42)

    cv_scores = np.empty(5)
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        model = lgb.LGBMRegressor(objective="regression", **param)
        model.fit(
            X_train,
            y_train,
            eval_set=[(X_test, y_test)],
            early_stopping_rounds=100
        )
        preds = model.predict(X_test)
        cv_scores[idx] = rmlse(y_test, preds)

    return np.mean(cv_scores)

In [70]:
features = ['latitude', 'longitude', 'constructed', 'area_total', 'rooms']
X = data[features]   
y = np.log(data['price'])

study = optuna.create_study(direction="minimize", study_name="LGBM Regressor")
func = lambda trial: objective(trial, X, y)
study.optimize(func, n_trials=1)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  fold_sizes = np.full(n_splits, n_samples // n_splits, dtype=np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  test_mask = np.zeros(_num_samples(X), dtype=np.bool)


[1]	valid_0's rmse: 0.878114
Training until validation scores don't improve for 100 rounds
[2]	valid_0's rmse: 0.867018
[3]	valid_0's rmse: 0.856087
[4]	valid_0's rmse: 0.845321
[5]	valid_0's rmse: 0.834694
[6]	valid_0's rmse: 0.824212
[7]	valid_0's rmse: 0.813908
[8]	valid_0's rmse: 0.803717
[9]	valid_0's rmse: 0.793683
[10]	valid_0's rmse: 0.783764
[11]	valid_0's rmse: 0.774003
[12]	valid_0's rmse: 0.764412
[13]	valid_0's rmse: 0.754993
[14]	valid_0's rmse: 0.745685
[15]	valid_0's rmse: 0.736529
[16]	valid_0's rmse: 0.727507
[17]	valid_0's rmse: 0.718593
[18]	valid_0's rmse: 0.70977
[19]	valid_0's rmse: 0.701136
[20]	valid_0's rmse: 0.69262
[21]	valid_0's rmse: 0.684231
[22]	valid_0's rmse: 0.675921
[23]	valid_0's rmse: 0.667785
[24]	valid_0's rmse: 0.659708
[25]	valid_0's rmse: 0.651817
[26]	valid_0's rmse: 0.644024
[27]	valid_0's rmse: 0.636272
[28]	valid_0's rmse: 0.628703
[29]	valid_0's rmse: 0.621212
[30]	valid_0's rmse: 0.613829
[31]	valid_0's rmse: 0.606563
[32]	valid_0's rmse

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  test_mask = np.zeros(_num_samples(X), dtype=np.bool)


[1]	valid_0's rmse: 0.864287
Training until validation scores don't improve for 100 rounds
[2]	valid_0's rmse: 0.853328
[3]	valid_0's rmse: 0.842426
[4]	valid_0's rmse: 0.831797
[5]	valid_0's rmse: 0.821245
[6]	valid_0's rmse: 0.810846
[7]	valid_0's rmse: 0.800678
[8]	valid_0's rmse: 0.790596
[9]	valid_0's rmse: 0.780633
[10]	valid_0's rmse: 0.770878
[11]	valid_0's rmse: 0.761282
[12]	valid_0's rmse: 0.751774
[13]	valid_0's rmse: 0.74246
[14]	valid_0's rmse: 0.733267
[15]	valid_0's rmse: 0.724157
[16]	valid_0's rmse: 0.715278
[17]	valid_0's rmse: 0.706463
[18]	valid_0's rmse: 0.697804
[19]	valid_0's rmse: 0.689216
[20]	valid_0's rmse: 0.680744
[21]	valid_0's rmse: 0.672487
[22]	valid_0's rmse: 0.664279
[23]	valid_0's rmse: 0.656124
[24]	valid_0's rmse: 0.648115
[25]	valid_0's rmse: 0.640241
[26]	valid_0's rmse: 0.632543
[27]	valid_0's rmse: 0.624884
[28]	valid_0's rmse: 0.617381
[29]	valid_0's rmse: 0.609978
[30]	valid_0's rmse: 0.602623
[31]	valid_0's rmse: 0.595477
[32]	valid_0's rms

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  test_mask = np.zeros(_num_samples(X), dtype=np.bool)


[1]	valid_0's rmse: 0.844773
Training until validation scores don't improve for 100 rounds
[2]	valid_0's rmse: 0.834072
[3]	valid_0's rmse: 0.823523
[4]	valid_0's rmse: 0.813105
[5]	valid_0's rmse: 0.802801
[6]	valid_0's rmse: 0.792655
[7]	valid_0's rmse: 0.782667
[8]	valid_0's rmse: 0.772795
[9]	valid_0's rmse: 0.763037
[10]	valid_0's rmse: 0.753451
[11]	valid_0's rmse: 0.744038
[12]	valid_0's rmse: 0.734731
[13]	valid_0's rmse: 0.725546
[14]	valid_0's rmse: 0.716481
[15]	valid_0's rmse: 0.707635
[16]	valid_0's rmse: 0.698905
[17]	valid_0's rmse: 0.690241
[18]	valid_0's rmse: 0.68177
[19]	valid_0's rmse: 0.673413
[20]	valid_0's rmse: 0.665159
[21]	valid_0's rmse: 0.65699
[22]	valid_0's rmse: 0.648996
[23]	valid_0's rmse: 0.641067
[24]	valid_0's rmse: 0.633272
[25]	valid_0's rmse: 0.625565
[26]	valid_0's rmse: 0.618005
[27]	valid_0's rmse: 0.610651
[28]	valid_0's rmse: 0.603364
[29]	valid_0's rmse: 0.596164
[30]	valid_0's rmse: 0.589042
[31]	valid_0's rmse: 0.582012
[32]	valid_0's rmse

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  test_mask = np.zeros(_num_samples(X), dtype=np.bool)


[1]	valid_0's rmse: 0.837008
Training until validation scores don't improve for 100 rounds
[2]	valid_0's rmse: 0.826249
[3]	valid_0's rmse: 0.815675
[4]	valid_0's rmse: 0.805204
[5]	valid_0's rmse: 0.794941
[6]	valid_0's rmse: 0.784839
[7]	valid_0's rmse: 0.774823
[8]	valid_0's rmse: 0.76496
[9]	valid_0's rmse: 0.755287
[10]	valid_0's rmse: 0.745776
[11]	valid_0's rmse: 0.736356
[12]	valid_0's rmse: 0.727123
[13]	valid_0's rmse: 0.717981
[14]	valid_0's rmse: 0.70892
[15]	valid_0's rmse: 0.700001
[16]	valid_0's rmse: 0.691233
[17]	valid_0's rmse: 0.682568
[18]	valid_0's rmse: 0.674052
[19]	valid_0's rmse: 0.665652
[20]	valid_0's rmse: 0.657377
[21]	valid_0's rmse: 0.649216
[22]	valid_0's rmse: 0.641191
[23]	valid_0's rmse: 0.633244
[24]	valid_0's rmse: 0.625428
[25]	valid_0's rmse: 0.617714
[26]	valid_0's rmse: 0.610109
[27]	valid_0's rmse: 0.602627
[28]	valid_0's rmse: 0.595249
[29]	valid_0's rmse: 0.588005
[30]	valid_0's rmse: 0.580869
[31]	valid_0's rmse: 0.573821
[32]	valid_0's rmse

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  test_mask = np.zeros(_num_samples(X), dtype=np.bool)


[1]	valid_0's rmse: 0.842448
Training until validation scores don't improve for 100 rounds
[2]	valid_0's rmse: 0.831756
[3]	valid_0's rmse: 0.821138
[4]	valid_0's rmse: 0.810704
[5]	valid_0's rmse: 0.800455
[6]	valid_0's rmse: 0.790323
[7]	valid_0's rmse: 0.780363
[8]	valid_0's rmse: 0.770523
[9]	valid_0's rmse: 0.760809
[10]	valid_0's rmse: 0.75125
[11]	valid_0's rmse: 0.741812
[12]	valid_0's rmse: 0.732546
[13]	valid_0's rmse: 0.723376
[14]	valid_0's rmse: 0.714348
[15]	valid_0's rmse: 0.705432
[16]	valid_0's rmse: 0.696671
[17]	valid_0's rmse: 0.688065
[18]	valid_0's rmse: 0.679546
[19]	valid_0's rmse: 0.671133
[20]	valid_0's rmse: 0.662886
[21]	valid_0's rmse: 0.654746
[22]	valid_0's rmse: 0.646688
[23]	valid_0's rmse: 0.638742
[24]	valid_0's rmse: 0.630958
[25]	valid_0's rmse: 0.623232
[26]	valid_0's rmse: 0.615612
[27]	valid_0's rmse: 0.608173
[28]	valid_0's rmse: 0.600811
[29]	valid_0's rmse: 0.593552
[30]	valid_0's rmse: 0.586402
[31]	valid_0's rmse: 0.579379
[32]	valid_0's rms

In [76]:
print(study.trials_dataframe())
study.best_params
study.best_value

   number     value             datetime_start          datetime_complete  \
0       0  0.008677 2021-10-07 17:44:55.773620 2021-10-07 17:48:07.514163   

                duration  params_colsample_bytree  params_learning_rate  \
0 0 days 00:03:11.740543                      1.0                 0.014   

   params_max_depth  params_min_child_samples  params_min_data_per_groups  \
0               100                         9                          49   

   params_num_leaves  params_reg_alpha  params_reg_lambda  params_subsample  \
0                822          0.047569           1.780968               0.4   

      state  
0  COMPLETE  


0.008677403446608733

In [74]:
features = ['latitude', 'longitude', 'constructed', 'area_total', 'rooms']

X_train = data[features]
y_train = data.loc[X_train.index].price

lgb_mod = lgb.LGBMRegressor(
    random_state = 48,
    metric='rmse',
    n_estimators=20000,
    **study.best_params
)

lgb_mod.fit(X_train, np.log(y_train))



LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.014, max_depth=100,
              metric='rmse', min_child_samples=9, min_child_weight=0.001,
              min_data_per_groups=49, min_split_gain=0.0, n_estimators=20000,
              n_jobs=-1, num_leaves=822, objective=None, random_state=48,
              reg_alpha=0.04756900410153506, reg_lambda=1.7809676540061132,
              silent=True, subsample=0.4, subsample_for_bin=200000,
              subsample_freq=0)

In [75]:
preds_test = lgb_mod.predict(data_test[features])
submission = pd.DataFrame()
submission['id'] = data_test.id
submission['price_prediction'] = np.exp(preds_test)
submission.to_csv('submissions/lgbm2_submission.csv', index=False)

In [28]:
features = ['latitude', 'longitude', 'constructed', 'area_total', 'rooms']

data_train, data_valid = model_selection.train_test_split(data, test_size=0.33, stratify=np.log(data.price).round())
X_train = data_train[features]
y_train = data_train.loc[X_train.index].price
X_valid = data_valid[features]
y_valid = data_valid.loc[X_valid.index].price


lgb_mod = lgb.LGBMRegressor(
    num_leaves=10,
    max_depth=5, 
    random_state=42, 
    silent=True, 
    metric='rmsle',
    n_jobs=4, 
    n_estimators=2000,
    colsample_bytree=0.95,
    subsample=0.9,
    learning_rate=0.05,
    seed = 42
)

lgb_mod.fit(X_train, np.log(y_train))

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  return floored.astype(np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  return floored.astype(np.int)




LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=0.95,
              importance_type='split', learning_rate=0.05, max_depth=5,
              metric='rmsle', min_child_samples=20, min_child_weight=0.001,
              min_split_gain=0.0, n_estimators=2000, n_jobs=4, num_leaves=10,
              objective=None, random_state=42, reg_alpha=0.0, reg_lambda=0.0,
              seed=42, silent=True, subsample=0.9, subsample_for_bin=200000,
              subsample_freq=0)

In [29]:
preds_valid = lgb_mod.predict(X_valid)
print('Validation RMSLE: ', rmlse(y_valid, np.exp(preds_valid)))

Validation RMSLE:  0.16431015822158682


In [37]:
preds_test = lgb_mod.predict(data_test[features])
submission = pd.DataFrame()
submission['id'] = data_test.id
submission['price_prediction'] = np.exp(preds_test)
submission.to_csv('submissions/lgbm_submission.csv', index=False)


(9937, 33)
(9937,)


array([34783427.25415785,  7639045.30896283,  6145455.69914033, ...,
       10082195.97498193,  8750817.10927868,  6105413.72482848])