In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import optuna
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.feature_selection import SelectFromModel

In [2]:
df = pd.read_csv("preprocessed_data.csv")
df.head()

Unnamed: 0,rooms,floor,floorCount,latitude,longitude,poiCount,schoolDistance,clinicDistance,postOfficeDistance,kindergartenDistance,...,city_lodz,city_lublin,city_poznan,city_radom,city_rzeszow,city_szczecin,city_warszawa,city_wroclaw,ownership_cooperative,ownership_udział
0,4.0,3.0,4.0,1.094088,-2.820267,-0.816073,1.487393,-0.026132,0.230208,1.028105,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,2.0,5.0,5.0,1.092505,-2.781375,1.647109,-0.585392,-0.607703,-0.383685,-0.380457,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,5.0,4.0,4.0,1.102938,-2.777586,-0.440334,-0.432099,-0.410059,0.479928,-0.185681,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,3.0,7.0,7.0,1.106274,-2.779584,-0.440334,-0.316574,-0.228318,0.340501,-0.233188,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,3.0,4.0,4.0,1.112952,-2.765224,-0.56558,-0.603165,0.413455,0.86075,1.358273,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [3]:
X = df.drop(columns=["log_price"])
y = df["log_price"]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Model Tuning

In [5]:
def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 500, 2000, step=100),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1),
        "max_depth": trial.suggest_int("max_depth", 5, 15),
        "num_leaves": trial.suggest_int("num_leaves", 20, 100),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0)
    }
    model = lgb.LGBMRegressor(**params, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return mean_absolute_error(y_test, y_pred)

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=20)
best_params = study.best_params

[I 2025-03-13 22:35:28,262] A new study created in memory with name: no-name-0908b654-c82e-4510-b2d7-ef863954ed59


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.016847 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3347
[LightGBM] [Info] Number of data points in the train set: 33385, number of used features: 38
[LightGBM] [Info] Start training from score 13.507787


[I 2025-03-13 22:35:33,419] Trial 0 finished with value: 0.07940447102127102 and parameters: {'n_estimators': 900, 'learning_rate': 0.09438794747742887, 'max_depth': 9, 'num_leaves': 35, 'subsample': 0.9350265479714479, 'colsample_bytree': 0.6017162763709837}. Best is trial 0 with value: 0.07940447102127102.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009896 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3347
[LightGBM] [Info] Number of data points in the train set: 33385, number of used features: 38
[LightGBM] [Info] Start training from score 13.507787


[I 2025-03-13 22:35:39,912] Trial 1 finished with value: 0.08863343352236716 and parameters: {'n_estimators': 1400, 'learning_rate': 0.027346736903409136, 'max_depth': 8, 'num_leaves': 34, 'subsample': 0.5051200082051728, 'colsample_bytree': 0.8769088278940664}. Best is trial 0 with value: 0.07940447102127102.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009073 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3347
[LightGBM] [Info] Number of data points in the train set: 33385, number of used features: 38
[LightGBM] [Info] Start training from score 13.507787
























[I 2025-03-13 22:35:47,442] Trial 2 finished with value: 0.06845551203461071 and parameters: {'n_estimators': 1700, 'learning_rate': 0.06109134162844115, 'max_depth': 7, 'num_leaves': 88, 'subsample': 0.8812220866778089, 'colsample_bytree': 0.9235176718667909}. Best is trial 2 with value: 0.06845551203461071.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005115 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3347
[LightGBM] [Info] Number of data points in the train set: 33385, number of used features: 38
[LightGBM] [Info] Start training from score 13.507787




[I 2025-03-13 22:35:51,583] Trial 3 finished with value: 0.07104718914628569 and parameters: {'n_estimators': 1000, 'learning_rate': 0.06306764423483976, 'max_depth': 9, 'num_leaves': 80, 'subsample': 0.8791826746443905, 'colsample_bytree': 0.7352880630243479}. Best is trial 2 with value: 0.06845551203461071.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005627 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3347
[LightGBM] [Info] Number of data points in the train set: 33385, number of used features: 38
[LightGBM] [Info] Start training from score 13.507787


[I 2025-03-13 22:35:53,371] Trial 4 finished with value: 0.09527048360976378 and parameters: {'n_estimators': 500, 'learning_rate': 0.02773836154080725, 'max_depth': 9, 'num_leaves': 48, 'subsample': 0.6416649865242521, 'colsample_bytree': 0.5306967544593518}. Best is trial 2 with value: 0.06845551203461071.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005765 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3347
[LightGBM] [Info] Number of data points in the train set: 33385, number of used features: 38
[LightGBM] [Info] Start training from score 13.507787






















[I 2025-03-13 22:36:00,070] Trial 5 finished with value: 0.07745890930408618 and parameters: {'n_estimators': 2000, 'learning_rate': 0.04516575502232121, 'max_depth': 6, 'num_leaves': 50, 'subsample': 0.6608744349572249, 'colsample_bytree': 0.8423003187661446}. Best is trial 2 with value: 0.06845551203461071.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002441 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3347
[LightGBM] [Info] Number of data points in the train set: 33385, number of used features: 38
[LightGBM] [Info] Start training from score 13.507787




[I 2025-03-13 22:36:02,451] Trial 6 finished with value: 0.09955553253607567 and parameters: {'n_estimators': 600, 'learning_rate': 0.018234846461689498, 'max_depth': 6, 'num_leaves': 44, 'subsample': 0.8410523377989416, 'colsample_bytree': 0.6957326943872584}. Best is trial 2 with value: 0.06845551203461071.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005520 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3347
[LightGBM] [Info] Number of data points in the train set: 33385, number of used features: 38
[LightGBM] [Info] Start training from score 13.507787












[I 2025-03-13 22:36:08,922] Trial 7 finished with value: 0.06464355103723839 and parameters: {'n_estimators': 1400, 'learning_rate': 0.07476411409868984, 'max_depth': 8, 'num_leaves': 88, 'subsample': 0.9470750611460343, 'colsample_bytree': 0.8412465782199863}. Best is trial 7 with value: 0.06464355103723839.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005105 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3347
[LightGBM] [Info] Number of data points in the train set: 33385, number of used features: 38
[LightGBM] [Info] Start training from score 13.507787




















[I 2025-03-13 22:36:11,634] Trial 8 finished with value: 0.08191944220723003 and parameters: {'n_estimators': 1300, 'learning_rate': 0.08970822480367567, 'max_depth': 5, 'num_leaves': 96, 'subsample': 0.7289500645278548, 'colsample_bytree': 0.7119777514151695}. Best is trial 7 with value: 0.06464355103723839.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005683 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3347
[LightGBM] [Info] Number of data points in the train set: 33385, number of used features: 38
[LightGBM] [Info] Start training from score 13.507787


[I 2025-03-13 22:36:23,217] Trial 9 finished with value: 0.07031498389711252 and parameters: {'n_estimators': 2000, 'learning_rate': 0.02330657962768317, 'max_depth': 15, 'num_leaves': 97, 'subsample': 0.7566200700014762, 'colsample_bytree': 0.8343205972467096}. Best is trial 7 with value: 0.06464355103723839.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005175 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3347
[LightGBM] [Info] Number of data points in the train set: 33385, number of used features: 38
[LightGBM] [Info] Start training from score 13.507787


[I 2025-03-13 22:36:30,365] Trial 10 finished with value: 0.062453992511593816 and parameters: {'n_estimators': 1600, 'learning_rate': 0.07587100475135046, 'max_depth': 12, 'num_leaves': 69, 'subsample': 0.9906317648180712, 'colsample_bytree': 0.9786270879452045}. Best is trial 10 with value: 0.062453992511593816.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005303 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3347
[LightGBM] [Info] Number of data points in the train set: 33385, number of used features: 38
[LightGBM] [Info] Start training from score 13.507787


[I 2025-03-13 22:36:37,192] Trial 11 finished with value: 0.06208902504736001 and parameters: {'n_estimators': 1600, 'learning_rate': 0.07954561142993039, 'max_depth': 12, 'num_leaves': 70, 'subsample': 0.999527969149327, 'colsample_bytree': 0.9985852503510244}. Best is trial 11 with value: 0.06208902504736001.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005773 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3347
[LightGBM] [Info] Number of data points in the train set: 33385, number of used features: 38
[LightGBM] [Info] Start training from score 13.507787


[I 2025-03-13 22:36:45,024] Trial 12 finished with value: 0.061438619908949006 and parameters: {'n_estimators': 1700, 'learning_rate': 0.07898682881738107, 'max_depth': 12, 'num_leaves': 68, 'subsample': 0.9860976154013635, 'colsample_bytree': 0.9997947056721763}. Best is trial 12 with value: 0.061438619908949006.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006163 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3347
[LightGBM] [Info] Number of data points in the train set: 33385, number of used features: 38
[LightGBM] [Info] Start training from score 13.507787


[I 2025-03-13 22:36:51,852] Trial 13 finished with value: 0.0630001864540458 and parameters: {'n_estimators': 1700, 'learning_rate': 0.08019828304745706, 'max_depth': 12, 'num_leaves': 63, 'subsample': 0.9962007343028578, 'colsample_bytree': 0.9922853039665502}. Best is trial 12 with value: 0.061438619908949006.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011900 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3347
[LightGBM] [Info] Number of data points in the train set: 33385, number of used features: 38
[LightGBM] [Info] Start training from score 13.507787


[I 2025-03-13 22:37:00,192] Trial 14 finished with value: 0.058802877192458365 and parameters: {'n_estimators': 1800, 'learning_rate': 0.09941983773683896, 'max_depth': 12, 'num_leaves': 71, 'subsample': 0.8109185365430529, 'colsample_bytree': 0.9259895448473529}. Best is trial 14 with value: 0.058802877192458365.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005678 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3347
[LightGBM] [Info] Number of data points in the train set: 33385, number of used features: 38
[LightGBM] [Info] Start training from score 13.507787


[I 2025-03-13 22:37:08,828] Trial 15 finished with value: 0.0573831543762593 and parameters: {'n_estimators': 1900, 'learning_rate': 0.09880695299818623, 'max_depth': 14, 'num_leaves': 74, 'subsample': 0.8042068321578268, 'colsample_bytree': 0.9223542176970055}. Best is trial 15 with value: 0.0573831543762593.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010853 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3347
[LightGBM] [Info] Number of data points in the train set: 33385, number of used features: 38
[LightGBM] [Info] Start training from score 13.507787


[I 2025-03-13 22:37:14,572] Trial 16 finished with value: 0.07403911383223366 and parameters: {'n_estimators': 1900, 'learning_rate': 0.0976354286607104, 'max_depth': 15, 'num_leaves': 23, 'subsample': 0.8082177782108438, 'colsample_bytree': 0.9103697971725381}. Best is trial 15 with value: 0.0573831543762593.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010251 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3347
[LightGBM] [Info] Number of data points in the train set: 33385, number of used features: 38
[LightGBM] [Info] Start training from score 13.507787


[I 2025-03-13 22:37:19,754] Trial 17 finished with value: 0.07238668511534535 and parameters: {'n_estimators': 1100, 'learning_rate': 0.047085332891749, 'max_depth': 14, 'num_leaves': 79, 'subsample': 0.7572874138151058, 'colsample_bytree': 0.7673598285325841}. Best is trial 15 with value: 0.0573831543762593.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005195 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3347
[LightGBM] [Info] Number of data points in the train set: 33385, number of used features: 38
[LightGBM] [Info] Start training from score 13.507787


[I 2025-03-13 22:37:27,071] Trial 18 finished with value: 0.06163943245293069 and parameters: {'n_estimators': 1900, 'learning_rate': 0.08836356396316501, 'max_depth': 13, 'num_leaves': 57, 'subsample': 0.6692402271173643, 'colsample_bytree': 0.918935157563197}. Best is trial 15 with value: 0.0573831543762593.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004983 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3347
[LightGBM] [Info] Number of data points in the train set: 33385, number of used features: 38
[LightGBM] [Info] Start training from score 13.507787


[I 2025-03-13 22:37:35,282] Trial 19 finished with value: 0.05769306098972569 and parameters: {'n_estimators': 1800, 'learning_rate': 0.09897453519998078, 'max_depth': 11, 'num_leaves': 81, 'subsample': 0.5930156850525687, 'colsample_bytree': 0.7949028743508253}. Best is trial 15 with value: 0.0573831543762593.


In [6]:
lgb_model = lgb.LGBMRegressor(**best_params, random_state=42)
lgb_model.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011826 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3347
[LightGBM] [Info] Number of data points in the train set: 33385, number of used features: 38
[LightGBM] [Info] Start training from score 13.507787


In [7]:
y_pred_lgb = lgb_model.predict(X_test)

In [10]:
mae = mean_absolute_error(y_test, y_pred_lgb)
rmse = np.sqrt(mean_squared_error(y_test, y_pred_lgb))
print('mae:',mae)
print('rmse:',rmse)

mae: 0.0573831543762593
rmse: 0.09261737537582033
