In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import optuna
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.feature_selection import SelectFromModel

In [2]:
df = pd.read_csv("preprocessed_data.csv")
df.head()

Unnamed: 0,rooms,floor,floorCount,latitude,longitude,poiCount,schoolDistance,clinicDistance,postOfficeDistance,kindergartenDistance,...,city_lodz,city_lublin,city_poznan,city_radom,city_rzeszow,city_szczecin,city_warszawa,city_wroclaw,ownership_cooperative,ownership_udział
0,4.0,3.0,4.0,1.094088,-2.820267,-0.816073,1.487393,-0.026132,0.230208,1.028105,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,2.0,5.0,5.0,1.092505,-2.781375,1.647109,-0.585392,-0.607703,-0.383685,-0.380457,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,5.0,4.0,4.0,1.102938,-2.777586,-0.440334,-0.432099,-0.410059,0.479928,-0.185681,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,3.0,7.0,7.0,1.106274,-2.779584,-0.440334,-0.316574,-0.228318,0.340501,-0.233188,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,3.0,4.0,4.0,1.112952,-2.765224,-0.56558,-0.603165,0.413455,0.86075,1.358273,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [3]:
X = df.drop(columns=["log_price"])
y = df["log_price"]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
feature_names = X_train.columns.tolist()
print(feature_names)

['rooms', 'floor', 'floorCount', 'latitude', 'longitude', 'poiCount', 'schoolDistance', 'clinicDistance', 'postOfficeDistance', 'kindergartenDistance', 'restaurantDistance', 'collegeDistance', 'pharmacyDistance', 'hasParkingSpace', 'hasBalcony', 'hasElevator', 'hasSecurity', 'hasStorageRoom', 'squareMeters_capped', 'centreDistance_winsorized', 'location_cluster', 'type_encoded', 'age', 'city_bydgoszcz', 'city_czestochowa', 'city_gdansk', 'city_gdynia', 'city_katowice', 'city_krakow', 'city_lodz', 'city_lublin', 'city_poznan', 'city_radom', 'city_rzeszow', 'city_szczecin', 'city_warszawa', 'city_wroclaw', 'ownership_cooperative', 'ownership_udział']


## Model Tuning

In [6]:
def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 500, 2000, step=100),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1),
        "max_depth": trial.suggest_int("max_depth", 5, 15),
        "num_leaves": trial.suggest_int("num_leaves", 20, 100),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0)
    }
    model = lgb.LGBMRegressor(**params, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return mean_absolute_error(y_test, y_pred)

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=20)
best_params = study.best_params

[I 2025-04-26 20:46:07,310] A new study created in memory with name: no-name-0813323e-fde2-42d5-a6c0-4e87b7593e42


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012782 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3347
[LightGBM] [Info] Number of data points in the train set: 33385, number of used features: 38
[LightGBM] [Info] Start training from score 13.507787


[I 2025-04-26 20:46:11,235] Trial 0 finished with value: 0.07196290548648807 and parameters: {'n_estimators': 600, 'learning_rate': 0.09101944530615423, 'max_depth': 14, 'num_leaves': 81, 'subsample': 0.9519875117641186, 'colsample_bytree': 0.7573675454740083}. Best is trial 0 with value: 0.07196290548648807.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004983 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3347
[LightGBM] [Info] Number of data points in the train set: 33385, number of used features: 38
[LightGBM] [Info] Start training from score 13.507787


[I 2025-04-26 20:46:13,225] Trial 1 finished with value: 0.08492994470393633 and parameters: {'n_estimators': 500, 'learning_rate': 0.05688520881209664, 'max_depth': 8, 'num_leaves': 68, 'subsample': 0.8800957269101718, 'colsample_bytree': 0.7999285185952795}. Best is trial 0 with value: 0.07196290548648807.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004448 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3347
[LightGBM] [Info] Number of data points in the train set: 33385, number of used features: 38
[LightGBM] [Info] Start training from score 13.507787






[I 2025-04-26 20:46:17,015] Trial 2 finished with value: 0.09092442668034034 and parameters: {'n_estimators': 2000, 'learning_rate': 0.03086719387015696, 'max_depth': 5, 'num_leaves': 21, 'subsample': 0.6110644369871567, 'colsample_bytree': 0.8216183804088593}. Best is trial 0 with value: 0.07196290548648807.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008704 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3347
[LightGBM] [Info] Number of data points in the train set: 33385, number of used features: 38
[LightGBM] [Info] Start training from score 13.507787


[I 2025-04-26 20:46:19,595] Trial 3 finished with value: 0.0984947877111893 and parameters: {'n_estimators': 600, 'learning_rate': 0.019689461119105148, 'max_depth': 11, 'num_leaves': 37, 'subsample': 0.9192410003362774, 'colsample_bytree': 0.7985446744800623}. Best is trial 0 with value: 0.07196290548648807.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004613 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3347
[LightGBM] [Info] Number of data points in the train set: 33385, number of used features: 38
[LightGBM] [Info] Start training from score 13.507787


[I 2025-04-26 20:46:38,402] Trial 4 finished with value: 0.08492696226597753 and parameters: {'n_estimators': 1900, 'learning_rate': 0.010009526242571279, 'max_depth': 11, 'num_leaves': 84, 'subsample': 0.8719753156240861, 'colsample_bytree': 0.7976578297012202}. Best is trial 0 with value: 0.07196290548648807.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010473 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3347
[LightGBM] [Info] Number of data points in the train set: 33385, number of used features: 38
[LightGBM] [Info] Start training from score 13.507787




[I 2025-04-26 20:46:41,222] Trial 5 finished with value: 0.07412225922061298 and parameters: {'n_estimators': 700, 'learning_rate': 0.08924911275540198, 'max_depth': 8, 'num_leaves': 70, 'subsample': 0.5972737642136214, 'colsample_bytree': 0.8875456176003458}. Best is trial 0 with value: 0.07196290548648807.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001757 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3347
[LightGBM] [Info] Number of data points in the train set: 33385, number of used features: 38
[LightGBM] [Info] Start training from score 13.507787










[I 2025-04-26 20:46:47,258] Trial 6 finished with value: 0.07914091942537882 and parameters: {'n_estimators': 1800, 'learning_rate': 0.04944863806738817, 'max_depth': 6, 'num_leaves': 38, 'subsample': 0.7321528542904697, 'colsample_bytree': 0.7435920698630051}. Best is trial 0 with value: 0.07196290548648807.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009639 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3347
[LightGBM] [Info] Number of data points in the train set: 33385, number of used features: 38
[LightGBM] [Info] Start training from score 13.507787


[I 2025-04-26 20:46:49,940] Trial 7 finished with value: 0.09963173367530902 and parameters: {'n_estimators': 600, 'learning_rate': 0.031266346306501755, 'max_depth': 12, 'num_leaves': 22, 'subsample': 0.95957043966513, 'colsample_bytree': 0.6889614484912148}. Best is trial 0 with value: 0.07196290548648807.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010894 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3347
[LightGBM] [Info] Number of data points in the train set: 33385, number of used features: 38
[LightGBM] [Info] Start training from score 13.507787


[I 2025-04-26 20:47:00,183] Trial 8 finished with value: 0.06287297819476781 and parameters: {'n_estimators': 1800, 'learning_rate': 0.06232799601458336, 'max_depth': 11, 'num_leaves': 76, 'subsample': 0.8826151702344036, 'colsample_bytree': 0.7672539360436119}. Best is trial 8 with value: 0.06287297819476781.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007753 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3347
[LightGBM] [Info] Number of data points in the train set: 33385, number of used features: 38
[LightGBM] [Info] Start training from score 13.507787










[I 2025-04-26 20:47:04,535] Trial 9 finished with value: 0.08618105937031165 and parameters: {'n_estimators': 800, 'learning_rate': 0.031728461892588426, 'max_depth': 7, 'num_leaves': 79, 'subsample': 0.8776461669954987, 'colsample_bytree': 0.7391360081388206}. Best is trial 8 with value: 0.06287297819476781.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009682 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3347
[LightGBM] [Info] Number of data points in the train set: 33385, number of used features: 38
[LightGBM] [Info] Start training from score 13.507787


[I 2025-04-26 20:47:16,475] Trial 10 finished with value: 0.0613306714139275 and parameters: {'n_estimators': 1500, 'learning_rate': 0.07011703589938538, 'max_depth': 15, 'num_leaves': 96, 'subsample': 0.7556073847790201, 'colsample_bytree': 0.5604657651551779}. Best is trial 10 with value: 0.0613306714139275.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001863 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3347
[LightGBM] [Info] Number of data points in the train set: 33385, number of used features: 38
[LightGBM] [Info] Start training from score 13.507787


[I 2025-04-26 20:47:24,081] Trial 11 finished with value: 0.062067299394607596 and parameters: {'n_estimators': 1500, 'learning_rate': 0.06450193060748055, 'max_depth': 15, 'num_leaves': 99, 'subsample': 0.7278061041548702, 'colsample_bytree': 0.5101231983854826}. Best is trial 10 with value: 0.0613306714139275.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007248 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3347
[LightGBM] [Info] Number of data points in the train set: 33385, number of used features: 38
[LightGBM] [Info] Start training from score 13.507787


[I 2025-04-26 20:47:29,851] Trial 12 finished with value: 0.061587262928918064 and parameters: {'n_estimators': 1400, 'learning_rate': 0.07269461009247105, 'max_depth': 15, 'num_leaves': 99, 'subsample': 0.7364608246337628, 'colsample_bytree': 0.5073503049653416}. Best is trial 10 with value: 0.0613306714139275.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004349 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3347
[LightGBM] [Info] Number of data points in the train set: 33385, number of used features: 38
[LightGBM] [Info] Start training from score 13.507787


[I 2025-04-26 20:47:34,488] Trial 13 finished with value: 0.06331100486830626 and parameters: {'n_estimators': 1200, 'learning_rate': 0.07569396828397788, 'max_depth': 14, 'num_leaves': 99, 'subsample': 0.6736280918450901, 'colsample_bytree': 0.5132268844040023}. Best is trial 10 with value: 0.0613306714139275.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005603 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3347
[LightGBM] [Info] Number of data points in the train set: 33385, number of used features: 38
[LightGBM] [Info] Start training from score 13.507787


[I 2025-04-26 20:47:40,670] Trial 14 finished with value: 0.06239995727221071 and parameters: {'n_estimators': 1300, 'learning_rate': 0.07607133690464649, 'max_depth': 13, 'num_leaves': 91, 'subsample': 0.5081854770763933, 'colsample_bytree': 0.5997709111436187}. Best is trial 10 with value: 0.0613306714139275.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010427 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3347
[LightGBM] [Info] Number of data points in the train set: 33385, number of used features: 38
[LightGBM] [Info] Start training from score 13.507787


[I 2025-04-26 20:47:48,339] Trial 15 finished with value: 0.06744889253018403 and parameters: {'n_estimators': 1500, 'learning_rate': 0.07802239682628256, 'max_depth': 15, 'num_leaves': 55, 'subsample': 0.8054035802566131, 'colsample_bytree': 0.6299950366492809}. Best is trial 10 with value: 0.0613306714139275.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004800 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3347
[LightGBM] [Info] Number of data points in the train set: 33385, number of used features: 38
[LightGBM] [Info] Start training from score 13.507787


[I 2025-04-26 20:47:52,037] Trial 16 finished with value: 0.06797744851879568 and parameters: {'n_estimators': 1100, 'learning_rate': 0.09865341656008877, 'max_depth': 13, 'num_leaves': 58, 'subsample': 0.8026302561558947, 'colsample_bytree': 0.5792136286697178}. Best is trial 10 with value: 0.0613306714139275.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005803 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3347
[LightGBM] [Info] Number of data points in the train set: 33385, number of used features: 38
[LightGBM] [Info] Start training from score 13.507787






[I 2025-04-26 20:48:01,047] Trial 17 finished with value: 0.0675206692290518 and parameters: {'n_estimators': 1600, 'learning_rate': 0.045966339922049584, 'max_depth': 9, 'num_leaves': 90, 'subsample': 0.7925718134290933, 'colsample_bytree': 0.9985720208648836}. Best is trial 10 with value: 0.0613306714139275.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005285 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3347
[LightGBM] [Info] Number of data points in the train set: 33385, number of used features: 38
[LightGBM] [Info] Start training from score 13.507787


[I 2025-04-26 20:48:05,152] Trial 18 finished with value: 0.06745741913741224 and parameters: {'n_estimators': 1000, 'learning_rate': 0.06952656086239098, 'max_depth': 15, 'num_leaves': 91, 'subsample': 0.6663268304604388, 'colsample_bytree': 0.5474273808881309}. Best is trial 10 with value: 0.0613306714139275.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004792 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3347
[LightGBM] [Info] Number of data points in the train set: 33385, number of used features: 38
[LightGBM] [Info] Start training from score 13.507787


[I 2025-04-26 20:48:09,713] Trial 19 finished with value: 0.0687694483663155 and parameters: {'n_estimators': 1400, 'learning_rate': 0.08566874831166853, 'max_depth': 13, 'num_leaves': 48, 'subsample': 0.6808601138841142, 'colsample_bytree': 0.6541030821999934}. Best is trial 10 with value: 0.0613306714139275.


In [7]:
lgb_model = lgb.LGBMRegressor(**best_params, random_state=42)
lgb_model.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.017256 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3347
[LightGBM] [Info] Number of data points in the train set: 33385, number of used features: 38
[LightGBM] [Info] Start training from score 13.507787


In [8]:
y_pred_lgb = lgb_model.predict(X_test)

In [9]:
mae = mean_absolute_error(y_test, y_pred_lgb)
rmse = np.sqrt(mean_squared_error(y_test, y_pred_lgb))
print('mae:',mae)
print('rmse:',rmse)

mae: 0.0613306714139275
rmse: 0.0937712707932223


In [10]:
import pickle

with open('model.pkl', 'wb') as f:
    pickle.dump(lgb_model, f)