In [55]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [56]:
# データを読み込む
df = pd.read_csv("realestate_data.csv")
rows, columns = df.shape
print(f"行数: {rows}, 列数: {columns}")
df.head()

行数: 7496, 列数: 9


Unnamed: 0,rent_price,house_area,year_from_built,distance,built_date,balcony_area,house_structure,floor,total_floor
0,81000.0,22.627647,21,360.0,2001-04-01,0.0,RC,4.0,10.0
1,119000.0,29.487423,11,720.0,2011-03-01,0.0,RC,3.0,10.0
2,65000.0,13.960667,32,640.0,1990-03-01,0.0,RC,3.0,6.0
3,230000.0,79.860208,13,480.0,2009-03-01,0.0,RC,15.0,29.0
4,102000.0,34.471313,23,320.0,1999-05-01,0.0,RC,2.0,7.0


In [57]:
# 訓練データとテストデータに分ける
train, test = train_test_split(df, test_size=0.2, random_state=0)
len(train) + len(test) == len(df)

True

In [58]:
# 機械学習ライブラリ
from sklearn.linear_model import LinearRegression, Ridge, Lasso
import lightgbm as lgb
from sklearn.metrics import mean_squared_error

In [59]:
# 目的変数
target_col = "rent_price"
# 説明変数
feature_cols = ["house_area", "distance", "year_from_built", "floor"]
regression_models = {
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(alpha=0.1),
    "Lasso": Lasso(alpha=0.1),
    "LightGBM": lgb.LGBMRegressor()
}

In [60]:
result = dict()
for model_name, model in regression_models.items():
    train_y = train[target_col]
    train_X = train[feature_cols]
    test_y = test[target_col]
    test_X = test[feature_cols]
    model.fit(train_X, train_y)
    pred_rent_price = model.predict(test_X)
    error = mean_squared_error(test[target_col], pred_rent_price)
    print(f"Model: {model_name}, Error: {error}")
    result[model_name] = error

Model: LinearRegression, Error: 360488504.5014344
Model: Ridge, Error: 360488516.60167414
Model: Lasso, Error: 360488540.89717674
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000725 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 403
[LightGBM] [Info] Number of data points in the train set: 5996, number of used features: 4
[LightGBM] [Info] Start training from score 107613.777852
Model: LightGBM, Error: 284171430.9677962


In [61]:
# 平均二乗誤差で性能評価
result = sorted(result.items(), key=lambda x: x[1])
good_model_name = result[0][0]
print(f"Good model name: {good_model_name}")

Good model name: LightGBM


In [62]:
# 予測したいデータを読み込む
df_pred = pd.read_csv("realestate_pred.csv")
df_pred.head()

Unnamed: 0,house_area,year_from_built,distance,built_date,balcony_area,house_structure,floor,total_floor
0,27.688916,17,360.0,2005-04-01,0.0,RC,11.0,11.0
1,34.501054,1,1040.0,2021-08-01,0.0,RC,4.0,7.0
2,20.243089,34,1000.0,1988-03-01,0.0,鉄骨,1.0,4.0
3,50.870947,33,520.0,1989-06-01,0.0,RC,4.0,4.0
4,65.707831,30,480.0,1992-01-01,0.0,鉄骨,3.0,4.0


In [63]:
X_pred = df_pred[feature_cols]
df_pred["pred_rent_price"] = regression_models[good_model_name].predict(X_pred)
df_pred.head()

Unnamed: 0,house_area,year_from_built,distance,built_date,balcony_area,house_structure,floor,total_floor,pred_rent_price
0,27.688916,17,360.0,2005-04-01,0.0,RC,11.0,11.0,96047.486605
1,34.501054,1,1040.0,2021-08-01,0.0,RC,4.0,7.0,139239.087889
2,20.243089,34,1000.0,1988-03-01,0.0,鉄骨,1.0,4.0,64908.453384
3,50.870947,33,520.0,1989-06-01,0.0,RC,4.0,4.0,123320.034471
4,65.707831,30,480.0,1992-01-01,0.0,鉄骨,3.0,4.0,154095.499573
