In [25]:
import glob
import pandas as pd
import matplotlib.pyplot as plt
import japanize_matplotlib 

def data_pre(df):
    all_null_list = []

    for col in df.columns:
        if df[col].count() == 0:
            all_null_list.append(col)

    df = df.drop(all_null_list, axis=1)

    df = df.drop("種類", axis=1)
    df = df.drop("市区町村コード", axis=1)

    dis = {
        "30分?60分": 45,
        "1H?1H30": 75,
        "1H30?2H": 105,
        "2H?": 120
    }

    df["最寄駅：距離（分）"] = df["最寄駅：距離（分）"].replace(dis).astype(float)

    df["面積（㎡）"] = df["面積（㎡）"].replace("2000㎡以上", 2000).astype(float)

    year_dist = {}
    for i in df["建築年"].value_counts().keys():
        if "平成" in i:
            num = i.split("平成")[1].split("年")[0]
            year = 33 - int(num)
        if "昭和" in i:
            num = i.split("昭和")[1].split("年")[0]
            year = 96 - int(num)
        if "令和" in i:
            num = i.split("令和")[1].split("年")[0]
            year = 3 - int(num)
        if "戦前" in i:
            year = 76
        year_dist[i] = year

    df["建築年"] = df["建築年"].replace(year_dist).astype(float)

    year = {
        "年第１四半期":".25",
        "年第２四半期":".50",
        "年第３四半期":".75",
        "年第４四半期":".99"
    }

    year_list = {}
    for i in df["取引時点"].value_counts().keys():
        for k, j in year.items():
            if k in i:
                year_rep = i.replace(k, j)

        year_list[i] = year_rep

    df["取引時点"] = df["取引時点"].replace(year_list).astype(float)

    df["場所"] = df["都道府県名"].str.cat(df['市区町村名'])
    area = pd.read_csv('area_axis.csv')
    df = pd.merge(df, area, on="場所")
    
    df = df.drop(["場所"], axis=1)

    for col in ["都道府県名", "市区町村名", "地区名", "最寄駅：名称", "間取り", "建物の構造", "用途", "今後の利用目的", "都市計画", "改装", "取引の事情等"]:
        df[col] = df[col].astype("category")
 
    return df

In [2]:
# データセットの用意

files = glob.glob("/tmp/working/dataset/nishika/old_apartment_2020/train/*")

# trainデータ読み込み
data_list = []
for i, file in enumerate(files):
    data_list.append(pd.read_csv(file, index_col=0))

df = pd.concat(data_list)

df = data_pre(df)

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error as mae
import lightgbm
import optuna.integration.lightgbm as lgb


df_train, df_val = train_test_split(df, test_size=0.2)

col = "取引価格（総額）_log"
train_y = df_train[col]
train_x = df_train.drop(col, axis=1)

val_y = df_val[col]
val_x = df_val.drop(col, axis=1)

trains = lgb.Dataset(train_x, train_y)
valids = lgb.Dataset(val_x, val_y)

In [51]:

opt_params = {
    "objective":"regression",
    "metric":"mae"
}


opt=lgb.train(
    opt_params,
    trains,
    valid_sets = valids,
    verbose_eval=200,
    num_boost_round = 1500,
    early_stopping_rounds = 100
)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12314
[LightGBM] [Info] Number of data points in the train set: 521580, number of used features: 19
[LightGBM] [Info] Start training from score 7.219298
Training until validation scores don't improve for 100 rounds
[200]	valid_0's l1: 0.0790029
[400]	valid_0's l1: 0.0775119
[600]	valid_0's l1: 0.0771135
[800]	valid_0's l1: 0.0769008
[1000]	valid_0's l1: 0.0767922
[1200]	valid_0's l1: 0.07671
[1400]	valid_0's l1: 0.0766892
Early stopping, best iteration is:
[1380]	valid_0's l1: 0.0766856


In [5]:
vals = model.predict(val_x)
mae(vals, val_y)

0.07661169311914949

In [37]:
df_test = pd.read_csv("/tmp/working/dataset/nishika/old_apartment_2020/test.csv")
df_test = data_pre(df_test)
index = df_test["ID"]
df_test = df_test.drop("ID", axis=1)
predict = model.predict(df_test)
df_test["取引価格（総額）_log"] = predict
df_test = pd.concat([df_test, index], axis=1)
df_test[["ID", "取引価格（総額）_log"]].to_csv("submit_test2.csv", index=False)

In [35]:
df_test.head()

Unnamed: 0,都道府県名,市区町村名,地区名,最寄駅：名称,最寄駅：距離（分）,間取り,面積（㎡）,建築年,建物の構造,用途,...,都市計画,建ぺい率（％）,容積率（％）,取引時点,改装,取引の事情等,緯度,経度,取引価格（総額）_log,ID
0,北海道,札幌市中央区,旭ケ丘,円山公園,26.0,３ＬＤＫ,75.0,32.0,ＲＣ,,...,第１種低層住居専用地域,40.0,60.0,2020.5,未改装,,43.046848,141.322114,7.020458,1000000
1,北海道,札幌市中央区,大通西,西１８丁目,2.0,１ＬＤＫ,45.0,30.0,ＳＲＣ,住宅,...,商業地域,80.0,400.0,2020.75,未改装,,43.046848,141.322114,6.899303,1000110
2,北海道,札幌市中央区,大通西,西１８丁目,2.0,２ＬＤＫ,50.0,32.0,ＳＲＣ,住宅,...,商業地域,80.0,400.0,2020.75,改装済,,43.046848,141.322114,7.132475,1000111
3,北海道,札幌市中央区,大通西,西１８丁目,3.0,２ＬＤＫ,80.0,24.0,ＲＣ,,...,商業地域,80.0,400.0,2020.75,改装済,,43.046848,141.322114,7.42504,1000112
4,北海道,札幌市中央区,大通西,西１８丁目,3.0,１Ｋ,20.0,40.0,ＲＣ,,...,商業地域,80.0,400.0,2020.75,未改装,,43.046848,141.322114,6.465629,1000113


In [9]:
pd.DataFrame(model.feature_importance(), index=val_x.columns, columns=["importance"]).sort_values("importance", ascending=False)

Unnamed: 0,importance
地区名,7892
最寄駅：名称,4745
建築年,3155
取引時点,2889
市区町村名,2043
面積（㎡）,1988
最寄駅：距離（分）,1678
間取り,816
緯度,537
経度,379


In [4]:
# lightGBM
import lightgbm as lgb

trains = lgb.Dataset(train_x, train_y)
valids = lgb.Dataset(val_x, val_y)

params = {
    "objective": "regression",
    "metrics": "mae"
}

model = lgb.train(params, trains, valid_sets=valids, num_boost_round=1000, early_stopping_rounds=100)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12321
[LightGBM] [Info] Number of data points in the train set: 521580, number of used features: 19
[LightGBM] [Info] Start training from score 7.219423




[1]	valid_0's l1: 0.253165
Training until validation scores don't improve for 100 rounds
[2]	valid_0's l1: 0.236032
[3]	valid_0's l1: 0.220883
[4]	valid_0's l1: 0.207435
[5]	valid_0's l1: 0.195508
[6]	valid_0's l1: 0.184848
[7]	valid_0's l1: 0.175579
[8]	valid_0's l1: 0.167425
[9]	valid_0's l1: 0.159959
[10]	valid_0's l1: 0.15351
[11]	valid_0's l1: 0.147736
[12]	valid_0's l1: 0.142584
[13]	valid_0's l1: 0.137902
[14]	valid_0's l1: 0.133841
[15]	valid_0's l1: 0.130176
[16]	valid_0's l1: 0.126816
[17]	valid_0's l1: 0.123718
[18]	valid_0's l1: 0.121147
[19]	valid_0's l1: 0.118803
[20]	valid_0's l1: 0.116311
[21]	valid_0's l1: 0.114156
[22]	valid_0's l1: 0.112225
[23]	valid_0's l1: 0.110469
[24]	valid_0's l1: 0.10881
[25]	valid_0's l1: 0.107285
[26]	valid_0's l1: 0.105822
[27]	valid_0's l1: 0.104602
[28]	valid_0's l1: 0.103298
[29]	valid_0's l1: 0.102114
[30]	valid_0's l1: 0.101092
[31]	valid_0's l1: 0.100184
[32]	valid_0's l1: 0.0993809
[33]	valid_0's l1: 0.0985019
[34]	valid_0's l1: 0.09

[281]	valid_0's l1: 0.0778611
[282]	valid_0's l1: 0.0778548
[283]	valid_0's l1: 0.0778497
[284]	valid_0's l1: 0.0778473
[285]	valid_0's l1: 0.0778406
[286]	valid_0's l1: 0.0778325
[287]	valid_0's l1: 0.0778277
[288]	valid_0's l1: 0.0778197
[289]	valid_0's l1: 0.0778064
[290]	valid_0's l1: 0.0778024
[291]	valid_0's l1: 0.0777876
[292]	valid_0's l1: 0.0777873
[293]	valid_0's l1: 0.0777821
[294]	valid_0's l1: 0.0777797
[295]	valid_0's l1: 0.0777697
[296]	valid_0's l1: 0.077768
[297]	valid_0's l1: 0.0777554
[298]	valid_0's l1: 0.0777525
[299]	valid_0's l1: 0.0777447
[300]	valid_0's l1: 0.0777426
[301]	valid_0's l1: 0.0777416
[302]	valid_0's l1: 0.0777352
[303]	valid_0's l1: 0.0777209
[304]	valid_0's l1: 0.0777191
[305]	valid_0's l1: 0.0777086
[306]	valid_0's l1: 0.077704
[307]	valid_0's l1: 0.0777001
[308]	valid_0's l1: 0.0776944
[309]	valid_0's l1: 0.0776916
[310]	valid_0's l1: 0.077682
[311]	valid_0's l1: 0.0776773
[312]	valid_0's l1: 0.0776685
[313]	valid_0's l1: 0.0776428
[314]	valid_0

[558]	valid_0's l1: 0.0769107
[559]	valid_0's l1: 0.0769102
[560]	valid_0's l1: 0.0769111
[561]	valid_0's l1: 0.0769105
[562]	valid_0's l1: 0.0769107
[563]	valid_0's l1: 0.0769076
[564]	valid_0's l1: 0.0769072
[565]	valid_0's l1: 0.0769059
[566]	valid_0's l1: 0.0769028
[567]	valid_0's l1: 0.0769008
[568]	valid_0's l1: 0.0768996
[569]	valid_0's l1: 0.0768975
[570]	valid_0's l1: 0.0768969
[571]	valid_0's l1: 0.0768932
[572]	valid_0's l1: 0.0768918
[573]	valid_0's l1: 0.0768917
[574]	valid_0's l1: 0.0768903
[575]	valid_0's l1: 0.0768886
[576]	valid_0's l1: 0.0768897
[577]	valid_0's l1: 0.0768884
[578]	valid_0's l1: 0.076889
[579]	valid_0's l1: 0.0768877
[580]	valid_0's l1: 0.0768862
[581]	valid_0's l1: 0.0768858
[582]	valid_0's l1: 0.0768844
[583]	valid_0's l1: 0.0768849
[584]	valid_0's l1: 0.0768826
[585]	valid_0's l1: 0.0768825
[586]	valid_0's l1: 0.0768838
[587]	valid_0's l1: 0.0768827
[588]	valid_0's l1: 0.0768823
[589]	valid_0's l1: 0.0768832
[590]	valid_0's l1: 0.0768809
[591]	valid

[838]	valid_0's l1: 0.076672
[839]	valid_0's l1: 0.0766721
[840]	valid_0's l1: 0.076672
[841]	valid_0's l1: 0.0766717
[842]	valid_0's l1: 0.0766677
[843]	valid_0's l1: 0.0766666
[844]	valid_0's l1: 0.0766577
[845]	valid_0's l1: 0.0766558
[846]	valid_0's l1: 0.0766564
[847]	valid_0's l1: 0.0766565
[848]	valid_0's l1: 0.0766569
[849]	valid_0's l1: 0.0766578
[850]	valid_0's l1: 0.0766574
[851]	valid_0's l1: 0.0766562
[852]	valid_0's l1: 0.0766573
[853]	valid_0's l1: 0.0766572
[854]	valid_0's l1: 0.0766588
[855]	valid_0's l1: 0.0766571
[856]	valid_0's l1: 0.0766569
[857]	valid_0's l1: 0.0766563
[858]	valid_0's l1: 0.0766541
[859]	valid_0's l1: 0.0766543
[860]	valid_0's l1: 0.0766542
[861]	valid_0's l1: 0.0766545
[862]	valid_0's l1: 0.0766541
[863]	valid_0's l1: 0.0766522
[864]	valid_0's l1: 0.0766517
[865]	valid_0's l1: 0.0766512
[866]	valid_0's l1: 0.0766504
[867]	valid_0's l1: 0.0766481
[868]	valid_0's l1: 0.0766467
[869]	valid_0's l1: 0.0766486
[870]	valid_0's l1: 0.0766461
[871]	valid_

In [None]:
# CatBoost
# from catboost import Pool
# from catboost import CatBoostRegressor, FeaturesData, Pool
# import numpy as np
# # カテゴリのカラムのみを抽出
# # categorical_features_indices =  ["都道府県名", "市区町村名",  "地区名", "最寄駅：名称", "間取り", "建物の構造", "用途", "今後の利用目的", "都市計画", "改装", "取引の事情等"]
# categorical_features_indices = np.where(df.dtypes != np.float)[0]

# # データセットの作成。Poolで説明変数、目的変数、
# # カラムのデータ型を指定できる
# train_pool = Pool(train_x, train_y, cat_features=categorical_features_indices)
# validate_pool = Pool(val_x, val_y, cat_features=categorical_features_indices)

# params = {
#     'depth' : 6,                  # 木の深さ
#     'learning_rate' : 0.16,       # 学習率
#     'early_stopping_rounds' : 100,
#     'iterations' : 1000, 
#     'loss_function' : 'RMSE', 
#     'random_seed' :42
# }

# model = CatBoostRegressor(**params)
# model.fit(train_pool)

In [None]:
#XGBoost
# import xgboost as xgb
# dtrain = xgb.DMatrix(train_x, label=train_y)
# dvalid = xgb.DMatrix(val_x, label=val_y)

# params = {
#         'objective': 'reg:squarederror','silent':1, 'random_state':1234, 
#         'eval_metric': 'mae',
#     }

# num_round = 1000
# # モデルのインスタンス作成
# model = xgb.XGBRegressor()
# model.fit(train_x, train_y)

Unnamed: 0,importance
地区名,8107
最寄駅：名称,4879
建築年,3490
取引時点,3303
面積（㎡）,2196
市区町村名,2035
最寄駅：距離（分）,1931
間取り,935
容積率（％）,480
改装,416


In [47]:
opt_params = {
    "objective":"regression",
    "metric":"mae"
}

import lightgbm
import optuna.integration.lightgbm as lgbo

opt=lgbo.train(
    opt_params,
    trains,
    valid_sets = valids,
    verbose_eval=False,
    #ラウンド数
    num_boost_round = 5,
    #打ち切り
    early_stopping_rounds = 100
)

AttributeError: module 'optuna.integration.lightgbm' has no attribute 'train'