In [22]:
import glob
import pandas as pd
import matplotlib.pyplot as plt
import japanize_matplotlib 

def data_pre(df):
    all_null_list = []

    for col in df.columns:
        if df[col].count() == 0:
            all_null_list.append(col)

    df = df.drop(all_null_list, axis=1)

    df = df.drop("種類", axis=1)
    df = df.drop("市区町村コード", axis=1)

    dis = {
        "30分?60分": 45,
        "1H?1H30": 75,
        "1H30?2H": 105,
        "2H?": 120
    }

    df["最寄駅：距離（分）"] = df["最寄駅：距離（分）"].replace(dis).astype(float)

    df["面積（㎡）"] = df["面積（㎡）"].replace("2000㎡以上", 2000).astype(float)

    year_dist = {}
    for i in df["建築年"].value_counts().keys():
        if "平成" in i:
            num = i.split("平成")[1].split("年")[0]
            year = 33 - int(num)
        if "昭和" in i:
            num = i.split("昭和")[1].split("年")[0]
            year = 96 - int(num)
        if "令和" in i:
            num = i.split("令和")[1].split("年")[0]
            year = 3 - int(num)
        if "戦前" in i:
            year = 76
        year_dist[i] = year

    df["建築年"] = df["建築年"].replace(year_dist).astype(float)
    
    df["改装"] = df["改装"].apply(lambda x: 0 if x == "未改装" else 1)

    year = {
        "年第１四半期":".25",
        "年第２四半期":".50",
        "年第３四半期":".75",
        "年第４四半期":".99"
    }

    year_list = {}
    for i in df["取引時点"].value_counts().keys():
        for k, j in year.items():
            if k in i:
                year_rep = i.replace(k, j)

        year_list[i] = year_rep

    df["取引時点"] = df["取引時点"].replace(year_list).astype(float)

    for col in ["都道府県名", "市区町村名",  "地区名", "最寄駅：名称", "間取り", "建物の構造", "用途", "今後の利用目的", "都市計画", "改装", "取引の事情等"]:
        df[col] = df[col].astype("category")
 
    return df

In [23]:
# データセットの用意

files = glob.glob("/tmp/working/dataset/nishika/old_apartment_2020/train/*")

# trainデータ読み込み
data_list = []
for i, file in enumerate(files):
    data_list.append(pd.read_csv(file, index_col=0))

df = pd.concat(data_list)

df = data_pre(df)

  interactivity=interactivity, compiler=compiler, result=result)


In [24]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error as mae

df_train, df_val = train_test_split(df, test_size=0.2)

col = "取引価格（総額）_log"
train_y = df_train[col]
train_x = df_train.drop(col, axis=1)

val_y = df_val[col]
val_x = df_val.drop(col, axis=1)

In [25]:
# lightGBM
import lightgbm as lgb

trains = lgb.Dataset(train_x, train_y)
valids = lgb.Dataset(val_x, val_y)

params = {
    "objective": "regression",
    "metrics": "mae"
}

model = lgb.train(params, trains, valid_sets=valids, num_boost_round=1000, early_stopping_rounds=100)



[1]	valid_0's l1: 0.25149
Training until validation scores don't improve for 100 rounds.
[2]	valid_0's l1: 0.234145
[3]	valid_0's l1: 0.219064
[4]	valid_0's l1: 0.205549
[5]	valid_0's l1: 0.193681
[6]	valid_0's l1: 0.183479
[7]	valid_0's l1: 0.174226
[8]	valid_0's l1: 0.166282
[9]	valid_0's l1: 0.158916
[10]	valid_0's l1: 0.152296
[11]	valid_0's l1: 0.14664
[12]	valid_0's l1: 0.141391
[13]	valid_0's l1: 0.136767
[14]	valid_0's l1: 0.13262
[15]	valid_0's l1: 0.12885
[16]	valid_0's l1: 0.125646
[17]	valid_0's l1: 0.122747
[18]	valid_0's l1: 0.120094
[19]	valid_0's l1: 0.117638
[20]	valid_0's l1: 0.115406
[21]	valid_0's l1: 0.113345
[22]	valid_0's l1: 0.111548
[23]	valid_0's l1: 0.109807
[24]	valid_0's l1: 0.108207
[25]	valid_0's l1: 0.106612
[26]	valid_0's l1: 0.105291
[27]	valid_0's l1: 0.104072
[28]	valid_0's l1: 0.10276
[29]	valid_0's l1: 0.101624
[30]	valid_0's l1: 0.100674
[31]	valid_0's l1: 0.0997208
[32]	valid_0's l1: 0.0989071
[33]	valid_0's l1: 0.0980072
[34]	valid_0's l1: 0.097

[279]	valid_0's l1: 0.0779123
[280]	valid_0's l1: 0.0779115
[281]	valid_0's l1: 0.0779089
[282]	valid_0's l1: 0.0779056
[283]	valid_0's l1: 0.0778951
[284]	valid_0's l1: 0.0778804
[285]	valid_0's l1: 0.077878
[286]	valid_0's l1: 0.0778715
[287]	valid_0's l1: 0.0778687
[288]	valid_0's l1: 0.0778624
[289]	valid_0's l1: 0.0778592
[290]	valid_0's l1: 0.077853
[291]	valid_0's l1: 0.0778475
[292]	valid_0's l1: 0.0778444
[293]	valid_0's l1: 0.077822
[294]	valid_0's l1: 0.077807
[295]	valid_0's l1: 0.0778014
[296]	valid_0's l1: 0.0777914
[297]	valid_0's l1: 0.077788
[298]	valid_0's l1: 0.0777848
[299]	valid_0's l1: 0.0777666
[300]	valid_0's l1: 0.0777529
[301]	valid_0's l1: 0.0777506
[302]	valid_0's l1: 0.0777401
[303]	valid_0's l1: 0.0777277
[304]	valid_0's l1: 0.0777202
[305]	valid_0's l1: 0.0777142
[306]	valid_0's l1: 0.0777112
[307]	valid_0's l1: 0.0777094
[308]	valid_0's l1: 0.0777102
[309]	valid_0's l1: 0.0777106
[310]	valid_0's l1: 0.0777065
[311]	valid_0's l1: 0.0777065
[312]	valid_0's

[554]	valid_0's l1: 0.0768617
[555]	valid_0's l1: 0.0768607
[556]	valid_0's l1: 0.0768619
[557]	valid_0's l1: 0.0768573
[558]	valid_0's l1: 0.0768568
[559]	valid_0's l1: 0.076857
[560]	valid_0's l1: 0.0768569
[561]	valid_0's l1: 0.0768489
[562]	valid_0's l1: 0.0768492
[563]	valid_0's l1: 0.0768479
[564]	valid_0's l1: 0.0768466
[565]	valid_0's l1: 0.0768497
[566]	valid_0's l1: 0.0768473
[567]	valid_0's l1: 0.0768475
[568]	valid_0's l1: 0.0768475
[569]	valid_0's l1: 0.0768462
[570]	valid_0's l1: 0.0768419
[571]	valid_0's l1: 0.076835
[572]	valid_0's l1: 0.0768368
[573]	valid_0's l1: 0.0768322
[574]	valid_0's l1: 0.0768305
[575]	valid_0's l1: 0.0768304
[576]	valid_0's l1: 0.0768312
[577]	valid_0's l1: 0.0768294
[578]	valid_0's l1: 0.0768296
[579]	valid_0's l1: 0.076826
[580]	valid_0's l1: 0.0768107
[581]	valid_0's l1: 0.0768093
[582]	valid_0's l1: 0.0767991
[583]	valid_0's l1: 0.0767983
[584]	valid_0's l1: 0.0767946
[585]	valid_0's l1: 0.0767949
[586]	valid_0's l1: 0.0767946
[587]	valid_0

[830]	valid_0's l1: 0.0766219
[831]	valid_0's l1: 0.0766207
[832]	valid_0's l1: 0.0766198
[833]	valid_0's l1: 0.0766212
[834]	valid_0's l1: 0.0766207
[835]	valid_0's l1: 0.0766201
[836]	valid_0's l1: 0.0766205
[837]	valid_0's l1: 0.0766208
[838]	valid_0's l1: 0.0766211
[839]	valid_0's l1: 0.0766202
[840]	valid_0's l1: 0.0766204
[841]	valid_0's l1: 0.076619
[842]	valid_0's l1: 0.0766182
[843]	valid_0's l1: 0.0766197
[844]	valid_0's l1: 0.0766165
[845]	valid_0's l1: 0.0766161
[846]	valid_0's l1: 0.0766066
[847]	valid_0's l1: 0.0766048
[848]	valid_0's l1: 0.0766046
[849]	valid_0's l1: 0.0766043
[850]	valid_0's l1: 0.0766056
[851]	valid_0's l1: 0.0766066
[852]	valid_0's l1: 0.0766061
[853]	valid_0's l1: 0.0766052
[854]	valid_0's l1: 0.0766049
[855]	valid_0's l1: 0.0766058
[856]	valid_0's l1: 0.0766053
[857]	valid_0's l1: 0.0766046
[858]	valid_0's l1: 0.0765968
[859]	valid_0's l1: 0.0765895
[860]	valid_0's l1: 0.0765895
[861]	valid_0's l1: 0.0765901
[862]	valid_0's l1: 0.0765897
[863]	valid

In [None]:
# CatBoost
# from catboost import Pool
# from catboost import CatBoostRegressor, FeaturesData, Pool
# import numpy as np
# # カテゴリのカラムのみを抽出
# # categorical_features_indices =  ["都道府県名", "市区町村名",  "地区名", "最寄駅：名称", "間取り", "建物の構造", "用途", "今後の利用目的", "都市計画", "改装", "取引の事情等"]
# categorical_features_indices = np.where(df.dtypes != np.float)[0]

# # データセットの作成。Poolで説明変数、目的変数、
# # カラムのデータ型を指定できる
# train_pool = Pool(train_x, train_y, cat_features=categorical_features_indices)
# validate_pool = Pool(val_x, val_y, cat_features=categorical_features_indices)

# params = {
#     'depth' : 6,                  # 木の深さ
#     'learning_rate' : 0.16,       # 学習率
#     'early_stopping_rounds' : 100,
#     'iterations' : 1000, 
#     'loss_function' : 'RMSE', 
#     'random_seed' :42
# }

# model = CatBoostRegressor(**params)
# model.fit(train_pool)

In [None]:
#XGBoost
# import xgboost as xgb
# dtrain = xgb.DMatrix(train_x, label=train_y)
# dvalid = xgb.DMatrix(val_x, label=val_y)

# params = {
#         'objective': 'reg:squarederror','silent':1, 'random_state':1234, 
#         'eval_metric': 'mae',
#     }

# num_round = 1000
# # モデルのインスタンス作成
# model = xgb.XGBRegressor()
# model.fit(train_x, train_y)

In [26]:
vals = model.predict(val_x)
mae(vals, val_y)

0.07656452380738607

In [27]:
df_test = pd.read_csv("/tmp/working/dataset/nishika/old_apartment_2020/test.csv", index_col=0)
df_test = data_pre(df_test)

In [28]:
predict = model.predict(df_test)
df_test["取引価格（総額）_log"] = predict

In [29]:
df_test[["取引価格（総額）_log"]].to_csv("submit_test.csv")

In [30]:
pd.DataFrame(model.feature_importance(), index=val_x.columns, columns=["importance"]).sort_values("importance", ascending=False)

Unnamed: 0,importance
地区名,8107
最寄駅：名称,4879
建築年,3490
取引時点,3303
面積（㎡）,2196
市区町村名,2035
最寄駅：距離（分）,1931
間取り,935
容積率（％）,480
改装,416


In [47]:
opt_params = {
    "objective":"regression",
    "metric":"mae"
}

import lightgbm
import optuna.integration.lightgbm as lgbo

opt=lgbo.train(
    opt_params,
    trains,
    valid_sets = valids,
    verbose_eval=False,
    #ラウンド数
    num_boost_round = 5,
    #打ち切り
    early_stopping_rounds = 100
)

AttributeError: module 'optuna.integration.lightgbm' has no attribute 'train'

In [45]:
!pip install --upgrade pip
!pip install -U optuna

