In [468]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import category_encoders as ce
import mojimoji
import re
from cmath import nan

さらなる特徴量の抽出を行う

まずは、現在ある「市区町村」ではターゲットを推測するにあたって弱い気がするので、カテゴリ変数である「所在地」特徴量を作成する。

In [469]:
df_target = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

一括で処理ができるように、訓練データとテストデータを結合する

In [470]:
df = pd.concat([df_target, df_test], axis=0)

In [471]:
print('結合する前のdf_targetのサイズ：',len(df_target))
print('結合する前のdf_testのサイズ：',len(df_test))
print('len(df_target)+len(df_test)=',len(df_target)+len(df_test))
print('結合したdfのサイズ：',len(df))

結合する前のdf_targetのサイズ： 31470
結合する前のdf_testのサイズ： 31262
len(df_target)+len(df_test)= 62732
結合したdfのサイズ： 62732


In [472]:
df['所在地'].head(5)

0          東京都北区滝野川３丁目
1          東京都中央区月島３丁目
2          東京都渋谷区笹塚２丁目
3    東京都杉並区高円寺南２丁目23-2
4       東京都葛飾区金町３丁目7-2
Name: 所在地, dtype: object

"東京都○○区△△n丁目"のうちの、○○区△△nを抽出する

In [473]:
locations = df['所在地']

#以下の部分では訓練データの”区”のラベル化に備えて、〇〇区の部分を抽出する。
i = 0
merge_addresses = []
for loc in locations:
    loc = mojimoji.zen_to_han(loc, kana=True)#所在地に含まれる全角数字を半角数字に変換
    digits = re.findall(r"\d+", loc)#n丁目などの数字情報を取得する
    target1 = "都"
    idx1 = loc.find(target1)
    try:
        idx2 = loc.find(digits[0])
    except:
        idx2 = -1#digitsがnanだったら-1を代入することで後々のスライスに備える
    address = loc[idx1+1:idx2]
    merge_addresses.append(address)

merge_addresses = pd.DataFrame(merge_addresses)
merge_addresses = merge_addresses.rename(columns={0:'所在地'})#列名の振り直し
print("カテゴリ化前の訓練データ：")
print(merge_addresses.head(30))
print("カテゴリ化前の訓練データの大きさ：",len(merge_addresses))


#カテゴリ化
list_cols = ['所在地']
addresses_encoder = ce.OrdinalEncoder(cols=list_cols, drop_invariant=True)
merge_addresses = addresses_encoder.fit_transform(merge_addresses['所在地'])
#訓練データとテストデータに再分割
addresses = merge_addresses[:len(df_target)]
addresses.to_csv('addresses.csv',index=False)
test_addresses = merge_addresses[len(df_target):]
test_addresses.to_csv('test_addresses.csv',index=False)

カテゴリ化前の訓練データ：
          所在地
0       北区滝野川
1       中央区月島
2       渋谷区笹塚
3     杉並区高円寺南
4       葛飾区金町
5      荒川区南千住
6      練馬区東大泉
7       目黒区鷹番
8       文京区向丘
9       板橋区板橋
10     大田区西馬込
11    江戸川区北小岩
12      港区南青山
13    杉並区阿佐谷南
14       墨田区緑
15     渋谷区幡ヶ谷
16      板橋区桜川
17    江戸川区西瑞江
18    新宿区四谷三栄
19  中央区日本橋箱崎町
20     中野区江古田
21      文京区湯島
22       中央区佃
23     大田区大森西
24    江戸川区北小岩
25      豊島区千早
26      台東区台東
27      足立区綾瀬
28     江戸川区船堀
29      渋谷区上原
カテゴリ化前の訓練データの大きさ： 62732


変数の読み込みを行う

In [474]:
test_addresses = test_addresses.reset_index(drop=True)
print(test_addresses)

       所在地
0      441
1      271
2      132
3      264
4      343
...    ...
31257  119
31258  316
31259   21
31260  845
31261  190

[31262 rows x 1 columns]


In [475]:
house_age = pd.read_csv('house_age.csv')
area_size = pd.read_csv('area_size.csv')
room_arrange_scores = pd.read_csv('room_arrange_scores.csv')
contract_span = pd.read_csv('contract_span.csv')
floor_scores = pd.read_csv('floor_scores.csv')
Floor_scores = pd.read_csv('capital_floor_scores.csv')
wards = pd.read_csv('wards.csv')
stations = pd.read_csv('stations.csv')
minits = pd.read_csv('minits.csv')

rent = pd.read_csv('rent.csv')

テストデータの読み込み

In [476]:
test_house_age = pd.read_csv('test_house_age.csv')
test_area_size = pd.read_csv('test_area_size.csv')
test_room_arrange_scores = pd.read_csv('test_room_arrange_scores.csv')
test_contract_span = pd.read_csv('test_contract_span.csv')
test_floor_scores = pd.read_csv('test_floor_scores.csv')
test_Floor_scores = pd.read_csv('test_capital_floor_scores.csv')
test_wards = pd.read_csv('test_wards.csv')
test_stations = pd.read_csv('test_stations.csv')
test_minits = pd.read_csv('test_minits.csv')

これをlightGBMに突っ込む

In [477]:
X_train = pd.concat([house_age, area_size, room_arrange_scores, contract_span, floor_scores, Floor_scores, stations, minits, addresses], axis=1)
y_train = rent

X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.3, random_state=0)

X_test = pd.concat([test_house_age, test_area_size, test_room_arrange_scores, test_contract_span, test_floor_scores, test_Floor_scores, test_stations, test_minits, test_addresses['所在地']], axis=1)

category_lists = ['最寄り駅', '所在地']
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train)

params = {
    'objective':'regression',
    'metrics':'rmse',
    'lambda_l2':0.0000001
}

model = lgb.train(
                    params,
                    lgb_train, 
                    valid_sets=[lgb_train, lgb_eval], 
                    verbose_eval=10, 
                    num_boost_round=3000, 
                    early_stopping_rounds=10,
                    categorical_feature = category_lists
                    )

y_pred = model.predict(X_test, num_iteration=model.best_iteration)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1868
[LightGBM] [Info] Number of data points in the train set: 22029, number of used features: 9
[LightGBM] [Info] Start training from score 118651.337373
Training until validation scores don't improve for 10 rounds
[10]	training's rmse: 37615.7	valid_1's rmse: 42777.2


New categorical_feature is ['所在地', '最寄り駅']


[20]	training's rmse: 23279.9	valid_1's rmse: 31714.1
[30]	training's rmse: 17917.4	valid_1's rmse: 27630.5
[40]	training's rmse: 15470.1	valid_1's rmse: 25857
[50]	training's rmse: 14177.4	valid_1's rmse: 25019.9
[60]	training's rmse: 13348.8	valid_1's rmse: 24539.6
[70]	training's rmse: 12692.2	valid_1's rmse: 24212.8
[80]	training's rmse: 12171.4	valid_1's rmse: 23883.8
[90]	training's rmse: 11643.6	valid_1's rmse: 23608.5
[100]	training's rmse: 11166.3	valid_1's rmse: 23327.5
[110]	training's rmse: 10828.1	valid_1's rmse: 23149.4
[120]	training's rmse: 10534.7	valid_1's rmse: 23045.5
[130]	training's rmse: 10246.6	valid_1's rmse: 22926.6
[140]	training's rmse: 9980.65	valid_1's rmse: 22793.8
[150]	training's rmse: 9731.38	valid_1's rmse: 22665
[160]	training's rmse: 9539.54	valid_1's rmse: 22614
[170]	training's rmse: 9361.29	valid_1's rmse: 22545.4
[180]	training's rmse: 9176.06	valid_1's rmse: 22493.5
[190]	training's rmse: 8988.08	valid_1's rmse: 22421.3
[200]	training's rmse: 8

以前よりは良いがまだまだ

「所在地」と「最寄駅」は情報として被っている気がするので、「所在地のみにする」

In [478]:
X_train = pd.concat([house_age, area_size, room_arrange_scores, contract_span, floor_scores, Floor_scores, minits, addresses], axis=1)
y_train = rent

X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.3, random_state=0)

X_test = pd.concat([test_house_age, test_area_size, test_room_arrange_scores, test_contract_span, test_floor_scores, test_Floor_scores, test_minits, test_addresses['所在地']], axis=1)

category_lists = ['所在地']
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train)

params = {
    'objective':'regression',
    'metrics':'rmse',
    'lambda_l2':0.0000001
}

model = lgb.train(
                    params,
                    lgb_train, 
                    valid_sets=[lgb_train, lgb_eval], 
                    verbose_eval=10, 
                    num_boost_round=3000, 
                    early_stopping_rounds=10,
                    categorical_feature = category_lists
                    )

y_pred = model.predict(X_test, num_iteration=model.best_iteration)

New categorical_feature is ['所在地']


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1472
[LightGBM] [Info] Number of data points in the train set: 22029, number of used features: 8
[LightGBM] [Info] Start training from score 118651.337373
Training until validation scores don't improve for 10 rounds
[10]	training's rmse: 38944.4	valid_1's rmse: 43756.6
[20]	training's rmse: 25518.8	valid_1's rmse: 32968.4
[30]	training's rmse: 20735.5	valid_1's rmse: 29576.1
[40]	training's rmse: 18502.8	valid_1's rmse: 27999.6
[50]	training's rmse: 17175.6	valid_1's rmse: 27120
[60]	training's rmse: 16245.8	valid_1's rmse: 26569.5
[70]	training's rmse: 15451.1	valid_1's rmse: 26058.9
[80]	training's rmse: 14773.3	valid_1's rmse: 25775.2
[90]	training's rmse: 14213.9	valid_1's rmse: 25557.2
[100]	training's rmse: 13744.5	valid_1's rmse: 25490.7
[110]	training's rmse: 13335.4	valid_1's rmse: 25371.6
[120]	training's rmse: 12993.1	valid_1'

「最寄駅」カテゴリは必要そうである

まだ未抽出の特徴量を抽出する

In [479]:
df = pd.read_csv('train.csv')
df

Unnamed: 0,id,賃料,所在地,アクセス,間取り,築年数,方角,面積,所在階,バス・トイレ,キッチン,放送・通信,室内設備,駐車場,周辺環境,建物構造,契約期間
0,1,75000,東京都北区滝野川３丁目,都営三田線\t西巣鴨駅\t徒歩4分\t\t埼京線\t板橋駅\t徒歩14分\t\t都電荒川線\...,1K,9年9ヶ月,南東,20.01m2,1階／12階建,専用バス／\t専用トイレ／\tバス・トイレ別／\tシャワー／\t浴室乾燥機\t／\t温水洗浄便座,ガスコンロ／\tコンロ2口／\tシステムキッチン\t／\t給湯,インターネット対応／\tCATV／\tCSアンテナ／\tBSアンテナ,エアコン付\tシューズボックス／\tバルコニー／\tフローリング／\t室内洗濯機置場／\t敷...,駐輪場\t空有,【小学校】 495m\t【大学】 461m\t【小学校】 962m\t【公園】 1103m\...,RC（鉄筋コンクリート）,2年間
1,2,76000,東京都中央区月島３丁目,都営大江戸線\t勝どき駅\t徒歩5分\t\t有楽町線\t月島駅\t徒歩9分\t\t日比谷線\...,1R,44年10ヶ月,,16.5m2,5階／10階建,専用トイレ／\tシャワー／\t温水洗浄便座,ガスコンロ／\tシステムキッチン\t／\t給湯,インターネット対応,エアコン付\tシューズボックス／\tバルコニー／\tフローリング／\t室内洗濯機置場／\t敷...,駐輪場\t空有\t駐車場\t無\tバイク置き場\t無,【スーパー】 1283m,鉄骨造,2年間
2,3,110000,東京都渋谷区笹塚２丁目,京王線\t笹塚駅\t徒歩6分\t\t京王線\t代田橋駅\t徒歩7分\t\t京王線\t明大前駅...,1K,8年6ヶ月,南,22.05m2,12階／15階建,専用バス／\t専用トイレ／\tバス・トイレ別／\tシャワー／\t浴室乾燥機\t／\t温水洗浄...,ガスコンロ／\tコンロ2口／\tシステムキッチン\t／\t給湯,インターネット対応／\t光ファイバー／\tCSアンテナ／\tBSアンテナ,エアコン付\tウォークインクローゼット\tシューズボックス／\tバルコニー／\tフローリング...,"駐輪場\t空有\tバイク置き場\t空有\t駐車場\t近隣\t30,000円(税込)\t距離100m",【スーパー】 89m\t【コンビニ】 184m\t【コンビニ】 392m\t【スーパー】 492m,RC（鉄筋コンクリート）,2年間
3,4,150000,東京都杉並区高円寺南２丁目23-2,総武線・中央線（各停）\t高円寺駅\t徒歩9分\t\t丸ノ内線(池袋－荻窪)\t新高円寺駅\...,2LDK,29年4ヶ月,南,60.48m2,3階／4階建,専用バス／\t専用トイレ／\tバス・トイレ別／\tシャワー／\t温水洗浄便座／\t洗面台独立,ガスコンロ／\t給湯,インターネット対応／\t光ファイバー,エアコン付\tシューズボックス／\tバルコニー／\t2面採光／\t室内洗濯機置場／\tエレベ...,駐車場\t無\t駐輪場\t無\tバイク置き場\t無,【スーパー】 225m\t【スーパー】 448m\t【スーパー】 619m\t【スーパー】 ...,RC（鉄筋コンクリート）,2年間\t※この物件は\t定期借家\tです。
4,5,74000,東京都葛飾区金町３丁目7-2,京成金町線\t京成金町駅\t徒歩5分\t\t常磐線\t金町(東京都)駅\t徒歩7分\t\t京...,2DK,31年7ヶ月,南,39.66m2,1階／2階建,専用バス／\t専用トイレ／\tバス・トイレ別／\tシャワー／\t追焚機能,給湯／\t独立キッチン,,バルコニー／\tフローリング／\t室内洗濯機置場\t公営水道／\t下水,"駐車場\t近隣\t17,000円(税込)\t距離300m\t駐輪場\t無\tバイク置き場\t無",【スーパー】 193m\t【スーパー】 298m\t【スーパー】 660m\t【スーパー】 ...,木造,2年間
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31465,31466,80000,東京都板橋区蓮根２丁目,都営三田線\t蓮根駅\t徒歩7分\t\t都営三田線\t西台駅\t徒歩10分\t\t都営三田線...,2DK,30年0ヶ月,南,37.9m2,1階／3階建,専用バス／\t専用トイレ／\tバス・トイレ別／\tシャワー／\t追焚機能／\t洗面台独立,ガスコンロ／\tコンロ2口／\t給湯,インターネット対応,エアコン付\tバルコニー／\t室内洗濯機置場\t公営水道／\t都市ガス,駐車場\t無,,鉄骨造,2年間
31466,31467,54500,東京都世田谷区太子堂５丁目17-1,東急田園都市線\t三軒茶屋駅\t徒歩6分\t\t東急世田谷線\t西太子堂駅\t徒歩4分\t\...,1R,39年7ヶ月,西,12.04m2,4階／4階建,専用バス／\t専用トイレ／\tシャワー,ガスコンロ／\tコンロ1口／\t給湯,,エアコン付\tバルコニー／\tフローリング／\t室外洗濯機置場／\tタイル張り\t公営水道／...,駐車場\t無\t駐輪場\t無\tバイク置き場\t無,【スーパー】 458m\t【スーパー】 540m\t【コンビニ】 131m\t【コンビニ】 ...,RC（鉄筋コンクリート）,2年間
31467,31468,125000,東京都江東区南砂４丁目,東西線\t南砂町駅\t徒歩5分\t\t都営新宿線\t大島(東京都)駅\t徒歩26分\t\t東...,3DK,45年10ヶ月,南,60m2,3階／5階建,バス・トイレ別／\tシャワー／\t追焚機能／\t温水洗浄便座,ガスコンロ／\tコンロ2口,光ファイバー,シューズボックス／\tバルコニー／\tフローリング／\t室内洗濯機置場\t都市ガス,駐輪場\t空有\tバイク置き場\t無,【コンビニ】 302m\t【コンビニ】 394m\t【コンビニ】 452m\t【コンビニ】 ...,鉄骨造,2年間
31468,31469,98000,東京都中野区中野２丁目,中央線（快速）\t中野(東京都)駅\t徒歩4分\t\t丸ノ内線(池袋－荻窪)\t新中野駅\t...,1DK,11年0ヶ月,南,29.59m2,1階／2階建,専用バス／\t専用トイレ／\tバス・トイレ別／\tシャワー／\t洗面台独立,ガスコンロ／\tシステムキッチン\t／\t給湯,インターネット対応／\t光ファイバー／\tCATV,エアコン付\tシューズボックス／\tバルコニー／\tフローリング／\t3面採光／\t室内洗濯...,駐輪場\t空有\t駐車場\t無\tバイク置き場\t無,【スーパー】 485m\t【スーパー】 1051m\t【コンビニ】 476m\t【コンビニ】...,木造,2年間


現在の間取りデータの抽出方法では不完全な気がするので、より表現力のある抽出方法にする。

In [480]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df = pd.concat([df_train, df_test], axis=0)#一括処理のために、訓練データとテストデータを結合

merge_room_arange = df['間取り']#「間取り」の特徴量のみを抽出

#以下で、nLDK+Sをそれぞれ別個に取得する
merge_rooms = []
merge_livings = []
merge_dining = []
merge_kitchen = []
merge_service = []

In [481]:
#roomの数を抽出。それと、LとDとKとSがそれぞれ存在するか否かの符号をつける。
for ldks in merge_room_arange:
    
    try:
        if ldks[0].isdigit():
            merge_rooms.append( int(ldks[0]) )

    except:
        merge_rooms.append(nan)
        merge_livings.append(nan)
        merge_dining.append(nan)
        merge_kitchen.append(nan)
        merge_service.append(nan)

    if ('L' in ldks):
        merge_livings.append(1)
    else:
        merge_livings.append(0)
    
    if ('D' in ldks):
        merge_dining.append(1)
    else:
        merge_dining.append(0)

    if ('K' in ldks):
        merge_kitchen.append(1)
    else:
        merge_kitchen.append(0)

    if ('S' in ldks):
        merge_service.append(1)
    else:
        merge_service.append(0)

merge_rooms = pd.DataFrame(merge_rooms)
merge_livings = pd.DataFrame(merge_livings)
merge_dining = pd.DataFrame(merge_dining)
merge_kitchen = pd.DataFrame(merge_kitchen)
merge_service = pd.DataFrame(merge_service)
merge_room_arange = pd.concat([merge_rooms, merge_livings, merge_dining, merge_kitchen, merge_service], axis=1)
merge_room_arange.columns = ['部屋数', 'L', 'D', 'K', 'S']
room_arange = merge_room_arange[:len(df_target)]
room_arange.to_csv('room_arange.csv', index=False)
test_room_arange = merge_room_arange[len(df_target):]
test_room_arange = test_room_arange.reset_index(drop=True)
test_room_arange.to_csv('test_room_arange.csv', index=False)

この最新の「間取り」特徴量と前の「間取り」特徴量を取り替えてlightGBMを動かす。

In [482]:
X_train = pd.concat([house_age, area_size, contract_span, floor_scores, Floor_scores, stations, minits, addresses, room_arange], axis=1)
y_train = rent

X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.3, random_state=0)

X_test = pd.concat([test_house_age, test_area_size, test_contract_span, test_floor_scores, test_Floor_scores, test_stations, test_minits, test_addresses['所在地'], test_room_arange], axis=1)

category_lists = ['最寄り駅', '所在地', 'L', 'D', 'K', 'S']
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train)

params = {
    'objective':'regression',
    'metrics':'rmse',
    'lambda_l1':0.000001
}

model = lgb.train(
                    params,
                    lgb_train, 
                    valid_sets=[lgb_train, lgb_eval], 
                    verbose_eval=10, 
                    num_boost_round=3000, 
                    early_stopping_rounds=10,
                    categorical_feature = category_lists
                    )

y_pred = model.predict(X_test, num_iteration=model.best_iteration)

New categorical_feature is ['D', 'K', 'L', 'S', '所在地', '最寄り駅']


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1877
[LightGBM] [Info] Number of data points in the train set: 22029, number of used features: 13
[LightGBM] [Info] Start training from score 118651.337373
Training until validation scores don't improve for 10 rounds
[10]	training's rmse: 37542.3	valid_1's rmse: 42434
[20]	training's rmse: 23375	valid_1's rmse: 31428.6
[30]	training's rmse: 18064.8	valid_1's rmse: 27445.8
[40]	training's rmse: 15772.5	valid_1's rmse: 25837.3
[50]	training's rmse: 14510.4	valid_1's rmse: 25036.2
[60]	training's rmse: 13481.7	valid_1's rmse: 24322.4
[70]	training's rmse: 12781.6	valid_1's rmse: 23907
[80]	training's rmse: 12151	valid_1's rmse: 23508.6
[90]	training's rmse: 11654.2	valid_1's rmse: 23219.2
[100]	training's rmse: 11256.2	valid_1's rmse: 23007.9
[110]	training's rmse: 10877.7	valid_1's rmse: 22904
[120]	training's rmse: 10574	valid_1's rmse: 22765.8
[130]	training's rmse: 10296	valid_1's rmse: 22651.8
[140

微改善を続けているが、まだまだ

次に「方角」の特徴量を取得する。

In [483]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df = pd.concat([df_train, df_test], axis=0)#一括処理のために、訓練データとテストデータを結合

merge_directions = df['方角']#「方角」の特徴量のみを抽出

#カテゴリ化
list_cols = ['方角']
directions_encoder = ce.OrdinalEncoder(cols=list_cols, drop_invariant=True)
merge_directions = directions_encoder.fit_transform(merge_directions)
#訓練データとテストデータに再分割
directions = merge_directions[:len(df_train)]
directions.to_csv('directions.csv',index=False)
test_directions = merge_directions[len(df_train):]
test_directions.to_csv('test_directions.csv',index=False)

「方角」特徴量をlightGBMに与えてみる。

In [484]:
X_train = pd.concat([house_age, area_size, contract_span, floor_scores, Floor_scores, stations, minits, addresses, room_arange, directions], axis=1)
y_train = rent

X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.3, random_state=0)

X_test = pd.concat([test_house_age, test_area_size, test_contract_span, test_floor_scores, test_Floor_scores, test_stations, test_minits, test_addresses['所在地'], test_room_arange, test_directions], axis=1)

category_lists = ['最寄り駅', '所在地', 'L', 'D', 'K', 'S', '方角']
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train)

params = {
    'objective':'regression',
    'metrics':'rmse',
    'lambda_l2':0.0000001
}

model = lgb.train(
                    params,
                    lgb_train, 
                    valid_sets=[lgb_train, lgb_eval], 
                    verbose_eval=10, 
                    num_boost_round=3000, 
                    early_stopping_rounds=10,
                    categorical_feature = category_lists
                    )

y_pred = model.predict(X_test, num_iteration=model.best_iteration)

New categorical_feature is ['D', 'K', 'L', 'S', '所在地', '方角', '最寄り駅']


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1887
[LightGBM] [Info] Number of data points in the train set: 22029, number of used features: 14
[LightGBM] [Info] Start training from score 118651.337373
Training until validation scores don't improve for 10 rounds
[10]	training's rmse: 37542.3	valid_1's rmse: 42434
[20]	training's rmse: 23375	valid_1's rmse: 31428.6
[30]	training's rmse: 18063.1	valid_1's rmse: 27439.7
[40]	training's rmse: 15773.3	valid_1's rmse: 25937.6
[50]	training's rmse: 14453.5	valid_1's rmse: 25091.1
[60]	training's rmse: 13499.2	valid_1's rmse: 24445.1
[70]	training's rmse: 12724.6	valid_1's rmse: 23893.9
[80]	training's rmse: 12151.3	valid_1's rmse: 23610.3
[90]	training's rmse: 11669.6	valid_1's rmse: 23389.4
[100]	training's rmse: 11219.5	valid_1's rmse: 23202.1
[110]	training's rmse: 10823.8	valid_1's rmse: 23013
[120]	training's rmse: 10502.6	valid_1's r

「方角」特徴量を入れるとスコアが悪化する。

「バス・トイレ別」と「温水洗浄便座」に注目して抽出してみる。

In [485]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df = pd.concat([df_train, df_test], axis=0)#一括処理のために、訓練データとテストデータを結合

merge_bath_toilets = df['バス・トイレ']

In [486]:
merge_sepa_bath_toilets = []
merge_washlets = []

for merge_bath_toilet in merge_bath_toilets:

    try:
        if ('バス・トイレ別' in merge_bath_toilet):
            merge_sepa_bath_toilets.append(1)
        else:
            merge_sepa_bath_toilets.append(0)
        
        if ('温水洗浄便座' in merge_bath_toilet):
            merge_washlets.append(1)
        else:
            merge_washlets.append(0)
    except:
        merge_sepa_bath_toilets.append(nan)
        merge_washlets.append(nan)

#「バス・トイレ別」に関するファイル出力までの一連の処理
merge_sepa_bath_toilets = pd.DataFrame(merge_sepa_bath_toilets)
merge_sepa_bath_toilets.columns = ['バス・トイレ別']
sepa_bath_toilets = merge_sepa_bath_toilets[:len(df_train)]
test_sepa_bath_toilets = merge_sepa_bath_toilets[len(df_train):]
sepa_bath_toilets.to_csv('sepa_bath_toilets.csv', index=False)
test_sepa_bath_toilets.to_csv('test_sepa_bath_toilets.csv', index=False)

#「温水洗浄便座」に関するファイル出力までの一連の処理
merge_washlets = pd.DataFrame(merge_washlets)
merge_washlets.columns = ['温水洗浄便座']
washlets = merge_washlets[:len(df_train)]
test_washlets = merge_washlets[len(df_train):]
washlets.to_csv('washlets.csv',index=False)
test_washlets.to_csv('test_washlets.csv',index=False)

「バス・トイレ別」と「温水洗浄便座」をlightGBMに突っ込む

In [487]:
X_train = pd.concat([house_age, area_size, contract_span, floor_scores, Floor_scores, stations, minits, addresses, room_arange, sepa_bath_toilets, washlets], axis=1)
y_train = rent

X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.3, random_state=0)

X_test = pd.concat([test_house_age, test_area_size, test_contract_span, test_floor_scores, test_Floor_scores, test_stations, test_minits, test_addresses['所在地'], test_room_arange, test_sepa_bath_toilets, test_washlets], axis=1)

category_lists = ['最寄り駅', '所在地', 'L', 'D', 'K', 'S', 'バス・トイレ別', '温水洗浄便座']
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train)

params = {
    'objective':'regression',
    'metrics':'rmse',
    'lambda_l2':0.0000001
}

model = lgb.train(
                    params,
                    lgb_train, 
                    valid_sets=[lgb_train, lgb_eval], 
                    verbose_eval=10, 
                    num_boost_round=3000, 
                    early_stopping_rounds=10,
                    categorical_feature = category_lists
                    )

y_pred = model.predict(X_test, num_iteration=model.best_iteration)

New categorical_feature is ['D', 'K', 'L', 'S', 'バス・トイレ別', '所在地', '最寄り駅', '温水洗浄便座']


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1883
[LightGBM] [Info] Number of data points in the train set: 22029, number of used features: 15
[LightGBM] [Info] Start training from score 118651.337373
Training until validation scores don't improve for 10 rounds
[10]	training's rmse: 37523.6	valid_1's rmse: 42437.7
[20]	training's rmse: 23376.1	valid_1's rmse: 31410.3
[30]	training's rmse: 18004.7	valid_1's rmse: 27575.5
[40]	training's rmse: 15718.3	valid_1's rmse: 26016.9
[50]	training's rmse: 14371.8	valid_1's rmse: 25216.2
[60]	training's rmse: 13493.9	valid_1's rmse: 24581.2
[70]	training's rmse: 12769	valid_1's rmse: 24143.6
[80]	training's rmse: 12170.7	valid_1's rmse: 23831.9
[90]	training's rmse: 11617.2	valid_1's rmse: 23561.5
[100]	training's rmse: 11236	valid_1's rmse: 23359.2
[110]	training's rmse: 10857.5	valid_1's rmse: 23186.7
[120]	training's rmse: 10498	valid_1's r

再び悪化

「バス・トイレ別」だけならどうか

In [488]:
X_train = pd.concat([house_age, area_size, contract_span, floor_scores, Floor_scores, stations, minits, addresses, room_arange, sepa_bath_toilets], axis=1)
y_train = rent

X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.3, random_state=0)

X_test = pd.concat([test_house_age, test_area_size, test_contract_span, test_floor_scores, test_Floor_scores, test_stations, test_minits, test_addresses['所在地'], test_room_arange, test_sepa_bath_toilets], axis=1)

category_lists = ['最寄り駅', '所在地', 'L', 'D', 'K', 'S', 'バス・トイレ別']
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train)

params = {
    'objective':'regression',
    'metrics':'rmse',
    'lambda_l2':0.0000001
}

model = lgb.train(
                    params,
                    lgb_train, 
                    valid_sets=[lgb_train, lgb_eval], 
                    verbose_eval=10, 
                    num_boost_round=3000, 
                    early_stopping_rounds=10,
                    categorical_feature = category_lists
                    )

y_pred = model.predict(X_test, num_iteration=model.best_iteration)

New categorical_feature is ['D', 'K', 'L', 'S', 'バス・トイレ別', '所在地', '最寄り駅']


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1880
[LightGBM] [Info] Number of data points in the train set: 22029, number of used features: 14
[LightGBM] [Info] Start training from score 118651.337373
Training until validation scores don't improve for 10 rounds
[10]	training's rmse: 37541.1	valid_1's rmse: 42435.8
[20]	training's rmse: 23376.1	valid_1's rmse: 31428.8
[30]	training's rmse: 17971.3	valid_1's rmse: 27378.3
[40]	training's rmse: 15646.9	valid_1's rmse: 25915.4
[50]	training's rmse: 14306.3	valid_1's rmse: 25124.8
[60]	training's rmse: 13438	valid_1's rmse: 24581.1
[70]	training's rmse: 12785.2	valid_1's rmse: 24110.7
[80]	training's rmse: 12155.6	valid_1's rmse: 23740.5
[90]	training's rmse: 11596.3	valid_1's rmse: 23485.9
[100]	training's rmse: 11188.8	valid_1's rmse: 23325.6
[110]	training's rmse: 10852.3	valid_1's rmse: 23220.4
[120]	training's rmse: 10547.3	valid_1

まだ悪化の状態である。<br>
「温水洗浄便座」だけならどうか

In [489]:
X_train = pd.concat([house_age, area_size, contract_span, floor_scores, Floor_scores, stations, minits, addresses, room_arange, washlets], axis=1)
y_train = rent

X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.3, random_state=0)

X_test = pd.concat([test_house_age, test_area_size, test_contract_span, test_floor_scores, test_Floor_scores, test_stations, test_minits, test_addresses['所在地'], test_room_arange, test_washlets], axis=1)

category_lists = ['最寄り駅', '所在地', 'L', 'D', 'K', 'S', '温水洗浄便座']
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train)

params = {
    'objective':'regression',
    'metrics':'rmse',
    'lambda_l2':0.0000001
}

model = lgb.train(
                    params,
                    lgb_train, 
                    valid_sets=[lgb_train, lgb_eval], 
                    verbose_eval=10, 
                    num_boost_round=3000, 
                    early_stopping_rounds=10,
                    categorical_feature = category_lists
                    )

y_pred = model.predict(X_test, num_iteration=model.best_iteration)

New categorical_feature is ['D', 'K', 'L', 'S', '所在地', '最寄り駅', '温水洗浄便座']


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1880
[LightGBM] [Info] Number of data points in the train set: 22029, number of used features: 14
[LightGBM] [Info] Start training from score 118651.337373
Training until validation scores don't improve for 10 rounds
[10]	training's rmse: 37529.9	valid_1's rmse: 42434.4
[20]	training's rmse: 23330.9	valid_1's rmse: 31482.4
[30]	training's rmse: 18036.7	valid_1's rmse: 27652.7
[40]	training's rmse: 15690.9	valid_1's rmse: 26134.3
[50]	training's rmse: 14373.1	valid_1's rmse: 25357.1
[60]	training's rmse: 13475.9	valid_1's rmse: 24778.9
[70]	training's rmse: 12738.1	valid_1's rmse: 24258.9
[80]	training's rmse: 12098.9	valid_1's rmse: 23926
[90]	training's rmse: 11626.2	valid_1's rmse: 23741.6
[100]	training's rmse: 11207.3	valid_1's rmse: 23475.8
[110]	training's rmse: 10856.9	valid_1's rmse: 23323.9
[120]	training's rmse: 10513.2	valid_1

「温水洗浄便座」も悪化の要因となってしまっている。

次に「建物構造」の”鉄”という文字に注目してフラグを立てたい。

In [490]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df = pd.concat([df_train, df_test], axis=0)#一括処理のために、訓練データとテストデータを結合

merge_buildings = df['建物構造']

merge_buildings_cat = []
for merge_building in merge_buildings:

    try:
        if ('鉄' in merge_building):
            merge_buildings_cat.append(1)
        else:
            merge_buildings_cat.append(0)

    except:
        merge_buildings_cat.append(nan)

#「建物構造」に関するファイル出力までの一連の処理
merge_buildings_cat = pd.DataFrame(merge_buildings_cat)
merge_buildings_cat.columns = ['建物構造']
buildings = merge_buildings_cat[:len(df_train)]
test_buildings = merge_buildings_cat[len(df_train):]
buildings.to_csv('buildings.csv', index=False)
test_buildings.to_csv('test_buildings.csv', index=False)

「建物構造」についてlightGBMに突っ込んでみる。

In [491]:
X_train = pd.concat([house_age, area_size, contract_span, floor_scores, Floor_scores, stations, minits, addresses, room_arange, buildings], axis=1)
y_train = rent

X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.3, random_state=0)

X_test = pd.concat([test_house_age, test_area_size, test_contract_span, test_floor_scores, test_Floor_scores, test_stations, test_minits, test_addresses['所在地'], test_room_arange, test_buildings], axis=1)

category_lists = ['最寄り駅', '所在地', 'L', 'D', 'K', 'S', '建物構造']
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train)

params = {
    'objective':'regression',
    'metrics':'rmse',
    'lambda_l2':0.0000001
}

model = lgb.train(
                    params,
                    lgb_train, 
                    valid_sets=[lgb_train, lgb_eval], 
                    verbose_eval=10, 
                    num_boost_round=3000, 
                    early_stopping_rounds=10,
                    categorical_feature = category_lists
                    )

y_pred = model.predict(X_test, num_iteration=model.best_iteration)

New categorical_feature is ['D', 'K', 'L', 'S', '建物構造', '所在地', '最寄り駅']


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1880
[LightGBM] [Info] Number of data points in the train set: 22029, number of used features: 14
[LightGBM] [Info] Start training from score 118651.337373
Training until validation scores don't improve for 10 rounds
[10]	training's rmse: 37542.3	valid_1's rmse: 42434
[20]	training's rmse: 23385.3	valid_1's rmse: 31429.2
[30]	training's rmse: 17989.4	valid_1's rmse: 27414.3
[40]	training's rmse: 15666.6	valid_1's rmse: 25848.3
[50]	training's rmse: 14377.9	valid_1's rmse: 24971.7
[60]	training's rmse: 13467.6	valid_1's rmse: 24369
[70]	training's rmse: 12761.1	valid_1's rmse: 23856.5
[80]	training's rmse: 12188.2	valid_1's rmse: 23629.9
[90]	training's rmse: 11704.8	valid_1's rmse: 23373.9
[100]	training's rmse: 11309.1	valid_1's rmse: 23210.5
[110]	training's rmse: 10917.2	valid_1's rmse: 23040.6
[120]	training's rmse: 10562.2	valid_1's

なかなか自己ベストを出すことができない。

過学習を改善するために、特徴量を減らしてみる。

In [492]:
X_train = pd.concat([house_age, area_size, contract_span, floor_scores, Floor_scores, stations, minits, addresses, room_arange], axis=1)
y_train = rent

X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.3, random_state=0)

X_test = pd.concat([test_house_age, test_area_size, test_contract_span, test_floor_scores, test_Floor_scores, test_stations, test_minits, test_addresses['所在地'], test_room_arange], axis=1)

category_lists = ['最寄り駅', '所在地', 'L', 'D', 'K', 'S']
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train)

params = {
    'objective':'regression',
    'metrics':'rmse',
    'lambda_l1':0.000001
}

model = lgb.train(
                    params,
                    lgb_train, 
                    valid_sets=[lgb_train, lgb_eval], 
                    verbose_eval=10, 
                    num_boost_round=3000, 
                    early_stopping_rounds=10,
                    categorical_feature = category_lists
                    )

y_pred = model.predict(X_test, num_iteration=model.best_iteration)

New categorical_feature is ['D', 'K', 'L', 'S', '所在地', '最寄り駅']


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1877
[LightGBM] [Info] Number of data points in the train set: 22029, number of used features: 13
[LightGBM] [Info] Start training from score 118651.337373
Training until validation scores don't improve for 10 rounds
[10]	training's rmse: 37542.3	valid_1's rmse: 42434
[20]	training's rmse: 23375	valid_1's rmse: 31428.6
[30]	training's rmse: 18064.8	valid_1's rmse: 27445.8
[40]	training's rmse: 15772.5	valid_1's rmse: 25837.3
[50]	training's rmse: 14510.4	valid_1's rmse: 25036.2
[60]	training's rmse: 13481.7	valid_1's rmse: 24322.4
[70]	training's rmse: 12781.6	valid_1's rmse: 23907
[80]	training's rmse: 12151	valid_1's rmse: 23508.6
[90]	training's rmse: 11654.2	valid_1's rmse: 23219.2
[100]	training's rmse: 11256.2	valid_1's rmse: 23007.9
[110]	training's rmse: 10877.7	valid_1's rmse: 22904
[120]	training's rmse: 10574	valid_1's rmse: 2

予測結果のファイル出力

In [493]:
X_test

Unnamed: 0,築年数,面積,契約期間,所在階,全体の階数,最寄り駅,所要時間,所在地,部屋数,L,D,K,S
0,588.0,50.22,0.0,8.0,8.0,34,15.0,441,2,1,1,1,0
1,2.0,20.88,2.0,3.0,4.0,52,6.0,271,1,0,0,0,0
2,280.0,26.93,2.0,1.0,4.0,358,10.0,132,1,0,0,1,0
3,434.0,23.57,2.0,1.0,2.0,83,10.0,264,1,0,0,1,0
4,544.0,50.00,2.0,4.0,4.0,83,12.0,343,2,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
31257,1.0,25.66,2.0,6.0,8.0,10,3.0,119,1,0,0,1,0
31258,186.0,22.71,0.0,8.0,15.0,171,2.0,316,1,0,0,0,0
31259,12.0,45.76,2.0,10.0,14.0,67,10.0,21,1,1,1,1,0
31260,184.0,55.20,0.0,14.0,14.0,434,3.0,845,1,0,0,1,0


In [494]:
df = pd.read_csv('test.csv')
id = df['id']
y_pred = pd.DataFrame(y_pred)
result = pd.concat([id, y_pred],axis=1)
result.to_csv('result.csv',index=False)

In [495]:
# result['id'] = result['id'].astype(int)