In [122]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import category_encoders as ce
import mojimoji
import re

さらなる特徴量の抽出を行う

まずは、現在ある「市区町村」ではターゲットを推測するにあたって弱い気がするので、カテゴリ変数である「所在地」特徴量を作成する。

In [123]:
df_target = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

一括で処理ができるように、訓練データとテストデータを結合する

In [124]:
df = pd.concat([df_target, df_test], axis=0)

In [125]:
print('結合する前のdf_targetのサイズ：',len(df_target))
print('結合する前のdf_testのサイズ：',len(df_test))
print('len(df_target)+len(df_test)=',len(df_target)+len(df_test))
print('結合したdfのサイズ：',len(df))

結合する前のdf_targetのサイズ： 31470
結合する前のdf_testのサイズ： 31262
len(df_target)+len(df_test)= 62732
結合したdfのサイズ： 62732


In [126]:
df['所在地'].head(5)

0          東京都北区滝野川３丁目
1          東京都中央区月島３丁目
2          東京都渋谷区笹塚２丁目
3    東京都杉並区高円寺南２丁目23-2
4       東京都葛飾区金町３丁目7-2
Name: 所在地, dtype: object

"東京都○○区△△n丁目"のうちの、○○区△△nを抽出する

In [127]:
locations = df['所在地']

#以下の部分では訓練データの”区”のラベル化に備えて、〇〇区の部分を抽出する。
i = 0
merge_addresses = []
for loc in locations:
    loc = mojimoji.zen_to_han(loc, kana=True)#所在地に含まれる全角数字を半角数字に変換
    digits = re.findall(r"\d+", loc)#n丁目などの数字情報を取得する
    target1 = "都"
    idx1 = loc.find(target1)
    try:
        idx2 = loc.find(digits[0])
    except:
        idx2 = -1#digitsがnanだったら-1を代入することで後々のスライスに備える
    address = loc[idx1+1:idx2]
    merge_addresses.append(address)

merge_addresses = pd.DataFrame(merge_addresses)
merge_addresses = merge_addresses.rename(columns={0:'所在地'})#列名の振り直し
print("カテゴリ化前の訓練データ：")
print(merge_addresses.head(20))
print("カテゴリ化前の訓練データの大きさ：",len(merge_addresses))


#カテゴリ化
list_cols = ['所在地']
addresses_encoder = ce.OrdinalEncoder(cols=list_cols, drop_invariant=True)
merge_addresses = addresses_encoder.fit_transform(merge_addresses['所在地'])
#訓練データとテストデータに再分割
addresses = merge_addresses[:len(df_target)]
addresses.to_csv('addresses.csv',index=False)
test_addresses = merge_addresses[len(df_target):]
test_addresses.to_csv('test_addresses.csv',index=False)

カテゴリ化前の訓練データ：
          所在地
0       北区滝野川
1       中央区月島
2       渋谷区笹塚
3     杉並区高円寺南
4       葛飾区金町
5      荒川区南千住
6      練馬区東大泉
7       目黒区鷹番
8       文京区向丘
9       板橋区板橋
10     大田区西馬込
11    江戸川区北小岩
12      港区南青山
13    杉並区阿佐谷南
14       墨田区緑
15     渋谷区幡ヶ谷
16      板橋区桜川
17    江戸川区西瑞江
18    新宿区四谷三栄
19  中央区日本橋箱崎町
カテゴリ化前の訓練データの大きさ： 62732


変数の読み込みを行う

In [128]:
test_addresses = test_addresses.reset_index(drop=True)
print(test_addresses)

       所在地
0      441
1      271
2      132
3      264
4      343
...    ...
31257  119
31258  316
31259   21
31260  845
31261  190

[31262 rows x 1 columns]


In [129]:
house_age = pd.read_csv('house_age.csv')
area_size = pd.read_csv('area_size.csv')
room_arrange_scores = pd.read_csv('room_arrange_scores.csv')
contract_span = pd.read_csv('contract_span.csv')
floor_scores = pd.read_csv('floor_scores.csv')
Floor_scores = pd.read_csv('capital_floor_scores.csv')
wards = pd.read_csv('wards.csv')
stations = pd.read_csv('stations.csv')
minits = pd.read_csv('minits.csv')

rent = pd.read_csv('rent.csv')

テストデータの読み込み

In [130]:
test_house_age = pd.read_csv('test_house_age.csv')
test_area_size = pd.read_csv('test_area_size.csv')
test_room_arrange_scores = pd.read_csv('test_room_arrange_scores.csv')
test_contract_span = pd.read_csv('test_contract_span.csv')
test_floor_scores = pd.read_csv('test_floor_scores.csv')
test_Floor_scores = pd.read_csv('test_capital_floor_scores.csv')
test_wards = pd.read_csv('test_wards.csv')
test_stations = pd.read_csv('test_stations.csv')
test_minits = pd.read_csv('test_minits.csv')

これをlightGBMに突っ込む

In [131]:
X_train = pd.concat([house_age, area_size, room_arrange_scores, contract_span, floor_scores, Floor_scores, stations, minits, addresses], axis=1)
y_train = rent

X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.3, random_state=0)

X_test = pd.concat([test_house_age, test_area_size, test_room_arrange_scores, test_contract_span, test_floor_scores, test_Floor_scores, test_stations, test_minits, test_addresses['所在地']], axis=1)

category_lists = ['最寄り駅', '所在地']
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train)

params = {
    'objective':'regression',
    'metrics':'rmse',
    'lambda_l2':0.0000001
}

model = lgb.train(
                    params,
                    lgb_train, 
                    valid_sets=[lgb_train, lgb_eval], 
                    verbose_eval=10, 
                    num_boost_round=3000, 
                    early_stopping_rounds=10,
                    categorical_feature = category_lists
                    )

y_pred = model.predict(X_test, num_iteration=model.best_iteration)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1868
[LightGBM] [Info] Number of data points in the train set: 22029, number of used features: 9
[LightGBM] [Info] Start training from score 118651.337373
Training until validation scores don't improve for 10 rounds
[10]	training's rmse: 37615.7	valid_1's rmse: 42777.2


New categorical_feature is ['所在地', '最寄り駅']


[20]	training's rmse: 23279.9	valid_1's rmse: 31714.1
[30]	training's rmse: 17917.4	valid_1's rmse: 27630.5
[40]	training's rmse: 15470.1	valid_1's rmse: 25857
[50]	training's rmse: 14177.4	valid_1's rmse: 25019.9
[60]	training's rmse: 13348.8	valid_1's rmse: 24539.6
[70]	training's rmse: 12692.2	valid_1's rmse: 24212.8
[80]	training's rmse: 12171.4	valid_1's rmse: 23883.8
[90]	training's rmse: 11643.6	valid_1's rmse: 23608.5
[100]	training's rmse: 11166.3	valid_1's rmse: 23327.5
[110]	training's rmse: 10828.1	valid_1's rmse: 23149.4
[120]	training's rmse: 10534.7	valid_1's rmse: 23045.5
[130]	training's rmse: 10246.6	valid_1's rmse: 22926.6
[140]	training's rmse: 9980.65	valid_1's rmse: 22793.8
[150]	training's rmse: 9731.38	valid_1's rmse: 22665
[160]	training's rmse: 9539.54	valid_1's rmse: 22614
[170]	training's rmse: 9361.29	valid_1's rmse: 22545.4
[180]	training's rmse: 9176.06	valid_1's rmse: 22493.5
[190]	training's rmse: 8988.08	valid_1's rmse: 22421.3
[200]	training's rmse: 8

ファイル出力

In [132]:
df = pd.read_csv('test.csv')
id = df['id']
y_pred = pd.DataFrame(y_pred)
result = pd.concat([id, y_pred],axis=1)
result.to_csv('result.csv',index=False)

In [133]:
result['id'] = result['id'].astype(int)