In [88]:
import pandas as pd
import numpy as np
import lightgbm as lgb

In [89]:
df = pd.read_csv('train.csv')

いままで避けていた「所在地」特徴量について抽出する。

In [90]:
locations = df['所在地']

どうやらすべて"東京都"から始まる文字列である模様

In [91]:
i = 0
for loc in locations:
    if (loc[:3] != '東京都'):
        print(loc[:3])

欠損値もない模様

In [92]:
locations.isnull().sum()

0

東京都内のどの市区町村なのかを抽出していく。

区のカテゴリ化を行う

In [93]:
import category_encoders as ce

#以下の部分では訓練データの”区”のラベル化に備えて、〇〇区の部分を抽出する。
i = 0
wards = []
for loc in locations:
    target1 = "都"
    idx1 = loc.find(target1)
    target2 = "区"
    idx2 = loc.find(target2)
    ward = loc[idx1+1:idx2]
    wards.append(ward)

wards = pd.DataFrame(wards)
wards = wards.rename(columns={0:'市区町村'})#列名の振り直し
print("カテゴリ化前の訓練データ：")
print(wards.head())
print("カテゴリ化前の訓練データの大きさ：",len(wards))


#以下の部分ではテストデータの”区”のラベル化に備えて、〇〇区の部分を抽出する。
df = pd.read_csv('test.csv')
test_locations = df['所在地']

i = 0
test_wards = []
for loc in test_locations:
    target1 = "都"
    idx1 = loc.find(target1)
    target2 = "区"
    idx2 = loc.find(target2)
    test_ward = loc[idx1+1:idx2]
    test_wards.append(test_ward)

test_wards = pd.DataFrame(test_wards)
test_wards = test_wards.rename(columns={0:'市区町村'})#列名の振り直し
print("カテゴリ化前のテストデータ：")
print(test_wards.head())
print("カテゴリ化前のテストデータの大きさ",len(test_wards))


#カテゴリ化の準備
list_cols = ['市区町村']
wards_encoder = ce.OrdinalEncoder(cols=list_cols, drop_invariant=True)

#訓練データとテストデータで同じカテゴリ番号を付与したいので、訓練用のwardsとテスト用のtest_wardsを一旦結合する
merge_wards = pd.concat([wards, test_wards],axis=0)
print(merge_wards.head())
#以下の二列の値が一致していれば、しっかりと結合できている。
print("the size of merge_wards:", len(merge_wards))
print("len(wards)+len(test_wards)=",len(wards)+len(test_wards))

#いよいよカテゴリ化する。
merge_wards = wards_encoder.fit_transform(merge_wards['市区町村'])
merge_wards.head(20)

#カテゴリ化が完了したので、訓練データとテストデータに再分解
wards = merge_wards[:len(wards)]
wards.to_csv('wards.csv',index=False)
test_wards = merge_wards[len(wards):]
test_wards.to_csv('wards.csv',index=False)
print(wards.head())
print(test_wards.head())

カテゴリ化前の訓練データ：
  市区町村
0    北
1   中央
2   渋谷
3   杉並
4   葛飾
カテゴリ化前の訓練データの大きさ： 31470
カテゴリ化前のテストデータ：
  市区町村
0  世田谷
1   目黒
2   豊島
3   杉並
4   杉並
カテゴリ化前のテストデータの大きさ 31262
  市区町村
0    北
1   中央
2   渋谷
3   杉並
4   葛飾
the size of merge_wards: 62732
len(wards)+len(test_wards)= 62732
   市区町村
0     1
1     2
2     3
3     4
4     5
   市区町村
0    20
1     8
2    17
3     4
4     4


「アクセス」特徴量について抽出する。

最寄り駅のカテゴリ化を試みる。そのため、「所在地」と同様にして訓練データとテストデータを結合する。

In [94]:
from cmath import nan


#まずは訓練データについて最寄り駅と所要時間について取得する。
df = pd.read_csv('train.csv')
accesses = df['アクセス']
i = 0
stations = []
minits = []
target1 = '\t'
target2 = '駅'
target3 = '歩'
target4 = '分'
for access in accesses:
    stations.append([])
    minits.append([])
    #以下、最寄りの駅の抽出
    idx1 = access.find(target1)
    idx2 = access.find(target2)
    stations[i].append(access[idx1+1:idx2])
    
    #以下、徒歩〇分の抽出
    idx1 = access.find(target3)
    idx2 = access.find(target4)
    try:
        minits[i].append(int (access[idx1+1:idx2]) )
    except:
        minits[i].append(nan)

    i+=1

stations = pd.DataFrame(stations)
minits = pd.DataFrame(minits)
minits = minits.rename(columns={0:'所要時間'})#列名の振り直し
minits.to_csv('minits.csv',index=False)


#テストデータについて最寄り駅と所要時間を取得する。
df = pd.read_csv('test.csv')
test_accesses = df['アクセス']
i = 0
test_stations = []
test_minits = []
target1 = '\t'
target2 = '駅'
target3 = '歩'
target4 = '分'
for access in test_accesses:
    test_stations.append([])
    test_minits.append([])
    #以下、最寄りの駅の抽出
    idx1 = access.find(target1)
    idx2 = access.find(target2)
    test_stations[i].append(access[idx1+1:idx2])
    
    #以下、徒歩〇分の抽出
    idx1 = access.find(target3)
    idx2 = access.find(target4)
    try:
        test_minits[i].append(int (access[idx1+1:idx2]) )
    except:
        test_minits[i].append(nan)

    i+=1

test_stations = pd.DataFrame(test_stations)
test_minits = pd.DataFrame(test_minits)
test_minits = test_minits.rename(columns={0:'所要時間'})#列名の振り直し
test_minits.to_csv('test_minits.csv',index=False)


#訓練データとテストデータを結合する。
merge_stations = pd.concat([stations, test_stations],axis=0)


#カテゴリ化
merge_stations = merge_stations.rename(columns={0:'最寄り駅'})#列名の振り直し
list_cols = ['最寄り駅']
stations_encoder = ce.OrdinalEncoder(cols=list_cols, drop_invariant=True)
merge_stations = stations_encoder.fit_transform(merge_stations['最寄り駅'])

stations = merge_stations[:len(stations)]
stations.to_csv('stations.csv',index=False)
test_stations = merge_stations[len(stations):]
test_stations.to_csv('test_stations.csv',index=False)

以前に抽出しておいた特徴量（「面積」「階数」「間取り」「契約期間」「築年数」）を読み込む。

In [95]:
area_size = pd.read_csv('area_size.csv')
house_age = pd.read_csv('house_age.csv')
n_floor = pd.read_csv('n_floor.csv')
room_arrange = pd.read_csv('room_arrange.csv')
contract_span = pd.read_csv('contract_span.csv')

目的変数（「賃料」）を読み込む。

In [96]:
rent = pd.read_csv('rent.csv')

In [97]:
room_arrange_scores = []
for ldks in room_arrange['間取り']:
    room_arrange_score = 0
    for s in ldks:
        if s.isdigit():
            room_arrange_score += int(s)
        elif (s in ['L', 'D', 'K', 'S']):
            room_arrange_score += 1
        else:
            pass
    
    room_arrange_scores.append(room_arrange_score)
room_arrange_scores = pd.Series(data=room_arrange_scores,name='間取り得点')
#room_arrange_scores = room_arrange_scores.rename(columns={0:'間取り得点'})#列名の振り直し
room_arrange_scores.to_csv('room_arrange_scores.csv',index=False)

In [98]:
import re

i = 0
for s in n_floor["所在階"]:
    try:
        n_floor["所在階"][i] = re.findall(r"\d+", s)
    except:
        n_floor["所在階"][i] = nan
    i += 1

In [99]:
floor_scores = []
Floor_scores = []
for n in n_floor["所在階"]:
    if (n == ""):
        continue
    else:
        try:
            floor_score = int(n[0])
        except:
            floor_score = nan
        try:
            Floor_score = int(n[1])
        except:
            Floor_score = nan
        floor_scores.append(floor_score)
        Floor_scores.append(Floor_score)

floor_scores = pd.Series(floor_scores)
Floor_scores = pd.Series(Floor_scores)

In [100]:
X_train = pd.concat([house_age, area_size, room_arrange_scores, contract_span, floor_scores, Floor_scores, wards, stations, minits], axis=1)
y_train = rent

In [101]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.3, random_state=0)

テストデータ作成

In [947]:
test_area_size = pd.read_csv('test_area_size.csv')
test_house_age = pd.read_csv('test_house_age.csv')
test_n_floor = pd.read_csv('test_n_floor.csv')
test_room_arrange = pd.read_csv('test_room_arrange.csv')
test_contract_span = pd.read_csv('test_contract_span.csv')

In [948]:
import re

i = 0
for s in test_n_floor["所在階"]:
    try:
        test_n_floor["所在階"][i] = re.findall(r"\d+", s)
    except:
        test_n_floor["所在階"][i] = nan
    i += 1

In [949]:
test_floor_scores = []
test_Floor_scores = []
for n in test_n_floor["所在階"]:
    if (n == ""):
        continue
    else:
        try:
            test_floor_score = int(n[0])
        except:
            test_floor_score = nan
        try:
            test_Floor_score = int(n[1])
        except:
            test_Floor_score = nan
        test_floor_scores.append(test_floor_score)
        test_Floor_scores.append(test_Floor_score)

test_floor_scores = pd.Series(test_floor_scores,name='所在階')
test_floor_scores.to_csv('test_floor_scores.csv',index=False)
test_Floor_scores = pd.Series(test_Floor_scores,name='全体の階数')
test_Floor_scores.to_csv('test_capital_floor_scores.csv',index=False)

In [950]:
test_room_arrange_scores = []
for ldks in test_room_arrange['間取り']:
    test_room_arrange_score = 0
    for s in ldks:
        if s.isdigit():
            test_room_arrange_score += int(s)
        elif (s in ['L', 'D', 'K', 'S']):
            test_room_arrange_score += 1
        else:
            pass
    
    test_room_arrange_scores.append(test_room_arrange_score)
test_room_arrange_scores = pd.Series(data=test_room_arrange_scores,name='間取り得点')
#test_room_arrange_scores = test_room_arrange_scores.rename(columns={0:'間取り得点'})#列名の振り直し
test_room_arrange_scores.to_csv('test_room_arrange_scores.csv',index=False)

In [951]:
X_test = pd.concat([test_house_age, test_area_size, test_room_arrange_scores, test_contract_span, test_floor_scores, test_Floor_scores, test_wards, test_stations, test_minits], axis=1)

lightGBMに「築年数」「面積」「間取り」「契約期間」「その部屋のある階数」「全体の階数」「所在地」「最寄り駅」「最寄り駅までの所要時間」をいれる。

In [952]:
category_lists = ['市区町村','最寄り駅']

lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train)

params = {
    'objective':'regression',
    'metrics':'rmse',
    'lambda_l2':0.00001
}

model = lgb.train(
                    params,
                    lgb_train, 
                    valid_sets=[lgb_train, lgb_eval], 
                    verbose_eval=10, 
                    num_boost_round=3000, 
                    early_stopping_rounds=10,
                    categorical_feature = category_lists
                    )

y_pred = model.predict(X_test, num_iteration=model.best_iteration)

New categorical_feature is ['市区町村', '最寄り駅']


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1089
[LightGBM] [Info] Number of data points in the train set: 22029, number of used features: 9
[LightGBM] [Info] Start training from score 118651.337373
Training until validation scores don't improve for 10 rounds
[10]	training's rmse: 35666.5	valid_1's rmse: 40678.1
[20]	training's rmse: 22792.8	valid_1's rmse: 30744.4
[30]	training's rmse: 18244.7	valid_1's rmse: 27457.7
[40]	training's rmse: 16313.8	valid_1's rmse: 26121.2
[50]	training's rmse: 15124.1	valid_1's rmse: 25296.8
[60]	training's rmse: 14285.3	valid_1's rmse: 24719.4
[70]	training's rmse: 13634.6	valid_1's rmse: 24307.2
[80]	training's rmse: 13096.2	valid_1's rmse: 24029.2
[90]	training's rmse: 12623.5	valid_1's rmse: 23792
[100]	training's rmse: 12203	valid_1's rmse: 23589
[110]	training's rmse: 11833.7	valid_1's rmse: 23428.9
[120]	training's rmse: 11539.1	valid_1's rm

スコアは下がったが、まだ過学習気味である。

In [953]:
y_pred = pd.DataFrame(y_pred)

In [954]:
y_pred.to_csv('result.csv')

In [955]:
id = df['id']

In [956]:
result = pd.concat([id, y_pred],axis=1)

In [957]:
result.to_csv('result.csv',index=False)