In [1]:
import glob
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import lightgbm as lgb

In [2]:
# 指定したファイルパスのcsvファイルを全部読み込む関数
def read_csv_all(target_path = '../input/'):
    for file in glob.glob(target_path+'*.csv'):
        file_name = file.replace(target_path,'').replace('.csv','')
        exec('{} = pd.read_csv(\'{}\')'.format(file_name, file), globals())
        print('read {} as {}'.format(file, file_name))

In [3]:
def encode_categorical(df, cols):
    for col in cols:
        le = LabelEncoder()
        # nullを文字列に置換
        df[col].fillna("missing", inplace=True)
        df[col] = le.fit_transform(df[col])

    return df

In [4]:
# 指定したファイルパスのcsvファイルを全部読み込む関数（train用）
def read_csv_train(target_path = '../input/train/'):
    dfs = [pd.read_csv(f) for f in glob.glob(target_path+'*.csv')]
    finaldf = pd.concat(dfs, axis=0)
    return finaldf

In [43]:
read_csv_all()
train = read_csv_train()

read ../input/test.csv as test
read ../input/sample_submission.csv as sample_submission


In [44]:
pd.set_option('display.max_columns', None)

In [45]:
feature_name = ['feature_{}'.format(i + 1) for i in range(train.shape[1] - 2)]
col_name_test = ['ID'] + feature_name
col_name_train = col_name_test + ['target']
train.columns = col_name_train
test.columns = col_name_test

In [46]:
# nullと1種類の変数しかない列を排除
feature_except = ['feature_1', 'feature_2', 'feature_12', 'feature_13', 'feature_18', 'feature_19', 'feature_20']
train = train.drop(feature_except, axis=1)
test = test.drop(feature_except, axis=1)

In [54]:
test['target'] = 0
train['part'] = 'train'
test['part'] = 'test'
df= pd.concat([train, test], axis=0)

In [55]:
# int型とstr型で混合しているものを明示的にstr型に変換
df['feature_10'] = df['feature_10'].astype(str)

In [56]:
# 文字列が入ってるものはすべてカテゴリカルに変換
# feature_8(最寄り駅：距離)、feature10（面積）は本来数値型として扱いたいが、最初はカテゴリカル変数として扱う
df = encode_categorical(
    df, ['feature_3', 'feature_4','feature_5', 'feature_6', 'feature_7', 'feature_8', 'feature_9', 'feature_10','feature_11','feature_14','feature_15','feature_16','feature_17','feature_21','feature_24','feature_25','feature_26']
)

In [57]:
train = df[df['part'] == 'train']
test = df[df['part'] == 'test']

In [66]:
train_y = train["target"]
train_x = train.iloc[:,1:train.shape[1] - 2]
test_x = test.iloc[:,1:test.shape[1] - 2]

In [67]:
# バリデーション用にデータを分割する
tr_x, va_x, tr_y, va_y = train_test_split(train_x , train_y,
                                         test_size=0.25, random_state=71, shuffle=True)

In [68]:
# lightgbmで学習してみる
## lightgbmの構造に変換
lgb_train = lgb.Dataset(tr_x, tr_y)
lgb_eval = lgb.Dataset(va_x, va_y)

In [69]:
## ハイパーパラメータの設定
params = {'objective': 'regression', 'seed': 71, 'verbose': 0, 'metrics': 'rmse'}
num_round = 100

In [70]:
## カテゴリカル変数の指定
categorical_features = ['feature_3', 'feature_4','feature_5', 'feature_6', 'feature_7', 'feature_8', 'feature_9', 'feature_10','feature_11','feature_14','feature_15','feature_16','feature_17','feature_21','feature_24','feature_25','feature_26']
model = lgb.train(params, lgb_train, num_boost_round=num_round,
                  categorical_feature=categorical_features,
                  valid_names=['tarin', 'valid'], valid_sets=[lgb_train, lgb_eval])

[1]	tarin's rmse: 0.330216	valid's rmse: 0.32931
[2]	tarin's rmse: 0.309262	valid's rmse: 0.308187
[3]	tarin's rmse: 0.290913	valid's rmse: 0.289695
[4]	tarin's rmse: 0.274704	valid's rmse: 0.273354
[5]	tarin's rmse: 0.260519	valid's rmse: 0.259045
[6]	tarin's rmse: 0.248378	valid's rmse: 0.246753
[7]	tarin's rmse: 0.237332	valid's rmse: 0.235619
[8]	tarin's rmse: 0.227872	valid's rmse: 0.226083
[9]	tarin's rmse: 0.219293	valid's rmse: 0.217415
[10]	tarin's rmse: 0.21151	valid's rmse: 0.209529
[11]	tarin's rmse: 0.205179	valid's rmse: 0.203132
[12]	tarin's rmse: 0.199199	valid's rmse: 0.19705
[13]	tarin's rmse: 0.193688	valid's rmse: 0.191492
[14]	tarin's rmse: 0.18892	valid's rmse: 0.186693
[15]	tarin's rmse: 0.184839	valid's rmse: 0.182623
[16]	tarin's rmse: 0.181177	valid's rmse: 0.178936
[17]	tarin's rmse: 0.177737	valid's rmse: 0.175499
[18]	tarin's rmse: 0.174622	valid's rmse: 0.172373
[19]	tarin's rmse: 0.171927	valid's rmse: 0.169674
[20]	tarin's rmse: 0.169467	valid's rmse: 0.

In [72]:
preds = model.predict(test_x)

In [82]:
test['取引価格（総額）_log'] = preds

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [87]:
submit_file = test[['ID', '取引価格（総額）_log']]
submit_file.to_csv('../output/submission_baseline.csv', index=False)