In [1]:
import glob
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import lightgbm as lgb

In [2]:
# 指定したファイルパスのcsvファイルを全部読み込む関数
def read_csv_all(target_path = '../input/'):
    for file in glob.glob(target_path+'*.csv'):
        file_name = file.replace(target_path,'').replace('.csv','')
        exec('{} = pd.read_csv(\'{}\')'.format(file_name, file), globals())
        print('read {} as {}'.format(file, file_name))

In [3]:
def encode_categorical(df, cols):
    for col in cols:
        le = LabelEncoder()
        # nullを文字列に置換
        df[col].fillna("missing", inplace=True)
        df[col] = le.fit_transform(df[col])

    return df

In [4]:
# 指定したファイルパスのcsvファイルを全部読み込む関数（train用）
def read_csv_train(target_path = '../input/train/'):
    dfs = [pd.read_csv(f) for f in glob.glob(target_path+'*.csv')]
    finaldf = pd.concat(dfs, axis=0)
    return finaldf

In [17]:
read_csv_all()
train = read_csv_train()

read ../input/test.csv as test
read ../input/sample_submission.csv as sample_submission


In [18]:
pd.set_option('display.max_columns', None)

In [19]:
feature_name = ['feature_{}'.format(i + 1) for i in range(train.shape[1] - 2)]
col_name_test = ['ID'] + feature_name
col_name_train = col_name_test + ['target']
train.columns = col_name_train
test.columns = col_name_test

In [20]:
# nullと1種類の変数しかない列を排除
feature_except = ['feature_1', 'feature_2', 'feature_12', 'feature_13', 'feature_18', 'feature_19', 'feature_20']
train = train.drop(feature_except, axis=1)
test = test.drop(feature_except, axis=1)

In [21]:
# int型とstr型で混合しているものを明示的にstr型に変換
train['feature_10'] = train['feature_10'].astype(str)

In [22]:
train['part'] = 'train'
test['part'] = 'test'

In [23]:
train_y = df["target"]
train_x = df.iloc[:,1:df.shape[1] - 1]

In [24]:
train_x.head()

Unnamed: 0,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,feature_11,feature_14,feature_15,feature_16,feature_17,feature_21,feature_22,feature_23,feature_24,feature_25,feature_26
0,579,34,338,6213,908,18,38,88,0,8,5,6,0,14,60.0,200.0,26,1,0
1,556,34,104,1107,3072,2,48,109,0,27,5,6,3,11,60.0,200.0,41,1,0
2,563,34,492,2947,301,28,26,65,0,31,9,6,0,1,80.0,500.0,18,2,0
3,561,34,100,2699,3812,20,38,88,0,5,5,0,3,9,60.0,200.0,50,2,0
4,568,34,494,8749,3246,8,38,7,0,14,5,0,3,1,80.0,400.0,55,2,0


In [11]:
# 文字列が入ってるものはすべてカテゴリカルに変換
# feature_8(最寄り駅：距離)、feature10（面積）は本来数値型として扱いたいが、最初はカテゴリカル変数として扱う
df = encode_categorical(
    df, ['feature_3', 'feature_4','feature_5', 'feature_6', 'feature_7', 'feature_8', 'feature_9', 'feature_10','feature_11','feature_14','feature_15','feature_16','feature_17','feature_21','feature_24','feature_25','feature_26']
)

In [13]:
# バリデーション用にデータを分割する
tr_x, va_x, tr_y, va_y = train_test_split(train_x , train_y,
                                         test_size=0.25, random_state=71, shuffle=True)

In [14]:
# lightgbmで学習してみる
## lightgbmの構造に変換
lgb_train = lgb.Dataset(tr_x, tr_y)
lgb_eval = lgb.Dataset(va_x, va_y)

In [15]:
## ハイパーパラメータの設定
params = {'objective': 'regression', 'seed': 71, 'verbose': 0, 'metrics': 'rmse'}
num_round = 100

In [16]:
## カテゴリカル変数の指定
categorical_features = ['feature_3', 'feature_4','feature_5', 'feature_6', 'feature_7', 'feature_8', 'feature_9', 'feature_10','feature_11','feature_14','feature_15','feature_16','feature_17','feature_21','feature_24','feature_25','feature_26']
model = lgb.train(params, lgb_train, num_boost_round=num_round,
                  categorical_feature=categorical_features,
                  valid_names=['tarin', 'valid'], valid_sets=[lgb_train, lgb_eval])

New categorical_feature is ['feature_10', 'feature_11', 'feature_14', 'feature_15', 'feature_16', 'feature_17', 'feature_21', 'feature_24', 'feature_25', 'feature_26', 'feature_3', 'feature_4', 'feature_5', 'feature_6', 'feature_7', 'feature_8', 'feature_9']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


[1]	tarin's rmse: 0.330216	valid's rmse: 0.32931
[2]	tarin's rmse: 0.309262	valid's rmse: 0.308187
[3]	tarin's rmse: 0.290913	valid's rmse: 0.289695
[4]	tarin's rmse: 0.274704	valid's rmse: 0.273354
[5]	tarin's rmse: 0.260519	valid's rmse: 0.259045
[6]	tarin's rmse: 0.248378	valid's rmse: 0.246753
[7]	tarin's rmse: 0.237332	valid's rmse: 0.235619
[8]	tarin's rmse: 0.227872	valid's rmse: 0.226083
[9]	tarin's rmse: 0.219293	valid's rmse: 0.217415
[10]	tarin's rmse: 0.21151	valid's rmse: 0.209529
[11]	tarin's rmse: 0.205179	valid's rmse: 0.203132
[12]	tarin's rmse: 0.199199	valid's rmse: 0.19705
[13]	tarin's rmse: 0.193688	valid's rmse: 0.191492
[14]	tarin's rmse: 0.18892	valid's rmse: 0.186693
[15]	tarin's rmse: 0.184839	valid's rmse: 0.182623
[16]	tarin's rmse: 0.181177	valid's rmse: 0.178936
[17]	tarin's rmse: 0.177737	valid's rmse: 0.175499
[18]	tarin's rmse: 0.174622	valid's rmse: 0.172373
[19]	tarin's rmse: 0.171927	valid's rmse: 0.169674
[20]	tarin's rmse: 0.169467	valid's rmse: 0.

In [37]:
df.head()

Unnamed: 0,ID,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,feature_11,feature_14,feature_15,feature_16,feature_17,feature_21,feature_22,feature_23,feature_24,feature_25,feature_26,target
0,40108844,579,34,338,昇町,博多南,23,３ＬＤＫ,70,0,8,5,6,0,14,60.0,200.0,26,1,0,7.230449
1,40001245,556,34,104,久岐の浜,若松,10,４ＬＤＫ,95,0,27,5,6,3,11,60.0,200.0,41,1,0,6.875061
2,40068753,563,34,492,古門戸町,中洲川端,4,２ＬＤＫ,50,0,31,9,6,0,1,80.0,500.0,18,2,0,7.041393
3,40009332,561,34,100,南王子町,黒崎,25,３ＬＤＫ,70,0,5,5,0,3,9,60.0,200.0,50,2,0,7.041393
4,40030654,568,34,494,百道浜,西新,16,３ＬＤＫ,120,0,14,5,0,3,1,80.0,400.0,55,2,0,8.0


In [107]:
test.head()

Unnamed: 0,ID,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,feature_11,feature_14,feature_15,feature_16,feature_17,feature_21,feature_22,feature_23,feature_24,feature_25,feature_26
0,1000000,1101,北海道,札幌市中央区,旭ケ丘,円山公園,26,３ＬＤＫ,75,,昭和64年,ＲＣ,,住宅,第１種低層住居専用地域,40.0,60.0,2020年第２四半期,未改装,
1,1000056,1101,北海道,札幌市中央区,大通西,西１１丁目,1,２ＬＤＫ,55,,平成28年,ＲＣ,,住宅,商業地域,80.0,600.0,2020年第１四半期,未改装,
2,1000108,1101,北海道,札幌市中央区,大通西,西１８丁目,2,１Ｒ,15,,昭和64年,ＳＲＣ,,住宅,商業地域,80.0,400.0,2020年第２四半期,未改装,
3,1000109,1101,北海道,札幌市中央区,大通西,西１８丁目,2,１ＬＤＫ,45,,平成3年,ＳＲＣ,,住宅,商業地域,80.0,400.0,2020年第２四半期,改装済,
4,1000110,1101,北海道,札幌市中央区,大通西,西１８丁目,3,１Ｒ,20,,昭和56年,ＲＣ,,住宅,商業地域,80.0,400.0,2020年第２四半期,,


In [106]:
preds = model.predict(test_x)

ValueError: DataFrame.dtypes for data must be int, float or bool.
Did not expect the data types in the following fields: feature_4, feature_5, feature_6, feature_7, feature_8, feature_9, feature_14, feature_15, feature_16, feature_17, feature_21, feature_24, feature_25, feature_26