# boston価格データでモデルのテスト

boston価格データを利用してモデルのテストを行う。

クロスバリデーションなども利用できるか試してみよう。

In [45]:
import pandas as pd
import numpy as np
import pandas_profiling
import lightgbm as lgb
import optuna
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

In [5]:
# データを読み込む
from sklearn.datasets import load_boston
boston = load_boston()

In [6]:
boston

{'data': array([[6.3200e-03, 1.8000e+01, 2.3100e+00, ..., 1.5300e+01, 3.9690e+02,
         4.9800e+00],
        [2.7310e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9690e+02,
         9.1400e+00],
        [2.7290e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9283e+02,
         4.0300e+00],
        ...,
        [6.0760e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
         5.6400e+00],
        [1.0959e-01, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9345e+02,
         6.4800e+00],
        [4.7410e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
         7.8800e+00]]),
 'target': array([24. , 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15. ,
        18.9, 21.7, 20.4, 18.2, 19.9, 23.1, 17.5, 20.2, 18.2, 13.6, 19.6,
        15.2, 14.5, 15.6, 13.9, 16.6, 14.8, 18.4, 21. , 12.7, 14.5, 13.2,
        13.1, 13.5, 18.9, 20. , 21. , 24.7, 30.8, 34.9, 26.6, 25.3, 24.7,
        21.2, 19.3, 20. , 16.6, 14.4, 19.4, 19.7, 20.5, 25. , 23.4, 18.9,
        35.4, 24.7, 3

In [7]:
# データフレームに変えて内容を確認する
df = pd.DataFrame(boston.data, columns=boston.feature_names)
df['住宅価格'] = boston.target
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,住宅価格
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


説明変数の内容は次の通りである。

- CRIM:犯罪発生率
- ZN:住居区画の密集度
- INDUS:非小売業の土地割合
- CHAS:チャールズ川(1:川の周辺,0:それ以外)
- NOX:NOX濃度
- RM:平均部屋数
- AGE:1940年以前に建てられた物件割合
- DIS:5つのボストン市の雇用施設からの重み付き距離
- RAD:大きな道路へのアクセスのしやすさ
- TAX:10000ドルあたりの所得税率
- PTRATIO:教師当たりの生徒数
- B:黒人の比率
- LSTAT:低所得者の割合

In [33]:
# 今回は自前で訓練データとテストデータに分けていく
train = df.drop(['住宅価格'],axis = 1)
target = df['住宅価格'].values
target

array([24. , 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15. ,
       18.9, 21.7, 20.4, 18.2, 19.9, 23.1, 17.5, 20.2, 18.2, 13.6, 19.6,
       15.2, 14.5, 15.6, 13.9, 16.6, 14.8, 18.4, 21. , 12.7, 14.5, 13.2,
       13.1, 13.5, 18.9, 20. , 21. , 24.7, 30.8, 34.9, 26.6, 25.3, 24.7,
       21.2, 19.3, 20. , 16.6, 14.4, 19.4, 19.7, 20.5, 25. , 23.4, 18.9,
       35.4, 24.7, 31.6, 23.3, 19.6, 18.7, 16. , 22.2, 25. , 33. , 23.5,
       19.4, 22. , 17.4, 20.9, 24.2, 21.7, 22.8, 23.4, 24.1, 21.4, 20. ,
       20.8, 21.2, 20.3, 28. , 23.9, 24.8, 22.9, 23.9, 26.6, 22.5, 22.2,
       23.6, 28.7, 22.6, 22. , 22.9, 25. , 20.6, 28.4, 21.4, 38.7, 43.8,
       33.2, 27.5, 26.5, 18.6, 19.3, 20.1, 19.5, 19.5, 20.4, 19.8, 19.4,
       21.7, 22.8, 18.8, 18.7, 18.5, 18.3, 21.2, 19.2, 20.4, 19.3, 22. ,
       20.3, 20.5, 17.3, 18.8, 21.4, 15.7, 16.2, 18. , 14.3, 19.2, 19.6,
       23. , 18.4, 15.6, 18.1, 17.4, 17.1, 13.3, 17.8, 14. , 14.4, 13.4,
       15.6, 11.8, 13.8, 15.6, 14.6, 17.8, 15.4, 21

In [9]:
train

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3,396.90,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.90,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.90,5.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.0,21.0,396.90,9.08
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.90,5.64
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48


In [10]:
target

0      24.0
1      21.6
2      34.7
3      33.4
4      36.2
       ... 
501    22.4
502    20.6
503    23.9
504    22.0
505    11.9
Name: 住宅価格, Length: 506, dtype: float64

In [46]:
# # 各foldのスコアを保存するリスト
# 正解率:回帰は使えない
scores_accuracy = []
# logloss:モデルの性能
scores_logloss = []
# mean_squared_error
scores_RMSE = []

In [47]:
# 実行する
# kfoldを用いる
kf = KFold(n_splits=4,shuffle=True,random_state=42)
for tr_idx, va_idx in kf.split(train):
    # 学習用データを学習データとバリデーションデータに分ける
    X_train, X_valid = train.iloc[tr_idx], train.iloc[va_idx]
    Y_train, Y_valid = target[tr_idx], target[va_idx]
    
    # モデルの学習
    lgb_train = lgb.Dataset(X_train, Y_train)
    lgb_eval = lgb.Dataset(X_valid, Y_valid, reference=lgb_train)
    
    # パラメータチューニングの箇所はここになる
    params = {
    'objective':'regression'
    }
    
    model = lgb.train(params, lgb_train,
                valid_sets = [lgb_train, lgb_eval],
                verbose_eval = 10,
                num_boost_round = 1000,
                early_stopping_rounds = 10)
    # 検証データ
    y_pred_valid = model.predict(X_valid,num_iteration=model.best_iteration)
    
    # 検証データの正解と検証で予測したものを利用する
    # logloss = log_loss(Y_valid, y_pred_valid, labels=[0, 1])
    # accuracy = accuracy_score(Y_valid, y_pred_valid)
    RMSE = mean_squared_error(Y_valid, y_pred_valid)
    
    # そのfoldのスコアを保存する
    # scores_logloss.append(logloss)
    # scores_accuracy.append(accuracy)
    scores_RMSE.append(RMSE)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 978
[LightGBM] [Info] Number of data points in the train set: 379, number of used features: 13
[LightGBM] [Info] Start training from score 22.907916
Training until validation scores don't improve for 10 rounds
[10]	training's l2: 22.1844	valid_1's l2: 23.7342
[20]	training's l2: 10.1506	valid_1's l2: 15.2763
[30]	training's l2: 7.22528	valid_1's l2: 13.4463
[40]	training's l2: 5.60223	valid_1's l2: 12.6277
[50]	training's l2: 4.57255	valid_1's l2: 11.676
[60]	training's l2: 3.85951	valid_1's l2: 11.4449
[70]	training's l2: 3.29463	valid_1's l2: 11.2262
[80]	training's l2: 2.85561	valid_1's l2: 11.1978
[90]	training's l2: 2.54064	valid_1's l2: 11.1571
[100]	training's l2: 2.23364	valid_1's l2: 10.9377
Early stopping, best iteration is:
[96]	training's l2: 2.33298	valid_1's l2: 10.8697
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 979
[LightGBM] [Info] Number of

[80]	training's l2: 2.92362	valid_1's l2: 11.9758
[90]	training's l2: 2.53584	valid_1's l2: 11.9276
[100]	training's l2: 2.22561	valid_1's l2: 11.7487
[110]	training's l2: 1.95623	valid_1's l2: 11.5596
[120]	training's l2: 1.72289	valid_1's l2: 11.2963
Early stopping, best iteration is:
[119]	training's l2: 1.74195	valid_1's l2: 11.276


In [48]:
scores_RMSE

[10.869676710787271,
 12.249426283306038,
 11.075232341050562,
 11.275972877919894]

In [49]:
# 訓練データでちゃんとモデルを作成するってこと？
# この後にパラメーターいじったモデルでテストデータの結果を出せばよいのか
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train)

params = {
    'objective'
}

model = lgb.train(params, lgb_train,
                valid_sets = [lgb_train, lgb_eval],
                verbose_eval = 10,
                num_boost_round = 1000,
                early_stopping_rounds = 10)

y_pred = model.predict(X_test, num_iteration=model.best_iteration)


TypeError: 'set' object does not support item assignment