In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler

import lightgbm

from sklearn.metrics import mean_squared_error

## データの抽出

In [2]:
# 住宅価格データセットの読み込み
df = pd.read_csv(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data",
    # "https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data",
    header=None,
    sep="\s+"
)

In [3]:
X = df.iloc[:, 0:13].values  # 訓練データ
y = df.iloc[:, 13].values  # ラベル
X, X.mean(axis=0), X.std(axis=0)

(array([[6.3200e-03, 1.8000e+01, 2.3100e+00, ..., 1.5300e+01, 3.9690e+02,
         4.9800e+00],
        [2.7310e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9690e+02,
         9.1400e+00],
        [2.7290e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9283e+02,
         4.0300e+00],
        ...,
        [6.0760e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
         5.6400e+00],
        [1.0959e-01, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9345e+02,
         6.4800e+00],
        [4.7410e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
         7.8800e+00]]),
 array([3.61352356e+00, 1.13636364e+01, 1.11367787e+01, 6.91699605e-02,
        5.54695059e-01, 6.28463439e+00, 6.85749012e+01, 3.79504269e+00,
        9.54940711e+00, 4.08237154e+02, 1.84555336e+01, 3.56674032e+02,
        1.26530632e+01]),
 array([8.59304135e+00, 2.32993957e+01, 6.85357058e+00, 2.53742935e-01,
        1.15763115e-01, 7.01922514e-01, 2.81210326e+01, 2.10362836e+00,
        8.69865112e+00,

## データセットを訓練データとテストデータに分割

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0
)

In [5]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((404, 13), (102, 13), (404,), (102,))

## データ加工

In [6]:
standard_scaler = StandardScaler()  # 変数ごとの平均0，標準偏差1
X_train_std = standard_scaler.fit_transform(X_train)
X_test_std = standard_scaler.transform(X_test)

In [7]:
X_train_std, X_train_std.mean(axis=0), X_train_std.std(axis=0)

(array([[-0.37257438, -0.49960763, -0.70492455, ..., -0.48463784,
          0.3716906 , -0.41100022],
        [-0.39709866, -0.49960763, -0.04487755, ...,  0.33649132,
          0.20501196, -0.38768057],
        [-0.402693  ,  0.77116771, -0.88675963, ..., -0.84958414,
          0.36660893, -0.18191902],
        ...,
        [-0.39805586, -0.49960763, -0.15941933, ..., -0.30216469,
          0.40342278, -0.33006734],
        [-0.38842357, -0.49960763, -0.60326872, ..., -0.25654641,
          0.38343489,  0.8359148 ],
        [-0.39951258, -0.49960763, -1.01275558, ..., -0.84958414,
          0.43041207,  0.27212814]]),
 array([-2.33174316e-16, -6.23813432e-17,  2.69916103e-15, -3.95723058e-17,
        -9.89857261e-16, -1.47571724e-15,  8.65918998e-16,  2.00884414e-16,
        -1.18716917e-16,  2.91296140e-17,  1.65098958e-14,  7.99759049e-15,
        -1.85852434e-15]),
 array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]))

In [8]:
X_test_std, X_test_std.mean(axis=0), X_test_std.std(axis=0)

(array([[-0.40835869, -0.49960763, -1.12872913, ..., -0.71272928,
          0.18547577, -0.73610347],
        [ 0.71925111, -0.49960763,  0.9988844 , ...,  0.79267419,
          0.0831649 , -0.4356916 ],
        [-0.40257488, -0.49960763,  0.39610829, ..., -0.94082071,
          0.39472748, -0.30263246],
        ...,
        [-0.3982601 ,  0.55937182, -0.85812418, ...,  0.56458276,
          0.41019833,  0.06087961],
        [-0.39934279, -0.49960763, -0.07637654, ...,  0.0627816 ,
          0.30517724, -0.45626776],
        [-0.40088071, -0.49960763, -0.36702631, ...,  1.1120022 ,
          0.41166637, -0.05983383]]),
 array([ 0.13866949, -0.09055086,  0.00947364, -0.00267538, -0.05133271,
        -0.12271014, -0.07993312,  0.05920719,  0.08369368,  0.15678111,
        -0.01548507, -0.11828163, -0.03615108]),
 array([1.25248977, 0.92992239, 0.90326538, 0.99544914, 1.02902695,
        1.05088058, 1.00607704, 1.05843705, 1.02463579, 0.92140548,
        0.93171334, 1.13590961, 0.88821615

## 予測モデルの指定

In [12]:
model = lightgbm.LGBMRegressor()

## 訓練データと損失関数を用いたモデルの学習

In [13]:
model.fit(X_train_std, y_train)

## テストデータを用いたモデルの評価

In [14]:
y_train_pred = model.predict(X_train_std)
y_test_pred = model.predict(X_test_std)

In [15]:
(
    np.mean(
        (y_train - y_train_pred) ** 2
    ),
    np.mean(
        (y_test - y_test_pred) ** 2
    )
)

(1.9212381080320244, 24.5098863728449)

In [16]:
(
    mean_squared_error(
        y_train, y_train_pred
    ),
    mean_squared_error(
        y_test, y_test_pred
    )
)

(1.9212381080320244, 24.5098863728449)

## モデルの保存と読み込み

In [17]:
# load libraries
import pickle

In [18]:
file_path = '../models/trained_LightGBMRegressor_model.pkl'
pickle.dump(model, open(file_path, 'wb'))

# 学習済みモデルを削除
del model

In [19]:
y_train_pred = model.predict(X_train_std)
y_test_pred = model.predict(X_test_std)

NameError: name 'model' is not defined

In [20]:
model = pickle.load(open(file_path, 'rb'))
    
model

In [21]:
y_train_pred = model.predict(X_train_std)
y_test_pred = model.predict(X_test_std)

In [22]:
(
    mean_squared_error(
        y_train, y_train_pred
    ),
    mean_squared_error(
        y_test, y_test_pred
    )
)

(1.9212381080320244, 24.5098863728449)

## モデルの保存と読み込み（標準化のモデルも共有する必要あり）

In [23]:
file_path = '../models/trained_LightGBMRegressor_standard_scaler.pkl'
pickle.dump(standard_scaler, open(file_path, 'wb'))

del standard_scaler

In [24]:
standard_scaler = pickle.load(open(file_path, 'rb'))

In [25]:
X_test_std = standard_scaler.transform(X_test)