In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler

from sklearn.svm import SVC

from sklearn.metrics import accuracy_score

## データの抽出

In [2]:
# 住宅価格データセットの読み込み
df = pd.read_csv(
    # "https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data",
    "https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data",
    header=None,
)

In [3]:
X = df.iloc[:, 1:].values  # 訓練データ
y = df.iloc[:, 0].values  # ラベル
X, X.mean(axis=0), X.std(axis=0)

(array([[1.423e+01, 1.710e+00, 2.430e+00, ..., 1.040e+00, 3.920e+00,
         1.065e+03],
        [1.320e+01, 1.780e+00, 2.140e+00, ..., 1.050e+00, 3.400e+00,
         1.050e+03],
        [1.316e+01, 2.360e+00, 2.670e+00, ..., 1.030e+00, 3.170e+00,
         1.185e+03],
        ...,
        [1.327e+01, 4.280e+00, 2.260e+00, ..., 5.900e-01, 1.560e+00,
         8.350e+02],
        [1.317e+01, 2.590e+00, 2.370e+00, ..., 6.000e-01, 1.620e+00,
         8.400e+02],
        [1.413e+01, 4.100e+00, 2.740e+00, ..., 6.100e-01, 1.600e+00,
         5.600e+02]]),
 array([1.30006180e+01, 2.33634831e+00, 2.36651685e+00, 1.94949438e+01,
        9.97415730e+01, 2.29511236e+00, 2.02926966e+00, 3.61853933e-01,
        1.59089888e+00, 5.05808988e+00, 9.57449438e-01, 2.61168539e+00,
        7.46893258e+02]),
 array([8.09542915e-01, 1.11400363e+00, 2.73572294e-01, 3.33016976e+00,
        1.42423077e+01, 6.24090564e-01, 9.96048950e-01, 1.24103260e-01,
        5.70748849e-01, 2.31176466e+00, 2.27928607e-01, 7.0

## データセットを訓練データとテストデータに分割

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0
)

In [5]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((142, 13), (36, 13), (142,), (36,))

## データ加工

In [6]:
standard_scaler = StandardScaler()  # 変数ごとの平均0，標準偏差1
X_train_std = standard_scaler.fit_transform(X_train)
X_test_std = standard_scaler.transform(X_test)

In [7]:
X_train_std, X_train_std.mean(axis=0), X_train_std.std(axis=0)

(array([[ 0.87668336,  0.79842885,  0.64412971, ...,  0.0290166 ,
         -1.06412236, -0.2059076 ],
        [-0.36659076, -0.7581304 , -0.39779858, ...,  0.0290166 ,
         -0.73083231, -0.81704676],
        [-1.69689407, -0.34424759, -0.32337513, ...,  0.90197362,
          0.51900537, -1.31256499],
        ...,
        [-0.70227477, -0.68615078, -0.65828065, ...,  0.46549511,
          0.51900537, -1.31256499],
        [ 1.13777093, -0.62316862, -0.91876272, ..., -0.18922266,
          1.03282752,  0.80164614],
        [ 1.4610222 ,  0.12361993,  0.42085937, ..., -1.45501034,
         -1.2168803 , -0.2719767 ]]),
 array([ 7.03662480e-16, -1.28799553e-15,  3.00932987e-15,  2.66707626e-15,
        -1.79043009e-16,  6.91934772e-16,  6.17659288e-16, -6.86461842e-16,
         8.63941156e-16, -2.17353522e-16, -3.59649712e-17,  1.60591415e-15,
        -9.46035113e-17]),
 array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]))

In [8]:
X_test_std, X_test_std.mean(axis=0), X_test_std.std(axis=0)

(array([[ 9.38847070e-01, -6.32166068e-01, -4.35010303e-01,
         -9.19695615e-01,  1.26324041e+00,  5.59998633e-01,
          9.77754158e-01, -1.20637533e+00,  2.36680192e-02,
          3.39284695e-01, -1.45574805e-01,  8.52295413e-01,
          1.04940526e+00],
        [-2.42263344e-01,  2.67579163e-01,  4.20859365e-01,
          7.12764102e-01,  8.40672358e-01, -1.27747161e+00,
         -6.05828120e-01, -9.70634096e-01, -5.87397203e-01,
          2.42611713e+00, -2.06608025e+00, -1.55017035e+00,
         -8.66598582e-01],
        [-7.64438475e-01, -1.11802849e+00, -7.69915825e-01,
         -1.61767889e-01, -9.20027861e-01,  2.03653722e+00,
          1.18341419e+00, -1.36353615e+00,  4.48018868e-01,
         -2.50930538e-01,  1.16386073e+00,  3.94021597e-01,
         -1.06480588e+00],
        [ 7.15057728e-01, -5.78181354e-01,  3.46435916e-01,
          2.75498106e-01,  1.12238439e+00,  1.15061407e+00,
          8.54358136e-01, -1.28495574e+00,  1.43251284e+00,
          5.0791761

## 予測モデルの指定

In [9]:
model = SVC(kernel="rbf", C=1.0)

## 訓練データと損失関数を用いたモデルの学習

In [10]:
model.fit(X_train_std, y_train)

## テストデータを用いたモデルの評価

In [11]:
y_train_pred = model.predict(X_train_std)
y_test_pred = model.predict(X_test_std)

In [12]:
(
    accuracy_score(
        y_train, y_train_pred
    ),
    accuracy_score(
        y_test, y_test_pred
    )
)

(0.9929577464788732, 1.0)

## モデルの保存と読み込み

In [13]:
# load libraries
import pickle

In [14]:
file_path = '../models/trained_LightGBMClassifier_model.pkl'
pickle.dump(model, open(file_path, 'wb'))

# 学習済みモデルを削除
del model

In [15]:
y_train_pred = model.predict(X_train_std)
y_test_pred = model.predict(X_test_std)

NameError: name 'model' is not defined

In [16]:
model = pickle.load(open(file_path, 'rb'))
    
model

In [17]:
y_train_pred = model.predict(X_train_std)
y_test_pred = model.predict(X_test_std)

In [18]:
(
    accuracy_score(
        y_train, y_train_pred
    ),
    accuracy_score(
        y_test, y_test_pred
    )
)

(0.9929577464788732, 1.0)

## モデルの保存（標準化のモデルも共有する必要あり）と読み込み

In [19]:
file_path = '../models/trained_LightGBMClassifier_standard_scaler.pkl'
pickle.dump(standard_scaler, open(file_path, 'wb'))

del standard_scaler

In [20]:
standard_scaler = pickle.load(open(file_path, 'rb'))

In [21]:
X_test_std = standard_scaler.transform(X_test)