In [None]:
!lscpu

In [None]:
# 深さ1のLightGBM回帰の可視化

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error

In [None]:
# データセットの読み込み
df = pd.read_csv(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data",
    header=None,
    sep="\s+",
)
df.columns = [
    "CRIM",
    "ZN",
    "INDUS",
    "CHAS",
    "NOX",
    "RM",
    "AGE",
    "DIS",
    "RAD",
    "TAX",
    "PTRATIO",
    "B",
    "LSTAT",
    "MEDV",
]
df.head()

In [None]:
# 特徴量と目的変数の設定
X_train = df.loc[:99, ["RM"]]
y_train = df.loc[:99, "MEDV"]
print("X_train:", X_train[:3])
print("y_train:", y_train[:3])

In [None]:
# ハイパーパラメータの設定
import lightgbm as lgb

lgb_train = lgb.Dataset(X_train, y_train)
params = {
    "objective": "mse",
    "metric": "mse",
    "learning_rate": 0.8,
    "max_depth": 1,
    "min_data_in_leaf": 1,
    "min_data_in_bin": 1,
    "max_bin": 100,
    "seed": 0,
    "verbose": -1,
}

In [None]:
# モデルの学習
model = lgb.train(
    params, lgb_train, num_boost_round=1, valid_sets=[lgb_train], valid_names=["train"]
)

In [None]:
# 学習データの予測と評価
y_train_pred = model.predict(X_train)
print("MSE train: %.2f" % mean_squared_error(y_train, y_train_pred))

In [None]:
model.predict(X_train)

In [None]:
# 木の可視化
lgb.plot_tree(model, tree_index=0, figsize=(8, 6))

In [None]:
# データと予測値の可視化
plt.figure(figsize=(8, 4))
X = X_train.values.flatten()
y = y_train.values

X_plt = np.arange(X.min(), X.max(), 0.01)[:, np.newaxis]
y_pred = model.predict(X_plt)

plt.scatter(X, y, color="blue", label="data")
plt.plot(X_plt, y_pred, color="red", label="LightGBM")
plt.ylabel("Price in $1000s [MDEV]")
plt.xlabel("average number of rooms [RM]")
plt.title("Boston house-prices")
plt.legend(loc="upper right")
plt.show()

In [None]:
# LightGBM回帰の学習→予測→評価

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, root_mean_squared_error

In [None]:
df = pd.read_csv(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data",
    header=None,
    sep="\s+",
)
df.columns = [
    "CRIM",
    "ZN",
    "INDUS",
    "CHAS",
    "NOX",
    "RM",
    "AGE",
    "DIS",
    "RAD",
    "TAX",
    "PTRATIO",
    "B",
    "LSTAT",
    "MEDV",
]
df.head()

In [None]:
X = df.drop(["MEDV"], axis=1)
y = df["MEDV"]
X.head()

In [None]:
# 学習データとテストデータに分割
from numpy import test

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=True, random_state=0
)
print("X_trainの形状:", X_train.shape)
print("y_trainの形状:", y_train.shape)
print("X_testの形状:", X_test.shape)
print("y_testの形状:", y_test.shape)

In [None]:
# ハイパーパラメータの設定
import lightgbm as lgb

lgb_train = lgb.Dataset(X_train, y_train)
params = {
    "objective": "mse",
    "num_leaves": 5,
    "seed": 0,
    "verbose": -1,
}

In [None]:
# モデルの学習
model = lgb.train(
    params,
    lgb_train,
    num_boost_round=50,
    valid_sets=[lgb_train],
    valid_names=["train"],
    callbacks=[lgb.log_evaluation(10)],
)

In [None]:
# 学習データの予測と評価
y_test_pred = model.predict(X_test)
print("RMSE test: %.2f" % root_mean_squared_error(y_test, y_test_pred))

In [None]:
# 特徴量の重要度の可視化
importances = model.feature_importance(importance_type="gain")
indices = np.argsort(importances)[::-1]

plt.figure(figsize=(8, 4))
plt.title("Feature importances")
plt.bar(range(len(indices)), importances[indices])
plt.xticks(range(len(indices)), X.columns[indices], rotation=90)
plt.show()

In [None]:
# 木の可視化
lgb.plot_tree(model, tree_index=0, figsize=(10, 10))

In [None]:
# 木の可視化
lgb.plot_tree(model, tree_index=-1, figsize=(10, 10))

In [None]:
# explainerの作成
import shap

explainer = shap.TreeExplainer(model=model, feature_perturbation="tree_path_dependent")

In [None]:
# SHAP値の計算
shap_values = explainer(X_test)

In [None]:
# 全件レコードの期待値
explainer.expected_value

In [None]:
y_test_pred

In [None]:
# 15件目のSHAP値
shap_values[14]

In [None]:
# 15件目の貢献度
shap_values.values[14]

In [None]:
# 期待値+15件目の貢献度合計
shap_values[14].base_values + shap_values[14].values.sum()

In [None]:
# 15件目の予測値
y_test_pred[14]

In [None]:
# 15件目のSHAP値の可視化
shap.plots.waterfall(shap_values[14])

In [None]:
# 11件目のSHAP値の可視化
shap.plots.waterfall(shap_values[10])

In [None]:
# 特徴量重要度の可視化
shap.plots.bar(shap_values=shap_values)