In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [None]:
# データセットの読み込み
df = pd.read_csv(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",
    header=None,
)
df.columns = [
    "age",
    "workclass",
    "fnlwgt",
    "education",
    "education-num",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "gender",
    "capital-gain",
    "capital-loss",
    "hours-per-week",
    "native-country",
    "income",
]
df.head()

In [None]:
# データの形状
df.shape

In [None]:
# 欠損データの有無
df.isnull().sum()

In [None]:
# データの概要
df.info()

In [None]:
# 数値変数のEDA
# 数値の要約統計量
df.describe().T

In [None]:
# カテゴリ変数の要約統計量
df.describe(exclude="number").T

In [None]:
# データの前処理

In [None]:
# 半角スペースの削除
cat_cols = df.select_dtypes(include=["object"]).columns

for col in cat_cols:
    df[col] = df[col].str.replace(" ", "")

In [None]:
# カテゴリ変数のリスト表示
for col in cat_cols:
    print(f"{col}: {list(df[col].unique())}")

In [None]:
# レコードの絞り込み（United-Statesのみ）
df = df[df["native-country"] == "United-States"]
df = df.drop(columns=["native-country"], axis=1)
df.reset_index(drop=True, inplace=True)
df.shape

In [None]:
# 正解ラベルの作成
df["income"] = df["income"].apply(lambda x: 1 if x == ">50K" else 0)

In [None]:
# データセットの確認
print(df.shape)
df.head()

In [None]:
# 特徴量と正解ラベルの分割
X = df.drop(columns=["income"], axis=1)
y = df["income"]

In [None]:
# 学習データとテストデータの分割
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=True, random_state=0, stratify=y
)
print("X_trainの形状:", X_train.shape)
print("y_trainの形状:", y_train.shape)
print("X_testの形状:", X_test.shape)
print("y_testの形状:", y_test.shape)

In [None]:
# カテゴリ変数のラベルエンコーディング
from sklearn.preprocessing import LabelEncoder

cat_cols = X.select_dtypes(include=["object"]).columns
for col in cat_cols:
    le = LabelEncoder()
    le.fit(X_train[col])
    X_train[col] = le.transform(X_train[col])
    X_test[col] = le.transform(X_test[col])

In [None]:
# カテゴリ変数のデータ型をcategoryに変換
for col in cat_cols:
    X_train[col] = X_train[col].astype("category")
    X_test[col] = X_test[col].astype("category")
X_train.info()

In [None]:
# ハイパーパラメータの設定
import lightgbm as lgb

lgb_train = lgb.Dataset(X_train, y_train)

params = {
    "objective": "binary",
    "num_leaves": 5,
    "seed": 0,
    "verbose": -1,
}

In [None]:
# モデルの学習
model = lgb.train(
    params,
    lgb_train,
    num_boost_round=500,
    valid_sets=[lgb_train],
    valid_names=["train"],
    callbacks=[lgb.log_evaluation(100)],
)

In [None]:
# テストデータでの予測と評価
y_test_pred_proba = model.predict(X_test)
print(f"ラベル1の確率：{y_test_pred_proba}")
y_test_pred = np.round(y_test_pred_proba)
print(f"予測ラベル：{y_test_pred}")

ac_score = accuracy_score(y_test, y_test_pred)
print(f"Accuracy: {ac_score:.2f}")
f1_score = f1_score(y_test, y_test_pred)
print(f"F1 Score: {f1_score:.2f}")

In [None]:
# 混同行列の作成
cm = confusion_matrix(y_test, y_test_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()

In [None]:
# 1本目の木の可視化
lgb.plot_tree(model, tree_index=0, figsize=(20, 20))

In [None]:
# SHAPによる予測値の説明

In [None]:
# 5832件目の予測値を説明
y_test_pred_proba[-3]

In [None]:
# explainerの作成
import shap

explainer = shap.TreeExplainer(model=model, feature_perturbation="tree_path_dependent")

In [None]:
# SHAP値の計算
shap_values = explainer(X_test)

In [None]:
# 全件レコードの期待値
explainer.expected_value

In [None]:
# 最後からの3件目のSHAP値
shap_values[-3]

In [None]:
# shap_valuesのラベル1の絞り込み
shap_values.values = shap_values.values  # 貢献度
shap_values.base_values = explainer.expected_value  # 期待値

In [None]:
# 最後から3件目のSHAP値 ラベル1
shap_values[-3]

In [None]:
# 最後から3件目の貢献度
shap_values.values[-3]

In [None]:
# 最後から3件目の貢献度合計
shap_values.values[-3].sum()

In [None]:
# 期待値+最後から3件目の貢献度合計
shap_values[-3].base_values + shap_values.values[-3].sum()

In [None]:
# 最後から3件目のラベル1の確率
logit = shap_values[-3].base_values + shap_values.values[-3].sum()


def sigmoid(z):
    return 1 / (1 + np.exp(-z))


sigmoid(logit)

In [None]:
y_test_pred_proba[-3]  # ラベル1の確率

In [None]:
# 最後から3番目のSHAP値の可視化
shap.plots.waterfall(shap_values[-3])

In [None]:
# 重要度の可視化
shap.plots.bar(shap_values=shap_values)