In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [None]:
# データセットの読み込み
df = pd.read_csv(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",
    header=None,
)
df.columns = [
    "age",
    "workclass",
    "fnlwgt",
    "education",
    "education-num",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "gender",
    "capital-gain",
    "capital-loss",
    "hours-per-week",
    "native-country",
    "income",
]

In [None]:
# 半角スペースの削除
cat_cols = df.select_dtypes(include=["object"]).columns

for col in cat_cols:
    df[col] = df[col].str.replace(" ", "")

In [None]:
# レコードの絞り込み（United-Statesのみ）
df = df[df["native-country"] == "United-States"]
df = df.drop(columns=["native-country"], axis=1)
df.reset_index(drop=True, inplace=True)
df.shape

In [None]:
# 正解ラベルの作成
df["income"] = df["income"].apply(lambda x: 1 if x == ">50K" else 0)

In [None]:
# 特徴量と正解ラベルの分割
X = df.drop(columns=["income"], axis=1)
y = df["income"]

In [None]:
# 学習データとテストデータの分割
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=True, random_state=0, stratify=y
)
print("X_trainの形状:", X_train.shape)
print("y_trainの形状:", y_train.shape)
print("X_testの形状:", X_test.shape)
print("y_testの形状:", y_test.shape)

In [None]:
# カテゴリ変数のラベルエンコーディング
from sklearn.preprocessing import LabelEncoder

cat_cols = X.select_dtypes(include=["object"]).columns
for col in cat_cols:
    le = LabelEncoder()
    le.fit(X_train[col])
    X_train[col] = le.transform(X_train[col])
    X_test[col] = le.transform(X_test[col])

    X_train[col] = X_train[col].astype("category")
    X_test[col] = X_test[col].astype("category")

In [None]:
# 学習データの20%を検証データに分割
X_tr, X_va, y_tr, y_va = train_test_split(
    X_train, y_train, test_size=0.2, shuffle=True, random_state=0, stratify=y_train
)
print("X_trの形状:", X_tr.shape)
print("y_trの形状:", y_tr.shape)
print("X_vaの形状:", X_va.shape)
print("y_vaの形状:", y_va.shape)

In [None]:
import lightgbm as lgb

lgb_train = lgb.Dataset(X_tr, label=y_tr)
lgb_valid = lgb.Dataset(X_va, label=y_va, reference=lgb_train)

params = {"objective": "binary", "num_leaves": 5, "seed": 0, "verbose": -1}

# 誤差プロットの格納用データ
evals_result = {}

In [None]:
# モデルの学習
model = lgb.train(
    params,
    lgb_train,
    num_boost_round=500,
    valid_sets=[lgb_train, lgb_valid],
    callbacks=[
        lgb.early_stopping(10),
        lgb.log_evaluation(100),
        lgb.record_evaluation(evals_result),
    ],
)

In [None]:
# 学習データと検証データの誤差プロット
lgb.plot_metric(evals_result)

In [None]:
# 学習が停止したブースティング回数
model.best_iteration

In [None]:
# 検証データの予測と評価
y_va_pred_proba = model.predict(X_va, num_iteration=model.best_iteration)
print("ラベル1の予測確率:", y_va_pred_proba)
y_va_pred = np.round(y_va_pred_proba)
print("ラベル1の予測値:", y_va_pred)

ac_score = accuracy_score(y_va, y_va_pred)
print(f"Accuracy: {ac_score:.2f}")

f1_sc = f1_score(y_va, y_va_pred)
print(f"F1 Score: {f1_sc:.2f}")

In [None]:
# テストデータの予測と評価
y_test_pred_proba = model.predict(X_test, num_iteration=model.best_iteration)
print("ラベル1の予測確率:", y_test_pred_proba)
y_test_pred = np.round(y_test_pred_proba)
print("ラベル1の予測値:", y_test_pred)

accuracy_sc = accuracy_score(y_test, y_test_pred)
print(f"Accuracy: {accuracy_sc:.2f}")
f1_sc = f1_score(y_test, y_test_pred)
print(f"F1 Score: {f1_sc:.2f}")

In [None]:
# 混同行列の表示
cm = confusion_matrix(y_test, y_test_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()