In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [None]:
# データセットの読み込み
df = pd.read_csv(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",
    header=None,
)
df.columns = [
    "age",
    "workclass",
    "fnlwgt",
    "education",
    "education-num",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "gender",
    "capital-gain",
    "capital-loss",
    "hours-per-week",
    "native-country",
    "income",
]
df.head()

In [None]:
# データの形状
df.shape

In [None]:
# 欠損データの有無
df.isnull().sum()

In [None]:
# データの概要
df.info()

In [None]:
# 数値変数のEDA
# 数値の要約統計量
df.describe().T

In [None]:
# カテゴリ変数の要約統計量
df.describe(exclude="number").T

In [None]:
# データの前処理

In [None]:
# 半角スペースの削除
cat_cols = df.select_dtypes(include=["object"]).columns

for col in cat_cols:
    df[col] = df[col].str.replace(" ", "")

In [None]:
# カテゴリ変数のリスト表示
for col in cat_cols:
    print(f"{col}: {list(df[col].unique())}")

In [None]:
# レコードの絞り込み（United-Statesのみ）
df = df[df["native-country"] == "United-States"]
df = df.drop(columns=["native-country"], axis=1)
df.reset_index(drop=True, inplace=True)
df.shape

In [None]:
# 正解ラベルの作成
df["income"] = df["income"].apply(lambda x: 1 if x == ">50K" else 0)

In [None]:
# データセットの確認
print(df.shape)
df.head()

In [None]:
# 特徴量と正解ラベルの分割
X = df.drop(columns=["income"], axis=1)
y = df["income"]

In [None]:
X.describe(exclude="number").T

In [None]:
# One Hot Encodingを用いたカテゴリ変数の処理
X = pd.concat(
    [X, pd.get_dummies(X["workclass"], prefix="workclass", drop_first=True)], axis=1
)
X = pd.concat(
    [X, pd.get_dummies(X["education"], prefix="education", drop_first=True)], axis=1
)
X = pd.concat(
    [X, pd.get_dummies(X["marital-status"], prefix="marital-status", drop_first=True)],
    axis=1,
)
X = pd.concat(
    [X, pd.get_dummies(X["occupation"], prefix="occupation", drop_first=True)], axis=1
)
X = pd.concat(
    [X, pd.get_dummies(X["relationship"], prefix="relationship", drop_first=True)],
    axis=1,
)
X = pd.concat([X, pd.get_dummies(X["race"], prefix="race", drop_first=True)], axis=1)
X = pd.concat(
    [X, pd.get_dummies(X["gender"], prefix="gender", drop_first=True)], axis=1
)
X = X.drop(
    [
        "workclass",
        "education",
        "marital-status",
        "occupation",
        "relationship",
        "race",
        "gender",
    ],
    axis=1,
)
print(X.shape)

In [None]:
# 学習データとテストデータの分割
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=True, random_state=0, stratify=y
)
print("X_trainの形状:", X_train.shape)
print("y_trainの形状:", y_train.shape)
print("X_testの形状:", X_test.shape)
print("y_testの形状:", y_test.shape)

In [None]:
print("y_trainの内訳:\n", y_train.value_counts())
print("y_testの内訳:\n", y_test.value_counts())

In [None]:
# 予測ラベル0の作成
y_test_zero = np.zeros(5834)
y_test_zero

In [None]:
# 特徴量の標準化
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
num_cols = X.select_dtypes(include=["float64", "int64"]).columns  # 数値型の特徴量を取得
# num_cols = X.columns[0:6]
scaler.fit(X_train[num_cols])
X_train[num_cols] = scaler.transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])
display(X_train.iloc[:2])

In [None]:
# モデルの学習
from sklearn.linear_model import LogisticRegression

# ロジスティック回帰モデル
model = LogisticRegression(
    max_iter=1000,
    multi_class="ovr",
    solver="liblinear",
    C=0.1,
    penalty="l1",
    random_state=0,
)
model.fit(X_train, y_train)
model.get_params()

In [None]:
# 予測確率のリスト
model.predict_proba(X_test)

In [None]:
# 予測のラベルリスト
model.predict(X_test)

In [None]:
y_test.values

In [None]:
y_test_pred = model.predict(X_test)
ac_score = accuracy_score(y_test, y_test_pred)
print(f"accuracy_score: {ac_score:.2f}")

f1_score = f1_score(y_test, y_test_pred)
print(f"f1_score: {f1_score:.2f}")

In [None]:
# 混同行列の作成
cm = confusion_matrix(y_test, y_test_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()

In [None]:
# パラメータによる予測値の解釈

In [None]:
print(f"回帰係数 w = [w1, w2, ... , w59]: {model.coef_}")
print("")
print(f"定数項 w0: {model.intercept_}")

In [None]:
# 特徴量表示
X.columns

In [None]:
# 回帰係数（上位30位）の可視化
importances = model.coef_[0]  # 回帰係数
indices = np.argsort(importances)[::-1][:30]  # 回帰係数を降順にソートし、上位30位を取得

plt.figure(figsize=(10, 4))
plt.title("Top 30 Features by Coefficient")
plt.bar(range(len(indices)), importances[indices])
plt.xticks(range(len(indices)), X.columns[indices], rotation=90)  # 特徴量に名前を付ける
plt.show()

In [None]:
# 最後から3件目のクラス0とクラス1の予測確率
model.predict_proba(X_test)[-3]

In [None]:
# 最後から3番目の特徴量
print(f"最後から3番目の特徴量 X = [x1, x2, ... ]: {X_test.values[-3]}")

In [None]:
# 最後から3番目のロジット
logit = (np.dot(model.coef_, X_test.values[-3]) + model.intercept_)[0]
logit

In [None]:
# シグモイド関数でロジットから確率へ変換
def sigmoid(z):
    return 1 / (1 + np.exp(-z))


# ロジットから確率への変換
sigmoid(logit)