In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [None]:
# データセットの読み込み
df = pd.read_csv(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",
    header=None,
)
df.columns = [
    "age",
    "workclass",
    "fnlwgt",
    "education",
    "education-num",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "gender",
    "capital-gain",
    "capital-loss",
    "hours-per-week",
    "native-country",
    "income",
]
df.head()

In [None]:
# データの形状
df.shape

In [None]:
# 欠損データの有無
df.isnull().sum()

In [None]:
# データの概要
df.info()

In [None]:
# 数値変数のEDA
# 数値の要約統計量
df.describe().T

In [None]:
# 数値データのヒストグラム
plt.rcParams["figure.figsize"] = (10, 6)
df.hist(bins=20)
plt.tight_layout()
plt.show()

In [None]:
# カテゴリ変数のEDA
# カテゴリ変数の要約統計量
cat_vars = df.select_dtypes(include=["object"]).columns
df[cat_vars].describe().T

In [None]:
# カテゴリ変数の要約統計量
df.describe(exclude="number").T

In [None]:
# カテゴリ変数のリスト表示
cat_cols = [
    "workclass",
    "education",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "gender",
    "native-country",
    "income",
]
for col in cat_cols:
    print(f"{col}: {list(df[col].unique())}")

In [None]:
# カテゴリ変数の棒グラフ可視化
cat_cols = df.select_dtypes(include=["object"]).columns
plt.rcParams["figure.figsize"] = (12, 24)
for i, col in enumerate(cat_cols):
    ax = plt.subplot(5, 2, i + 1)
    df[col].value_counts().plot(kind="bar", ax=ax)

plt.tight_layout()
plt.show()

In [None]:
# データの前処理

In [None]:
# 半角スペースの削除
cat_cols = df.select_dtypes(include=["object"]).columns

for col in cat_cols:
    df[col] = df[col].str.replace(" ", "")

In [None]:
# カテゴリ変数のリスト表示
for col in cat_cols:
    print(f"{col}: {list(df[col].unique())}")

In [None]:
# レコードの絞り込み（United-Statesのみ）
df = df[df["native-country"] == "United-States"]
df = df.drop(columns=["native-country"], axis=1)
df.reset_index(drop=True, inplace=True)
df.shape

In [None]:
# 前処理後のincome件数内訳
df["income"].value_counts()

In [None]:
# 前処理後のincome件数の可視化
plt.figure(figsize=(6, 3))
sns.countplot(data=df, x="income")

In [None]:
# 正解ラベルの作成
df["income"] = df["income"].apply(lambda x: 1 if x == ">50K" else 0)

In [None]:
# データセットの確認
print(df.shape)
df.head()

In [None]:
# 混同行列と正解率の検証

In [None]:
# 特徴量と正解ラベルの分割
X = df.drop(columns=["income"], axis=1)
y = df["income"]

In [None]:
# 学習データとテストデータの分割
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=True, random_state=0, stratify=y
)
print("X_trainの形状:", X_train.shape)
print("y_trainの形状:", y_train.shape)
print("X_testの形状:", X_test.shape)
print("y_testの形状:", y_test.shape)

In [None]:
print("y_trainの内訳:\n", y_train.value_counts())
print("y_testの内訳:\n", y_test.value_counts())

In [None]:
# 予測ラベル0の作成
y_test_zero = np.zeros(5834)
y_test_zero

In [None]:
# 予測ラベル0の混同行列の作成
cm = confusion_matrix(y_test, y_test_zero)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()

In [None]:
# 予測ラベルの評価指標の計算
ac_score = accuracy_score(y_test, y_test_zero)
pr_score = precision_score(y_test, y_test_zero)
rc_score = recall_score(y_test, y_test_zero)
f1 = f1_score(y_test, y_test_zero)

print(f"Accuracy: {ac_score:.2f}")
print(f"Precision: {pr_score:.2f}")
print(f"Recall: {rc_score:.2f}")
print(f"F1 Score: {f1:.2f}")

In [None]:
# 予測ラベル1の作成
y_test_ones = np.ones(5834)
y_test_ones

In [None]:
# 予測ラベル0の混同行列の作成
cm = confusion_matrix(y_test, y_test_ones)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()

In [None]:
# 予測ラベルの評価指標の計算
ac_score = accuracy_score(y_test, y_test_ones)
pr_score = precision_score(y_test, y_test_ones)
rc_score = recall_score(y_test, y_test_ones)
f1 = f1_score(y_test, y_test_ones)

print(f"Accuracy: {ac_score:.2f}")
print(f"Precision: {pr_score:.2f}")
print(f"Recall: {rc_score:.2f}")
print(f"F1 Score: {f1:.2f}")