In [16]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [17]:
# =========================
# 1) Load
# =========================
train_path = "/Users/uruma/Kaggle_Predicting-Heart-Disease/data/train.csv"
test_path = "/Users/uruma/Kaggle_Predicting-Heart-Disease/data/test.csv"
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

ID_COL = "id"
TARGET_COL = "Heart Disease"

# Presence/Absence -> 1/0
y = train_df[TARGET_COL].map({"Absence": 0, "Presence": 1}).astype(int)
X = train_df.drop(columns=[TARGET_COL])

In [18]:
# =========================
# 2) Column split (baseline heuristic)
#    - 低ユニーク数の整数列はカテゴリ扱い
# =========================
feature_cols = [c for c in X.columns if c != ID_COL]

cat_cols, num_cols = [], []
for c in feature_cols:
    if X[c].dtype == "object":
        cat_cols.append(c)
    else:
        nunique = X[c].nunique(dropna=True)
        if nunique <= 12:
            cat_cols.append(c)
        else:
            num_cols.append(c)

print("Categorical:", cat_cols)
print("Numerical  :", num_cols)

Categorical: ['Sex', 'Chest pain type', 'FBS over 120', 'EKG results', 'Exercise angina', 'Slope of ST', 'Number of vessels fluro', 'Thallium']
Numerical  : ['Age', 'BP', 'Cholesterol', 'Max HR', 'ST depression']


In [19]:
# =========================
# 3) Preprocess + Model (baseline)
# =========================
numeric_tf = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
])

categorical_tf = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore")),
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_tf, num_cols),
        ("cat", categorical_tf, cat_cols),
    ],
    remainder="drop"
)

model = LogisticRegression(
    max_iter=2000,
    solver="lbfgs",
    n_jobs=None
)

clf = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", model)
])

In [20]:
# =========================
# 4) Simple holdout AUC
# =========================
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

clf.fit(X_train, y_train)
valid_pred = clf.predict_proba(X_valid)[:, 1]
auc = roc_auc_score(y_valid, valid_pred)
print(f"Holdout ROC-AUC = {auc:.6f}")

Holdout ROC-AUC = 0.953704


In [21]:
# =========================
# 5) (Option) KFold OOF AUC (より信頼できるベースライン指標)
# =========================
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
oof = np.zeros(len(df), dtype=float)

for fold, (tr_idx, va_idx) in enumerate(skf.split(X, y), 1):
    X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
    y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]

    clf.fit(X_tr, y_tr)
    oof[va_idx] = clf.predict_proba(X_va)[:, 1]

oof_auc = roc_auc_score(y, oof)
print(f"5-Fold OOF ROC-AUC = {oof_auc:.6f}")

5-Fold OOF ROC-AUC = 0.952876


In [22]:
clf.fit(X, y)
test_pred = clf.predict_proba(test_df)[:, 1]
submission = pd.DataFrame({ID_COL: test_df[ID_COL], "Heart Disease": test_pred})
submission.to_csv("/Users/uruma/Kaggle_Predicting-Heart-Disease/submission.csv", index=False)
print("Saved!")

Saved!
