In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, StratifiedKFold

# -----------------------------
# 1) 데이터 로드 (Kaggle 경로 우선, 실패하면 현재 폴더의 train.csv)
# -----------------------------
def load_titanic_train():
    paths = [
        "/kaggle/input/titanic/train.csv",  # Kaggle Titanic classic 경로
        "train.csv",                        # 로컬/현재 폴더
    ]
    last_err = None
    for p in paths:
        try:
            df = pd.read_csv(p)
            print(f"Loaded: {p}  shape={df.shape}")
            return df
        except Exception as e:
            last_err = e
    raise FileNotFoundError(
        "train.csv를 찾지 못했습니다. Kaggle이면 /kaggle/input/titanic/train.csv 경로를 확인하거나, "
        "로컬이면 현재 폴더에 train.csv가 있어야 합니다."
    ) from last_err

df = load_titanic_train()


Loaded: train.csv  shape=(891, 12)


In [3]:
# 타겟
y = df["Survived"].astype(int).to_numpy()

# split에는 X가 필요하지만, 분포 비교가 목적이면 아무거나 써도 됩니다(행 개수만 맞으면 OK).
# 여기서는 인덱스만 사용합니다.
X = np.arange(len(df)).reshape(-1, 1)


In [None]:

# -----------------------------
# 2) fold별 분포 계산 함수
# -----------------------------
def fold_distribution(splitter, X, y, method_name: str):
    rows = []
    for fold, (tr_idx, va_idx) in enumerate(splitter.split(X, y) if "Stratified" in method_name else splitter.split(X)):
        y_va = y[va_idx]
        n = len(va_idx)
        pos = int((y_va == 1).sum())
        neg = int((y_va == 0).sum())
        rows.append({
            "method": method_name,
            "fold": fold,
            "n_valid": n,
            "neg(0)": neg,
            "pos(1)": pos,
            "pos_rate": pos / n,
        })
    return pd.DataFrame(rows)


In [14]:

# -----------------------------
# 3) KFold vs StratifiedKFold 비교
# -----------------------------
n_splits = 5
seed = 42

kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)

train_idx, valid_idx = next(kf.split(X, y))  # 첫 fold
print(len(train_idx), len(valid_idx))
print(valid_idx[:10])  # validation 인덱스 앞 10개

##    print(tr_idx, va_idx)

712 179
[ 5 10 23 25 30 31 33 39 44 49]


In [None]:


res_kf = fold_distribution(kf, X, y, "KFold")
res_skf = fold_distribution(skf, X, y, "StratifiedKFold")

res = pd.concat([res_kf, res_skf], ignore_index=True)

# 전체(원본) 클래스 비율
overall_pos_rate = (y == 1).mean()
print(f"Overall pos_rate (Survived=1): {overall_pos_rate:.4f}  (n={len(y)})\n")

# fold별 테이블
print(res.sort_values(["method", "fold"]).to_string(index=False))

# method별 요약 (pos_rate 변동성 비교)
summary = (res.groupby("method")["pos_rate"]
             .agg(["mean", "std", "min", "max"])
             .sort_index())
print("\n[Summary: pos_rate across folds]")
print(summary.to_string())
