In [1]:
import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression

# 1) 데이터 로드 (Kaggle Titanic 기준)
## train = pd.read_csv("/kaggle/input/titanic/train.csv")
## test  = pd.read_csv("/kaggle/input/titanic/test.csv")
train = pd.read_csv("./train.csv")
test  = pd.read_csv("./test.csv")


In [None]:

# 2) X, y 분리
y = train["Survived"]
X = train.drop(columns=["Survived"])
X_test = test.copy()

# 3) 컬럼 타입 분리 (가장 단순한 자동 규칙)
num_cols = X.select_dtypes(include=["number"]).columns
cat_cols = X.select_dtypes(exclude=["number"]).columns


Index(['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare'], dtype='object')
0     7.2500
1    71.2833
2     7.9250
3    53.1000
4     8.0500
Name: Fare, dtype: float64


In [5]:
# 4) 전처리 파이프(수치형/범주형)
numeric_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
])

categorical_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore")),
])

In [11]:
preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_pipe, num_cols),
        ("cat", categorical_pipe, cat_cols),
    ],
    remainder="drop",  # 지정 안 된 컬럼은 버림
)

In [12]:

# 5) 모델 + 전체 파이프라인
model = LogisticRegression(max_iter=1000)

clf = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", model),
])


In [13]:

# 6) 고정된 CV (StratifiedKFold: 클래스 비율 유지)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scores = cross_val_score(clf, X, y, cv=cv, scoring="accuracy")
print("CV accuracy: %.4f ± %.4f" % (scores.mean(), scores.std()))

CV accuracy: 0.8070 ± 0.0332


In [14]:

# 7) 최종 학습 + 예측 + 제출 파일 생성
clf.fit(X, y)
pred = clf.predict(X_test)

submission = pd.DataFrame({
    "PassengerId": test["PassengerId"],
    "Survived": pred.astype(int)
})
submission.to_csv("submission.csv", index=False)
print(submission.head())

   PassengerId  Survived
0          892         0
1          893         0
2          894         0
3          895         0
4          896         1
