In [None]:
import numpy as np
import pandas as pd
import missingno as msno
import seaborn as sns
import matplotlib.pyplot as plt

## train = pd.read_csv("/kaggle/input/titanic/train.csv")
## test  = pd.read_csv("/kaggle/input/titanic/test.csv")

train = pd.read_csv("./train.csv")
test  = pd.read_csv("./test.csv")

y = train["Survived"].astype(int)
X = train.drop(columns=["Survived"])

print("X:", X.shape, "y:", y.shape, "test:", test.shape)
display(X.head(3))

In [None]:
msno.matrix(X)

In [None]:
def profile_columns(df, n_examples=3):
    rows = []
    for col in df.columns:
        s = df[col]
        rows.append({
            "col": col,
            "dtype": str(s.dtype),
            "missing_cnt": int(s.isna().sum()),
            "missing_pct": float(s.isna().mean() * 100),
            "n_unique": int(s.nunique(dropna=True)),
            "examples": s.dropna().astype(str).head(n_examples).tolist()
        })
    prof = pd.DataFrame(rows).sort_values(["missing_pct","n_unique"], ascending=[False, False])
    return prof

prof = profile_columns(X)
display(prof)


In [None]:
## "Cabin" 컬럼의 결측치 비율과 일부 예시 확인
## X["Cabin"].isna().mean(), X["Cabin"].dropna().head(10).tolist()
deck = X["Cabin"].fillna("U").astype(str).str[0]
deck.value_counts().head(20)


In [None]:
## "Name" 컬럼의 일부 예시 확인
X["Name"].sample(10).tolist()


In [None]:
## "Ticket" 컬럼의 고유값 개수와 일부 예시 확인
X["Ticket"].nunique(), X["Ticket"].head(10).tolist()
## 전체 891, 고유값 681, missing 0 -> 중복값 많음


In [None]:
## Age, Embarked, Fare 결측치 비율 확인 
X[["Age","Embarked","Fare"]].isna().mean().sort_values(ascending=False)


In [None]:
## Cabin & Name 
tmp = train.copy()
tmp["Deck"] = tmp["Cabin"].fillna("U").astype(str).str[0]
tmp["Title"] = tmp["Name"].str.extract(r",\s*([^\.]+)\.")[0].fillna("Unknown")
## tmp["Title"].nunique(), tmp["Title"].sample(10).tolist()

def survival_rate_by(col, topn=20):
    t = tmp.groupby(col)["Survived"].agg(["mean","count"]).sort_values("count", ascending=False)
    return t.head(topn)

display(survival_rate_by("Sex"))
display(survival_rate_by("Pclass"))
display(survival_rate_by("Title"))
display(survival_rate_by("Deck"))



In [None]:
# Pclass별로 Deck 분포 확인
deck_pclass_dist = tmp.groupby(['Deck', 'Pclass']).size().unstack().fillna(0)

plt.figure(figsize=(10, 6))
sns.heatmap(deck_pclass_dist, annot=True, fmt='g', cmap='Blues')
plt.title("Relationship between Deck and Pclass")
plt.show()

In [None]:
import re
from sklearn.base import BaseEstimator, TransformerMixin

class TitanicFeatureEngineer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.ticket_counts_ = None

    def fit(self, X, y=None):
        X_ = X.copy()
        self.ticket_counts_ = (
            X_["Ticket"].value_counts(dropna=False) 
            if "Ticket" in X_.columns 
            else None
        )
        return self

    def transform(self, X):
        X_ = X.copy()

        # Title
        title = X_["Name"].astype(str).str.extract(r",\s*([^\.]+)\.")[0].fillna("Unknown")
        common = {"Mr","Miss","Mrs","Master"}
        X_["Title"] = title.where(title.isin(common), "Rare")

        # Family
        X_["FamilySize"] = X_["SibSp"].fillna(0) + X_["Parch"].fillna(0) + 1
        X_["IsAlone"] = (X_["FamilySize"] == 1).astype(int)

        # Deck
        X_["Deck"] = X_["Cabin"].fillna("U").astype(str).str[0]

        # TicketGroupSize (fold-safe: uses counts learned in fit)
        if self.ticket_counts_ is not None:
            X_["TicketGroupSize"] = X_["Ticket"].map(self.ticket_counts_).fillna(1).astype(int)
        else:
            X_["TicketGroupSize"] = 1

        # FarePerPerson
        denom = X_["FamilySize"].replace(0, 1)
        X_["FarePerPerson"] = X_["Fare"] / denom

        # Treat Pclass as categorical
        X_["Pclass"] = X_["Pclass"].astype("Int64").astype(str)

        # Drop raw high-cardinality columns
        X_ = X_.drop(columns=["Name","Cabin","Ticket"], errors="ignore")

        return X_


In [None]:
fe = TitanicFeatureEngineer().fit(X)
X_fe = fe.transform(X)

num_cols = X_fe.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X_fe.select_dtypes(exclude=[np.number]).columns.tolist()

print("Numeric:", num_cols)
print("Categorical:", cat_cols)
display(X_fe.head(3))


In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score

try:
    ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
except TypeError:
    ohe = OneHotEncoder(handle_unknown="ignore", sparse=False)

numeric_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
])

categorical_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe", ohe),
])

preprocess = ColumnTransformer([
    ("num", numeric_pipe, num_cols),
    ("cat", categorical_pipe, cat_cols),
])

model = RandomForestClassifier(
    n_estimators=600,
    min_samples_leaf=2,
    min_samples_split=4,
    max_features="sqrt",
    n_jobs=-1,
    random_state=42,
)

pipe = Pipeline([
    ("feat", TitanicFeatureEngineer()),
    ("prep", preprocess),
    ("model", model),
])

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(pipe, X, y, cv=cv, scoring="accuracy")
print(scores.mean(), scores.std())
