In [None]:
# -*- coding: utf-8 -*-
# Titanic: WCG + XGBoost (Men & Solo Women) — Python port of the R notebook
# Requires: pandas, numpy, scikit-learn, xgboost

import os
import numpy as np
import pandas as pd

from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier

RANDOM_STATE = 300  # R側 set.seed に相当

: 

In [None]:
train = pd.read_csv("../input/train.csv")
test  = pd.read_csv("../input/test.csv")
test["Survived"] = np.nan
data = pd.concat([train, test], ignore_index=True)


In [None]:
def make_title(name, sex):
    if isinstance(name, str) and "Master" in name:
        return "boy"
    if sex == "female":
        return "woman"
    return "man"

data["Title"] = [make_title(n, s) for n, s in zip(data["Name"], data["Sex"])]

In [None]:
def impute_by_tree(df, target_col, feature_cols):
    X = df[feature_cols]
    y = df[target_col]

    # One-hot for categoricals
    cat_cols = [c for c in feature_cols if df[c].dtype == "object"]
    num_cols = [c for c in feature_cols if df[c].dtype != "object"]

    pre = ColumnTransformer(
        transformers=[
            ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
            ("num", "passthrough", num_cols),
        ]
    )
    pipe = Pipeline(
        steps=[
            ("pre", pre),
            ("reg", DecisionTreeRegressor(random_state=RANDOM_STATE, max_depth=6)),
        ]
    )

    # fit on not-null
    mask = ~y.isna()
    pipe.fit(X[mask], y[mask])

    # predict on nulls
    y_pred = pipe.predict(X[~mask])
    df.loc[~mask, target_col] = y_pred

# Age
impute_by_tree(
    data,
    target_col="Age",
    feature_cols=["Title", "Pclass", "SibSp", "Parch"]
)

# Fare
# Embarkedに欠損があることがあるので補完
data["Embarked"] = data["Embarked"].fillna(data["Embarked"].mode().iloc[0])
impute_by_tree(
    data,
    target_col="Fare",
    feature_cols=["Title", "Pclass", "Embarked", "Sex", "Age"])

In [None]:
data["TicketFreq"] = data.groupby("Ticket")["Ticket"].transform("count")
data["FamilySize"] = data["SibSp"] + data["Parch"] + 1
data["FareAdj"] = data["Fare"] / data["TicketFreq"]

# x1 = Fare / (TicketFreq * 10), x2 = FamilySize + Age/70
data["x1"] = data["Fare"] / (data["TicketFreq"] * 10.0)
data["x2"] = data["FamilySize"] + (data["Age"] / 70.0)

In [None]:
is_train = data["PassengerId"] <= 891
men_train = is_train & (data["Title"] == "man")

X_men = data.loc[men_train, ["x1", "x2"]].to_numpy()
y_men = data.loc[men_train, "Survived"].dropna().astype(int).to_numpy()
X_men = data.loc[men_train & data["Survived"].notna(), ["x1", "x2"]].to_numpy()

xgb_men = XGBClassifier(
    objective="binary:logistic",
    eval_metric="logloss",
    max_depth=5,
    learning_rate=0.1,
    gamma=0.1,
    colsample_bytree=1.0,
    min_child_weight=1.0,
    n_estimators=500,
    subsample=1.0,
    random_state=RANDOM_STATE,
    n_jobs=-1,
)
xgb_men.fit(X_men, y_men)

men_test = (data["PassengerId"] > 891) & (data["Title"] == "man")
X_men_test = data.loc[men_test, ["x1", "x2"]].to_numpy()
p_men = xgb_men.predict_proba(X_men_test)[:, 1]
men_live_pid = data.loc[men_test].assign(p=p_men).query("p >= 0.90")["PassengerId"].tolist()

In [None]:
def ticket_last_to_X(s):
    s = str(s).strip()
    return s[:-1] + "X" if len(s) else s

def fare_str(x):
    return "NA" if pd.isna(x) else f"{float(x):.4f}"

data["Surname"] = data["Name"].str.split(",").str[0]
data["TicketX"] = data["Ticket"].map(ticket_last_to_X)
data["FareStr"] = data["Fare"].map(fare_str)
data["EmbarkedStr"] = data["Embarked"].fillna("NA").astype(str)

data["GroupId"] = (
    data["Surname"].astype(str) + "-" +
    data["Pclass"].astype(str) + "-" +
    data["TicketX"].astype(str) + "-" +
    data["FareStr"] + "-" +
    data["EmbarkedStr"]
)

# men は noGroup
data.loc[data["Title"] == "man", "GroupId"] = "noGroup"

# Wilkes (893) ← Hocking (775) の GroupId を借りる（Rの特例）
gid_775 = data.loc[data["PassengerId"] == 775, "GroupId"]
if not gid_775.empty:
    data.loc[data["PassengerId"] == 893, "GroupId"] = gid_775.values[0]

# 出現数<=1 は noGroup
grp_freq = data["GroupId"].value_counts()
data.loc[data["GroupId"].map(grp_freq) <= 1, "GroupId"] = "noGroup"

# TicketId（nanny/relative 用）
data["TicketId"] = (
    data["Pclass"].astype(str) + "-" +
    data["TicketX"].astype(str) + "-" +
    data["FareStr"] + "-" +
    data["EmbarkedStr"]
)

# nanny/relative 取り込み（同TicketIdの既存GroupIdへ合流）
ticket_to_group = (
    data.loc[data["GroupId"] != "noGroup", ["TicketId", "GroupId"]]
        .drop_duplicates()
        .groupby("TicketId")["GroupId"]
        .first()
        .to_dict()
)
mask_cand = (data["Title"] != "man") & (data["GroupId"] == "noGroup") & (data["TicketId"].isin(ticket_to_group))
data.loc[mask_cand, "GroupId"] = data.loc[mask_cand, "TicketId"].map(ticket_to_group)

# GroupSurvival（train で平均 → 全体へ伝播）
group_surv_train = (
    data.loc[is_train]
        .groupby("GroupId")["Survived"]
        .mean()
)
data["GroupSurvival"] = data["GroupId"].map(group_surv_train)

# 未知グループは Pclass で補完
unk = data["GroupSurvival"].isna()
data.loc[unk & (data["Pclass"] == 3), "GroupSurvival"] = 0.0
data.loc[unk & (data["Pclass"] != 3), "GroupSurvival"] = 1.0

# Base prediction: female=1, male=0
data["Predict"] = 0
data.loc[data["Sex"] == "female", "Predict"] = 1
# WCG上書き
data.loc[(data["Title"] == "woman") & (data["GroupSurvival"] == 0.0), "Predict"] = 0
data.loc[(data["Title"] == "boy")   & (data["GroupSurvival"] == 1.0), "Predict"] = 1

# boys/females のカウント（参考）
boys_live = data.query("PassengerId > 891 & Sex == 'male' & Predict == 1")["PassengerId"].tolist()
fem_die   = data.query("PassengerId > 891 & Sex == 'female' & Predict == 0")["PassengerId"].tolist()

In [None]:
solo_w_train = is_train & (data["Title"] == "woman") & (data["FamilySize"] == 1)
X_w = data.loc[solo_w_train].copy()
X_w["x1"] = X_w["FareAdj"] / 10.0
X_w["x2"] = X_w["Age"] / 15.0
X_w = X_w.loc[data.loc[solo_w_train, "Survived"].notna(), ["x1", "x2"]].to_numpy()
y_w = data.loc[solo_w_train & data["Survived"].notna(), "Survived"].astype(int).to_numpy()

xgb_w = XGBClassifier(
    objective="binary:logistic",
    eval_metric="logloss",
    max_depth=5,
    learning_rate=0.1,
    gamma=0.1,
    colsample_bytree=1.0,
    min_child_weight=1.0,
    n_estimators=500,
    subsample=1.0,
    random_state=RANDOM_STATE,
    n_jobs=-1,
)
xgb_w.fit(X_w, y_w)

In [None]:
def ticket_last_to_X(s):
    s = str(s).strip()
    return s[:-1] + "X" if len(s) else s

def fare_str(x):
    return "NA" if pd.isna(x) else f"{float(x):.4f}"

data["Surname"] = data["Name"].str.split(",").str[0]
data["TicketX"] = data["Ticket"].map(ticket_last_to_X)
data["FareStr"] = data["Fare"].map(fare_str)
data["EmbarkedStr"] = data["Embarked"].fillna("NA").astype(str)

data["GroupId"] = (
    data["Surname"].astype(str) + "-" +
    data["Pclass"].astype(str) + "-" +
    data["TicketX"].astype(str) + "-" +
    data["FareStr"] + "-" +
    data["EmbarkedStr"]
)

# men は noGroup
data.loc[data["Title"] == "man", "GroupId"] = "noGroup"

# Wilkes (893) ← Hocking (775) の GroupId を借りる（Rの特例）
gid_775 = data.loc[data["PassengerId"] == 775, "GroupId"]
if not gid_775.empty:
    data.loc[data["PassengerId"] == 893, "GroupId"] = gid_775.values[0]

# 出現数<=1 は noGroup
grp_freq = data["GroupId"].value_counts()
data.loc[data["GroupId"].map(grp_freq) <= 1, "GroupId"] = "noGroup"

# TicketId（nanny/relative 用）
data["TicketId"] = (
    data["Pclass"].astype(str) + "-" +
    data["TicketX"].astype(str) + "-" +
    data["FareStr"] + "-" +
    data["EmbarkedStr"]
)

# nanny/relative 取り込み（同TicketIdの既存GroupIdへ合流）
ticket_to_group = (
    data.loc[data["GroupId"] != "noGroup", ["TicketId", "GroupId"]]
        .drop_duplicates()
        .groupby("TicketId")["GroupId"]
        .first()
        .to_dict()
)
mask_cand = (data["Title"] != "man") & (data["GroupId"] == "noGroup") & (data["TicketId"].isin(ticket_to_group))
data.loc[mask_cand, "GroupId"] = data.loc[mask_cand, "TicketId"].map(ticket_to_group)

# GroupSurvival（train で平均 → 全体へ伝播）
group_surv_train = (
    data.loc[is_train]
        .groupby("GroupId")["Survived"]
        .mean()
)
data["GroupSurvival"] = data["GroupId"].map(group_surv_train)

# 未知グループは Pclass で補完
unk = data["GroupSurvival"].isna()
data.loc[unk & (data["Pclass"] == 3), "GroupSurvival"] = 0.0
data.loc[unk & (data["Pclass"] != 3), "GroupSurvival"] = 1.0

# Base prediction: female=1, male=0
data["Predict"] = 0
data.loc[data["Sex"] == "female", "Predict"] = 1
# WCG上書き
data.loc[(data["Title"] == "woman") & (data["GroupSurvival"] == 0.0), "Predict"] = 0
data.loc[(data["Title"] == "boy")   & (data["GroupSurvival"] == 1.0), "Predict"] = 1

# boys/females のカウント（参考）
boys_live = data.query("PassengerId > 891 & Sex == 'male' & Predict == 1")["PassengerId"].tolist()
fem_die   = data.query("PassengerId > 891 & Sex == 'female' & Predict == 0")["PassengerId"].tolist()

# ---------------------------------------------------------
# 提出1: WCG + (成人男性XGBで生存) → R: WCG_XGBoost1.csv
# ---------------------------------------------------------
sub1 = pd.DataFrame({"PassengerId": np.arange(892, 1310), "Survived": data.loc[data["PassengerId"] > 891, "Predict"].astype(int).values})
# 成人男性で p>=0.90 を生存(=1)に上書き
sub1.loc[sub1["PassengerId"].isin(men_live_pid), "Survived"] = 1
sub1.to_csv("WCG_XGBoost1_py.csv", index=False)

# =========================================================
# 6) Solo women XGB (FamilySize==1), threshold p<=0.08 -> die
# =========================================================
solo_w_train = is_train & (data["Title"] == "woman") & (data["FamilySize"] == 1)

# Survivedが欠損していない行のみを使用
valid_rows = solo_w_train & data["Survived"].notna()

X_w = data.loc[valid_rows].copy()
X_w["x1"] = X_w["FareAdj"] / 10.0
X_w["x2"] = X_w["Age"] / 15.0
X_w = X_w[["x1", "x2"]].to_numpy()
y_w = data.loc[valid_rows, "Survived"].astype(int).to_numpy()

xgb_w = XGBClassifier(
    objective="binary:logistic",
    eval_metric="logloss",
    max_depth=5,
    learning_rate=0.1,
    gamma=0.1,
    colsample_bytree=1.0,
    min_child_weight=1.0,
    n_estimators=500,
    subsample=1.0,
    random_state=RANDOM_STATE,
    n_jobs=-1,
)
xgb_w.fit(X_w, y_w)

# 予測対象: テスト側の solo women, ただしWCGで確定済み(=GroupSurvivalが0/1)は除外
wcg_test_mask = (data["PassengerId"] > 891) & (data["GroupSurvival"].isin([0.0, 1.0]))
solo_w_test = (data["PassengerId"] > 891) & (data["Title"] == "woman") & (data["FamilySize"] == 1) & (~wcg_test_mask)

solo_w_test = (data["PassengerId"] > 891) & (data["Title"] == "woman") & (data["FamilySize"] == 1)

if solo_w_test.sum() > 0:
    Xw_test = data.loc[solo_w_test].copy()
    Xw_test["x1"] = Xw_test["FareAdj"] / 10.0
    Xw_test["x2"] = Xw_test["Age"] / 15.0
    Xw_test = Xw_test[["x1", "x2"]].to_numpy()
    
    p_w = xgb_w.predict_proba(Xw_test)[:, 1]
    solo_w_pid = data.loc[solo_w_test, "PassengerId"].to_list()
    # しきい値 p<=0.08 で死亡(=0)
else:
    p_w = np.array([])
    solo_w_pid = []

p_w = xgb_w.predict_proba(Xw_test)[:, 1]
solo_w_pid = data.loc[solo_w_test, "PassengerId"].to_list()
# しきい値 p<=0.08 で死亡(=0)
solo_w_die_pid = [pid for pid, prob in zip(solo_w_pid, p_w) if prob <= 0.08]

# ---------------------------------------------------------
# 提出2: WCG + (solo women XGBで死亡) → R: WCG_XGBoost2.csv
# ---------------------------------------------------------
sub2 = sub1.copy()
sub2.loc[sub2["PassengerId"].isin(solo_w_die_pid), "Survived"] = 0
sub2.to_csv("WCG_XGBoost2_py.csv", index=False)


