In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

In [3]:
data_dir = Path("../data")
proc_dir = data_dir / "processed"
proc_dir.mkdir(parents=True, exist_ok=True)

In [4]:
train_raw = pd.read_csv(data_dir / "train.csv")
test_raw = pd.read_csv(data_dir / "test.csv")

In [5]:
spend_cols = ["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]
cat_cols_base = ["HomePlanet", "Destination"]
bool_cols = ["CryoSleep", "VIP"]

In [23]:
def split_cabin(df: pd.DataFrame) -> pd.DataFrame:
    cab = df["Cabin"].fillna("Unknown/9999/U").str.split("/", expand=True)
    cab.columns = ["Deck", "CabinNum", "Side"]
    df = df.assign(Deck=cab["Deck"], Side=cab["Side"])
    df["CabinNum"] = pd.to_numeric(cab["CabinNum"], errors="coerce")
    return df

In [8]:
def add_group_features(df: pd.DataFrame) -> pd.DataFrame:
    # PassengerId like '0001_01' -> group '0001'
    grp = df["PassengerId"].astype(str).str.split("_").str[0]
    df = df.assign(Group=grp)
    return df

In [9]:
def engineer_spend(df: pd.DataFrame) -> pd.DataFrame:
    for c in spend_cols:
        df[c] = pd.to_numeric(df[c], errors="coerce")
    df["TotalSpend"] = df[spend_cols].sum(axis=1, skipna=True)
    # log1p versions - mitigates the right-skewness inherent in spending metrics
    for c in spend_cols + ["TotalSpend"]:
        df[f"{c}_log1p"] = np.log1p(df[c].fillna(0))
    return df

In [10]:
def add_missing_flags(df: pd.DataFrame) -> pd.DataFrame:
    cols_to_flag = [
        "HomePlanet", "CryoSleep", "Destination", "Age", "VIP", 
        "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck",
        "Name", "Deck", "Side", "CabinNum"
    ]
    for c in cols_to_flag:
        if c in df.columns:
            df[f"{c}_missing"] = df[c].isna().astype(int)
    return df

In [58]:
def basic_clean(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    # booleans
    for c in bool_cols:
        if c in out.columns:
            out[c] = out[c].astype("boolean")
    # target (train only)
    if "Transported" in out.columns:
        out["Transported"] = out["Transported"].astype(int)
    # core transforms
    out = split_cabin(out)
    out = add_group_features(out)
    out = engineer_spend(out)
    out = add_missing_flags(out)
    return out 

In [67]:
# build a single concatenated frame to ensure consistent mapping
train = basic_clean(train_raw)
test = basic_clean(test_raw)
train["is_train"] = 1
test["is_train"] = 0
both = pd.concat([train, test], axis=0, ignore_index=True)

In [68]:
# GroupSize
both["GroupSize"] = both.groupby("Group")["Group"].transform("size")

In [69]:
# For missing age data, use median age (on training data only to avoid leakage)
age_median = both.loc[both["is_train"]==1, "Age"].median()
both["Age"] = both["Age"].fillna(age_median)

In [70]:
# Spending: fill NA with zeroes
for c in spend_cols:
    both[c] = both[c].fillna(0.0)
both["TotalSpend"] = both[spend_cols].sum(axis=1)

In [71]:
# Categorial columns: fill NA with "Unknown"
for c in cat_cols_base + ["Deck", "Side"]:
    both[c] = both[c].fillna("Unknown").astype("category")

In [74]:
# A convenience numeric
both["CabinNum"] = both["CabinNum"].fillna(both.loc[both["is_train"]==1, "CabinNum"].median())

In [75]:
# Define feature sets for naive, basic and enhanced feature spaces

# Minimal columns for naive
naive_nums = ["Age"] + spend_cols
naive_bools = bool_cols
naive_cats = cat_cols_base

# Basic feature space: add cabin split, total spend and missing flags
basic_nums = naive_nums + ["TotalSpend"]
basic_bools = naive_bools
basic_cats = naive_cats + ["Deck", "Side"]
basic_extras = ["Age_missing"] + [f"{c}_missing" for c in spend_cols]

# Enhanced: add group features, logs, ratios and CabinNum
enh_nums = basic_nums + ["CabinNum", "GroupSize"] + [f"{c}_log1p" for c in spend_cols + ["TotalSpend"]]
enh_bools = basic_bools
enh_cats = basic_cats + ["Group"] # high-cardinality, but fine for models like trees/boosters 
enh_extras = basic_extras + ["SpendPerAge"]
# derive SpendPerAge
both["SpendPerAge"] = both["TotalSpend"] / both["Age"].replace(0, np.nan)

In [77]:
def boolean_to_int(series: pd.Series, fillna_value: int=0) -> pd.Series:
    # convert pandas boolean dtype to integers
    # True -> 1, False -> 0, NA -> fillna_value
    return series.astype("boolean").astype("Int8").fillna(fillna_value).astype(int)

In [78]:
def build_set(df: pd.DataFrame, num_cols, bool_cols, cat_cols, extra_cols=(), one_hot=True):
    out = pd.DataFrame(index=df.index)
    # numerics
    for c in num_cols:
        if c in df.columns:
            out[c] = pd.to_numeric(df[c], errors="coerce").fillna(0)
    # booleans
    for c in bool_cols:
        if c in df.columns:
            out[c] = boolean_to_int(df[c], fillna_value=0)
    # extras - usually numerics
    for c in extra_cols:
        if c in df.columns:
            out[c] = pd.to_numeric(df[c], errors="coerce").fillna(0)
    # categoricals
    cat_df = pd.DataFrame(index=df.index)
    for c in cat_cols:
        if c in df.columns:
            cat_df[c] = df[c].astype("category")
    if one_hot and cat_df.shape[1] > 0:
        cat_df = pd.get_dummies(cat_df, dummy_na=False)
        out = pd.concat([out, cat_df], axis=1)
    return out

In [81]:
# Build all three feature spaces on the concatenated frame (ensures columns are aligned)
X_naive_all = build_set(both, naive_nums, naive_bools, naive_cats, extra_cols=[])
X_basic_all = build_set(both, basic_nums, basic_bools, basic_cats, extra_cols=basic_extras)
X_enh_all = build_set(both, enh_nums, enh_bools, enh_cats, extra_cols=enh_extras)

In [86]:
# Re-split to train/test
mask_tr = both["is_train"] == 1
y = train["Transported"].astype(int).values # target

In [87]:
def split_save(name, X_all):
    X_train = X_all.loc[mask_tr].copy()
    X_test = X_all.loc[~mask_tr].copy()
    # Attach target for train
    train_out = X_train.copy()
    train_out["Transported"] = y
    # Save
    train_path = proc_dir / f"train{name}.csv"
    test_path = proc_dir / f"test{name}.csv"
    train_out.to_csv(train_path, index=False)
    X_test.to_csv(test_path, index=False)
    print(f"Save: {train_path.name}, {test_path.name} | shapes: {train_out.shape}, {X_test.shape}")

In [88]:
split_save("naive", X_naive_all)
split_save("basic", X_basic_all)
split_save("enhanced", X_enh_all)

Save: trainnaive.csv, testnaive.csv | shapes: (8693, 17), (4277, 16)
Save: trainbasic.csv, testbasic.csv | shapes: (8693, 36), (4277, 35)
Save: trainenhanced.csv, testenhanced.csv | shapes: (8693, 9325), (4277, 9324)
