In [60]:
import pandas as pd
import numpy as np
from pathlib import Path

In [61]:
data_dir = Path("../data")
proc_dir = data_dir / "processed"
proc_dir.mkdir(parents=True, exist_ok=True)

In [62]:
train_raw = pd.read_csv(data_dir / "train.csv")
test_raw = pd.read_csv(data_dir / "test.csv")

In [63]:
spend_cols = ["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]
cat_cols_base = ["HomePlanet", "Destination", "Deck", "Side"]
bool_cols = ["CryoSleep", "VIP"]

In [64]:
# Spending distribution function helpers

def _spend_shares(df: pd.DataFrame):
    total = df[spend_cols].sum(axis=1)
    shares = (df[spend_cols].div(total.replace(0, np.nan), axis=0)).fillna(0.0) # Calculate % spends taking care of NaNs
    shares.columns = [f"{c}_share" for c in spend_cols]
    return shares, total

def _entropy_from_shares(shares:pd.DataFrame, eps=1e-12):
    p = shares.clip(lower=0) + eps
    ent = -(p * np.log(p)).sum(axis=1)
    return (ent / np.log(shares.shape[1])).rename("SpendEntropy") # Normalised

def _herfindahl(shares: pd.DataFrame):
    return (shares**2).sum(axis=1).rename("SpendHHI")

def _gini_from_values(vals:pd.DataFrame, eps=1e-9):
    arr = vals[spend_cols].to_numpy()
    arr = np.sort(arr, axis=1)
    n = arr.shape[1]
    cum = np.cumsum(arr, axis=1)
    denom = arr.sum(axis=1) + eps
    gini = 1 - 2 * (cum.sum(axis=1) / (n * denom))
    return pd.Series(gini, index=vals.index, name="SpendGini")

In [65]:
def add_spend_shape_features(df: pd.DataFrame) -> pd.DataFrame:
    shares, total = _spend_shares(df)
    # dominant spend category and share
    dominant_idx = df[spend_cols].values.argmax(axis=1)
    dominant_cat = pd.Categorical.from_codes(dominant_idx, spend_cols)
    # ensure CryoSleep is boolean, but avoid NA -> int errors by filling NA as False for these two flags
    cryo = df["CryoSleep"].astype("boolean")
    cryo_filled = cryo.fillna(False)
    features = pd.concat([
        shares,
        _entropy_from_shares(shares),
        _herfindahl(shares),
        _gini_from_values(df),
        pd.Series((df[spend_cols] > 0).sum(axis=1), index=df.index, name="SpendNonZeroCount"),
        pd.Series((total > 0).astype(int), index=df.index, name="AnySpend"),
        pd.Series(((cryo_filled) & (total > 0)).astype(int), index=df.index, name="CryoMismatch"),
        pd.Series(((~cryo_filled) & (total == 0)).astype(int), index=df.index, name="ZeroSpendButNotCryo"),
        pd.Series(dominant_cat, index=df.index, name="DominantSpendCategory"),
        pd.Series(shares.max(axis=1), index=df.index, name="DominantSpendShare"),
        pd.Series(df[spend_cols].std(axis=1) / (df[spend_cols].mean(axis=1)+1e-9), index=df.index, name="SpendCV"),
    ], axis=1)
    return pd.concat([df, features], axis=1)

In [66]:
# Add group aggregate information

def add_group_aggregates(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy() 
    # prerequisites
    if "Group" not in df.columns:
        df["Group"] = df["PassengerId"].astype(str).str.split("_").str[0]
    if "GroupSize" not in df.columns:
        df["GroupSize"] = df.groupby("Group")["Group"].transform("size")
    # passenger index and solo/first flags
    df["PassengerIdx"] = df["PassengerId"].astype(str).str.split("_").str[1].astype(int)
    df["IsFirstInGroup"] = (df["PassengerIdx"] == 1).astype(int)
    df["IsSolo"] = (df["GroupSize"] == 1).astype(int)
    grp = df.groupby("Group", observed=True)
    # spend aggregates
    grp_total_spend = grp[spend_cols].sum().sum(axis=1)
    df["GroupTotalSpend"] = df["Group"].map(grp_total_spend).astype(float)
    df["GroupMeanSpend"] = df["GroupTotalSpend"] / df["GroupSize"].clip(lower=1)
    df["GroupSpendPerMember"] = df["GroupMeanSpend"] 
    # age aggregates
    df["GroupAgeMean"] = grp["Age"].transform("mean")
    df["GroupAgeStd"] = grp["Age"].transform("std").fillna(0)
    # consistency flags
    for col in ["HomePlanet", "Destination", "Deck", "Side"]:
        nunique = grp[col].transform(lambda s: s.nunique(dropna=True))
        df[f"Group{col}NUnique"] = nunique
        df[f"GroupAllSame{col}"] = (nunique == 1).astype(int)
    # cryo proportion (feature-only)
    df["GroupCryoShare"] = grp["CryoSleep"].transform(lambda s: (s==True).mean())
    return df

In [85]:
# Cabin and interactions

deck_order = {"Unknown":0, "A":1, "B":2, "C":3, "D":4, "E":5, "F":6, "G":7, "T":8}

def add_cabin_extras(df: pd.DataFrame, n_bins=10) -> pd.DataFrame:
    df = df.copy()
    df["DeckOrdinal"] = df["Deck"].map(deck_order).fillna(0).astype(int)
    df["SideBinary"] = df["Side"].map({"P":0, "S":1}).fillna(0).astype(int)
    # robust qcut: if too few uniques, fallback to cut
    try:
        df["CabinNumBin"] = pd.qcut(df["CabinNum"].rank(method="first"), q=n_bins, labels=False, duplicates="drop")
    except Exception:
        df["CabinNumBin"] = pd.cut(df["CabinNum"], bins=n_bins, labels=False, include_lowest=True)
    df["CabinNumBin"] = df["CabinNumBin"].fillna(-1).astype(int)
    return df

def add_small_interactions(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    # ensure CryoSleep is boolean, but avoid NA -> int errors by filling NA as False for these two flags
    cryo = df["CryoSleep"].astype("boolean")
    cryo_filled = cryo.fillna(False)      
    df["Cryo_x_AnySpend"] = ((cryo_filled) & df["AnySpend"] == 1).astype(int)
    df["VIP_x_TotalSpend"] = (df["VIP"].fillna(False).astype(bool).astype(int) * np.log1p(df["TotalSpend"])).astype(float)
    # compact one-hot for small-cardinality interactions
    df["HPxDest"] = df["HomePlanet"].astype(str) + "|" + df["Destination"].astype(str)
    hp_dest_dum = pd.get_dummies(df["HPxDest"], prefix="HPxDest", dummy_na=False)
    return pd.concat([df.drop(columns=["HPxDest"]), hp_dest_dum], axis=1)

In [86]:
def split_cabin(df: pd.DataFrame) -> pd.DataFrame:
    cab = df["Cabin"].fillna("Unknown/9999/U").str.split("/", expand=True)
    cab.columns = ["Deck", "CabinNum", "Side"]
    df = df.assign(Deck=cab["Deck"], Side=cab["Side"])
    df["CabinNum"] = pd.to_numeric(cab["CabinNum"], errors="coerce")
    return df

In [87]:
def add_group_features(df: pd.DataFrame) -> pd.DataFrame:
    # PassengerId like '0001_01' -> group '0001'
    df = df.copy()
    if "Group" not in df.columns:
        df["Group"] = df["PassengerId"].astype(str).str.split("_").str[0]
    return df

In [88]:
def engineer_spend(df: pd.DataFrame) -> pd.DataFrame:
    for c in spend_cols:
        df[c] = pd.to_numeric(df[c], errors="coerce")
    df["TotalSpend"] = df[spend_cols].sum(axis=1, skipna=True)
    # log1p versions - mitigates the right-skewness inherent in spending metrics
    for c in spend_cols + ["TotalSpend"]:
        df[f"{c}_log1p"] = np.log1p(df[c].fillna(0))
    return df

In [89]:
def add_missing_flags(df: pd.DataFrame) -> pd.DataFrame:
    cols_to_flag = [
        "HomePlanet", "CryoSleep", "Destination", "Age", "VIP", 
        "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck",
        "Name", "Deck", "Side", "CabinNum"
    ]
    for c in cols_to_flag:
        if c in df.columns:
            df[f"{c}_missing"] = df[c].isna().astype(int)
    return df

In [90]:
def basic_clean(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    # booleans
    for c in bool_cols:
        if c in out.columns:
            out[c] = out[c].astype("boolean")
    # target (train only)
    if "Transported" in out.columns:
        out["Transported"] = out["Transported"].astype(int)
    # core transforms
    out = split_cabin(out)
    out = add_group_features(out)
    out = engineer_spend(out)
    out = add_missing_flags(out)
    out = add_spend_shape_features(out)
    out = add_group_aggregates(out)
    out = add_cabin_extras(out)
    out = add_small_interactions(out)    
    return out 

In [91]:
# build a single concatenated frame to ensure consistent mapping
train = basic_clean(train_raw)
test = basic_clean(test_raw)
train["is_train"] = 1
test["is_train"] = 0
both = pd.concat([train, test], axis=0, ignore_index=True)

In [92]:
# GroupSize
both["GroupSize"] = both.groupby("Group")["Group"].transform("size")

In [93]:
# For missing age data, use median age (on training data only to avoid leakage)
age_median = both.loc[both["is_train"]==1, "Age"].median()
both["Age"] = both["Age"].fillna(age_median)

In [94]:
# Spending: fill NA with zeroes
for c in spend_cols:
    both[c] = both[c].fillna(0.0)
both["TotalSpend"] = both[spend_cols].sum(axis=1)

In [77]:
# Categorial columns: fill NA with "Unknown"
for c in cat_cols_base + ["Deck", "Side"]:
    both[c] = both[c].fillna("Unknown").astype("category")

TypeError: Cannot setitem on a Categorical with a new category (Unknown), set the categories first

In [95]:
# A convenience numeric
both["CabinNum"] = both["CabinNum"].fillna(both.loc[both["is_train"]==1, "CabinNum"].median())

In [96]:
# Define feature sets for naive, basic and enhanced feature spaces

# Minimal columns for naive
naive_nums = ["Age"] + spend_cols
naive_bools = bool_cols
naive_cats = ["HomePlanet", "Destination"]

# Basic feature space: add cabin split, total spend and missing flags
basic_nums = naive_nums + ["TotalSpend"] + ["CabinNum"]
basic_bools = naive_bools
basic_cats = cat_cols_base
basic_extras = ["Age_missing"] + [f"{c}_missing" for c in spend_cols] + ["AnySpend"]

# Enhanced: add group features, logs, ratios and CabinNum
enh_nums = basic_nums + [
    "GroupSize", "GroupTotalSpend", "GroupMeanSpend", "GroupMeanSpendPerMember",
    "GroupAgeMean", "GroupAgeStd", 
    "DominantSpendShare", "SpendNonZeroCount", "SpendEntropy", "SpendHHI", "SpendGini", "SpendCV",
    "CabinNumBin", "DeckOrdinal", "SideBinary",
    "PassengerIdx", "IsFirstInGroup", "IsSolo",
    "SpendPerAge"
] + ["f{c}_log1p" for c in spend_cols + ["TotalSpend"]]
enh_bools = basic_bools
enh_cats = ["DominantSpendCategory"] 
enh_extras = basic_extras + [
    "CryoMismatch", "ZeroSpendButNotCryo",
    "GroupHomePlanetNUnique", "GroupDestinationNUnique", "GroupDeckNUnique", "GroupSideNUnique", 
    "GroupAllSameHomePlanet", "GroupAllSameDestination", "GroupAllSameDeck", "GroupAllSameSide"
]

In [97]:
def boolean_to_int(series: pd.Series, fillna_value: int=0) -> pd.Series:
    # convert pandas boolean dtype to integers
    # True -> 1, False -> 0, NA -> fillna_value
    return series.astype("boolean").astype("Int8").fillna(fillna_value).astype(int)

In [98]:
def build_set(df: pd.DataFrame, num_cols, bool_cols, cat_cols, extra_cols=(), one_hot=True):
    out = pd.DataFrame(index=df.index)
    # numerics
    for c in num_cols:
        if c in df.columns:
            out[c] = pd.to_numeric(df[c], errors="coerce").fillna(0)
    # booleans
    for c in bool_cols:
        if c in df.columns:
            out[c] = boolean_to_int(df[c], fillna_value=0)
    # extras - usually numerics
    for c in extra_cols:
        if c in df.columns:
            out[c] = pd.to_numeric(df[c], errors="coerce").fillna(0)
    # categoricals
    cat_df = pd.DataFrame(index=df.index)
    for c in cat_cols:
        if c in df.columns:
            cat_df[c] = df[c].astype("category")
    if one_hot and cat_df.shape[1] > 0:
        cat_df = pd.get_dummies(cat_df, dummy_na=False)
        out = pd.concat([out, cat_df], axis=1)

    return out

In [99]:
# Build all three feature spaces on the concatenated frame (ensures columns are aligned)
X_naive_all = build_set(both, naive_nums, naive_bools, naive_cats, extra_cols=[])
X_basic_all = build_set(both, basic_nums, basic_bools, basic_cats, extra_cols=basic_extras)
X_enh_all = build_set(both, enh_nums, enh_bools, enh_cats, extra_cols=enh_extras)

In [100]:
# Re-split to train/test
mask_tr = both["is_train"] == 1
y = train["Transported"].astype(int).values # target

In [103]:
def split_save(name, X_all):
    X_train = X_all.loc[mask_tr].copy()
    X_test = X_all.loc[~mask_tr].copy()
    # Attach target for train
    train_out = X_train.copy()
    train_out["Transported"] = y
    # Save
    train_path = proc_dir / f"train_{name}.csv"
    test_path = proc_dir / f"test_{name}.csv"
    train_out.to_csv(train_path, index=False)
    X_test.to_csv(test_path, index=False)
    print(f"Save: {train_path.name}, {test_path.name} | shapes: {train_out.shape}, {X_test.shape}")

In [104]:
split_save("naive", X_naive_all)
split_save("basic", X_basic_all)
split_save("enhanced", X_enh_all)

Save: train_naive.csv, test_naive.csv | shapes: (8693, 15), (4277, 14)
Save: train_basic.csv, test_basic.csv | shapes: (8693, 36), (4277, 35)
Save: train_enhanced.csv, test_enhanced.csv | shapes: (8693, 50), (4277, 49)
