In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GroupKFold
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.metrics import accuracy_score

df_train = pd.read_csv("data/train.csv")
df_test = pd.read_csv("data/test.csv")
sample_submission = pd.read_csv("data/sample_submission.csv")

In [2]:
## add TotalSpend numeric feature and NoSpend numeric feature

for df in (df_train, df_test):
    # Sum all the spending columns
    df["TotalSpend"] = (
        df["RoomService"]
        + df["FoodCourt"]
        + df["ShoppingMall"]
        + df["Spa"]
        + df["VRDeck"]
    )

    # Optional: flag for passengers who spent nothing
    df["NoSpend"] = (df["TotalSpend"] == 0).astype(int)


In [3]:
# split cabin feature into three, for deck, floor and side

for df in (df_train, df_test):
    cabin_split = df["Cabin"].str.split("/", expand=True)

    df["CabinDeck"] = cabin_split[0]                      # e.g. "B"
    df["CabinNum"] = pd.to_numeric(cabin_split[1], errors="coerce")  # e.g. 45
    df["CabinSide"] = cabin_split[2]                      # e.g. "P"

In [4]:
## exploit groups

for df in (df_train, df_test):
    df["GroupId"] = df["PassengerId"].str.split("_").str[0]
    group_sizes = df.groupby("GroupId")["GroupId"].transform("count")
    df["GroupSize"] = group_sizes
    df["IsAlone"] = (df["GroupSize"] == 1).astype(int)

In [5]:
## BELOW WE WILL DEFINE THE COLUMNS FROM THE DATA SET 
## AND CREATE FIT_PREPROCESSING
target_col = "Transported"
drop_cols = ["PassengerId", "Name", "Cabin"]
numeric_cols = ["Age", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck", "TotalSpend", "NoSpend", "CabinNum", "GroupSize", "IsAlone"]
categorical_cols = ["HomePlanet", "CryoSleep", "CabinDeck", "CabinSide", "Destination", "VIP"]

def fit_preprocessing (df_train):
    y = df_train[target_col].astype(int)
    X = df_train.drop(columns=drop_cols + [target_col])

    X_numeric = X[numeric_cols].copy()
    X_cat = X[categorical_cols].copy()

    numeric_medians = X_numeric.median()
    cat_modes = X_cat.mode().iloc[0]
    
    ## add in median for NaNs on numerical cols
    for col in numeric_cols:
        X_numeric[col] = X_numeric[col].fillna(numeric_medians[col])

    for col in categorical_cols:
        X_cat[col] = X_cat[col].fillna(cat_modes[col])

    if X_numeric.isna().sum().sum() != 0:
        print("Error: not null values in numeric cols still unfilled")
        return

    if X_cat.isna().sum().sum() != 0:
        print("Error: not null values still in cat. calls still unfilled")
        return

    X_cat_encoded = pd.get_dummies(X_cat, drop_first=False)  
    
    X_prepared = pd.concat([X_numeric, X_cat_encoded], axis=1)
    print(X_prepared.shape)

    return X_prepared, numeric_medians, cat_modes, y, X_prepared.columns

In [6]:
def apply_preprocessing(df, numeric_medians, cat_modes, train_cols):
    X = df.drop(columns=drop_cols)

    X_numeric = X[numeric_cols].copy()
    X_cat = X[categorical_cols].copy()

    for col in numeric_cols:
        X_numeric[col] = X_numeric[col].fillna(numeric_medians[col])

    for col in categorical_cols:
        X_cat[col] = X_cat[col].fillna(cat_modes[col])

    if X_numeric.isna().sum().sum() != 0:
        print("Error: null values in numeric cols still unfilled")
        return

    if X_cat.isna().sum().sum() != 0:
        print("Error: null values still in cat. calls still unfilled")
        return

    X_cat_encoded = pd.get_dummies(X_cat, drop_first=False)

    X_prepared = pd.concat([X_numeric, X_cat_encoded], axis=1)

    X_prepared = X_prepared.reindex(columns=train_cols, fill_value=0)

    return X_prepared

In [7]:
X_prepared, numeric_medians, cat_modes, y, train_cols = fit_preprocessing(df_train)
X_test_prepared = apply_preprocessing(df_test, numeric_medians, cat_modes, train_cols)

(8693, 29)


  X_cat[col] = X_cat[col].fillna(cat_modes[col])
  X_cat[col] = X_cat[col].fillna(cat_modes[col])


In [8]:
X_train, X_val, y_train, y_val = train_test_split(
    X_prepared, y, test_size=0.2, random_state=42, stratify=y)

In [9]:
from sklearn.model_selection import GroupKFold
from sklearn.metrics import accuracy_score
from sklearn.ensemble import HistGradientBoostingClassifier
import numpy as np

groups = df_train["GroupId"]

gkf = GroupKFold(n_splits=5)

thresholds = np.linspace(0.4, 0.6, 21)  # 0.40, 0.41, ..., 0.60

best_thr = None
best_acc = -1

for thr in thresholds:
    fold_accs = []

    for train_idx, val_idx in gkf.split(X_prepared, y, groups=groups):
        X_tr, X_va = X_prepared.iloc[train_idx], X_prepared.iloc[val_idx]
        y_tr, y_va = y.iloc[train_idx], y.iloc[val_idx]

        gb = HistGradientBoostingClassifier(
            learning_rate=0.05,
            max_depth=None,
            max_iter=300,
            min_samples_leaf=20,
            random_state=42,
        )
        gb.fit(X_tr, y_tr)
        proba_va = gb.predict_proba(X_va)[:, 1]

        preds_va = (proba_va >= thr).astype(int)
        acc = accuracy_score(y_va, preds_va)
        fold_accs.append(acc)

    mean_acc = np.mean(fold_accs)
    print(f"thr={thr:.2f} -> mean_acc={mean_acc:.5f}")

    if mean_acc > best_acc:
        best_acc = mean_acc
        best_thr = thr

print("Best threshold:", best_thr, "with CV acc:", best_acc)


thr=0.40 -> mean_acc=0.80916
thr=0.41 -> mean_acc=0.80996
thr=0.42 -> mean_acc=0.81077
thr=0.43 -> mean_acc=0.81111
thr=0.44 -> mean_acc=0.81042
thr=0.45 -> mean_acc=0.81054
thr=0.46 -> mean_acc=0.81123
thr=0.47 -> mean_acc=0.81123
thr=0.48 -> mean_acc=0.81134
thr=0.49 -> mean_acc=0.81065
thr=0.50 -> mean_acc=0.80996
thr=0.51 -> mean_acc=0.80766
thr=0.52 -> mean_acc=0.80697
thr=0.53 -> mean_acc=0.80789
thr=0.54 -> mean_acc=0.80720
thr=0.55 -> mean_acc=0.80525
thr=0.56 -> mean_acc=0.80548
thr=0.57 -> mean_acc=0.80617
thr=0.58 -> mean_acc=0.80674
thr=0.59 -> mean_acc=0.80582
thr=0.60 -> mean_acc=0.80364
Best threshold: 0.48 with CV acc: 0.8113416503936298


In [10]:
import os

# 1. Train final HGB on all training data
gb_final = HistGradientBoostingClassifier(
    learning_rate=0.05,
    max_depth=None,
    max_iter=300,
    min_samples_leaf=20,
    random_state=42,
)
gb_final.fit(X_prepared, y)

# 2. Predict probabilities on test
proba_test = gb_final.predict_proba(X_test_prepared)[:, 1]

# 3. Use tuned threshold instead of 0.5
best_thr = 0.48  # <-- replace with the value you found from the sweep
pred_test = (proba_test >= best_thr).astype(bool)

submission = pd.DataFrame({
    "PassengerId": df_test["PassengerId"],
    "Transported": pred_test,
})

os.makedirs("submissions", exist_ok=True)
submission.to_csv("submissions/06_hgb_threshold_tuned.csv", index=False)
