In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GroupKFold
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.metrics import accuracy_score

df_train = pd.read_csv("data/train.csv")
df_test = pd.read_csv("data/test.csv")
sample_submission = pd.read_csv("data/sample_submission.csv")

In [2]:
## add TotalSpend numeric feature and NoSpend numeric feature

for df in (df_train, df_test):
    # Sum all the spending columns
    df["TotalSpend"] = (
        df["RoomService"]
        + df["FoodCourt"]
        + df["ShoppingMall"]
        + df["Spa"]
        + df["VRDeck"]
    )

    # Optional: flag for passengers who spent nothing
    df["NoSpend"] = (df["TotalSpend"] == 0).astype(int)


In [3]:
# split cabin feature into three, for deck, floor and side

for df in (df_train, df_test):
    cabin_split = df["Cabin"].str.split("/", expand=True)

    df["CabinDeck"] = cabin_split[0]                      # e.g. "B"
    df["CabinNum"] = pd.to_numeric(cabin_split[1], errors="coerce")  # e.g. 45
    df["CabinSide"] = cabin_split[2]                      # e.g. "P"

In [4]:
## exploit groups

for df in (df_train, df_test):
    df["GroupId"] = df["PassengerId"].str.split("_").str[0]
    group_sizes = df.groupby("GroupId")["GroupId"].transform("count")
    df["GroupSize"] = group_sizes
    df["IsAlone"] = (df["GroupSize"] == 1).astype(int)

In [5]:
## BELOW WE WILL DEFINE THE COLUMNS FROM THE DATA SET 
## AND CREATE FIT_PREPROCESSING
target_col = "Transported"
drop_cols = ["PassengerId", "Name", "Cabin"]
numeric_cols = ["Age", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck", "TotalSpend", "NoSpend", "CabinNum", "GroupSize", "IsAlone"]
categorical_cols = ["HomePlanet", "CryoSleep", "CabinDeck", "CabinSide", "Destination", "VIP"]

def fit_preprocessing (df_train):
    y = df_train[target_col].astype(int)
    X = df_train.drop(columns=drop_cols + [target_col])

    X_numeric = X[numeric_cols].copy()
    X_cat = X[categorical_cols].copy()

    numeric_medians = X_numeric.median()
    cat_modes = X_cat.mode().iloc[0]
    
    ## add in median for NaNs on numerical cols
    for col in numeric_cols:
        X_numeric[col] = X_numeric[col].fillna(numeric_medians[col])

    for col in categorical_cols:
        X_cat[col] = X_cat[col].fillna(cat_modes[col])

    if X_numeric.isna().sum().sum() != 0:
        print("Error: not null values in numeric cols still unfilled")
        return

    if X_cat.isna().sum().sum() != 0:
        print("Error: not null values still in cat. calls still unfilled")
        return

    X_cat_encoded = pd.get_dummies(X_cat, drop_first=False)  
    
    X_prepared = pd.concat([X_numeric, X_cat_encoded], axis=1)
    print(X_prepared.shape)

    return X_prepared, numeric_medians, cat_modes, y, X_prepared.columns

In [6]:
def apply_preprocessing(df, numeric_medians, cat_modes, train_cols):
    X = df.drop(columns=drop_cols)

    X_numeric = X[numeric_cols].copy()
    X_cat = X[categorical_cols].copy()

    for col in numeric_cols:
        X_numeric[col] = X_numeric[col].fillna(numeric_medians[col])

    for col in categorical_cols:
        X_cat[col] = X_cat[col].fillna(cat_modes[col])

    if X_numeric.isna().sum().sum() != 0:
        print("Error: null values in numeric cols still unfilled")
        return

    if X_cat.isna().sum().sum() != 0:
        print("Error: null values still in cat. calls still unfilled")
        return

    X_cat_encoded = pd.get_dummies(X_cat, drop_first=False)

    X_prepared = pd.concat([X_numeric, X_cat_encoded], axis=1)

    X_prepared = X_prepared.reindex(columns=train_cols, fill_value=0)

    return X_prepared

In [7]:
X_prepared, numeric_medians, cat_modes, y, train_cols = fit_preprocessing(df_train)
X_test_prepared = apply_preprocessing(df_test, numeric_medians, cat_modes, train_cols)

(8693, 29)


  X_cat[col] = X_cat[col].fillna(cat_modes[col])
  X_cat[col] = X_cat[col].fillna(cat_modes[col])


In [8]:
X_train, X_val, y_train, y_val = train_test_split(
    X_prepared, y, test_size=0.2, random_state=42, stratify=y)

In [9]:
groups = df_train["GroupId"]  # if you removed GroupId, just recreate it from PassengerId

gkf = GroupKFold(n_splits=5)

acc_hgb = []
acc_rf = []
acc_ens = []

for train_idx, val_idx in gkf.split(X_prepared, y, groups=groups):
    X_tr, X_va = X_prepared.iloc[train_idx], X_prepared.iloc[val_idx]
    y_tr, y_va = y.iloc[train_idx], y.iloc[val_idx]

    # HGB using the best params you had for the 0.80196 score
    hgb = HistGradientBoostingClassifier(
        learning_rate=0.05,
        max_depth=None,
        max_iter=300,
        min_samples_leaf=20,
        random_state=42,
    )
    hgb.fit(X_tr, y_tr)
    proba_hgb = hgb.predict_proba(X_va)[:, 1]
    pred_hgb = (proba_hgb >= 0.5).astype(int)
    acc_hgb.append(accuracy_score(y_va, pred_hgb))

    # Random Forest – a decent starting config
    rf = RandomForestClassifier(
        n_estimators=500,
        max_depth=None,
        min_samples_leaf=1,
        max_features="sqrt",
        n_jobs=-1,
        random_state=42,
    )
    rf.fit(X_tr, y_tr)
    proba_rf = rf.predict_proba(X_va)[:, 1]
    pred_rf = (proba_rf >= 0.5).astype(int)
    acc_rf.append(accuracy_score(y_va, pred_rf))

    # Simple ensemble: average probs (you can tweak weights later)
    proba_ens = 0.5 * proba_hgb + 0.5 * proba_rf
    pred_ens = (proba_ens >= 0.5).astype(int)
    acc_ens.append(accuracy_score(y_va, pred_ens))

print("HGB   CV mean acc:", np.mean(acc_hgb), "std:", np.std(acc_hgb))
print("RF    CV mean acc:", np.mean(acc_rf), "std:", np.std(acc_rf))
print("Ensem CV mean acc:", np.mean(acc_ens), "std:", np.std(acc_ens))


HGB   CV mean acc: 0.8099618777507278 std: 0.006627149254404563
RF    CV mean acc: 0.8052463255802873 std: 0.00745846099140698
Ensem CV mean acc: 0.8108819467559032 std: 0.007905327803613936


In [10]:
# Train final HGB on all training data
hgb_final = HistGradientBoostingClassifier(
    learning_rate=0.05,
    max_depth=None,
    max_iter=300,
    min_samples_leaf=20,
    random_state=42,
)
hgb_final.fit(X_prepared, y)

# Train final RF on all training data
rf_final = RandomForestClassifier(
    n_estimators=500,
    max_depth=None,
    min_samples_leaf=1,
    max_features="sqrt",
    n_jobs=-1,
    random_state=42,
)
rf_final.fit(X_prepared, y)

0,1,2
,n_estimators,500
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [12]:
import os

# Base probabilities on test
proba_hgb_test = hgb_final.predict_proba(X_test_prepared)[:, 1]
proba_rf_test  = rf_final.predict_proba(X_test_prepared)[:, 1]

# Simple average ensemble
proba_ens_test = 0.5 * proba_hgb_test + 0.5 * proba_rf_test

# 0.5 threshold (you can do threshold tuning later if you want to squeeze more)
pred_ens_test = (proba_ens_test >= 0.5).astype(bool)

submission = pd.DataFrame({
    "PassengerId": df_test["PassengerId"],
    "Transported": pred_ens_test,
})

os.makedirs("submissions", exist_ok=True)
submission.to_csv("submissions/05_hgb_rf_ensemble.csv", index=False)