In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GroupKFold
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.metrics import accuracy_score

df_train = pd.read_csv("data/train.csv")
df_test = pd.read_csv("data/test.csv")
sample_submission = pd.read_csv("data/sample_submission.csv")

In [2]:
## add TotalSpend numeric feature and NoSpend numeric feature

for df in (df_train, df_test):
    # Sum all the spending columns
    df["TotalSpend"] = (
        df["RoomService"]
        + df["FoodCourt"]
        + df["ShoppingMall"]
        + df["Spa"]
        + df["VRDeck"]
    )

    # Optional: flag for passengers who spent nothing
    df["NoSpend"] = (df["TotalSpend"] == 0).astype(int)


In [3]:
# split cabin feature into three, for deck, floor and side

for df in (df_train, df_test):
    cabin_split = df["Cabin"].str.split("/", expand=True)

    df["CabinDeck"] = cabin_split[0]                      # e.g. "B"
    df["CabinNum"] = pd.to_numeric(cabin_split[1], errors="coerce")  # e.g. 45
    df["CabinSide"] = cabin_split[2]                      # e.g. "P"

In [4]:
## exploit groups

for df in (df_train, df_test):
    df["GroupId"] = df["PassengerId"].str.split("_").str[0]
    group_sizes = df.groupby("GroupId")["GroupId"].transform("count")
    df["GroupSize"] = group_sizes
    df["IsAlone"] = (df["GroupSize"] == 1).astype(int)

In [5]:
## BELOW WE WILL DEFINE THE COLUMNS FROM THE DATA SET 
## AND CREATE FIT_PREPROCESSING
target_col = "Transported"
drop_cols = ["PassengerId", "Name", "Cabin"]
numeric_cols = ["Age", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck", "TotalSpend", "NoSpend", "CabinNum", "GroupSize", "IsAlone"]
categorical_cols = ["HomePlanet", "CryoSleep", "CabinDeck", "CabinSide", "Destination", "VIP"]

def fit_preprocessing (df_train):
    y = df_train[target_col].astype(int)
    X = df_train.drop(columns=drop_cols + [target_col])

    X_numeric = X[numeric_cols].copy()
    X_cat = X[categorical_cols].copy()

    numeric_medians = X_numeric.median()
    cat_modes = X_cat.mode().iloc[0]
    
    ## add in median for NaNs on numerical cols
    for col in numeric_cols:
        X_numeric[col] = X_numeric[col].fillna(numeric_medians[col])

    for col in categorical_cols:
        X_cat[col] = X_cat[col].fillna(cat_modes[col])

    if X_numeric.isna().sum().sum() != 0:
        print("Error: not null values in numeric cols still unfilled")
        return

    if X_cat.isna().sum().sum() != 0:
        print("Error: not null values still in cat. calls still unfilled")
        return

    X_cat_encoded = pd.get_dummies(X_cat, drop_first=False)  
    
    X_prepared = pd.concat([X_numeric, X_cat_encoded], axis=1)
    print(X_prepared.shape)

    return X_prepared, numeric_medians, cat_modes, y, X_prepared.columns

In [6]:
def apply_preprocessing(df, numeric_medians, cat_modes, train_cols):
    X = df.drop(columns=drop_cols)

    X_numeric = X[numeric_cols].copy()
    X_cat = X[categorical_cols].copy()

    for col in numeric_cols:
        X_numeric[col] = X_numeric[col].fillna(numeric_medians[col])

    for col in categorical_cols:
        X_cat[col] = X_cat[col].fillna(cat_modes[col])

    if X_numeric.isna().sum().sum() != 0:
        print("Error: null values in numeric cols still unfilled")
        return

    if X_cat.isna().sum().sum() != 0:
        print("Error: null values still in cat. calls still unfilled")
        return

    X_cat_encoded = pd.get_dummies(X_cat, drop_first=False)

    X_prepared = pd.concat([X_numeric, X_cat_encoded], axis=1)

    X_prepared = X_prepared.reindex(columns=train_cols, fill_value=0)

    return X_prepared

In [7]:
X_prepared, numeric_medians, cat_modes, y, train_cols = fit_preprocessing(df_train)
X_test_prepared = apply_preprocessing(df_test, numeric_medians, cat_modes, train_cols)

(8693, 29)


  X_cat[col] = X_cat[col].fillna(cat_modes[col])
  X_cat[col] = X_cat[col].fillna(cat_modes[col])


In [8]:
X_train, X_val, y_train, y_val = train_test_split(
    X_prepared, y, test_size=0.2, random_state=42, stratify=y)

In [9]:
from sklearn.model_selection import GroupKFold
from sklearn.metrics import accuracy_score
from sklearn.ensemble import HistGradientBoostingClassifier
import numpy as np

groups = df_train["GroupId"]

gkf = GroupKFold(n_splits=5)

thresholds = np.linspace(0.4, 0.6, 21)  # 0.40, 0.41, ..., 0.60

best_thr = None
best_acc = -1

for thr in thresholds:
    fold_accs = []

    for train_idx, val_idx in gkf.split(X_prepared, y, groups=groups):
        X_tr, X_va = X_prepared.iloc[train_idx], X_prepared.iloc[val_idx]
        y_tr, y_va = y.iloc[train_idx], y.iloc[val_idx]

        gb = HistGradientBoostingClassifier(
            learning_rate=0.05,
            max_depth=None,
            max_iter=300,
            min_samples_leaf=20,
            random_state=42,
        )
        gb.fit(X_tr, y_tr)
        proba_va = gb.predict_proba(X_va)[:, 1]

        preds_va = (proba_va >= thr).astype(int)
        acc = accuracy_score(y_va, preds_va)
        fold_accs.append(acc)

    mean_acc = np.mean(fold_accs)
    print(f"thr={thr:.2f} -> mean_acc={mean_acc:.5f}")

    if mean_acc > best_acc:
        best_acc = mean_acc
        best_thr = thr

print("Best threshold:", best_thr, "with CV acc:", best_acc)


thr=0.40 -> mean_acc=0.80916
thr=0.41 -> mean_acc=0.80996
thr=0.42 -> mean_acc=0.81077
thr=0.43 -> mean_acc=0.81111
thr=0.44 -> mean_acc=0.81042
thr=0.45 -> mean_acc=0.81054
thr=0.46 -> mean_acc=0.81123
thr=0.47 -> mean_acc=0.81123
thr=0.48 -> mean_acc=0.81134
thr=0.49 -> mean_acc=0.81065
thr=0.50 -> mean_acc=0.80996
thr=0.51 -> mean_acc=0.80766
thr=0.52 -> mean_acc=0.80697
thr=0.53 -> mean_acc=0.80789
thr=0.54 -> mean_acc=0.80720
thr=0.55 -> mean_acc=0.80525
thr=0.56 -> mean_acc=0.80548
thr=0.57 -> mean_acc=0.80617
thr=0.58 -> mean_acc=0.80674
thr=0.59 -> mean_acc=0.80582
thr=0.60 -> mean_acc=0.80364
Best threshold: 0.48 with CV acc: 0.8113416503936298


In [10]:
import os

# 1. Train final HGB on all training data
gb_final = HistGradientBoostingClassifier(
    learning_rate=0.05,
    max_depth=None,
    max_iter=300,
    min_samples_leaf=20,
    random_state=42,
)
gb_final.fit(X_prepared, y)

# 2. Predict probabilities on test
proba_test = gb_final.predict_proba(X_test_prepared)[:, 1]

# 3. Use tuned threshold instead of 0.5
best_thr = 0.48  # <-- replace with the value you found from the sweep
pred_test = (proba_test >= best_thr).astype(bool)

submission = pd.DataFrame({
    "PassengerId": df_test["PassengerId"],
    "Transported": pred_test,
})

# os.makedirs("submissions", exist_ok=True)
# submission.to_csv("submissions/06_hgb_threshold_tuned.csv", index=False)


In [11]:
from xgboost import XGBClassifier


groups = df_train["GroupId"]
gkf = GroupKFold(n_splits=5)

acc_hgb = []
acc_xgb = []
acc_ens = []

for train_idx, val_idx in gkf.split(X_prepared, y, groups=groups):
    X_tr, X_va = X_prepared.iloc[train_idx], X_prepared.iloc[val_idx]
    y_tr, y_va = y.iloc[train_idx], y.iloc[val_idx]

    # HGB (your best config)
    hgb = HistGradientBoostingClassifier(
        learning_rate=0.05,
        max_depth=None,
        max_iter=300,
        min_samples_leaf=20,
        random_state=42,
    )
    hgb.fit(X_tr, y_tr)
    proba_hgb = hgb.predict_proba(X_va)[:, 1]

    

    # XGBoost (decent starting point)

# Best config:
# learning_rate         0.050000
# n_estimators        200.000000
# max_depth             5.000000
# subsample             0.800000
# colsample_bytree      1.000000
# cv_mean_acc           0.812608
# cv_std_acc            0.005153
# Name: 5, dtype: float64
    
    xgb = XGBClassifier(
        n_estimators=200,
        learning_rate=0.05,
        max_depth=5,
        subsample=0.8,
        colsample_bytree=1.0,
        objective="binary:logistic",
        eval_metric="logloss",
        n_jobs=-1,
        random_state=42,
    )
    xgb.fit(X_tr, y_tr)
    proba_xgb = xgb.predict_proba(X_va)[:, 1]

    # Threshold 0.48 to match your tuned best
    thr = 0.48

    pred_hgb = (proba_hgb >= thr).astype(int)
    pred_xgb = (proba_xgb >= thr).astype(int)
    proba_ens = 0.5 * proba_hgb + 0.5 * proba_xgb
    pred_ens = (proba_ens >= thr).astype(int)

    acc_hgb.append(accuracy_score(y_va, pred_hgb))
    acc_xgb.append(accuracy_score(y_va, pred_xgb))
    acc_ens.append(accuracy_score(y_va, pred_ens))

print("HGB   CV mean:", np.mean(acc_hgb))
print("XGB   CV mean:", np.mean(acc_xgb))
print("Ensem CV mean:", np.mean(acc_ens))


HGB   CV mean: 0.8113416503936298
XGB   CV mean: 0.8111124272180023
Ensem CV mean: 0.8123772574082297


In [12]:
groups = df_train["GroupId"]

# Define a small-ish grid to start (you can expand later)
learning_rates = [0.05, 0.03]
n_estimators_list = [200, 400]
max_depths = [4, 5, 6]
subsamples = [0.8, 1.0]
colsample_bytrees = [0.8, 1.0]

gkf = GroupKFold(n_splits=5)
results = []

for lr in learning_rates:
    for n_est in n_estimators_list:
        for depth in max_depths:
            for subs in subsamples:
                for colsample in colsample_bytrees:
                    fold_accs = []

                    for train_idx, val_idx in gkf.split(X_prepared, y, groups=groups):
                        X_tr, X_va = X_prepared.iloc[train_idx], X_prepared.iloc[val_idx]
                        y_tr, y_va = y.iloc[train_idx], y.iloc[val_idx]

                        xgb = XGBClassifier(
                            learning_rate=lr,
                            n_estimators=n_est,
                            max_depth=depth,
                            subsample=subs,
                            colsample_bytree=colsample,
                            objective="binary:logistic",
                            eval_metric="logloss",
                            n_jobs=-1,
                            random_state=42,
                        )

                        xgb.fit(X_tr, y_tr)
                        proba_va = xgb.predict_proba(X_va)[:, 1]

                        # using 0.5 threshold for search; you can retune later if needed
                        preds_va = (proba_va >= 0.5).astype(int)
                        acc = accuracy_score(y_va, preds_va)
                        fold_accs.append(acc)

                    mean_acc = np.mean(fold_accs)
                    std_acc = np.std(fold_accs)

                    print(
                        f"lr={lr}, n_est={n_est}, depth={depth}, "
                        f"subsample={subs}, colsample={colsample} "
                        f"-> mean_acc={mean_acc:.5f} (std={std_acc:.4f})"
                    )

                    results.append({
                        "learning_rate": lr,
                        "n_estimators": n_est,
                        "max_depth": depth,
                        "subsample": subs,
                        "colsample_bytree": colsample,
                        "cv_mean_acc": mean_acc,
                        "cv_std_acc": std_acc,
                    })

results_df = pd.DataFrame(results)
display(results_df.sort_values("cv_mean_acc", ascending=False))

best_idx = results_df["cv_mean_acc"].idxmax()
best_row = results_df.loc[best_idx]

print("\nBest config:")
print(best_row)


lr=0.05, n_est=200, depth=4, subsample=0.8, colsample=0.8 -> mean_acc=0.80835 (std=0.0078)
lr=0.05, n_est=200, depth=4, subsample=0.8, colsample=1.0 -> mean_acc=0.80732 (std=0.0082)
lr=0.05, n_est=200, depth=4, subsample=1.0, colsample=0.8 -> mean_acc=0.80870 (std=0.0090)
lr=0.05, n_est=200, depth=4, subsample=1.0, colsample=1.0 -> mean_acc=0.80847 (std=0.0096)
lr=0.05, n_est=200, depth=5, subsample=0.8, colsample=0.8 -> mean_acc=0.80847 (std=0.0084)
lr=0.05, n_est=200, depth=5, subsample=0.8, colsample=1.0 -> mean_acc=0.81261 (std=0.0052)
lr=0.05, n_est=200, depth=5, subsample=1.0, colsample=0.8 -> mean_acc=0.80720 (std=0.0055)
lr=0.05, n_est=200, depth=5, subsample=1.0, colsample=1.0 -> mean_acc=0.80916 (std=0.0080)
lr=0.05, n_est=200, depth=6, subsample=0.8, colsample=0.8 -> mean_acc=0.80973 (std=0.0082)
lr=0.05, n_est=200, depth=6, subsample=0.8, colsample=1.0 -> mean_acc=0.80973 (std=0.0057)
lr=0.05, n_est=200, depth=6, subsample=1.0, colsample=0.8 -> mean_acc=0.80950 (std=0.0062)

Unnamed: 0,learning_rate,n_estimators,max_depth,subsample,colsample_bytree,cv_mean_acc,cv_std_acc
5,0.05,200,5,0.8,1.0,0.812608,0.005153
41,0.03,400,5,0.8,1.0,0.811458,0.005394
17,0.05,400,5,0.8,1.0,0.811227,0.006204
14,0.05,400,4,1.0,0.8,0.811112,0.006093
43,0.03,400,5,1.0,1.0,0.810306,0.008756
45,0.03,400,6,0.8,1.0,0.810191,0.007462
13,0.05,400,4,0.8,1.0,0.809962,0.006146
9,0.05,200,6,0.8,1.0,0.809732,0.005657
8,0.05,200,6,0.8,0.8,0.809731,0.00822
10,0.05,200,6,1.0,0.8,0.809501,0.006196



Best config:
learning_rate         0.050000
n_estimators        200.000000
max_depth             5.000000
subsample             0.800000
colsample_bytree      1.000000
cv_mean_acc           0.812608
cv_std_acc            0.005153
Name: 5, dtype: float64


In [13]:
from sklearn.model_selection import GroupKFold
from sklearn.metrics import accuracy_score
from sklearn.ensemble import HistGradientBoostingClassifier
from xgboost import XGBClassifier
import numpy as np
import pandas as pd

groups = df_train["GroupId"]
gkf = GroupKFold(n_splits=5)

n_samples = len(X_prepared)
oof_hgb = np.zeros(n_samples)
oof_xgb = np.zeros(n_samples)
oof_y   = y.values.copy()

# 1) Build OOF probabilities for HGB and XGB
for train_idx, val_idx in gkf.split(X_prepared, y, groups=groups):
    X_tr, X_va = X_prepared.iloc[train_idx], X_prepared.iloc[val_idx]
    y_tr, y_va = y.iloc[train_idx], y.iloc[val_idx]

    # HGB (best config you found)
    hgb = HistGradientBoostingClassifier(
        learning_rate=0.05,
        max_depth=None,
        max_iter=300,
        min_samples_leaf=20,
        random_state=42,
    )
    hgb.fit(X_tr, y_tr)
    oof_hgb[val_idx] = hgb.predict_proba(X_va)[:, 1]

    # XGB (best config you found)
    xgb = XGBClassifier(
        n_estimators=200,
        learning_rate=0.05,
        max_depth=5,
        subsample=0.8,
        colsample_bytree=1.0,
        objective="binary:logistic",
        eval_metric="logloss",
        n_jobs=-1,
        random_state=42,
    )
    xgb.fit(X_tr, y_tr)
    oof_xgb[val_idx] = xgb.predict_proba(X_va)[:, 1]

# 2) Search over ensemble weights and thresholds
weights = np.linspace(0.0, 1.0, 21)      # 0.00, 0.05, ..., 1.00
thresholds = np.linspace(0.40, 0.60, 21) # 0.40, 0.41, ..., 0.60

best_w = None
best_thr = None
best_acc = -1.0

for w in weights:
    ens_proba_base = w * oof_hgb + (1.0 - w) * oof_xgb  # combine once per w

    for thr in thresholds:
        preds = (ens_proba_base >= thr).astype(int)
        acc = accuracy_score(oof_y, preds)

        if acc > best_acc:
            best_acc = acc
            best_w = w
            best_thr = thr

print(f"Best weight w (HGB): {best_w:.3f}")
print(f"Weight on XGB: {1 - best_w:.3f}")
print(f"Best threshold: {best_thr:.3f}")
print(f"Best OOF CV accuracy: {best_acc:.5f}")


Best weight w (HGB): 0.400
Weight on XGB: 0.600
Best threshold: 0.470
Best OOF CV accuracy: 0.81399


In [14]:
from sklearn.ensemble import HistGradientBoostingClassifier
from xgboost import XGBClassifier
import pandas as pd
import os

# 1. Train final HGB on all training data
hgb_final = HistGradientBoostingClassifier(
    learning_rate=0.05,
    max_depth=None,
    max_iter=300,
    min_samples_leaf=20,
    random_state=42,
)
hgb_final.fit(X_prepared, y)

# 2. Train final XGB on all training data
xgb_final = XGBClassifier(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=1.0,
    objective="binary:logistic",
    eval_metric="logloss",
    n_jobs=-1,
    random_state=42,
)
xgb_final.fit(X_prepared, y)

# 3. Get test probabilities from both
proba_hgb_test = hgb_final.predict_proba(X_test_prepared)[:, 1]
proba_xgb_test = xgb_final.predict_proba(X_test_prepared)[:, 1]

# 4. Ensemble with learned weights
w = 0.4              # HGB weight
thr = 0.47           # best threshold from search
proba_ens_test = w * proba_hgb_test + (1 - w) * proba_xgb_test

pred_ens_test = (proba_ens_test >= thr).astype(bool)

# 5. Build submission
submission = pd.DataFrame({
    "PassengerId": df_test["PassengerId"],
    "Transported": pred_ens_test,
})

os.makedirs("submissions", exist_ok=True)
submission_path = "submissions/07_hgb_xgb_ens_w0p4_thr047.csv"
submission.to_csv(submission_path, index=False)
submission_path


'submissions/07_hgb_xgb_ens_w0p4_thr047.csv'