In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

df_train = pd.read_csv("data/train.csv")
df_test = pd.read_csv("data/test.csv")
sample_submission = pd.read_csv("data/sample_submission.csv")

In [2]:
## add TotalSpend numeric feature and NoSpend numeric feature

for df in (df_train, df_test):
    # Sum all the spending columns
    df["TotalSpend"] = (
        df["RoomService"]
        + df["FoodCourt"]
        + df["ShoppingMall"]
        + df["Spa"]
        + df["VRDeck"]
    )

    # Optional: flag for passengers who spent nothing
    df["NoSpend"] = (df["TotalSpend"] == 0).astype(int)


In [3]:
# split cabin feature into three, for deck, floor and side

for df in (df_train, df_test):
    cabin_split = df["Cabin"].str.split("/", expand=True)

    df["CabinDeck"] = cabin_split[0]                      # e.g. "B"
    df["CabinNum"] = pd.to_numeric(cabin_split[1], errors="coerce")  # e.g. 45
    df["CabinSide"] = cabin_split[2]                      # e.g. "P"

In [4]:
## exploit groups

for df in (df_train, df_test):
    df["GroupId"] = df["PassengerId"].str.split("_").str[0]
    group_sizes = df.groupby("GroupId")["GroupId"].transform("count")
    df["GroupSize"] = group_sizes
    df["IsAlone"] = (df["GroupSize"] == 1).astype(int)

        # Make a numeric version of CryoSleep for group aggregates
    df["CryoSleepNum"] = df["CryoSleep"].map({True: 1, False: 0})
    
    # Group-level spend + cryo rate
    df["GroupTotalSpend"] = df.groupby("GroupId")["TotalSpend"].transform("sum")
    df["GroupMeanSpend"] = df.groupby("GroupId")["TotalSpend"].transform("mean")
    df["GroupCryoRate"] = df.groupby("GroupId")["CryoSleepNum"].transform("mean")

In [5]:
df_train.head(5)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,...,CabinDeck,CabinNum,CabinSide,GroupId,GroupSize,IsAlone,CryoSleepNum,GroupTotalSpend,GroupMeanSpend,GroupCryoRate
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,...,B,0.0,P,1,1,1,0.0,0.0,0.0,0.0
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,...,F,0.0,S,2,1,1,0.0,736.0,736.0,0.0
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,...,A,0.0,S,3,2,0,0.0,15559.0,7779.5,0.0
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,...,A,0.0,S,3,2,0,0.0,15559.0,7779.5,0.0
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,...,F,1.0,S,4,1,1,0.0,1091.0,1091.0,0.0


In [6]:
df_train.info()
df_train.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 26 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   PassengerId      8693 non-null   object 
 1   HomePlanet       8492 non-null   object 
 2   CryoSleep        8476 non-null   object 
 3   Cabin            8494 non-null   object 
 4   Destination      8511 non-null   object 
 5   Age              8514 non-null   float64
 6   VIP              8490 non-null   object 
 7   RoomService      8512 non-null   float64
 8   FoodCourt        8510 non-null   float64
 9   ShoppingMall     8485 non-null   float64
 10  Spa              8510 non-null   float64
 11  VRDeck           8505 non-null   float64
 12  Name             8493 non-null   object 
 13  Transported      8693 non-null   bool   
 14  TotalSpend       7785 non-null   float64
 15  NoSpend          8693 non-null   int64  
 16  CabinDeck        8494 non-null   object 
 17  CabinNum      

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,TotalSpend,NoSpend,CabinNum,GroupSize,IsAlone,CryoSleepNum,GroupTotalSpend,GroupMeanSpend,GroupCryoRate
count,8514.0,8512.0,8510.0,8485.0,8510.0,8505.0,7785.0,8693.0,8494.0,8693.0,8693.0,8476.0,8693.0,8156.0,8584.0
mean,28.82793,224.687617,458.077203,173.729169,311.138778,304.854791,1484.601541,0.373519,600.367671,2.035546,0.552744,0.358306,2764.118486,1492.374153,0.359531
std,14.489021,666.717663,1611.48924,604.696458,1136.705535,1145.717189,2845.288241,0.483766,511.867226,1.596347,0.497239,0.479531,5211.1456,2408.29307,0.404533
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,167.25,1.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0,736.0,0.0,427.0,1.0,1.0,0.0,885.0,784.0,0.25
75%,38.0,47.0,76.0,27.0,59.0,46.0,1486.0,1.0,999.0,3.0,1.0,1.0,2599.0,1618.0,0.666667
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0,35987.0,1.0,1894.0,8.0,1.0,1.0,52668.0,35987.0,1.0


In [7]:
## BELOW WE WILL DEFINE THE COLUMNS FROM THE DATA SET 
## AND CREATE FIT_PREPROCESSING
target_col = "Transported"
drop_cols = ["PassengerId", "Name", "Cabin"]
numeric_cols = ["Age", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck", "TotalSpend", "NoSpend", "CabinNum", "GroupSize", "IsAlone", "GroupTotalSpend", "GroupMeanSpend", "GroupCryoRate"]
categorical_cols = ["HomePlanet", "CryoSleep", "CabinDeck", "CabinSide", "Destination", "VIP"]

def fit_preprocessing (df_train):
    y = df_train[target_col].astype(int)
    X = df_train.drop(columns=drop_cols + [target_col])

    X_numeric = X[numeric_cols].copy()
    X_cat = X[categorical_cols].copy()

    numeric_medians = X_numeric.median()
    cat_modes = X_cat.mode().iloc[0]
    
    ## add in median for NaNs on numerical cols
    for col in numeric_cols:
        X_numeric[col] = X_numeric[col].fillna(numeric_medians[col])

    for col in categorical_cols:
        X_cat[col] = X_cat[col].fillna(cat_modes[col])

    if X_numeric.isna().sum().sum() != 0:
        print("Error: not null values in numeric cols still unfilled")
        return

    if X_cat.isna().sum().sum() != 0:
        print("Error: not null values still in cat. calls still unfilled")
        return

    X_cat_encoded = pd.get_dummies(X_cat, drop_first=False)  
    
    X_prepared = pd.concat([X_numeric, X_cat_encoded], axis=1)
    print(X_prepared.shape)

    return X_prepared, numeric_medians, cat_modes, y, X_prepared.columns

In [8]:
def apply_preprocessing(df, numeric_medians, cat_modes, train_cols):
    X = df.drop(columns=drop_cols)

    X_numeric = X[numeric_cols].copy()
    X_cat = X[categorical_cols].copy()

    for col in numeric_cols:
        X_numeric[col] = X_numeric[col].fillna(numeric_medians[col])

    for col in categorical_cols:
        X_cat[col] = X_cat[col].fillna(cat_modes[col])

    if X_numeric.isna().sum().sum() != 0:
        print("Error: null values in numeric cols still unfilled")
        return

    if X_cat.isna().sum().sum() != 0:
        print("Error: null values still in cat. calls still unfilled")
        return

    X_cat_encoded = pd.get_dummies(X_cat, drop_first=False)

    X_prepared = pd.concat([X_numeric, X_cat_encoded], axis=1)

    X_prepared = X_prepared.reindex(columns=train_cols, fill_value=0)

    return X_prepared

In [9]:
X_prepared, numeric_medians, cat_modes, y, train_cols = fit_preprocessing(df_train)
X_test_prepared = apply_preprocessing(df_test, numeric_medians, cat_modes, train_cols)

(8693, 32)


  X_cat[col] = X_cat[col].fillna(cat_modes[col])
  X_cat[col] = X_cat[col].fillna(cat_modes[col])


In [10]:
X_prepared.head()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,TotalSpend,NoSpend,CabinNum,GroupSize,...,CabinDeck_D,CabinDeck_E,CabinDeck_F,CabinDeck_G,CabinDeck_T,CabinSide_P,CabinSide_S,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e
0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.0,1,...,False,False,False,False,False,True,False,False,False,True
1,24.0,109.0,9.0,25.0,549.0,44.0,736.0,0,0.0,1,...,False,False,True,False,False,False,True,False,False,True
2,58.0,43.0,3576.0,0.0,6715.0,49.0,10383.0,0,0.0,2,...,False,False,False,False,False,False,True,False,False,True
3,33.0,0.0,1283.0,371.0,3329.0,193.0,5176.0,0,0.0,2,...,False,False,False,False,False,False,True,False,False,True
4,16.0,303.0,70.0,151.0,565.0,2.0,1091.0,0,1.0,1,...,False,False,True,False,False,False,True,False,False,True


In [11]:
X_train, X_val, y_train, y_val = train_test_split(
    X_prepared, y, test_size=0.2, random_state=42, stratify=y)

In [12]:
rf = RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)

0,1,2
,n_estimators,300
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [13]:
## HERE WE GET THE BASE MODEL SCORE OVER WHICH WE WANT TO SEE IMPROVEMENTS

y_val_pred = rf.predict(X_val)
val_acc = accuracy_score(y_val, y_val_pred)
print("Validition accuracy:", val_acc)

Validition accuracy: 0.8033352501437608


Below I'll run experiments on the models and see what performs the best.

In [14]:
rf_v2 = RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1)
rf_v2.fit(X_train, y_train)

0,1,2
,n_estimators,300
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [15]:
y_val_pred_v2 = rf_v2.predict(X_val)
val_acc_v2 = accuracy_score(y_val, y_val_pred_v2)
print("Validition accuracy:", val_acc_v2)

Validition accuracy: 0.8033352501437608


After manual testing with various max_depth, min_leaf_samples, and nodes up to n = 1000, the original model is still the one that performs best. This will be our baseline moving forward still.

In [16]:
from sklearn.ensemble import HistGradientBoostingClassifier

gb = HistGradientBoostingClassifier(
    learning_rate=0.01,
    max_depth=8,   # or e.g. 6
    max_iter=400,     # number of boosting rounds
    random_state=42
)

gb.fit(X_train, y_train)

0,1,2
,loss,'log_loss'
,learning_rate,0.01
,max_iter,400
,max_leaf_nodes,31
,max_depth,8
,min_samples_leaf,20
,l2_regularization,0.0
,max_features,1.0
,max_bins,255
,categorical_features,'from_dtype'


In [17]:
y_val_pred = gb.predict(X_val)
print("Val accuracy:", accuracy_score(y_val, y_val_pred))

Val accuracy: 0.8021851638872916


In [18]:
from sklearn.model_selection import GroupKFold
from sklearn.metrics import accuracy_score
import numpy as np

groups = df_train["GroupId"]

learning_rates = [0.05, 0.02, 0.01]
max_depths = [None, 6, 8]
max_iters = [200, 300]
min_samples_leafs = [20, 50]

results = []

gkf = GroupKFold(n_splits=5)

for lr in learning_rates:
    for depth in max_depths:
        for n_iter in max_iters:
            for leaf in min_samples_leafs:
                fold_accs = []

                for train_idx, val_idx in gkf.split(X_prepared, y, groups=groups):
                    X_tr, X_va = X_prepared.iloc[train_idx], X_prepared.iloc[val_idx]
                    y_tr, y_va = y.iloc[train_idx], y.iloc[val_idx]

                    gb = HistGradientBoostingClassifier(
                        learning_rate=lr,
                        max_depth=depth,
                        max_iter=n_iter,
                        min_samples_leaf=leaf,
                        random_state=42,
                    )

                    gb.fit(X_tr, y_tr)
                    y_pred = gb.predict(X_va)
                    fold_accs.append(accuracy_score(y_va, y_pred))

                mean_acc = np.mean(fold_accs)
                std_acc = np.std(fold_accs)

                print(
                    f"lr={lr}, depth={depth}, iter={n_iter}, leaf={leaf} "
                    f"-> mean_acc={mean_acc:.5f} (std={std_acc:.4f})"
                )

                results.append({
                    "learning_rate": lr,
                    "max_depth": depth,
                    "max_iter": n_iter,
                    "min_samples_leaf": leaf,
                    "cv_mean_acc": mean_acc,
                    "cv_std_acc": std_acc,
                })

results_df = pd.DataFrame(results)
display(results_df.sort_values("cv_mean_acc", ascending=False))

best_idx = results_df["cv_mean_acc"].idxmax()
best_row = results_df.loc[best_idx]
print("\nBest config:")
print(best_row)

lr=0.05, depth=None, iter=200, leaf=20 -> mean_acc=0.81054 (std=0.0080)
lr=0.05, depth=None, iter=200, leaf=50 -> mean_acc=0.80996 (std=0.0043)
lr=0.05, depth=None, iter=300, leaf=20 -> mean_acc=0.80755 (std=0.0062)
lr=0.05, depth=None, iter=300, leaf=50 -> mean_acc=0.81100 (std=0.0062)
lr=0.05, depth=6, iter=200, leaf=20 -> mean_acc=0.81169 (std=0.0081)
lr=0.05, depth=6, iter=200, leaf=50 -> mean_acc=0.81100 (std=0.0085)
lr=0.05, depth=6, iter=300, leaf=20 -> mean_acc=0.80996 (std=0.0065)
lr=0.05, depth=6, iter=300, leaf=50 -> mean_acc=0.81077 (std=0.0071)
lr=0.05, depth=8, iter=200, leaf=20 -> mean_acc=0.81008 (std=0.0084)
lr=0.05, depth=8, iter=200, leaf=50 -> mean_acc=0.81203 (std=0.0073)
lr=0.05, depth=8, iter=300, leaf=20 -> mean_acc=0.80801 (std=0.0064)
lr=0.05, depth=8, iter=300, leaf=50 -> mean_acc=0.80996 (std=0.0065)
lr=0.02, depth=None, iter=200, leaf=20 -> mean_acc=0.80663 (std=0.0057)
lr=0.02, depth=None, iter=200, leaf=50 -> mean_acc=0.80812 (std=0.0079)
lr=0.02, depth=N

Unnamed: 0,learning_rate,max_depth,max_iter,min_samples_leaf,cv_mean_acc,cv_std_acc
23,0.02,8.0,300,50,0.812032,0.00653
9,0.05,8.0,200,50,0.812032,0.007263
4,0.05,6.0,200,20,0.811688,0.008097
3,0.05,,300,50,0.810997,0.006178
5,0.05,6.0,200,50,0.810997,0.008533
14,0.02,,300,20,0.810881,0.006749
7,0.05,6.0,300,50,0.810767,0.007119
15,0.02,,300,50,0.810651,0.006464
0,0.05,,200,20,0.810536,0.008026
22,0.02,8.0,300,20,0.810076,0.00738



Best config:
learning_rate         0.020000
max_depth             8.000000
max_iter            300.000000
min_samples_leaf     50.000000
cv_mean_acc           0.812032
cv_std_acc            0.006530
Name: 23, dtype: float64


In [24]:
# 1. Get base per-passenger probabilities from your final HGB model
best_gb = HistGradientBoostingClassifier(
                        learning_rate=0.02,
                        max_depth=8,
                        max_iter=300,
                        min_samples_leaf=50,
                        random_state=42,
                    )

best_gb.fit(X_prepared, y)

test_proba = best_gb.predict_proba(X_test_prepared)[:, 1]

# 2. Attach PassengerId, GroupId, and prob into one DataFrame
test_meta = df_test[["PassengerId", "GroupId"]].copy()
test_meta["proba"] = test_proba

# 3. Compute average probability per group
test_meta["group_proba"] = test_meta.groupby("GroupId")["proba"].transform("mean")

# 4. Turn group-level probabilities into final predictions
#    Everyone in the same GroupId shares the same final label
test_meta["Transported"] = (test_meta["group_proba"] >= 0.5).astype(bool)

# 5. Build submission file
submission = test_meta[["PassengerId", "Transported"]]
submission.to_csv("submissions/05_hgb_group_consistent.csv", index=False)


In [25]:
from sklearn.model_selection import GroupKFold
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd

groups = df_train["GroupId"]

gkf = GroupKFold(n_splits=5)

raw_accs = []
group_accs = []

for train_idx, val_idx in gkf.split(X_prepared, y, groups=groups):
    X_tr, X_va = X_prepared.iloc[train_idx], X_prepared.iloc[val_idx]
    y_tr, y_va = y.iloc[train_idx], y.iloc[val_idx]
    groups_va = groups.iloc[val_idx]

    gb = HistGradientBoostingClassifier(
        learning_rate=0.02,
        max_depth=8,
        max_iter=300,
        min_samples_leaf=50,
        random_state=42,
    )
    gb.fit(X_tr, y_tr)

    # base probabilities on validation set
    proba_va = gb.predict_proba(X_va)[:, 1]

    # raw 0.5-threshold predictions
    raw_pred = (proba_va >= 0.5).astype(int)
    raw_acc = accuracy_score(y_va, raw_pred)
    raw_accs.append(raw_acc)

    # group-consistent predictions on validation
    val_meta = pd.DataFrame({
        "GroupId": groups_va.values,
        "y": y_va.values,
        "proba": proba_va,
    })
    val_meta["group_proba"] = val_meta.groupby("GroupId")["proba"].transform("mean")
    group_pred = (val_meta["group_proba"] >= 0.5).astype(int)
    group_acc = accuracy_score(val_meta["y"], group_pred)
    group_accs.append(group_acc)

print("Raw preds  CV mean acc:", np.mean(raw_accs), "std:", np.std(raw_accs))
print("Group-cons CV mean acc:", np.mean(group_accs), "std:", np.std(group_accs))


Raw preds  CV mean acc: 0.8120319668394 std: 0.006530319968391462
Group-cons CV mean acc: 0.7538243014946489 std: 0.012987959442774463
