In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

df_train = pd.read_csv("data/train.csv")
df_test = pd.read_csv("data/test.csv")
sample_submission = pd.read_csv("data/sample_submission.csv")

In [2]:
## add TotalSpend numeric feature and NoSpend numeric feature

for df in (df_train, df_test):
    # Sum all the spending columns
    df["TotalSpend"] = (
        df["RoomService"]
        + df["FoodCourt"]
        + df["ShoppingMall"]
        + df["Spa"]
        + df["VRDeck"]
    )

    # Optional: flag for passengers who spent nothing
    df["NoSpend"] = (df["TotalSpend"] == 0).astype(int)


In [3]:
# split cabin feature into three, for deck, floor and side

for df in (df_train, df_test):
    cabin_split = df["Cabin"].str.split("/", expand=True)

    df["CabinDeck"] = cabin_split[0]                      # e.g. "B"
    df["CabinNum"] = pd.to_numeric(cabin_split[1], errors="coerce")  # e.g. 45
    df["CabinSide"] = cabin_split[2]                      # e.g. "P"

In [5]:
## exploit groups

for df in (df_train, df_test):
    df["GroupId"] = df["PassengerId"].str.split("_").str[0]
    group_sizes = df.groupby("GroupId")["GroupId"].transform("count")
    df["GroupSize"] = group_sizes
    df["IsAlone"] = (df["GroupSize"] == 1).astype(int)

In [6]:
df_train.head(5)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,...,Name,Transported,TotalSpend,NoSpend,CabinDeck,CabinNum,CabinSide,GroupId,GroupSize,IsAlone
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,...,Maham Ofracculy,False,0.0,1,B,0.0,P,1,1,1
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,...,Juanna Vines,True,736.0,0,F,0.0,S,2,1,1
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,...,Altark Susent,False,10383.0,0,A,0.0,S,3,2,0
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,...,Solam Susent,False,5176.0,0,A,0.0,S,3,2,0
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,...,Willy Santantines,True,1091.0,0,F,1.0,S,4,1,1


In [10]:
df_train.info()
df_train.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 22 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
 14  TotalSpend    7785 non-null   float64
 15  NoSpend       8693 non-null   int64  
 16  CabinDeck     8494 non-null   object 
 17  CabinNum      8494 non-null   float64
 18  CabinSide     8494 non-null 

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,TotalSpend,NoSpend,CabinNum,GroupSize,IsAlone
count,8514.0,8512.0,8510.0,8485.0,8510.0,8505.0,7785.0,8693.0,8494.0,8693.0,8693.0
mean,28.82793,224.687617,458.077203,173.729169,311.138778,304.854791,1484.601541,0.373519,600.367671,2.035546,0.552744
std,14.489021,666.717663,1611.48924,604.696458,1136.705535,1145.717189,2845.288241,0.483766,511.867226,1.596347,0.497239
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,167.25,1.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0,736.0,0.0,427.0,1.0,1.0
75%,38.0,47.0,76.0,27.0,59.0,46.0,1486.0,1.0,999.0,3.0,1.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0,35987.0,1.0,1894.0,8.0,1.0


In [7]:
## BELOW WE WILL DEFINE THE COLUMNS FROM THE DATA SET 
## AND CREATE FIT_PREPROCESSING
target_col = "Transported"
drop_cols = ["PassengerId", "Name", "Cabin"]
numeric_cols = ["Age", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck", "TotalSpend", "NoSpend", "CabinNum", "GroupSize", "IsAlone"]
categorical_cols = ["HomePlanet", "CryoSleep", "CabinDeck", "CabinSide", "Destination", "VIP"]

def fit_preprocessing (df_train):
    y = df_train[target_col].astype(int)
    X = df_train.drop(columns=drop_cols + [target_col])

    X_numeric = X[numeric_cols].copy()
    X_cat = X[categorical_cols].copy()

    numeric_medians = X_numeric.median()
    cat_modes = X_cat.mode().iloc[0]
    
    ## add in median for NaNs on numerical cols
    for col in numeric_cols:
        X_numeric[col] = X_numeric[col].fillna(numeric_medians[col])

    for col in categorical_cols:
        X_cat[col] = X_cat[col].fillna(cat_modes[col])

    if X_numeric.isna().sum().sum() != 0:
        print("Error: not null values in numeric cols still unfilled")
        return

    if X_cat.isna().sum().sum() != 0:
        print("Error: not null values still in cat. calls still unfilled")
        return

    X_cat_encoded = pd.get_dummies(X_cat, drop_first=False)  
    
    X_prepared = pd.concat([X_numeric, X_cat_encoded], axis=1)
    print(X_prepared.shape)

    return X_prepared, numeric_medians, cat_modes, y, X_prepared.columns

In [8]:
def apply_preprocessing(df, numeric_medians, cat_modes, train_cols):
    X = df.drop(columns=drop_cols)

    X_numeric = X[numeric_cols].copy()
    X_cat = X[categorical_cols].copy()

    for col in numeric_cols:
        X_numeric[col] = X_numeric[col].fillna(numeric_medians[col])

    for col in categorical_cols:
        X_cat[col] = X_cat[col].fillna(cat_modes[col])

    if X_numeric.isna().sum().sum() != 0:
        print("Error: null values in numeric cols still unfilled")
        return

    if X_cat.isna().sum().sum() != 0:
        print("Error: null values still in cat. calls still unfilled")
        return

    X_cat_encoded = pd.get_dummies(X_cat, drop_first=False)

    X_prepared = pd.concat([X_numeric, X_cat_encoded], axis=1)

    X_prepared = X_prepared.reindex(columns=train_cols, fill_value=0)

    return X_prepared

In [9]:
X_prepared, numeric_medians, cat_modes, y, train_cols = fit_preprocessing(df_train)
X_test_prepared = apply_preprocessing(df_test, numeric_medians, cat_modes, train_cols)

(8693, 29)


  X_cat[col] = X_cat[col].fillna(cat_modes[col])
  X_cat[col] = X_cat[col].fillna(cat_modes[col])


In [11]:
X_prepared.head()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,TotalSpend,NoSpend,CabinNum,GroupSize,...,CabinDeck_D,CabinDeck_E,CabinDeck_F,CabinDeck_G,CabinDeck_T,CabinSide_P,CabinSide_S,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e
0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.0,1,...,False,False,False,False,False,True,False,False,False,True
1,24.0,109.0,9.0,25.0,549.0,44.0,736.0,0,0.0,1,...,False,False,True,False,False,False,True,False,False,True
2,58.0,43.0,3576.0,0.0,6715.0,49.0,10383.0,0,0.0,2,...,False,False,False,False,False,False,True,False,False,True
3,33.0,0.0,1283.0,371.0,3329.0,193.0,5176.0,0,0.0,2,...,False,False,False,False,False,False,True,False,False,True
4,16.0,303.0,70.0,151.0,565.0,2.0,1091.0,0,1.0,1,...,False,False,True,False,False,False,True,False,False,True


In [12]:
X_train, X_val, y_train, y_val = train_test_split(
    X_prepared, y, test_size=0.2, random_state=42, stratify=y)

In [13]:
rf = RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)

0,1,2
,n_estimators,300
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [14]:
## HERE WE GET THE BASE MODEL SCORE OVER WHICH WE WANT TO SEE IMPROVEMENTS

y_val_pred = rf.predict(X_val)
val_acc = accuracy_score(y_val, y_val_pred)
print("Validition accuracy:", val_acc)

Validition accuracy: 0.8039102932719954


Below I'll run experiments on the models and see what performs the best.

In [46]:
rf_v2 = RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1)
rf_v2.fit(X_train, y_train)

0,1,2
,n_estimators,300
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [47]:
y_val_pred_v2 = rf_v2.predict(X_val)
val_acc_v2 = accuracy_score(y_val, y_val_pred_v2)
print("Validition accuracy:", val_acc_v2)

Validition accuracy: 0.7952846463484762


After manual testing with various max_depth, min_leaf_samples, and nodes up to n = 1000, the original model is still the one that performs best. This will be our baseline moving forward still.

In [16]:
from sklearn.ensemble import HistGradientBoostingClassifier

gb = HistGradientBoostingClassifier(
    learning_rate=0.01,
    max_depth=8,   # or e.g. 6
    max_iter=400,     # number of boosting rounds
    random_state=42
)

gb.fit(X_train, y_train)

0,1,2
,loss,'log_loss'
,learning_rate,0.01
,max_iter,400
,max_leaf_nodes,31
,max_depth,8
,min_samples_leaf,20
,l2_regularization,0.0
,max_features,1.0
,max_bins,255
,categorical_features,'from_dtype'


In [20]:
y_val_pred = gb.predict(X_val)
print("Val accuracy:", accuracy_score(y_val, y_val_pred))

Val accuracy: 0.8027602070155262


In [17]:

# 1. Define the grid you want to search over
learning_rates = [0.05, 0.02, 0.01]
max_depths = [None, 6, 8]
max_iters = [200, 300]
min_samples_leafs = [20, 50]

results = []

# 2. Loop over all combinations
for lr in learning_rates:
    for depth in max_depths:
        for n_iter in max_iters:
            for leaf in min_samples_leafs:
                gb = HistGradientBoostingClassifier(
                    learning_rate=lr,
                    max_depth=depth,
                    max_iter=n_iter,
                    min_samples_leaf=leaf,
                    random_state=42,
                )

                gb.fit(X_train, y_train)
                y_val_pred = gb.predict(X_val)
                acc = accuracy_score(y_val, y_val_pred)

                print(
                    f"lr={lr}, depth={depth}, iter={n_iter}, leaf={leaf} "
                    f"-> val_acc={acc:.5f}"
                )

                results.append({
                    "learning_rate": lr,
                    "max_depth": depth,
                    "max_iter": n_iter,
                    "min_samples_leaf": leaf,
                    "val_acc": acc,
                    "model": gb,  # keep the fitted model if you want to reuse best
                })

# 3. Turn results into a DataFrame for easy sorting / inspection
results_df = pd.DataFrame([
    {k: v for k, v in r.items() if k != "model"}  # drop model objects
    for r in results
])

display(results_df.sort_values("val_acc", ascending=False))

# 4. Grab the best model + its params
best_idx = results_df["val_acc"].idxmax()
best_row = results_df.loc[best_idx]
best_model = results[best_idx]["model"]

print("\nBest config:")
print(best_row)

lr=0.05, depth=None, iter=200, leaf=20 -> val_acc=0.81139
lr=0.05, depth=None, iter=200, leaf=50 -> val_acc=0.81541
lr=0.05, depth=None, iter=300, leaf=20 -> val_acc=0.82289
lr=0.05, depth=None, iter=300, leaf=50 -> val_acc=0.81484
lr=0.05, depth=6, iter=200, leaf=20 -> val_acc=0.81139
lr=0.05, depth=6, iter=200, leaf=50 -> val_acc=0.81139
lr=0.05, depth=6, iter=300, leaf=20 -> val_acc=0.81254
lr=0.05, depth=6, iter=300, leaf=50 -> val_acc=0.80909
lr=0.05, depth=8, iter=200, leaf=20 -> val_acc=0.81599
lr=0.05, depth=8, iter=200, leaf=50 -> val_acc=0.80966
lr=0.05, depth=8, iter=300, leaf=20 -> val_acc=0.81541
lr=0.05, depth=8, iter=300, leaf=50 -> val_acc=0.81369
lr=0.02, depth=None, iter=200, leaf=20 -> val_acc=0.80161
lr=0.02, depth=None, iter=200, leaf=50 -> val_acc=0.79816
lr=0.02, depth=None, iter=300, leaf=20 -> val_acc=0.80161
lr=0.02, depth=None, iter=300, leaf=50 -> val_acc=0.80794
lr=0.02, depth=6, iter=200, leaf=20 -> val_acc=0.80794
lr=0.02, depth=6, iter=200, leaf=50 -> va

Unnamed: 0,learning_rate,max_depth,max_iter,min_samples_leaf,val_acc
2,0.05,,300,20,0.822887
8,0.05,8.0,200,20,0.815986
10,0.05,8.0,300,20,0.815411
1,0.05,,200,50,0.815411
3,0.05,,300,50,0.814836
11,0.05,8.0,300,50,0.813686
6,0.05,6.0,300,20,0.812536
18,0.02,6.0,300,20,0.811961
19,0.02,6.0,300,50,0.811386
17,0.02,6.0,200,50,0.811386



Best config:
learning_rate         0.050000
max_depth                  NaN
max_iter            300.000000
min_samples_leaf     20.000000
val_acc               0.822887
Name: 2, dtype: float64


In [18]:
gb_final = HistGradientBoostingClassifier(
    learning_rate=0.05,
    max_depth=None,   # or e.g. 6
    max_iter=300,     # number of boosting rounds
    min_samples_leaf=20,
    random_state=42
)

gb_final.fit(X_prepared, y)

0,1,2
,loss,'log_loss'
,learning_rate,0.05
,max_iter,300
,max_leaf_nodes,31
,max_depth,
,min_samples_leaf,20
,l2_regularization,0.0
,max_features,1.0
,max_bins,255
,categorical_features,'from_dtype'


In [19]:
test_preds = gb_final.predict(X_test_prepared)

submission = pd.DataFrame({
    "PassengerId": df_test["PassengerId"],
    "Transported": test_preds.astype(bool),
})

submission.to_csv("03_submission_hgb_tuned_v2.csv", index=False)

In [20]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

accs = []
for train_idx, val_idx in skf.split(X_prepared, y):
    X_tr, X_va = X_prepared.iloc[train_idx], X_prepared.iloc[val_idx]
    y_tr, y_va = y.iloc[train_idx], y.iloc[val_idx]

    gb = HistGradientBoostingClassifier(
        learning_rate=0.05,
        max_depth=None,
        max_iter=300,
        min_samples_leaf=20,
        random_state=42,
    )
    gb.fit(X_tr, y_tr)
    y_pred = gb.predict(X_va)
    accs.append(accuracy_score(y_va, y_pred))

print("CV mean acc:", np.mean(accs))
print("CV std acc:", np.std(accs))


CV mean acc: 0.8103062418979465
CV std acc: 0.0063640079549921215
