# Spaceship Titanic GRANDE

In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES']      = '0'
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'

import pandas as pd
from sklearn.model_selection import train_test_split
from GRANDE import GRANDE

## Fitting the GRANDE model

The code is from the [GRANDE github repository](https://github.com/s-marton/GRANDE).

In [2]:
# 1. Load & preprocess
df = pd.read_csv('data/train_processed.csv')
test_df = pd.read_csv('data/test_processed.csv')

In [3]:
print(df.dtypes)

PassengerId                   object
Age                          float64
RoomService                  float64
FoodCourt                    float64
ShoppingMall                 float64
Spa                          float64
VRDeck                       float64
Transported                     bool
CabinNum                       int64
HomePlanet_Europa               bool
HomePlanet_Mars                 bool
CryoSleep_True                  bool
Destination_PSO J318.5-22       bool
Destination_TRAPPIST-1e         bool
VIP_True                        bool
Deck_B                          bool
Deck_C                          bool
Deck_D                          bool
Deck_E                          bool
Deck_F                          bool
Deck_G                          bool
Deck_T                          bool
Side_S                          bool
dtype: object


In [4]:
print(test_df.dtypes)

PassengerId                   object
Age                          float64
RoomService                  float64
FoodCourt                    float64
ShoppingMall                 float64
Spa                          float64
VRDeck                       float64
CabinNum                       int64
HomePlanet_Europa               bool
HomePlanet_Mars                 bool
CryoSleep_True                  bool
Destination_PSO J318.5-22       bool
Destination_TRAPPIST-1e         bool
VIP_True                        bool
Deck_B                          bool
Deck_C                          bool
Deck_D                          bool
Deck_E                          bool
Deck_F                          bool
Deck_G                          bool
Deck_T                          bool
Side_S                          bool
dtype: object


In [6]:
# … your imputation, cabin parsing, quantile transforms …

####
#bool_cols = ['Transported','HomePlanet_Europa', 'HomePlanet_Mars', 'CryoSleep_True', 'Destination_PSO J318.5-22', 'Destination_TRAPPIST-1e',
            #'VIP_True', 'Deck_B', 'Deck_C', 'Deck_D', 'Deck_E', 'Deck_F', 'Deck_G', 'Deck_T', 'Side_S']  # and any other True/False flags
#df[bool_cols] = df[bool_cols].astype(int)
#test_df[bool_cols]  = test_df[bool_cols].astype(int)

####

X = df.drop(columns=['PassengerId','Transported'])
y = df['Transported'].astype(int)

# 2. Train/Validation split
X_train, X_valid, y_train, y_valid = train_test_split(
    X.values, y.values,
    test_size=0.2,
    random_state=42,
    stratify=y.values
)

categorical_feature_indices = []

params = {
        'depth': 5, # tree depth
        'n_estimators': 258, # number of estimators / trees

        'learning_rate_weights': 0.005, # learning rate for leaf weights
        'learning_rate_index': 0.01, # learning rate for split indices
        'learning_rate_values': 0.01, # learning rate for split values
        'learning_rate_leaf': 0.01, # learning rate for leafs (logits)

        'optimizer': 'adam', # optimizer
        'cosine_decay_steps': 0, # decay steps for lr schedule (CosineDecayRestarts)

        'loss': 'crossentropy', # loss function (default 'crossentropy' for binary & multi-class classification and 'mse' for regression)
        'focal_loss': False, # use focal loss {True, False}
        'temperature': 0.0, # temperature for stochastic re-weighted GD (0.0, 1.0)

        'from_logits': True, # use logits for weighting {True, False}
        'use_class_weights': True, # use class weights for training {True, False}

        'dropout': 0.0, # dropout rate (here, dropout randomly disables individual estimators of the ensemble during training)

        'selected_variables': 0.8, # feature subset percentage (0.0, 1.0)
        'data_subset_fraction': 1.0, # data subset percentage (0.0, 1.0)
}

args = {
    'epochs': 100, #1_000, # number of epochs for training
    'early_stopping_epochs': 25, # patience for early stopping (best weights are restored)
    'batch_size': 64,  # batch size for training

    'cat_idx': categorical_feature_indices, # put list of categorical indices
    'objective': 'binary', # objective / task {'binary', 'classification', 'regression'}
    
    'random_seed': 42,
    'verbose': 1,       
}

model_grande = GRANDE(params=params, args=args)

model_grande.fit(X_train=X_train,
          y_train=y_train,
          X_val=X_valid,
          y_val=y_valid)


Epoch 1/100


  X_train[num_columns] = X_train[num_columns].fillna(self.mean_train_num)
  X_val[num_columns] = X_val[num_columns].fillna(self.mean_train_num)


[1m109/109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 16ms/step - loss: 0.6544 - val_loss: 0.5561
Epoch 2/100
[1m109/109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 16ms/step - loss: 0.5428 - val_loss: 0.4883
Epoch 3/100
[1m109/109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 16ms/step - loss: 0.4904 - val_loss: 0.4499
Epoch 4/100
[1m109/109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 16ms/step - loss: 0.4625 - val_loss: 0.4294
Epoch 5/100
[1m109/109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 16ms/step - loss: 0.4471 - val_loss: 0.4193
Epoch 6/100
[1m109/109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 16ms/step - loss: 0.4380 - val_loss: 0.4079
Epoch 7/100
[1m109/109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 16ms/step - loss: 0.4300 - val_loss: 0.4033
Epoch 8/100
[1m109/109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 16ms/step - loss: 0.4247 - val_loss: 0.3989
Epoch 9/100
[1m109/109[0m [32m━━━

## Prediction

In [None]:
from sklearn.metrics import accuracy_score
import numpy as np
y_pred = model_grande.predict(X_valid)
y_pred = np.argmax(y_pred, axis=1)

accuracy = accuracy_score(y_valid, y_pred)
print(f"Validation Accuracy: {accuracy:.4f}")

Validation Accuracy: 0.8062


  X[self.num_columns] = X[self.num_columns].fillna(self.mean_train_num)


In [12]:
# Load & preprocess test set
test_df = pd.read_csv('data/test_processed.csv')
# … same preprocessing as train …
X_test = test_df.drop(columns=['PassengerId']).values

# Generate predictions
preds = model_grande.predict(X_test)
# For binary, preds[:,1] is the prob. of True
submission = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Transported': (preds[:,1] > 0.5)
})
submission.to_csv('submission_grande.csv', index=False)

  X[self.num_columns] = X[self.num_columns].fillna(self.mean_train_num)
