# Spaceship Titanic: TABM by Elias Ruud Aronsen

Project: [Spaceship Titanic](https://www.kaggle.com/competitions/spaceship-titanic/overview)

In this notebook, I will train a TABM model for the individual part of the project.

TABM Github repository can be found here: [TABM GitHub](https://github.com/yandex-research/tabm)

In [6]:
import pandas as pd
import random as rand
import torch
from torch import nn
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score
from src.tabm_reference import Model # This is our TABM model
from torch.utils.data import DataLoader, TensorDataset

# Read in preprocessed data, shown in 1-EDA-and-preprocessing.ipynb
train = pd.read_csv('data/processed_train.csv')
test = pd.read_csv('data/processed_test.csv')

df_Y = train['Transported']
df_X = train.drop(columns=['Transported'])

## TABM

In [2]:
df_X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 46 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   RoomService                8693 non-null   float64
 1   FoodCourt                  8693 non-null   float64
 2   ShoppingMall               8693 non-null   float64
 3   Spa                        8693 non-null   float64
 4   VRDeck                     8693 non-null   float64
 5   Group                      8693 non-null   int64  
 6   Id                         8693 non-null   int64  
 7   Num                        8693 non-null   float64
 8   FamilySize                 8693 non-null   float64
 9   GroupSize                  8693 non-null   int64  
 10  HomePlanet_Earth           8693 non-null   float64
 11  HomePlanet_Europa          8693 non-null   float64
 12  HomePlanet_Mars            8693 non-null   float64
 13  HomePlanet_nan             8693 non-null   float

In [18]:
#### MODEL DEFINITION AND SETUP

train_x, val_x, train_y, val_y = train_test_split( df_X, df_Y, test_size=0.2, random_state=42, stratify=df_Y) # split our dataset

# we turn data int tensors as we will be using pytorch.
train_x_tensor = torch.tensor(train_x.values, dtype=torch.float32)
val_x_tensor = torch.tensor(val_x.values, dtype=torch.float32)
train_y_tensor = torch.tensor(train_y.values, dtype=torch.long)
val_y_tensor = torch.tensor(val_y.values, dtype=torch.long)

# create the tensor datasets
train_dataset = TensorDataset(train_x_tensor, train_y_tensor)
val_dataset = TensorDataset(val_x_tensor, val_y_tensor)

# create dataloaders for batches, speeds up training, self regularization, more stable updates etc.
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=512)


### Model setup
n_num_features = train_x_tensor.shape[1]
cat_cardinalities = []  # no categorical features left (everything is one-hot or numerical)
n_classes = 2 # binary

def setup(k = 5, n_blocks= 5, d_block= 20, dropout=0.1, learning_rate=1e-3, weight_decay= 1e-4, activation='RELU'):

    model = Model(
        n_num_features=n_num_features,
        cat_cardinalities=cat_cardinalities,
        n_classes=n_classes,
        backbone=dict( # the structure of the underlying MLPs
            type='MLP',
            n_blocks=n_blocks,
            d_block=d_block,
            dropout=dropout,
            activation=activation,),
        bins=None,
        num_embeddings=None,
        arch_type='tabm',
        k=k, # number of ensemble models
    )

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    loss_function = nn.CrossEntropyLoss() # cross entropy is standard for binary classification

    return model, optimizer, loss_function


In [5]:
##### Training the TABM MODEL
def run_tabm(k=5, n_blocks=5, d_block=20, dropout=0.1, learning_rate=1e-3, weight_decay=1e-4, activation='ReLU', n_epochs=200, patience=20):
    # define our model optimizer and loss function parameters
    model, optimizer, loss_function = setup(k=k, n_blocks=n_blocks, d_block=d_block, dropout=dropout, learning_rate=learning_rate, weight_decay=weight_decay, activation=activation)

    best_val_loss = float('inf') # tracks current best validation loss reached
    patience_counter = 0 # goes up when the model steps into a worse val loss.
    
    #Training loop with validation monitoring and early stopping.
    for epoch in range(1, n_epochs + 1):
        #Training
        model.train()
        train_losses = []
        train_preds_all = []
        train_targets_all = []

        for xb, yb in train_loader: # loading in batches
            optimizer.zero_grad()
            outputs = model(xb, None) # forward propogation
            loss = loss_function(outputs.mean(dim=1), yb) # calculate loss
            loss.backward() # backward propegation
            optimizer.step()

            # adding batch train loss to the batch total
            train_losses.append(loss.item())
            train_preds_all.append(outputs.mean(dim=1).argmax(dim=1))
            train_targets_all.append(yb)
            
        # calculating accuracy for epoch
        train_preds_all = torch.cat(train_preds_all)
        train_targets_all = torch.cat(train_targets_all)
        train_acc = accuracy_score(train_targets_all.cpu(), train_preds_all.cpu())

        # Validation
        model.eval()
        val_losses = []
        val_preds_all = []
        val_targets_all = []

        with torch.no_grad():
            for xb, yb in val_loader:
                val_outputs = model(xb, None)
                val_loss = loss_function(val_outputs.mean(dim=1), yb)

                val_losses.append(val_loss.item())
                val_preds_all.append(val_outputs.mean(dim=1).argmax(dim=1))
                val_targets_all.append(yb)

        # calculating validation accuracy
        val_preds_all = torch.cat(val_preds_all)
        val_targets_all = torch.cat(val_targets_all)
        val_acc = accuracy_score(val_targets_all.cpu(), val_preds_all.cpu())
        val_loss_epoch = sum(val_losses) / len(val_losses)

        print(f"Epoch {epoch}: Train Loss={sum(train_losses)/len(train_losses):.4f}, Train Acc={train_acc:.4f} ||| Val Loss={val_loss_epoch:.4f}, Val Acc={val_acc:.4f}")

        # early stopping check
        if val_loss_epoch < best_val_loss:
            best_val_loss = val_loss_epoch
            patience_counter = 0 # we reset counter each time we find better
            best_model_state = model.state_dict() # save state
        else: # increase patience if not better
            patience_counter += 1
            if patience_counter >= patience:
                print(f"Early stopping triggered at epoch {epoch}")
                break
            
    # Load back the best found model
    model.load_state_dict(best_model_state)
    return model
    

### Tuning

In [11]:
## The parameter space based recommendations from: https://github.com/yandex-research/tabm/tree/main
param_space = {
    'k': [4, 8, 12],
    'n_blocks': [2, 3, 4],
    'd_block': [128, 256, 512],
    'dropout': [0.0, 0.1, 0.2],
    'learning_rate': [1e-3, 5e-4, 3e-4],
    'weight_decay': [0.0, 1e-4, 1e-3],
    'activation': ['ReLU', 'GELU']
}

In [12]:
n_trials = 30  # how many random combinations to try
results = []

for trial in range(n_trials):
    # we randomly sample one value for each parameter
    params = {key: rand.choice(values) for key, values in param_space.items()}
    print(f"\n Trial {trial+1} with params: {params}")

    # train the model with the chosen parameters
    model = run_tabm(**params)

    # after we train, we evaluate the final val accuracy
    model.eval()  # set model to eval mode. turns off dropout etc.

    val_preds = []    # predictions
    val_targets = []  # targets

    with torch.no_grad():  # disables gradient tracking, its faster
        for xb, yb in val_loader:  # go over val batches
            outputs = model(xb, None)  # forward pass
            preds = outputs.mean(dim=1).argmax(dim=1)  # ensemble average, then pick predicted class
            val_preds.append(preds)    # predictions
            val_targets.append(yb)     # targets

    val_preds = torch.cat(val_preds)
    val_targets = torch.cat(val_targets)
    
    val_acc = accuracy_score(val_targets.cpu(), val_preds.cpu()) # accuracy calcuulation
    print(f"Validation Accuracy: {val_acc:.4f}")

    # saves the result
    results.append((params, val_acc))


 Trial 1 with params: {'k': 4, 'n_blocks': 4, 'd_block': 256, 'dropout': 0.2, 'learning_rate': 0.0005, 'weight_decay': 0.0, 'activation': 'ReLU'}
Epoch 1: Train Loss=1.4369, Train Acc=0.6372 ||| Val Loss=0.5893, Val Acc=0.7027
Epoch 2: Train Loss=0.6905, Train Acc=0.6800 ||| Val Loss=0.5592, Val Acc=0.7188
Epoch 3: Train Loss=0.6116, Train Acc=0.7039 ||| Val Loss=0.5462, Val Acc=0.7269
Epoch 4: Train Loss=0.5688, Train Acc=0.7274 ||| Val Loss=0.5486, Val Acc=0.7286
Epoch 5: Train Loss=0.5560, Train Acc=0.7383 ||| Val Loss=0.5216, Val Acc=0.7890
Epoch 6: Train Loss=0.5423, Train Acc=0.7473 ||| Val Loss=0.5195, Val Acc=0.7849
Epoch 7: Train Loss=0.5346, Train Acc=0.7620 ||| Val Loss=0.5084, Val Acc=0.7895
Epoch 8: Train Loss=0.5244, Train Acc=0.7680 ||| Val Loss=0.5043, Val Acc=0.7930
Epoch 9: Train Loss=0.5179, Train Acc=0.7656 ||| Val Loss=0.4965, Val Acc=0.7936
Epoch 10: Train Loss=0.5183, Train Acc=0.7637 ||| Val Loss=0.4974, Val Acc=0.7936
Epoch 11: Train Loss=0.5141, Train Acc=0.7

In [16]:
# sort the results based on performance and show best parameter combinations
best_params = sorted(results, key=lambda x: x[1], reverse=True)

print("\nTop 5 parameter combinations:")
for i, (params, acc) in enumerate(best_params[:5], start=1):
    print(f"{i}. Val Accuracy: {acc:.4f} | Params: {params}")


Top 5 parameter combinations:
1. Val Accuracy: 0.8051 | Params: {'k': 4, 'n_blocks': 3, 'd_block': 256, 'dropout': 0.2, 'learning_rate': 0.0003, 'weight_decay': 0.0, 'activation': 'ReLU'}
2. Val Accuracy: 0.8005 | Params: {'k': 4, 'n_blocks': 4, 'd_block': 256, 'dropout': 0.2, 'learning_rate': 0.0005, 'weight_decay': 0.0, 'activation': 'ReLU'}
3. Val Accuracy: 0.7993 | Params: {'k': 8, 'n_blocks': 3, 'd_block': 128, 'dropout': 0.2, 'learning_rate': 0.0003, 'weight_decay': 0.0, 'activation': 'GELU'}
4. Val Accuracy: 0.7987 | Params: {'k': 8, 'n_blocks': 4, 'd_block': 128, 'dropout': 0.1, 'learning_rate': 0.0005, 'weight_decay': 0.0, 'activation': 'ReLU'}
5. Val Accuracy: 0.7987 | Params: {'k': 8, 'n_blocks': 3, 'd_block': 256, 'dropout': 0.2, 'learning_rate': 0.0003, 'weight_decay': 0.001, 'activation': 'ReLU'}


## Results

- After 30 trials the best accuracy attained was 0.8051, which is on par with our other ensemble models like CatBoost and XGBoost, but not quite as good as I expected, but we still have not trained the model on the full dataset which will likely increase it's accuracy.

From here we train this model on the full training set, generate our predictions on the givne test data and submit.

In [19]:
best = best_params[0][0] # our best found parameters
print("Our parameters: ", best)


X_tensor = torch.tensor(df_X.values, dtype=torch.float32)
Y_tensor = torch.tensor(df_Y.values, dtype=torch.long)

train_dataset = TensorDataset(X_tensor, Y_tensor)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True) # calling train loader since run_tabm uses it

final_model = run_tabm(**best)

Our parameters:  {'k': 4, 'n_blocks': 3, 'd_block': 256, 'dropout': 0.2, 'learning_rate': 0.0003, 'weight_decay': 0.0, 'activation': 'ReLU'}
Epoch 1: Train Loss=3.3993, Train Acc=0.6260 ||| Val Loss=0.7554, Val Acc=0.7223
Epoch 2: Train Loss=1.2636, Train Acc=0.6481 ||| Val Loss=0.5628, Val Acc=0.7355
Epoch 3: Train Loss=0.8432, Train Acc=0.6598 ||| Val Loss=0.5815, Val Acc=0.6561
Epoch 4: Train Loss=0.6990, Train Acc=0.6776 ||| Val Loss=0.5699, Val Acc=0.7734
Epoch 5: Train Loss=0.6288, Train Acc=0.6929 ||| Val Loss=0.5689, Val Acc=0.7861
Epoch 6: Train Loss=0.5947, Train Acc=0.7080 ||| Val Loss=0.5414, Val Acc=0.7493
Epoch 7: Train Loss=0.5816, Train Acc=0.7191 ||| Val Loss=0.5356, Val Acc=0.7878
Epoch 8: Train Loss=0.5659, Train Acc=0.7245 ||| Val Loss=0.5255, Val Acc=0.7878
Epoch 9: Train Loss=0.5561, Train Acc=0.7320 ||| Val Loss=0.5161, Val Acc=0.7849
Epoch 10: Train Loss=0.5454, Train Acc=0.7452 ||| Val Loss=0.5071, Val Acc=0.7844
Epoch 11: Train Loss=0.5415, Train Acc=0.7445 ||

#### Making the submission

In [20]:


# convert test to tensor so that we can feed it into model
test_x_tensor = torch.tensor(test.values, dtype=torch.float32)

# Predicting
final_model.eval()
with torch.no_grad():
    test_outputs = final_model(test_x_tensor, None)  # generate predictions for the k models
    test_preds = test_outputs.mean(dim=1).argmax(dim=1)  # we take the average prediction over the k ensemble models

# we get back the passengerIds (removed in encoding)
test_ids = pd.read_csv('data/test.csv')['PassengerId']

#  setup the submission df
final_submission_tabm = pd.DataFrame({
    'PassengerId': test_ids,
    'Transported': test_preds.cpu().numpy().astype(bool)  # convert predictions to bool True/False
})

# saves the submission df to csv. Ready for kaggle.
final_submission_tabm.to_csv('submissions/final_submission_tabm.csv', index=False)

- The final score from the leaderboard for the TabM model was 0.79728.