In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm, trange
import optuna

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, TensorDataset
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import KFold

import optuna
from optuna.samplers import TPESampler

In [2]:
%%capture
!pip install pytorch_tabnet

In [3]:
from pytorch_tabnet.tab_model import TabNetRegressor

## Read data

In [4]:
train = pd.read_csv('/kaggle/input/playground-series-s4e4/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s4e4/test.csv')
train.drop(columns=['id'], axis=1, inplace=True)
test.drop(columns=['id'], axis=1, inplace=True)

## Preprocess data

In [5]:
encoder = LabelEncoder()
scaler = MinMaxScaler()

train['Sex'] = encoder.fit_transform(train['Sex'])
test['Sex'] = encoder.transform(test['Sex'])

cat_cols = ['Sex']
target = 'Rings'
continuous_cols = [col for col in train.columns if col not in cat_cols + [target]]

train[continuous_cols] = scaler.fit_transform(train[continuous_cols])
test[continuous_cols] = scaler.transform(test[continuous_cols])

X = train.drop(columns=['Rings'], axis=1).values
y = train['Rings'].values.reshape(-1, 1)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.27, random_state=24)
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.float32)

# Create TensorDataset instances for training and validation
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
batch_size = 97
# Create DataLoader instances for training and validation
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

## Define network

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

In [10]:
params = {'n_d': 13, 'n_a': 16, 
          'n_steps': 7, 
          'gamma': 1.7530486543675008, 
          'n_independent': 2, 'n_shared': 1, 
          'momentum': 0.18408553079911108, 
          'clip_value': 0.8420021070703481, 
          'lambda_sparse': 0.0005891219059313855}
model = TabNetRegressor(**params)

## Define training loop

In [13]:
model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    patience=3,  # Adjust patience as needed
    max_epochs=20,  # Set maximum number of epochs
)

epoch 0  | loss: 8.64175 | val_0_mse: 8.6619  |  0:00:05s
epoch 1  | loss: 4.5031  | val_0_mse: 8.35007 |  0:00:11s
epoch 2  | loss: 4.17191 | val_0_mse: 6.37626 |  0:00:17s
epoch 3  | loss: 4.04824 | val_0_mse: 6.29605 |  0:00:23s
epoch 4  | loss: 3.90284 | val_0_mse: 5.26967 |  0:00:29s
epoch 5  | loss: 3.87069 | val_0_mse: 5.03647 |  0:00:35s
epoch 6  | loss: 3.80008 | val_0_mse: 4.07813 |  0:00:41s
epoch 7  | loss: 3.79443 | val_0_mse: 4.26017 |  0:00:47s
epoch 8  | loss: 3.73569 | val_0_mse: 3.84917 |  0:00:53s
epoch 9  | loss: 3.79702 | val_0_mse: 3.68383 |  0:00:58s
epoch 10 | loss: 3.78948 | val_0_mse: 3.68223 |  0:01:04s
epoch 11 | loss: 3.71858 | val_0_mse: 3.59184 |  0:01:10s
epoch 12 | loss: 3.69564 | val_0_mse: 3.63725 |  0:01:16s
epoch 13 | loss: 3.69753 | val_0_mse: 3.68556 |  0:01:22s
epoch 14 | loss: 3.66446 | val_0_mse: 3.4993  |  0:01:28s
epoch 15 | loss: 3.68347 | val_0_mse: 3.64818 |  0:01:34s
epoch 16 | loss: 3.68848 | val_0_mse: 3.64977 |  0:01:40s
epoch 17 | los



## Inference

In [17]:
# Inference

# Perform inference on the test data
test_preds = model.predict(test.values)

# Assuming you want to convert the predictions to a NumPy array
test_preds_np = test_preds
test_preds_np

array([[ 9.932756],
       [ 9.775032],
       [10.297734],
       ...,
       [12.535048],
       [13.190065],
       [ 7.954035]], dtype=float32)

## Hyperparameter tune 

In [None]:
def train_model(X_train_tensor, y_train_tensor, params):
    """
    Train the TabNet model with the given hyperparameters.
    
    Args:
        X_train_tensor (torch.Tensor): Input features tensor.
        y_train_tensor (torch.Tensor): Target tensor.
        params (dict): Hyperparameters for the model.
        
    Returns:
        float: Average validation RMSE across folds.
    """
    # Unpack the hyperparameters
    input_dim = X_train_tensor.shape[1]
    n_d = params['n_d']
    n_a = params['n_a']
    n_steps = params['n_steps']
    gamma = params['gamma']
    cat_idxs = []
    cat_dims = []
    cat_emb_dim = []
    n_independent = params['n_independent']
    n_shared = params['n_shared']
    momentum = params['momentum']
    clip_value = params['clip_value']
    lambda_sparse = params['lambda_sparse']
    num_epochs = params['num_epochs']
    batch_size = params['batch_size']
    patience = params['patience']
    virtual_batch_size = params['virtual_batch_size']
    
    # Create the TabNet model
    model = TabNetRegressor(
        input_dim=input_dim,
        output_dim=1,
        n_d=n_d,
        n_a=n_a,
        n_steps=n_steps,
        gamma=gamma,
        cat_idxs=cat_idxs,
        cat_dims=cat_dims,
        cat_emb_dim=cat_emb_dim,
        n_independent=n_independent,
        n_shared=n_shared,
        momentum=momentum,
        clip_value=clip_value,
        lambda_sparse=lambda_sparse,
        device_name='cuda'
    )
    
    # Create KFold cross-validator
    kf = KFold(n_splits=3, shuffle=True, random_state=42)
    best_val_rmse_folds = []
    
    # Perform k-fold cross-validation
    for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_tensor)):
        print(f"Fold [{fold+1}/3]")
        
        # Train the model
        model.fit(
            X_train=X_train_tensor[train_idx].numpy(),  # Convert to NumPy array
            y_train=y_train_tensor[train_idx].numpy(),  # Convert to NumPy array
            eval_set=[(X_train_tensor[val_idx].numpy(), y_train_tensor[val_idx].numpy())],  # Convert to NumPy array
            max_epochs=num_epochs,
            patience=patience,
            batch_size=batch_size,
            virtual_batch_size=virtual_batch_size,
            eval_metric=['rmse']
        )
        
        # Make predictions on the validation set
        val_predictions = model.predict(X_train_tensor[val_idx].numpy())  # Convert to NumPy array
        
        # Calculate validation RMSE
        val_rmse = np.sqrt(mean_squared_error(y_train_tensor[val_idx].numpy(), val_predictions.squeeze()))  # Convert to NumPy array
        print(f"Validation RMSE: {val_rmse:.4f}")
        
        # Store the validation RMSE for the current fold
        best_val_rmse_folds.append(val_rmse)
    
    # Calculate the average validation RMSE across all folds
    avg_val_rmse = np.mean(best_val_rmse_folds)
    print(f"Average Validation RMSE across folds: {avg_val_rmse:.4f}")
    
    return avg_val_rmse

def objective(trial):
    """
    Objective function for hyperparameter optimization.
    
    Args:
        trial (optuna.trial.Trial): Optuna trial object.
        
    Returns:
        float: Average validation RMSE.
    """
    params = {
        'n_d': trial.suggest_int('n_d', 8, 64),
        'n_a': trial.suggest_int('n_a', 8, 64),
        'n_steps': trial.suggest_int('n_steps', 3, 10),
        'gamma': trial.suggest_float('gamma', 1.0, 2.0),
        'n_independent': trial.suggest_int('n_independent', 1, 5),
        'n_shared': trial.suggest_int('n_shared', 1, 5),
        'momentum': trial.suggest_float('momentum', 0.01, 0.4),
        'clip_value': trial.suggest_float('clip_value', 0.01, 2.0),
        'lambda_sparse': trial.suggest_float('lambda_sparse', 0.0001, 0.01, log=True),
        'num_epochs': 200,
        'batch_size': trial.suggest_int('batch_size', 16, 1024),
        'patience': 10,
        'virtual_batch_size': trial.suggest_int('virtual_batch_size', 16, 128)
    }
    
    avg_val_rmse = train_model(X_train_tensor, y_train_tensor, params)  # Pass X_train_tensor and y_train_tensor
    return avg_val_rmse

# Create an Optuna study
study = optuna.create_study(direction='minimize', sampler=TPESampler(multivariate=True))

# Optimize the hyperparameters
study.optimize(objective, timeout=3600 * 10)

# Print the best hyperparameters and validation RMSE
print("Best hyperparameters:", study.best_params)
print("Best validation RMSE:", study.best_value)


[I 2024-04-17 12:08:26,419] A new study created in memory with name: no-name-7dc7740d-bc9e-46cc-8f20-4b7261a967bf


Fold [1/3]
epoch 0  | loss: 5.99872 | val_0_rmse: 2.418560028076172|  0:00:21s
epoch 1  | loss: 4.46499 | val_0_rmse: 1.9803099632263184|  0:00:42s
epoch 2  | loss: 4.30144 | val_0_rmse: 2.0053000450134277|  0:01:03s
epoch 3  | loss: 4.12455 | val_0_rmse: 1.9707200527191162|  0:01:24s
epoch 4  | loss: 4.17529 | val_0_rmse: 1.938920021057129|  0:01:45s
epoch 5  | loss: 4.07522 | val_0_rmse: 1.9376399517059326|  0:02:06s
epoch 6  | loss: 4.04532 | val_0_rmse: 1.9462300539016724|  0:02:27s
epoch 7  | loss: 4.0765  | val_0_rmse: 1.9309200048446655|  0:02:48s
epoch 8  | loss: 4.06111 | val_0_rmse: 1.989150047302246|  0:03:09s
epoch 9  | loss: 3.98008 | val_0_rmse: 1.9353400468826294|  0:03:30s
epoch 10 | loss: 4.02081 | val_0_rmse: 2.0060300827026367|  0:03:51s
epoch 11 | loss: 3.98848 | val_0_rmse: 1.9197800159454346|  0:04:12s
epoch 12 | loss: 3.96786 | val_0_rmse: 1.9210200309753418|  0:04:33s
epoch 13 | loss: 3.97463 | val_0_rmse: 1.9080100059509277|  0:04:54s
epoch 14 | loss: 4.00102 |



Validation RMSE: 1.8897
Fold [2/3]
epoch 0  | loss: 5.90405 | val_0_rmse: 2.2973899841308594|  0:00:22s
epoch 1  | loss: 4.44973 | val_0_rmse: 2.0359199047088623|  0:00:44s
epoch 5  | loss: 4.14287 | val_0_rmse: 1.9529600143432617|  0:02:12s
epoch 6  | loss: 4.14098 | val_0_rmse: 2.0566699504852295|  0:02:33s
epoch 7  | loss: 4.21602 | val_0_rmse: 2.0975399017333984|  0:02:55s
epoch 8  | loss: 4.1228  | val_0_rmse: 1.9693700075149536|  0:03:16s
epoch 9  | loss: 4.07529 | val_0_rmse: 1.9599599838256836|  0:03:38s
epoch 10 | loss: 4.09453 | val_0_rmse: 1.9906100034713745|  0:03:59s
epoch 11 | loss: 4.06394 | val_0_rmse: 1.9482899904251099|  0:04:20s
epoch 12 | loss: 4.02315 | val_0_rmse: 1.9691400527954102|  0:04:42s
epoch 13 | loss: 4.05814 | val_0_rmse: 2.018620014190674|  0:05:03s
epoch 14 | loss: 4.08172 | val_0_rmse: 2.148710012435913|  0:05:25s
epoch 15 | loss: 3.99589 | val_0_rmse: 1.9460500478744507|  0:05:47s
epoch 16 | loss: 3.979   | val_0_rmse: 1.9655799865722656|  0:06:09s
e



Validation RMSE: 1.9234
Fold [3/3]
epoch 0  | loss: 5.86462 | val_0_rmse: 2.5415499210357666|  0:00:22s
epoch 1  | loss: 4.36388 | val_0_rmse: 2.001460075378418|  0:00:44s
epoch 2  | loss: 4.26707 | val_0_rmse: 2.0100200176239014|  0:01:06s
epoch 3  | loss: 4.15779 | val_0_rmse: 2.0599000453948975|  0:01:28s
epoch 4  | loss: 4.12893 | val_0_rmse: 1.9616400003433228|  0:01:49s
epoch 5  | loss: 4.16252 | val_0_rmse: 1.9894399642944336|  0:02:10s
epoch 6  | loss: 4.11857 | val_0_rmse: 1.9378999471664429|  0:02:31s
epoch 7  | loss: 4.03833 | val_0_rmse: 1.9389699697494507|  0:02:53s
epoch 8  | loss: 3.98361 | val_0_rmse: 2.004319906234741|  0:03:14s
epoch 9  | loss: 3.99602 | val_0_rmse: 1.9583499431610107|  0:03:35s
epoch 10 | loss: 3.95483 | val_0_rmse: 1.9196300506591797|  0:03:57s
epoch 11 | loss: 3.94651 | val_0_rmse: 1.9544800519943237|  0:04:18s
epoch 12 | loss: 3.93076 | val_0_rmse: 1.9359099864959717|  0:04:40s
epoch 13 | loss: 3.94538 | val_0_rmse: 1.9246900081634521|  0:05:01s
e

[I 2024-04-17 12:40:48,092] Trial 0 finished with value: 1.9109134674072266 and parameters: {'n_d': 13, 'n_a': 16, 'n_steps': 7, 'gamma': 1.7530486543675008, 'n_independent': 2, 'n_shared': 1, 'momentum': 0.18408553079911108, 'clip_value': 0.8420021070703481, 'lambda_sparse': 0.0005891219059313855, 'batch_size': 97, 'virtual_batch_size': 95}. Best is trial 0 with value: 1.9109134674072266.


Validation RMSE: 1.9196
Average Validation RMSE across folds: 1.9109
Fold [1/3]




epoch 0  | loss: 9.23799 | val_0_rmse: 2.8239400386810303|  0:00:29s
epoch 1  | loss: 4.82595 | val_0_rmse: 2.5552499294281006|  0:00:59s
epoch 2  | loss: 4.61314 | val_0_rmse: 2.2804598808288574|  0:01:30s
epoch 3  | loss: 4.35507 | val_0_rmse: 2.058799982070923|  0:02:00s


### Hyperparameter search space is too long but the best results obtained were:
{'num_heads': 7, 'hidden_dim': 122, 'num_layers': 4, 'dropout': 0.3672614749403933, 'batch_size': 85, 'learning_rate': 0.0012365343304592232}. Best is trial 1 with value: 1.9047857522964478.