In [324]:
import math
from typing import List, Optional, Callable, cast, Dict
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
from torch.utils.data import DataLoader, Dataset
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.discriminant_analysis import StandardScaler
from sklearn.model_selection import train_test_split
from copy import deepcopy
import torch.optim as optim
from sklearn.preprocessing import LabelEncoder
import optuna
from optuna.trial import TrialState
# import typing as ty
from torch import Tensor
import torch.nn.init as nn_init
import statistics
from sklearn.model_selection import KFold
import time
from utils import calculate_metric

In [325]:
# sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), "..")))

In [326]:
# set random seeds
np.random.seed(0)
torch.manual_seed(0)

<torch._C.Generator at 0x7f01bf92e930>

In [373]:
INPUT_FILE = "../data/data_removing_na.xlsx"
MODEL_PATH = '../output/transformer'
BATCH_SIZE = 32

DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
DEVICE_LIST = [0, 1, 2, 3]

In [328]:
DEVICE

device(type='cuda', index=0)

In [329]:
df = pd.read_excel(INPUT_FILE)

In [330]:
df.head()

Unnamed: 0,rr1_30,currency,seniorioty_adj,coupon rate,domicile_country,exchange_country,Industry_sector,Industry_group,Industry_subgroup,event_type,...,PD_55_pd,PD_56_pd,PD_57_pd,PD_58_pd,PD_59_pd,PD_60_pd,DTD,NI_Over_TA,Size,defaulted_in_last_6_months
0,0.259908,USD,Senior Subordinated Unsecured,9.0,United States,United States,Consumer Discretionary,Retail & Whsle - Discretionary,E-Commerce Discretionary,Bankruptcy Filing,...,0.396731,0.397453,0.398148,0.398819,0.399467,0.400092,-0.732815,-0.007137,-0.852484,False
1,0.032729,USD,Senior Subordinated Unsecured,5.75,United States,United States,Health Care,Health Care,Health Care Facilities & Svcs,Default Corp Action,...,0.957454,0.957467,0.95748,0.957492,0.957503,0.957514,-1.666262,-0.000286,-1.186347,False
2,0.9724,USD,Unsecured,5.675,South Korea,South Korea,Consumer Discretionary,Retail & Whsle - Discretionary,Wholesale - Discretionary,Default Corp Action,...,0.568169,0.568693,0.569197,0.569682,0.57015,0.5706,-1.853366,0.000191,1.053677,False
3,1.047416,CHF,Unsecured,0.125,South Korea,South Korea,Consumer Discretionary,Retail & Whsle - Discretionary,Wholesale - Discretionary,Default Corp Action,...,0.568169,0.568693,0.569197,0.569682,0.57015,0.5706,-1.853366,0.000191,1.053677,False
4,0.848872,JPY,Unsecured,1.75,Japan,Japan,Industrials,Industrial Products,Electrical Equipment,Bankruptcy Filing,...,0.130285,0.130688,0.131081,0.131465,0.13184,0.132206,-0.768857,-0.028058,-1.946507,False


In [331]:
df.shape

(1725, 165)

In [332]:
labels = df['rr1_30']
features = df.drop(columns=['rr1_30', 'rr1_7', 'rr2_7', 'rr2_30'], errors="ignore") #drop labels

In [333]:
features.dtypes

currency                       object
seniorioty_adj                 object
coupon rate                   float64
domicile_country               object
exchange_country               object
                               ...   
PD_60_pd                      float64
DTD                           float64
NI_Over_TA                    float64
Size                          float64
defaulted_in_last_6_months       bool
Length: 164, dtype: object

In [334]:
feature_list = features.columns
category_features = list(features.select_dtypes(include=['object', 'bool']).columns)
non_category_features = [i for i in feature_list if i not in category_features]

In [335]:
print(len(non_category_features))
print(len(category_features))

153
11


In [336]:
# enforce all categories to have type string
features[category_features] = features[category_features].astype(str)

In [337]:
# transform categorical features in features
label_encoders = {}
mappings = {}

for column in category_features:
    le = LabelEncoder()
    features[column] = le.fit_transform(features[column])
    label_encoders[column] = le
    mappings[column] = {index: label for index, label in enumerate(le.classes_)}

In [338]:
mappings

{'currency': {0: 'CAD',
  1: 'CHF',
  2: 'CNY',
  3: 'EUR',
  4: 'GBP',
  5: 'HKD',
  6: 'INR',
  7: 'ISK',
  8: 'JPY',
  9: 'MYR',
  10: 'NOK',
  11: 'SEK',
  12: 'SGD',
  13: 'THB',
  14: 'TWD',
  15: 'USD'},
 'seniorioty_adj': {0: 'Junior Unsecured or Junior Subordinated Unsecured',
  1: 'Secured',
  2: 'Senior Secured',
  3: 'Senior Subordinated Unsecured',
  4: 'Senior Unsecured',
  5: 'Subordinated Unsecured',
  6: 'Unsecured'},
 'domicile_country': {0: 'Argentina',
  1: 'Australia',
  2: 'Bahamas',
  3: 'Belgium',
  4: 'Bermuda',
  5: 'Canada',
  6: 'Cayman Islands',
  7: 'China',
  8: 'Czech Republic',
  9: 'Greece',
  10: 'Hong Kong',
  11: 'Iceland',
  12: 'India',
  13: 'Indonesia',
  14: 'Japan',
  15: 'Luxembourg',
  16: 'Malaysia',
  17: 'Mongolia',
  18: 'Philippines',
  19: 'Poland',
  20: 'Singapore',
  21: 'South Africa',
  22: 'South Korea',
  23: 'Taiwan',
  24: 'Thailand',
  25: 'United Kingdom',
  26: 'United States'},
 'exchange_country': {0: 'Australia',
  1: 'C

In [339]:
categories = [len(le.classes_) for le in label_encoders.values()]
d_numerical = len(non_category_features)

In [340]:
# split data into training and test set
test_size = 0.25
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=test_size, random_state=42)

print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

Training Features Shape: (1293, 164)
Training Labels Shape: (1293,)
Testing Features Shape: (432, 164)
Testing Labels Shape: (432,)


In [341]:
train_features.head()

Unnamed: 0,currency,seniorioty_adj,coupon rate,domicile_country,exchange_country,Industry_sector,Industry_group,Industry_subgroup,event_type,event_type_subcategory_sum,...,PD_55_pd,PD_56_pd,PD_57_pd,PD_58_pd,PD_59_pd,PD_60_pd,DTD,NI_Over_TA,Size,defaulted_in_last_6_months
755,15,4,7.5,26,12,3,10,32,1,5,...,0.691011,0.691382,0.691738,0.69208,0.692408,0.692725,-1.084433,-0.052027,-2.074964,0
588,15,6,6.0,13,4,7,8,31,1,1,...,0.183801,0.184996,0.186167,0.187313,0.188437,0.189539,-0.540409,0.017209,0.864692,0
585,15,2,11.0,26,12,7,8,31,1,1,...,0.278774,0.280216,0.281615,0.282972,0.28429,0.285571,0.754647,-0.010395,-0.342209,0
1329,15,2,9.125,26,12,3,10,33,1,4,...,0.190375,0.191471,0.192544,0.193594,0.194622,0.195628,-0.24208,-0.022618,-2.808528,0
973,15,2,9.25,26,12,3,10,32,1,9,...,0.149501,0.15093,0.152327,0.153692,0.155027,0.156332,1.138686,3.3e-05,-0.085154,0


In [342]:
# Normalize the data
# Prepare the ColumnTransformer
scaler = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), non_category_features)   # StandardScaler()
    ],
    remainder='passthrough'  # Leave categorical features untouched
)

In [343]:
class CustomDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels
    
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, idx):
        feature = self.features[idx]
        label = self.labels[idx]
        feature_tensor = torch.tensor(feature, dtype=torch.float32)
        label_tensor = torch.tensor(label, dtype=torch.float32)
        return feature_tensor, label_tensor
    
def custom_collate(batch):
    divisor = len(DEVICE_LIST)  # Number of GPUs
    batch_size = len(batch)
    # print("batch shape ", batch)

    # Calculate padding to make batch size divisible by the number of GPUs
    pad_size = divisor - (batch_size % divisor) if batch_size % divisor != 0 else 0

    # Separate features and labels
    features = torch.stack([item[0] for item in batch])
    labels = torch.stack([item[1] for item in batch])

    # Pad features and labels along the batch dimension
    if pad_size > 0:
        pad_features = features[:pad_size]  # Reuse existing features for padding
        pad_labels = labels[:pad_size]  # Reuse existing labels for padding
        features = torch.cat([features, pad_features], dim=0)
        labels = torch.cat([labels, pad_labels], dim=0)

    return features, labels

TUNING PARAMETERS

In [345]:
class Tokenizer(nn.Module):
    category_offsets: Optional[Tensor]

    def __init__(
        self,
        d_numerical: int,
        categories: Optional[List[int]],
        d_token: int,
        bias: bool,
    ) -> None:
        super(Tokenizer, self).__init__()
        
        # Handling categorical features
        if categories is None:
            d_bias = d_numerical
            self.category_offsets = None
            self.category_embeddings = None
        else:
            d_bias = d_numerical + len(categories)
            
            # Ensure proper indexing for embeddings
            category_offsets = torch.tensor([0] + categories[:-1]).cumsum(0)
            self.register_buffer('category_offsets', category_offsets)
            
            # Create category embeddings
            self.category_embeddings = nn.Embedding(sum(categories), d_token)
        
        # Adding [CLS] token into account
        self.weight = nn.Parameter(torch.empty(d_numerical + 1, d_token))
        self.bias = nn.Parameter(torch.empty(d_bias, d_token)) if bias else None
        
        # The initialization is inspired by nn.Linear
        nn_init.kaiming_uniform_(self.weight, a=math.sqrt(5))
        if self.bias is not None:
            nn_init.kaiming_uniform_(self.bias, a=math.sqrt(5))

    @property
    def n_tokens(self) -> int:
        return len(self.weight) + (0 if self.category_offsets is None else len(self.category_offsets))

    def forward(self, x_num: Tensor, x_cat: Optional[Tensor]) -> Tensor:
        """
        Forward pass for the tokenizer.
        
        Args:
            x_num (Tensor): Numerical input tensor of shape (batch_size, num_features).
            x_cat (Optional[Tensor]): Categorical input tensor of shape (batch_size, num_categories).
        
        Returns:
            Tensor: Tokenized output of shape (batch_size, n_tokens, d_token).
        """
        batch_size = x_num.size(0)
        
        # Add [CLS] token to numerical features
        x_num = torch.cat([
            torch.ones(batch_size, 1, device=x_num.device),  # [CLS] token
            x_num
        ], dim=1)
        
        # Weight numerical features
        x = self.weight.unsqueeze(0) * x_num.unsqueeze(-1)   # (1, d_num + 1, d_token) x (batch_size, d_num + 1, 1) => (batch_size, d_num + 1, d_token)
        
        # Handle categorical features if present
        if x_cat is not None:
            x_cat_embeddings = self.category_embeddings(x_cat + self.category_offsets.unsqueeze(0)) # (batch_size, num_categories, d_token)
            x = torch.cat([x, x_cat_embeddings], dim=1) # batch_size, 1 + d_num + num_Categories, d_token
        
        # Add bias if applicable
        if self.bias is not None:
            bias = torch.cat([
                torch.zeros(1, self.bias.size(1), device=x.device),
                self.bias
            ])
            x = x + bias.unsqueeze(0)
        
        return x


In [346]:
class MultiheadAttention(nn.Module):
    def __init__(
        self, d: int, n_heads: int, dropout: float = 0.0, kv_compression: Optional[float] = None
    ) -> None:
        """
        Multi-head attention mechanism.

        Args:
            d (int): Dimensionality of the input features.
            n_heads (int): Number of attention heads.
            dropout (float): Dropout probability. Defaults to 0.0.
            kv_compression: compression rate for key and value
        """
        super(MultiheadAttention, self).__init__()

        # Ensure d is divisible by n_heads
        assert n_heads > 0 and d % n_heads == 0, "d must be divisible by n_heads."

        self.n_heads = n_heads
        self.d_head = d // n_heads

        # Define linear transformations
        self.W_q = nn.Linear(d, d)
        self.W_k = nn.Linear(d, d)
        self.W_v = nn.Linear(d, d)
        self.W_out = nn.Linear(d, d) if n_heads > 1 else None
        
        self.kv_compression = kv_compression
        if kv_compression:
            self.key_compression = nn.Linear(d, int(d * kv_compression), bias=False)
            self.value_compression = nn.Linear(d, int(d * kv_compression), bias=False)

        # Dropout layer
        self.dropout = nn.Dropout(dropout) if dropout > 0 else None

    def _reshape(self, x: Tensor) -> Tensor:
        """Reshape tensor for multi-head attention.

        Input:
            x: Tensor of shape (batch_size, n_tokens, d)

        Output:
            Tensor of shape (batch_size * n_heads, n_tokens, d_head)
        """
        batch_size, n_tokens, d = x.shape
        return (
            x.reshape(batch_size, n_tokens, self.n_heads, self.d_head) 
            .transpose(1, 2)  # (batch_size, n_heads, n_tokens, d_head)
            .reshape(batch_size * self.n_heads, n_tokens, self.d_head)  # (batch_size * n_heads, n_tokens, d_head)
        )

    def forward(self, x_q: Tensor, x_kv: Tensor) -> Tensor:
        """
        Forward pass of multi-head attention.

        Args:
            x_q (Tensor): Query tensor of shape (batch_size, num_tokens, d).
            x_kv (Tensor): Key and value tensor of shape (batch_size, num_tokens, d).

        Returns:
            Tensor: Output tensor of shape (batch_size, num_tokens, d).
        """
        # Linear projections
        q, k, v = self.W_q(x_q), self.W_k(x_kv), self.W_v(x_kv)
        # q: (batch_size, n_q_tokens, d). k, v: (batch_size, num_tokens, d)
        
        # reduce dimension of key and value if it's too long
        if self.kv_compression:
            k = self.key_compression(k)
            v = self.value_compression(v)

        # Ensure dimensions are divisible by number of heads
        for tensor in [q, k, v]:
            assert tensor.shape[-1] % self.n_heads == 0, "Input dimensions must be divisible by n_heads."

        batch_size = q.size(0)
        n_q_tokens = q.size(1)

        # Reshape for multi-head attention
        q = self._reshape(q)  # (batch_size * n_heads, n_q_tokens, d_head)
        k = self._reshape(k)  # (batch_size * n_heads, num_tokens, d_head)
        v = self._reshape(v)  # (batch_size * n_heads, num_tokens, d_head)

        # Scaled dot-product attention
        attention_scores = q @ k.transpose(-2, -1) / math.sqrt(self.d_head)  # (batch_size * n_heads, n_q_tokens, num_tokens)
        attention_weights = F.softmax(attention_scores, dim=-1)  # (batch_size * n_heads, n_q_tokens, num_tokens)

        # Apply dropout to attention weights if specified
        if self.dropout is not None:
            attention_weights = self.dropout(attention_weights)

        # Compute attention output
        attention_output = attention_weights @ v  # (batch_size * n_heads, n_q_tokens, d_head)
        
        # Reshape back to original dimensions
        attention_output = (
            attention_output
            .reshape(batch_size, self.n_heads, n_q_tokens, self.d_head)  # (batch_size, n_heads, n_q_tokens, d_head)
            .transpose(1, 2)  # (batch_size, n_q_tokens, n_heads, d_head)
            .reshape(batch_size, n_q_tokens, -1)  # (batch_size, n_q_tokens, d)
        )

        # Apply final linear transformation if applicable
        if self.W_out is not None:
            attention_output = self.W_out(attention_output)  # (batch_size, n_q_tokens, d)

        return attention_output


In [347]:
def reglu(x: torch.Tensor) -> torch.Tensor:
    """ReGLU activation function."""
    a, b = x.chunk(2, dim=-1)
    return a * F.relu(b)

def geglu(x: torch.Tensor) -> torch.Tensor:
    """GeGLU activation function."""
    a, b = x.chunk(2, dim=-1)
    return a * F.gelu(b)

def get_activation_fn(name: str) -> Callable[[torch.Tensor], torch.Tensor]:
    """Retrieve the activation function by name."""
    activation_functions = {
        'reglu': reglu,
        'geglu': geglu,
        'sigmoid': torch.sigmoid,
    }

    if name in activation_functions:
        return activation_functions[name]

    if hasattr(F, name):
        return getattr(F, name)

    raise ValueError(f"Unsupported activation function: {name}")

def get_nonglu_activation_fn(name: str) -> Callable[[torch.Tensor], torch.Tensor]:
    """Retrieve the non-GLU activation function by name."""
    nonglu_mappings = {
        'reglu': F.relu,
        'geglu': F.gelu,
    }

    return nonglu_mappings.get(name, get_activation_fn(name))

In [348]:
class TransformerBlock(nn.Module):
    def __init__(self, d_token: int,
        d_hidden: int,
        n_heads: int,
        attention_dropout: float,
        activation: str,
        prenormalization: bool,
        kv_compression: Optional[float],
        ffn_dropout: Optional[float],
        residual_dropout: Optional[float]) -> None:
        
        super(TransformerBlock, self).__init__()
        
        self.norm0 = nn.LayerNorm(d_token) if not prenormalization else nn.Identity()
        self.attention = MultiheadAttention(d_token, n_heads, attention_dropout, kv_compression)
        self.linear0 = nn.Linear(d_token, d_hidden * (2 if activation.endswith('glu') else 1))
    
        self.dropout0 = nn.Dropout(residual_dropout) if residual_dropout else nn.Identity()
        self.linear1 = nn.Linear(d_hidden, d_token)
        self.norm1 = nn.LayerNorm(d_token) if not prenormalization else nn.Identity()
        self.dropout1 = nn.Dropout(ffn_dropout) if ffn_dropout else nn.Identity()
        self.activation = get_activation_fn(activation)
        
    def forward(self, x, is_last_layer=False):
        # attention block
        residual = self.norm0(x)
        residual = self.attention(residual[:, :1]if is_last_layer else residual, #[CLS] - target variable,
                                  residual) 
        residual = self.dropout0(residual) + x
        residual = self.norm1(residual)
        x = residual
        
        # feedforward block
        residual = self.norm0(residual)
        residual = self.linear0(residual)
        residual = self.activation(residual)
        residual = self.dropout1(residual)
        residual = self.linear1(residual)
        residual = self.dropout0(residual) + x
        residual = self.norm1(residual)
        return residual
    
class Transformer(nn.Module):
    """Transformer implementation with support for optional Linformer compression."""

    def __init__(
        self,
        *,
        d_numerical: int,
        categories: Optional[List[int]],
        token_bias: bool,
        n_layers: int,
        d_token: int,
        n_heads: int,
        d_ffn_factor: float,
        attention_dropout: float,
        ffn_dropout: Optional[float],
        residual_dropout: Optional[float],
        activation: str,
        prenormalization: bool,
        kv_compression: Optional[float],
        d_out: int,
    ) -> None:
        super(Transformer, self).__init__()

        self.tokenizer = Tokenizer(d_numerical, categories, d_token, token_bias)
        d_hidden = int(d_token * d_ffn_factor)
        self.layers = nn.ModuleList([TransformerBlock(d_token, d_hidden, n_heads, attention_dropout, activation, prenormalization, kv_compression, ffn_dropout, residual_dropout) for _ in range(n_layers)])

        
        # use non-glu activation function in the last layer for simplicity
        self.last_activation = get_nonglu_activation_fn(activation)
        self.prenormalization = prenormalization
        self.last_normalization = nn.LayerNorm(d_token) if prenormalization else None
        self.head = nn.Linear(d_token, d_out)

    def forward(self, x_num: torch.Tensor, x_cat: Optional[torch.Tensor]) -> torch.Tensor:
        """Forward pass of the Transformer model."""
        x = self.tokenizer(x_num, x_cat)
        x_residual = x

        for layer_idx, layer in enumerate(self.layers):
            is_last_layer = layer_idx + 1 == len(self.layers)
            x_residual = layer(x_residual, is_last_layer)

        # Final normalization and activation
        x_residual = x_residual[:, 0]
        if self.last_normalization is not None:
            x_residual = self.last_normalization(x_residual)
        x_residual = self.last_activation(x_residual)
        output = self.head(x_residual)
        # print("dimension of ori output: ", output.shape)
        return output.squeeze()

In [349]:
EPOCHS = 50

In [350]:
# empty cache first
torch.cuda.empty_cache()

In [351]:
def objective(trial):
    # Generate the model.
    # model = define_model(trial, train_features.shape[1], 1).to(DEVICE)
    # Define out_features_list
    n_heads = trial.suggest_int("n_heads", 1, 10)
    
    n_layers = trial.suggest_int("n_layers", 1, 5)
    
    token_multiplier = trial.suggest_int("token_multiplier", 5, 30)  # Adjust the range as necessary
    
    # Suggest an integer for a that is divisible by b
    d_token = trial.suggest_int("d_token", n_heads, n_heads * token_multiplier, step=n_heads)
    
    attention_dropout = trial.suggest_float("attention_dropout", 0, 0.5)
    d_ffn_factor = trial.suggest_float("d_ffn_factor", 1, 3)
    ffn_dropout = trial.suggest_float("ffn_dropout", 0, 0.5)
    
    # activation = trial.suggest_categorical("activation", choices=['relu', 'reglu', 'geglu'])
    # batch_size = trial.suggest_int('batch_size', 16, 128, step=16)

    args = {'activation': 'relu', #activation, #
    'attention_dropout': attention_dropout,
    'd_ffn_factor': d_ffn_factor,
    'd_token': d_token,
    'ffn_dropout': ffn_dropout,
    'n_heads': n_heads,
    'n_layers': n_layers,
    'prenormalization': False,
    'residual_dropout': 0.0,
    'kv_compression': None,
    'token_bias': True,
    'd_out': 1
    }

    # Generate the optimizers.
    optimizer_name = trial.suggest_categorical("optimizer", ["Adam", "RMSprop", "SGD"])
    lr = trial.suggest_float("lr", 1e-5, 1e-1, log=True)
    weight_decay = trial.suggest_float("weight_decay", 1e-9, 1e-1, log=True)

    # training with 5-fold CV
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    val_losses = []

    for train_idx, val_idx in kf.split(train_features):
        # Create training and validation datasets for the current fold
        X_train_fold, X_val_fold = train_features.iloc[train_idx], train_features.iloc[val_idx]
        y_train_fold, y_val_fold = train_labels.iloc[train_idx], train_labels.iloc[val_idx]
        
        # scaling features
        X_train_fold = scaler.fit_transform(X_train_fold)
        X_val_fold = scaler.transform(X_val_fold)
            
        # Initialize the model for this fold
        model = Transformer(d_numerical=d_numerical, categories=categories, **args).to(DEVICE)
        model = nn.DataParallel(model, device_ids = DEVICE_LIST)
        model.to(DEVICE)
        
        # define optimizer
        if optimizer_name == "Adam":
         optimizer = getattr(optim, optimizer_name)(model.parameters(), lr=lr, weight_decay=weight_decay)
        else:
            momentum = trial.suggest_float("momentum", 1e-9, 0.95, log=True)
            optimizer = getattr(optim, optimizer_name)(model.parameters(), lr=lr, weight_decay=weight_decay, momentum=momentum)
        
        # Define the loss function and optimizer
        criterion = nn.MSELoss()
        
        # Prepare DataLoader for training
        train_dataset = CustomDataset(X_train_fold, y_train_fold.to_numpy())
        val_dataset = CustomDataset(X_val_fold, y_val_fold.to_numpy())
        train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=custom_collate)
        val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=custom_collate)
            
        # Training of the model.
        model.train()
        for epoch in range(EPOCHS):
            for batch_idx, (data, target) in enumerate(train_loader):
                data, target = data.to(DEVICE), target.to(DEVICE)
                
                X_num = data[:, :len(non_category_features)].to(DEVICE)
                X_cat = data[:, -len(category_features):].detach().long().to(DEVICE)

                optimizer.zero_grad()
                
                # print("shape", X_num.shape, X_cat.shape)
                output = model(X_num, X_cat)

                # print("shape of target var", output.shape, target.shape)
                loss = criterion(output, target)
                loss.backward()
                optimizer.step()

        # Validation of the model.
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch_idx, (data, target) in enumerate(val_loader):
                data, target = data.to(DEVICE), target.to(DEVICE)
                X_num = data[:, :len(non_category_features)].to(DEVICE)
                X_cat = data[:, -len(category_features):].detach().long().to(DEVICE)
                output = model(X_num, X_cat)
                
                # print("shape of val var: ", output.shape, target.shape)
                val_loss = criterion(output, target).item()
                val_losses.append(val_loss**0.5) #rmse

        trial.report(val_loss, epoch)

    # Return the average validation loss across all folds
    return np.mean(val_losses)

In [352]:
# 3. Create a study object and optimize the objective function.
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=100, timeout=600)

pruned_trials = study.get_trials(deepcopy=False, states=[TrialState.PRUNED])
complete_trials = study.get_trials(deepcopy=False, states=[TrialState.COMPLETE])

print("Study statistics: ")
print("  Number of finished trials: ", len(study.trials))
print("  Number of pruned trials: ", len(pruned_trials))
print("  Number of complete trials: ", len(complete_trials))

print("Best trial:")
trial = study.best_trial

print("  Value: ", trial.value)

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))


[I 2024-12-11 15:00:50,008] A new study created in memory with name: no-name-243854ea-9794-4964-b34a-e8d386968e3b
[I 2024-12-11 15:04:03,619] Trial 0 finished with value: 0.26834642141950416 and parameters: {'n_heads': 6, 'n_layers': 4, 'token_multiplier': 14, 'd_token': 54, 'attention_dropout': 0.44210847022425526, 'd_ffn_factor': 1.8304837732415857, 'ffn_dropout': 0.4694615865409912, 'optimizer': 'Adam', 'lr': 0.0010108795198581204, 'weight_decay': 0.01618203056711717}. Best is trial 0 with value: 0.26834642141950416.
[I 2024-12-11 15:07:50,172] Trial 1 finished with value: 0.3189764472797721 and parameters: {'n_heads': 6, 'n_layers': 5, 'token_multiplier': 17, 'd_token': 6, 'attention_dropout': 0.016380208557858655, 'd_ffn_factor': 2.821030393983747, 'ffn_dropout': 0.35069134519487793, 'optimizer': 'Adam', 'lr': 5.440245251013737e-05, 'weight_decay': 0.005349542900693531}. Best is trial 0 with value: 0.26834642141950416.
[I 2024-12-11 15:09:06,011] Trial 2 finished with value: 0.323

Study statistics: 
  Number of finished trials:  4
  Number of pruned trials:  0
  Number of complete trials:  4
Best trial:
  Value:  0.2579254291284334
  Params: 
    n_heads: 8
    n_layers: 4
    token_multiplier: 25
    d_token: 88
    attention_dropout: 0.07472463283355035
    d_ffn_factor: 1.3200781582994598
    ffn_dropout: 0.057767167109364026
    optimizer: Adam
    lr: 0.0012173762540881236
    weight_decay: 0.001057582767063431


In [353]:
trial.params

{'n_heads': 8,
 'n_layers': 4,
 'token_multiplier': 25,
 'd_token': 88,
 'attention_dropout': 0.07472463283355035,
 'd_ffn_factor': 1.3200781582994598,
 'ffn_dropout': 0.057767167109364026,
 'optimizer': 'Adam',
 'lr': 0.0012173762540881236,
 'weight_decay': 0.001057582767063431}

In [354]:
MODEL_CONFIG = {"model": {}, "optimizer": {}}

for key, value in trial.params.items():
    if key in ['lr', 'weight_decay', 'momentum', 'weight_decay', 'optimizer']:
        MODEL_CONFIG["optimizer"][key] = value
    elif key == 'token_multiplier':
        continue
    elif key == 'batch_size':
        BATCH_SIZE = value
    else:
        # adj_key = key.rpartition('_')[0]
        MODEL_CONFIG["model"][key] = value

In [355]:
MODEL_CONFIG

{'model': {'n_heads': 8,
  'n_layers': 4,
  'd_token': 88,
  'attention_dropout': 0.07472463283355035,
  'd_ffn_factor': 1.3200781582994598,
  'ffn_dropout': 0.057767167109364026},
 'optimizer': {'optimizer': 'Adam',
  'lr': 0.0012173762540881236,
  'weight_decay': 0.001057582767063431}}

RUNNING THE MODEL

In [356]:
# empty cache first
torch.cuda.empty_cache()

In [357]:
# Transform training data
train_features = scaler.fit_transform(train_features)
test_features = scaler.transform(test_features)

new_feature_list = non_category_features + category_features

In [358]:
# Create dataset instances
train_dataset = CustomDataset(train_features, train_labels.to_numpy())
test_dataset = CustomDataset(test_features, test_labels.to_numpy())

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=custom_collate)
test_loader = DataLoader(test_dataset, batch_size=test_features.shape[0], shuffle=False, collate_fn=custom_collate)

In [359]:
categories = [len(le.classes_) for le in label_encoders.values()]
d_numerical = len(non_category_features)
# d_token = 32  # Example token dimension
# token_bias = True

args = {
  # 'initialization': 'kaiming',
  'activation': 'relu',
    'prenormalization': False,
    'residual_dropout': 0.0,
    'kv_compression': None,
    'token_bias': True,
    'd_out': 1
}

args.update(MODEL_CONFIG["model"])
model = Transformer(d_numerical=d_numerical, categories=categories, **args).to(DEVICE)

In [360]:
model

Transformer(
  (tokenizer): Tokenizer(
    (category_embeddings): Embedding(166, 88)
  )
  (layers): ModuleList(
    (0-3): 4 x TransformerBlock(
      (norm0): LayerNorm((88,), eps=1e-05, elementwise_affine=True)
      (attention): MultiheadAttention(
        (W_q): Linear(in_features=88, out_features=88, bias=True)
        (W_k): Linear(in_features=88, out_features=88, bias=True)
        (W_v): Linear(in_features=88, out_features=88, bias=True)
        (W_out): Linear(in_features=88, out_features=88, bias=True)
        (dropout): Dropout(p=0.07472463283355035, inplace=False)
      )
      (linear0): Linear(in_features=88, out_features=116, bias=True)
      (dropout0): Identity()
      (linear1): Linear(in_features=116, out_features=88, bias=True)
      (norm1): LayerNorm((88,), eps=1e-05, elementwise_affine=True)
      (dropout1): Dropout(p=0.057767167109364026, inplace=False)
    )
  )
  (head): Linear(in_features=88, out_features=1, bias=True)
)

In [361]:
if DEVICE != "cpu":
    model = nn.DataParallel(model, device_ids = DEVICE_LIST)
model.to(DEVICE)

DataParallel(
  (module): Transformer(
    (tokenizer): Tokenizer(
      (category_embeddings): Embedding(166, 88)
    )
    (layers): ModuleList(
      (0-3): 4 x TransformerBlock(
        (norm0): LayerNorm((88,), eps=1e-05, elementwise_affine=True)
        (attention): MultiheadAttention(
          (W_q): Linear(in_features=88, out_features=88, bias=True)
          (W_k): Linear(in_features=88, out_features=88, bias=True)
          (W_v): Linear(in_features=88, out_features=88, bias=True)
          (W_out): Linear(in_features=88, out_features=88, bias=True)
          (dropout): Dropout(p=0.07472463283355035, inplace=False)
        )
        (linear0): Linear(in_features=88, out_features=116, bias=True)
        (dropout0): Identity()
        (linear1): Linear(in_features=116, out_features=88, bias=True)
        (norm1): LayerNorm((88,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.057767167109364026, inplace=False)
      )
    )
    (head): Linear(in_features=8

In [362]:
# define optimizer
optim_config = deepcopy(MODEL_CONFIG["optimizer"])
del optim_config["optimizer"]

optimizer = getattr(optim, MODEL_CONFIG["optimizer"]["optimizer"])(model.parameters(), **optim_config)
optimizer

Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.0012173762540881236
    maximize: False
    weight_decay: 0.001057582767063431
)

In [363]:
train_loader.dataset

<__main__.CustomDataset at 0x7efebc3e4e60>

In [364]:
EPOCH = 500
criterion = nn.MSELoss()
start_time = time.time()
train_errors = []

for ep in tqdm(range(EPOCH)):

    model.train()
    running_loss = 0.0
    for i, (data, target) in enumerate(train_loader):
        data, target = data.to(DEVICE), target.to(DEVICE)
        X_num = data[:, :len(non_category_features)].to(DEVICE)
        X_cat = data[:, -len(category_features):].detach().long().to(DEVICE)
        
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(X_num, X_cat)

        loss = criterion(outputs, target)
        
        loss.backward()

        optimizer.step()

        # print statistics
        running_loss += loss.item()


    train_loss = running_loss  / len(train_loader)
    train_errors.append(train_loss**0.5)
    if ep % 100 == 50:
        print(f'Epoch [{ep+1}], Train Loss: {train_loss**0.5:.4f}')
    
# train_loss = running_loss  / len(train_loader)
# print(f'Epoch [{ep+1}], Train Loss: {train_loss**0.5:.4f}')

# print out training time
elapsed_time = time.time() - start_time
print(f"Training time: {elapsed_time:.3f} seconds")

 10%|█         | 51/500 [00:46<06:57,  1.08it/s]

Epoch [51], Train Loss: 0.2369


 30%|███       | 151/500 [02:18<05:43,  1.01it/s]

Epoch [151], Train Loss: 0.1806


 50%|█████     | 251/500 [03:51<03:49,  1.09it/s]

Epoch [251], Train Loss: 0.1695


 70%|███████   | 351/500 [05:22<02:16,  1.09it/s]

Epoch [351], Train Loss: 0.1669


 90%|█████████ | 451/500 [06:53<00:45,  1.08it/s]

Epoch [451], Train Loss: 0.1565


100%|██████████| 500/500 [07:38<00:00,  1.09it/s]

Training time: 458.319 seconds





In [365]:
# empty cache first
torch.cuda.empty_cache()

In [None]:
# Testing phase
model.eval()
test_loss = 0.0

with torch.no_grad():
    i = 0
    for data, target in test_loader:
        # inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
        X_num = data[:, :len(non_category_features)]
        X_cat = data[:, -len(category_features):].detach().long()
        
        if isinstance(model, nn.DataParallel):
            model = model.module  # Unwrap from DataParallel
        model = model.to('cpu')
            
               
        outputs = model(X_num, X_cat)

        # save metrics
        mae, mape, rmse, rsqr = calculate_metric(outputs.numpy(), target.numpy())
        print(f"Test average mean absolute error: {mae}")
        print(f"Test average mean absolute percentage error: {mape}")
        print(f"Test average root mean squared error: {rmse}")
        print(f"Test average R2: {rsqr}")

Training average mean absolute error: 0.1805223524570465
Training average mean absolute percentage error: 223.90522956848145
Training average root mean squared error: 0.25800502697627115
Training average R2: 0.4188556671142578


In [367]:
# let’s load back in our saved model
# model = MLP()
# model.load_state_dict(torch.load(MODEL_PATH))

5 fold CV

In [382]:
# Define cross-validation setup
kf = KFold(n_splits=5, shuffle=True, random_state=42)
EPOCHS = 500
val_mae = []
val_mape = []
val_rmse = []
val_rsqr = []

i = 0


for train_idx, val_idx in kf.split(features):
    running_loss = 0
    train_errors = []

    print(f"Start {i} batch")
    i += 1
    
    # Create training and validation datasets for the current fold
    X_train_fold, X_val_fold = features.iloc[train_idx], features.iloc[val_idx]
    y_train_fold, y_val_fold = labels.iloc[train_idx], labels.iloc[val_idx]
    
    # scaling features
    X_train_fold = scaler.fit_transform(X_train_fold)
    X_val_fold = scaler.transform(X_val_fold)
        
    # Initialize the model for this fold
    model = Transformer(d_numerical=d_numerical, categories=categories, **args)
    model = nn.DataParallel(model, device_ids = DEVICE_LIST)
    model.to(DEVICE)
    
    # define optimizer
    optimizer = getattr(optim, MODEL_CONFIG["optimizer"]["optimizer"])(model.parameters(), **optim_config)
    
    # Define the loss function and optimizer
    criterion = nn.MSELoss()
    
    # Prepare DataLoader for training
    train_dataset = CustomDataset(X_train_fold, y_train_fold.to_numpy())
    val_dataset = CustomDataset(X_val_fold, y_val_fold.to_numpy())
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=val_dataset.features.shape[0], shuffle=True)
        
    # Training of the model.
    model.train()
    for epoch in range(EPOCHS):
        running_loss = 0.0
        for batch_idx, (data, target) in enumerate(train_loader):
            data, target = data.to(DEVICE), target.to(DEVICE)
  
            X_num = data[:, :len(non_category_features)].to(DEVICE)
            X_cat = data[:, -len(category_features):].detach().long().to(DEVICE)
            
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = model(X_num, X_cat)

            
            loss = criterion(outputs, target)
            loss.backward()
            optimizer.step()
            
            # print statistics
            running_loss += loss.item()
            
        train_loss = running_loss  / len(train_loader)
        train_errors.append(train_loss**0.5)
        if epoch % 100 == 20:
            print(f'Epoch [{ep+1}], Train Loss: {train_loss**0.5:.4f}')
            
    # save model state_dict
    torch.save(model.state_dict(),  f"{MODEL_PATH}/transformer_weights_fold_{i}.pth")

    # Validation of the model.
    model.eval()
    with torch.no_grad():
        for batch_idx, (data, target) in enumerate(val_loader):
            # data, target = data.to(DEVICE), target.to(DEVICE)
            X_num = data[:, :len(non_category_features)]
            X_cat = data[:, -len(category_features):].detach().long()
            
            if isinstance(model, nn.DataParallel):
                model = model.module  # Unwrap from DataParallel
            model = model.to('cpu')
            outputs = model(X_num, X_cat)
            
            # save metrics
            mae, mape, rmse, rsqr = calculate_metric(outputs.numpy(), target.numpy())
            val_mae.append(mae)
            val_mape.append(mape)
            val_rmse.append(rmse)
            val_rsqr.append(rsqr)

Start 0 batch




Epoch [500], Train Loss: 0.2570
Epoch [500], Train Loss: 0.1957
Epoch [500], Train Loss: 0.1786
Epoch [500], Train Loss: 0.1760
Epoch [500], Train Loss: 0.1694
Start 1 batch




Epoch [500], Train Loss: 0.2653
Epoch [500], Train Loss: 0.1995
Epoch [500], Train Loss: 0.1822
Epoch [500], Train Loss: 0.1752
Epoch [500], Train Loss: 0.1739
Start 2 batch




Epoch [500], Train Loss: 0.2657
Epoch [500], Train Loss: 0.2078
Epoch [500], Train Loss: 0.2011
Epoch [500], Train Loss: 0.1982
Epoch [500], Train Loss: 0.2151
Start 3 batch




Epoch [500], Train Loss: 0.2584
Epoch [500], Train Loss: 0.2099
Epoch [500], Train Loss: 0.1923
Epoch [500], Train Loss: 0.1891
Epoch [500], Train Loss: 0.1821
Start 4 batch




Epoch [500], Train Loss: 0.2615
Epoch [500], Train Loss: 0.1791
Epoch [500], Train Loss: 0.1794
Epoch [500], Train Loss: 0.1663
Epoch [500], Train Loss: 0.1691


In [383]:
val_rmse

[0.25521199392431193,
 0.28332516901557664,
 0.23299730218043802,
 0.2331523955102855,
 0.2667226411639532]

In [384]:
print(f"Test average mean absolute error: {statistics.mean(val_mae)}")
print(f"Test average mean absolute percentage error: {statistics.mean(val_mape)}")
print(f"Test average root mean squared error: {statistics.mean(val_rmse)}")
print(f"Test average R2: {statistics.mean(val_rsqr)}")

Test average mean absolute error: 0.17922967672348022
Test average mean absolute percentage error: 6145.480759143829
Test average root mean squared error: 0.25428190035891307
Test average R2: 0.403901481628418
