In [2]:
pip install torch

Collecting torch
  Downloading torch-2.5.1-cp312-none-macosx_11_0_arm64.whl.metadata (28 kB)
Collecting filelock (from torch)
  Downloading filelock-3.16.1-py3-none-any.whl.metadata (2.9 kB)
Collecting fsspec (from torch)
  Using cached fsspec-2024.10.0-py3-none-any.whl.metadata (11 kB)
Collecting sympy==1.13.1 (from torch)
  Downloading sympy-1.13.1-py3-none-any.whl.metadata (12 kB)
Collecting mpmath<1.4,>=1.1.0 (from sympy==1.13.1->torch)
  Using cached mpmath-1.3.0-py3-none-any.whl.metadata (8.6 kB)
Downloading torch-2.5.1-cp312-none-macosx_11_0_arm64.whl (63.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.9/63.9 MB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading sympy-1.13.1-py3-none-any.whl (6.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.2/6.2 MB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading filelock-3.16.1-py3-none-any.whl (16 kB)
Using cached fsspec-2024.10.0-py3-non

In [34]:
import math
from typing import List, Optional, Callable, cast, Dict
import sys
import os
from pathlib import Path
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
from torch.utils.data import DataLoader, Dataset
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.discriminant_analysis import StandardScaler
from sklearn.model_selection import train_test_split
from copy import deepcopy
import torch.optim as optim
from sklearn.preprocessing import LabelEncoder
import optuna
from optuna.trial import TrialState
# import typing as ty
from torch import Tensor
import torch.nn.init as nn_init
import statistics
from sklearn.model_selection import KFold
import time
from utils import calculate_metric

In [2]:
# sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), "..")))

In [None]:
# set random seeds
np.random.seed(0)
torch.manual_seed(0)

In [35]:
INPUT_FILE = "../data/all_data_cleaned.xlsx"
MODEL_PATH = '../output/transformer/best_transformer_model.pth'
BATCH_SIZE = 32

DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
# DEVICE = 'cpu'
DEVICE_LIST = [0, 1, 3]

In [36]:
DEVICE

device(type='cpu')

In [37]:
df = pd.read_excel(INPUT_FILE)

In [38]:
df.head()

Unnamed: 0,rr1_7,rr2_7,rr1_30,rr2_30,currency,seniorioty,seniorioty_adj,coupon rate,coupon frequency,maturity_type,...,PD_56_pd,PD_57_pd,PD_58_pd,PD_59_pd,PD_60_pd,DTD,NI_Over_TA,Size,defaulted_in_last_6_months,default_duration
0,0.247786,0.241969,0.259908,0.252843,USD,Senior Subordinated Unsecured,Senior Subordinated Unsecured,9.0,2.0,CALL/SINK,...,0.397453,0.398148,0.398819,0.399467,0.400092,-0.732815,-0.007137,-0.852484,False,6.073973
1,0.030148,0.029552,0.032729,0.031998,USD,Senior Subordinated Unsecured,Senior Subordinated Unsecured,5.75,2.0,CONV/CALL,...,0.957467,0.95748,0.957492,0.957503,0.957514,-1.666262,-0.000286,-1.186347,False,5.109589
2,0.969841,0.96039,0.9724,0.96049,USD,Unsecured,Unsecured,5.675,2.0,PUTABLE,...,0.568693,0.569197,0.569682,0.57015,0.5706,-1.853366,0.000191,1.053677,False,3.145205
3,1.047361,1.046199,1.047416,1.046196,CHF,Unsecured,Unsecured,0.125,1.0,CONVERTIBLE,...,0.568693,0.569197,0.569682,0.57015,0.5706,-1.853366,0.000191,1.053677,False,3.00274
4,0.848102,0.840452,0.848872,0.840574,JPY,Unsecured,Unsecured,1.75,1.0,CONV/CALL,...,0.130688,0.131081,0.131465,0.13184,0.132206,-0.768857,-0.028058,-1.946507,False,7.153425


In [39]:
df.shape

(1725, 179)

In [49]:
labels = df['rr1_30']
features = df.drop(columns=['rr1_30', 'rr1_7', 'rr2_7', 'rr2_30'], errors="ignore") #drop labels

In [50]:
features.dtypes

currency                       object
seniorioty                     object
seniorioty_adj                 object
coupon rate                   float64
coupon frequency              float64
                               ...   
DTD                           float64
NI_Over_TA                    float64
Size                          float64
defaulted_in_last_6_months       bool
default_duration              float64
Length: 175, dtype: object

In [51]:
feature_list = features.columns
category_features = list(features.select_dtypes(include=['object', 'bool']).columns)
non_category_features = [i for i in feature_list if i not in category_features]

In [52]:
print(len(non_category_features))
print(len(category_features))

155
20


In [54]:
# enforce all categories to have type string
features[category_features] = features[category_features].astype(str)

In [55]:
# transform categorical features in features
label_encoders = {}
mappings = {}

for column in category_features:
    le = LabelEncoder()
    features[column] = le.fit_transform(features[column])
    label_encoders[column] = le
    mappings[column] = {index: label for index, label in enumerate(le.classes_)}

In [56]:
mappings

{'currency': {0: 'CAD',
  1: 'CHF',
  2: 'CNY',
  3: 'EUR',
  4: 'GBP',
  5: 'HKD',
  6: 'INR',
  7: 'ISK',
  8: 'JPY',
  9: 'MYR',
  10: 'NOK',
  11: 'SEK',
  12: 'SGD',
  13: 'THB',
  14: 'TWD',
  15: 'USD'},
 'seniorioty': {0: 'Junior Unsecured or Junior Subordinated Unsecured',
  1: 'Secured',
  2: 'Senior Secured',
  3: 'Senior Secured - First Lien',
  4: 'Senior Secured - First Mortgage',
  5: 'Senior Secured - Mortgage',
  6: 'Senior Secured - Second Lien',
  7: 'Senior Subordinated Unsecured',
  8: 'Senior Unsecured',
  9: 'Subordinated Unsecured',
  10: 'Unsecured'},
 'seniorioty_adj': {0: 'Junior Unsecured or Junior Subordinated Unsecured',
  1: 'Secured',
  2: 'Senior Secured',
  3: 'Senior Subordinated Unsecured',
  4: 'Senior Unsecured',
  5: 'Subordinated Unsecured',
  6: 'Unsecured'},
 'maturity_type': {0: 'AT MATURITY',
  1: 'CALL/PUT',
  2: 'CALL/REF/SINK',
  3: 'CALL/SINK',
  4: 'CALL/SINK/PUT',
  5: 'CALLABLE',
  6: 'CONV/CALL',
  7: 'CONV/CALL/SINK',
  8: 'CONV/PUT'

In [57]:
features

Unnamed: 0,currency,seniorioty,seniorioty_adj,coupon rate,coupon frequency,maturity_type,call type,put type,Convertible,coupon type -code,...,PD_56_pd,PD_57_pd,PD_58_pd,PD_59_pd,PD_60_pd,DTD,NI_Over_TA,Size,defaulted_in_last_6_months,default_duration
0,15,7,3,9.000,2.0,3,3,2,1,7,...,0.397453,0.398148,0.398819,0.399467,0.400092,-0.732815,-0.007137,-0.852484,0,6.073973
1,15,7,3,5.750,2.0,6,3,2,0,7,...,0.957467,0.957480,0.957492,0.957503,0.957514,-1.666262,-0.000286,-1.186347,0,5.109589
2,15,10,6,5.675,2.0,11,3,2,1,3,...,0.568693,0.569197,0.569682,0.570150,0.570600,-1.853366,0.000191,1.053677,0,3.145205
3,1,10,6,0.125,1.0,10,3,2,0,7,...,0.568693,0.569197,0.569682,0.570150,0.570600,-1.853366,0.000191,1.053677,0,3.002740
4,8,10,6,1.750,1.0,6,3,2,1,7,...,0.130688,0.131081,0.131465,0.131840,0.132206,-0.768857,-0.028058,-1.946507,0,7.153425
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1720,9,2,2,5.000,2.0,0,3,2,1,7,...,0.081844,0.082676,0.083500,0.084315,0.085123,0.954865,0.000425,2.497169,0,6.298630
1721,9,2,2,4.950,2.0,0,3,2,1,7,...,0.081844,0.082676,0.083500,0.084315,0.085123,0.954865,0.000425,2.497169,0,6.298630
1722,9,2,2,5.150,2.0,0,3,2,1,7,...,0.081844,0.082676,0.083500,0.084315,0.085123,0.954865,0.000425,2.497169,0,6.298630
1723,9,2,2,5.050,2.0,0,3,2,1,7,...,0.081844,0.082676,0.083500,0.084315,0.085123,0.954865,0.000425,2.497169,0,6.298630


In [58]:
# split data into training and test set
test_size = 0.25
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=test_size, random_state=42)

print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

Training Features Shape: (1293, 175)
Training Labels Shape: (1293,)
Testing Features Shape: (432, 175)
Testing Labels Shape: (432,)


In [59]:
# Normalize the data
# Prepare the ColumnTransformer
scaler = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), non_category_features)   # StandardScaler()
    ],
    remainder='passthrough'  # Leave categorical features untouched
)

In [60]:
class CustomDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels
    
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, idx):
        feature = self.features[idx]
        label = self.labels[idx]
        return torch.tensor(feature, dtype=torch.float32), torch.tensor(label, dtype=torch.float32)

TUNING PARAMETERS

In [63]:
import torch
import torch.nn as nn
import torch.nn.init as nn_init
from torch import Tensor
from typing import List, Optional
import math

class Tokenizer(nn.Module):
    category_offsets: Optional[Tensor]

    def __init__(
        self,
        d_numerical: int,
        categories: Optional[List[int]],
        d_token: int,
        bias: bool,
    ) -> None:
        super().__init__()
        
        # Handling categorical features
        if categories is None:
            d_bias = d_numerical
            self.category_offsets = None
            self.category_embeddings = None
        else:
            d_bias = d_numerical + len(categories)
            
            # Ensure proper indexing for embeddings
            category_offsets = torch.tensor([0] + categories[:-1]).cumsum(0)
            self.register_buffer('category_offsets', category_offsets)
            
            # Create category embeddings
            self.category_embeddings = nn.Embedding(sum(categories), d_token)
        
        # Adding [CLS] token into account
        self.weight = nn.Parameter(torch.empty(d_numerical + 1, d_token))
        self.bias = nn.Parameter(torch.empty(d_bias, d_token)) if bias else None
        
        # Initialization inspired by nn.Linear
        self._init_parameters()

    def _init_parameters(self) -> None:
        """Initialize the weights and bias parameters."""
        nn_init.kaiming_uniform_(self.weight, a=math.sqrt(5))
        if self.bias is not None:
            nn_init.kaiming_uniform_(self.bias, a=math.sqrt(5))

    @property
    def n_tokens(self) -> int:
        return len(self.weight) + (0 if self.category_offsets is None else len(self.category_offsets))

    def forward(self, x_num: Tensor, x_cat: Optional[Tensor]) -> Tensor:
        """
        Forward pass for the tokenizer.
        
        Args:
            x_num (Tensor): Numerical input tensor of shape (batch_size, num_features).
            x_cat (Optional[Tensor]): Categorical input tensor of shape (batch_size, num_categories).
        
        Returns:
            Tensor: Tokenized output of shape (batch_size, n_tokens, d_token).
        """
        batch_size = x_num.size(0)
        
        # Add [CLS] token to numerical features
        x_num = torch.cat([
            torch.ones(batch_size, 1, device=x_num.device),  # [CLS] token
            x_num
        ], dim=1)
        
        # Weight numerical features
        x = self.weight.unsqueeze(0) * x_num.unsqueeze(-1)   # (1, d_num + 1, d_token) x (batch_size, d_num + 1, 1) => (batch_size, d_num + 1, d_token)
        
        # Handle categorical features if present
        if x_cat is not None:
            x_cat_embeddings = self.category_embeddings(x_cat + self.category_offsets.unsqueeze(0)) # (batch_size, num_categories, d_token)
            x = torch.cat([x, x_cat_embeddings], dim=1) # batch_size, 1 + d_num + num_Categories, d_token
        
        # Add bias if applicable
        if self.bias is not None:
            bias = torch.cat([
                torch.zeros(1, self.bias.size(1), device=x.device),
                self.bias
            ])
            x = x + bias.unsqueeze(0)
        
        return x


In [64]:
# class MultiheadAttention(nn.Module):
#     def __init__(
#         self, d: int, n_heads: int, dropout: float, initialization: str
#     ) -> None:
#         if n_heads > 1:
#             assert d % n_heads == 0

#         super().__init__()
#         self.W_q = nn.Linear(d, d)
#         self.W_k = nn.Linear(d, d)
#         self.W_v = nn.Linear(d, d)
#         self.W_out = nn.Linear(d, d) if n_heads > 1 else None
#         self.n_heads = n_heads
#         self.dropout = nn.Dropout(dropout) if dropout else None

#     def _reshape(self, x: Tensor) -> Tensor:
#         batch_size, n_tokens, d = x.shape
#         d_head = d // self.n_heads
#         return (
#             x.reshape(batch_size, n_tokens, self.n_heads, d_head)
#             .transpose(1, 2)
#             .reshape(batch_size * self.n_heads, n_tokens, d_head)
#         )

#     def forward(
#         self,
#         x_q: Tensor, #(batch_size, num_tokens, d)
#         x_kv: Tensor
#     ) -> Tensor:
#         # linear projection
#         q, k, v = self.W_q(x_q), self.W_k(x_kv), self.W_v(x_kv)
#         for tensor in [q, k, v]:
#             assert tensor.shape[-1] % self.n_heads == 0

#         batch_size = len(q)
#         d_head_key = k.shape[-1] // self.n_heads
#         d_head_value = v.shape[-1] // self.n_heads
#         n_q_tokens = q.shape[1]

#         q = self._reshape(q)
#         k = self._reshape(k)
#         attention = F.softmax(q @ k.transpose(1, 2) / math.sqrt(d_head_key), dim=-1)
#         if self.dropout is not None:
#             attention = self.dropout(attention)
#         x = attention @ self._reshape(v)
#         x = (
#             x.reshape(batch_size, self.n_heads, n_q_tokens, d_head_value)
#             .transpose(1, 2)
#             .reshape(batch_size, n_q_tokens, self.n_heads * d_head_value)
#         )
#         if self.W_out is not None:
#             x = self.W_out(x)
#         return x


class MultiheadAttention(nn.Module):
    def __init__(
        self, d: int, n_heads: int, dropout: float = 0.0, initialization: str = "default", kv_compression: Optional[float] = None
    ) -> None:
        """
        Multi-head attention mechanism.

        Args:
            d (int): Dimensionality of the input features.
            n_heads (int): Number of attention heads.
            dropout (float): Dropout probability. Defaults to 0.0.
            initialization (str): Initialization method for the linear layers. Defaults to "default".
        """
        super().__init__()

        # Ensure d is divisible by n_heads
        assert n_heads > 0 and d % n_heads == 0, "d must be divisible by n_heads."

        self.n_heads = n_heads
        self.d_head = d // n_heads

        # Define linear transformations
        self.W_q = nn.Linear(d, d)
        self.W_k = nn.Linear(d, d)
        self.W_v = nn.Linear(d, d)
        self.W_out = nn.Linear(d, d) if n_heads > 1 else None
        
        if kv_compression:
            self.key_compression = nn.Linear(d, int(d * kv_compression), bias=False)
            self.value_compression = nn.Linear(d, int(d * kv_compression), bias=False)

        # Dropout layer
        self.dropout = nn.Dropout(dropout) if dropout > 0 else None

        # Initialize weights based on the initialization method
        self._initialize_weights(initialization)

    def _initialize_weights(self, initialization: str) -> None:
        """Initialize weights based on the specified method."""
        layers = [self.W_q, self.W_k, self.W_v, self.W_out]
        if self.kv_compression:
            layers += [self.key_compression, self.value_compression]
            
        if initialization == "xavier":
            for layer in layers:
                if layer is not None:
                    nn.init.xavier_uniform_(layer.weight)
                    if layer.bias is not None:
                        nn.init.zeros_(layer.bias)
        elif initialization == "kaiming":
            for layer in [self.W_q, self.W_k, self.W_v, self.W_out]:
                if layer is not None:
                    nn.init.kaiming_uniform_(layer.weight, a=math.sqrt(5))
                    if layer.bias is not None:
                        nn.init.zeros_(layer.bias)
        else:  # Default initialization
            for layer in [self.W_q, self.W_k, self.W_v, self.W_out]:
                if layer is not None:
                    nn.init.kaiming_uniform_(layer.weight, a=math.sqrt(5))

    def _reshape(self, x: Tensor) -> Tensor:
        """Reshape tensor for multi-head attention.

        Input:
            x: Tensor of shape (batch_size, n_tokens, d)

        Output:
            Tensor of shape (batch_size * n_heads, n_tokens, d_head)
        """
        batch_size, n_tokens, d = x.shape
        return (
            x.reshape(batch_size, n_tokens, self.n_heads, self.d_head) 
            .transpose(1, 2)  # (batch_size, n_heads, n_tokens, d_head)
            .reshape(batch_size * self.n_heads, n_tokens, self.d_head)  # (batch_size * n_heads, n_tokens, d_head)
        )

    def forward(self, x_q: Tensor, x_kv: Tensor) -> Tensor:
        """
        Forward pass of multi-head attention.

        Args:
            x_q (Tensor): Query tensor of shape (batch_size, num_tokens, d).
            x_kv (Tensor): Key and value tensor of shape (batch_size, num_tokens, d).

        Returns:
            Tensor: Output tensor of shape (batch_size, num_tokens, d).
        """
        # Linear projections
        q, k, v = self.W_q(x_q), self.W_k(x_kv), self.W_v(x_kv)
        # q: (batch_size, n_q_tokens, d). k, v: (batch_size, num_tokens, d)
        
        # reduce dimension of key and value if it's too long
        if self.kv_compression:
            k = self.key_compression(k)
            v = self.value_compression(v)

        # Ensure dimensions are divisible by number of heads
        for tensor in [q, k, v]:
            assert tensor.shape[-1] % self.n_heads == 0, "Input dimensions must be divisible by n_heads."

        batch_size = q.size(0)
        n_q_tokens = q.size(1)

        # Reshape for multi-head attention
        q = self._reshape(q)  # (batch_size * n_heads, n_q_tokens, d_head)
        k = self._reshape(k)  # (batch_size * n_heads, num_tokens, d_head)
        v = self._reshape(v)  # (batch_size * n_heads, num_tokens, d_head)

        # Scaled dot-product attention
        attention_scores = q @ k.transpose(-2, -1) / math.sqrt(self.d_head)  # (batch_size * n_heads, n_q_tokens, num_tokens)
        attention_weights = F.softmax(attention_scores, dim=-1)  # (batch_size * n_heads, n_q_tokens, num_tokens)

        # Apply dropout to attention weights if specified
        if self.dropout is not None:
            attention_weights = self.dropout(attention_weights)

        # Compute attention output
        attention_output = attention_weights @ v  # (batch_size * n_heads, n_q_tokens, d_head)
        
        # Reshape back to original dimensions
        attention_output = (
            attention_output
            .reshape(batch_size, self.n_heads, n_q_tokens, self.d_head)  # (batch_size, n_heads, n_q_tokens, d_head)
            .transpose(1, 2)  # (batch_size, n_q_tokens, n_heads, d_head)
            .reshape(batch_size, n_q_tokens, -1)  # (batch_size, n_q_tokens, d)
        )

        # Apply final linear transformation if applicable
        if self.W_out is not None:
            attention_output = self.W_out(attention_output)  # (batch_size, n_q_tokens, d)

        return attention_output


In [65]:
def reglu(x: torch.Tensor) -> torch.Tensor:
    """ReGLU activation function."""
    a, b = x.chunk(2, dim=-1)
    return a * F.relu(b)

def geglu(x: torch.Tensor) -> torch.Tensor:
    """GeGLU activation function."""
    a, b = x.chunk(2, dim=-1)
    return a * F.gelu(b)

def get_activation_fn(name: str) -> Callable[[torch.Tensor], torch.Tensor]:
    """Retrieve the activation function by name."""
    activation_functions = {
        'reglu': reglu,
        'geglu': geglu,
        'sigmoid': torch.sigmoid,
    }

    if name in activation_functions:
        return activation_functions[name]

    if hasattr(F, name):
        return getattr(F, name)

    raise ValueError(f"Unsupported activation function: {name}")

def get_nonglu_activation_fn(name: str) -> Callable[[torch.Tensor], torch.Tensor]:
    """Retrieve the non-GLU activation function by name."""
    nonglu_mappings = {
        'reglu': F.relu,
        'geglu': F.gelu,
    }

    return nonglu_mappings.get(name, get_activation_fn(name))

In [29]:
# class Transformer(nn.Module):
#     """Transformer.

#     References:
#     - https://pytorch.org/docs/stable/generated/torch.nn.Transformer.html
#     - https://github.com/facebookresearch/pytext/tree/master/pytext/models/representations/transformer
#     - https://github.com/pytorch/fairseq/blob/1bba712622b8ae4efb3eb793a8a40da386fe11d0/examples/linformer/linformer_src/modules/multihead_linear_attention.py#L19
#     """

#     def __init__(
#         self,
#         *,
#         # tokenizer
#         d_numerical: int,
#         categories: Optional[List[int]],
#         token_bias: bool,
#         # transformer
#         n_layers: int,
#         d_token: int,
#         n_heads: int,
#         d_ffn_factor: float,
#         attention_dropout: float,
#         ffn_dropout: float,
#         residual_dropout: float,
#         activation: str,
#         prenormalization: bool,
#         initialization: str,
#         # linformer
#         kv_compression: Optional[float],
#         kv_compression_sharing: Optional[str],
#         #
#         d_out: int,
#     ) -> None:
#         assert (kv_compression is None) ^ (kv_compression_sharing is not None)

#         super().__init__()
#         self.tokenizer = Tokenizer(d_numerical, categories, d_token, token_bias)
#         n_tokens = self.tokenizer.n_tokens

#         def make_kv_compression():
#             assert kv_compression
#             compression = nn.Linear(
#                 n_tokens, int(n_tokens * kv_compression), bias=False
#             )
#             if initialization == 'xavier':
#                 nn_init.xavier_uniform_(compression.weight)
#             return compression

#         self.shared_kv_compression = (
#             make_kv_compression()
#             if kv_compression and kv_compression_sharing == 'layerwise'
#             else None
#         )

#         def make_normalization():
#             return nn.LayerNorm(d_token)

#         d_hidden = int(d_token * d_ffn_factor)
#         self.layers = nn.ModuleList([])
#         for layer_idx in range(n_layers):
#             layer = nn.ModuleDict(
#                 {
#                     'attention': MultiheadAttention(
#                         d_token, n_heads, attention_dropout, initialization
#                     ),
#                     'linear0': nn.Linear(
#                         d_token, d_hidden * (2 if activation.endswith('glu') else 1)
#                     ),
#                     'linear1': nn.Linear(d_hidden, d_token),
#                     'norm1': make_normalization(),
#                 }
#             )
#             if not prenormalization or layer_idx:
#                 layer['norm0'] = make_normalization()
#             if kv_compression and self.shared_kv_compression is None:
#                 layer['key_compression'] = make_kv_compression()
#                 if kv_compression_sharing == 'headwise':
#                     layer['value_compression'] = make_kv_compression()
#                 else:
#                     assert kv_compression_sharing == 'key-value'
#             self.layers.append(layer)

#         self.activation = get_activation_fn(activation)
#         self.last_activation = get_nonglu_activation_fn(activation)
#         self.prenormalization = prenormalization
#         self.last_normalization = make_normalization() if prenormalization else None
#         self.ffn_dropout = ffn_dropout
#         self.residual_dropout = residual_dropout
#         self.head = nn.Linear(d_token, d_out)

#     def _get_kv_compressions(self, layer):
#         return (
#             (self.shared_kv_compression, self.shared_kv_compression)
#             if self.shared_kv_compression is not None
#             else (layer['key_compression'], layer['value_compression'])
#             if 'key_compression' in layer and 'value_compression' in layer
#             else (layer['key_compression'], layer['key_compression'])
#             if 'key_compression' in layer
#             else (None, None)
#         )

#     def _start_residual(self, x, layer, norm_idx):
#         x_residual = x
#         if self.prenormalization:
#             norm_key = f'norm{norm_idx}'
#             if norm_key in layer:
#                 x_residual = layer[norm_key](x_residual)
#         return x_residual

#     def _end_residual(self, x, x_residual, layer, norm_idx):
#         if self.residual_dropout:
#             x_residual = F.dropout(x_residual, self.residual_dropout, self.training)
#         x = x + x_residual
#         if not self.prenormalization:
#             x = layer[f'norm{norm_idx}'](x)
#         return x

#     def forward(self, x_num: Tensor, x_cat: Optional[Tensor]) -> Tensor:
#         x = self.tokenizer(x_num, x_cat)

#         for layer_idx, layer in enumerate(self.layers):
#             is_last_layer = layer_idx + 1 == len(self.layers)
#             layer = cast(Dict[str, nn.Module], layer)

#             # start residual connection
#             x_residual = self._start_residual(x, layer, 0)
            
#             # attention layer
#             x_residual = layer['attention'](
#                 # for the last attention, it is enough to process only [CLS]
#                 (x_residual[:, :1] if is_last_layer else x_residual),
#                 x_residual,
#                 *self._get_kv_compressions(layer),
#             )
            
#             # end residual connection
#             if is_last_layer:
#                 x = x[:, : x_residual.shape[1]]
#             x = self._end_residual(x, x_residual, layer, 0)

#             # feedforward network
#             x_residual = self._start_residual(x, layer, 1)
#             x_residual = layer['linear0'](x_residual)
#             x_residual = self.activation(x_residual)
#             if self.ffn_dropout:
#                 x_residual = F.dropout(x_residual, self.ffn_dropout, self.training)
#             x_residual = layer['linear1'](x_residual)
#             x = self._end_residual(x, x_residual, layer, 1)

#         # final layer processing
#         assert x.shape[1] == 1
#         x = x[:, 0]
#         if self.last_normalization is not None:
#             x = self.last_normalization(x)
#         x = self.last_activation(x)
#         x = self.head(x)
#         x = x.squeeze(-1)
#         return x

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from typing import Optional, List, Dict

class Transformer(nn.Module):
    """Transformer implementation with support for optional Linformer compression."""

    def __init__(
        self,
        *,
        d_numerical: int,
        categories: Optional[List[int]],
        token_bias: bool,
        n_layers: int,
        d_token: int,
        n_heads: int,
        d_ffn_factor: float,
        attention_dropout: float,
        ffn_dropout: float,
        residual_dropout: float,
        activation: str,
        prenormalization: bool,
        initialization: str,
        kv_compression: Optional[float],
        kv_compression_sharing: Optional[str],
        d_out: int,
    ) -> None:
        super().__init__()

        # Validate kv_compression settings
        if (kv_compression is None) != (kv_compression_sharing is None):
            raise ValueError("Both or neither of kv_compression and kv_compression_sharing must be set.")

        self.tokenizer = Tokenizer(d_numerical, categories, d_token, token_bias)
        n_tokens = self.tokenizer.n_tokens

        # kv_compression parameter introduces a linear transformation that compresses the key and value matrices to a smaller dimension
        # Layerwise: A single shared compression matrix is used across all layers.
        self.shared_kv_compression = (
            self._make_kv_compression(n_tokens, kv_compression, initialization)
            if kv_compression and kv_compression_sharing == 'layerwise'
            else None
        )

        d_hidden = int(d_token * d_ffn_factor)
        self.layers = nn.ModuleList([
            self._make_layer(
                d_token,
                d_hidden,
                n_heads,
                attention_dropout,
                activation,
                prenormalization,
                initialization,
                n_tokens,
                kv_compression,
                kv_compression_sharing
            )
            for _ in range(n_layers)
        ])

        self.activation = get_activation_fn(activation)
        
        # use non-glu activation function in the last layer for simplicity
        self.last_activation = get_nonglu_activation_fn(activation)
        self.prenormalization = prenormalization
        self.last_normalization = nn.LayerNorm(d_token) if prenormalization else None
        self.ffn_dropout = ffn_dropout
        self.residual_dropout = residual_dropout
        self.head = nn.Linear(d_token, d_out)

    def _make_kv_compression(self, n_tokens, kv_compression, initialization):
        """Create a key-value compression layer."""
        compression = nn.Linear(n_tokens, int(n_tokens * kv_compression), bias=False)
        if initialization == 'xavier':
            nn.init.xavier_uniform_(compression.weight)
        return compression

    def _make_layer(
        self,
        d_token,
        d_hidden,
        n_heads,
        attention_dropout,
        activation,
        prenormalization,
        initialization,
        n_tokens,
        kv_compression,
        kv_compression_sharing
    ) -> nn.ModuleDict:
        """Construct a Transformer layer."""
        layer = nn.ModuleDict({
            'attention': MultiheadAttention(
                d_token, n_heads, attention_dropout, initialization, kv_compression
            ),
            'linear0': nn.Linear(
                d_token, d_hidden * (2 if activation.endswith('glu') else 1)
            ),
            'linear1': nn.Linear(d_hidden, d_token),
            'norm1': nn.LayerNorm(d_token),
        })

        if not prenormalization:
            layer['norm0'] = nn.LayerNorm(d_token)

        if kv_compression and kv_compression_sharing != 'layerwise':
            layer['key_compression'] = self._make_kv_compression(n_tokens, kv_compression, initialization)
            if kv_compression_sharing == 'headwise':
                layer['value_compression'] = self._make_kv_compression(n_tokens, kv_compression, initialization)

        return layer

    def forward(self, x_num: torch.Tensor, x_cat: Optional[torch.Tensor]) -> torch.Tensor:
        """Forward pass of the Transformer model."""
        x = self.tokenizer(x_num, x_cat)

        for layer_idx, layer in enumerate(self.layers):
            is_last_layer = layer_idx + 1 == len(self.layers)

            # Start residual connection
            x_residual = self._start_residual(x, layer, 0)

            # Attention
            x_residual = layer['attention'](
                x_residual[:, :1] if is_last_layer else x_residual,
                x_residual
            )

            # End residual connection
            x = self._end_residual(x, x_residual, layer, 0)

            # Feedforward
            x_residual = self._start_residual(x, layer, 1)
            x_residual = self.activation(layer['linear0'](x_residual))

            if self.ffn_dropout:
                x_residual = F.dropout(x_residual, self.ffn_dropout, self.training)

            x_residual = layer['linear1'](x_residual)
            x = self._end_residual(x, x_residual, layer, 1)

        # Final normalization and activation
        x = x[:, 0]
        if self.last_normalization is not None:
            x = self.last_normalization(x)
        x = self.last_activation(x)

        return self.head(x)

    def _get_kv_compressions(self, layer):
        """Retrieve key-value compression modules for a layer."""
        return (
            (self.shared_kv_compression, self.shared_kv_compression)
            if self.shared_kv_compression
            else (
                layer.get('key_compression', None),
                layer.get('value_compression', layer.get('key_compression', None))
            )
        )

    def _start_residual(self, x, layer, norm_idx):
        """Start a residual connection."""
        if self.prenormalization:
            norm_key = f'norm{norm_idx}'
            return layer[norm_key](x) if norm_key in layer else x
        return x

    def _end_residual(self, x, x_residual, layer, norm_idx):
        """End a residual connection."""
        if self.residual_dropout:
            x_residual = F.dropout(x_residual, self.residual_dropout, self.training)
        x = x + x_residual
        if not self.prenormalization:
            x = layer[f'norm{norm_idx}'](x)
        return x


In [30]:
EPOCHS = 50

In [31]:
[len(le.classes_) for le in label_encoders.values()]

[16, 7, 27, 13, 11, 19, 51, 3, 15, 2, 2]

In [32]:
# empty cache first
torch.cuda.empty_cache()

In [33]:
def objective(trial):
    # Generate the model.
    # model = define_model(trial, train_features.shape[1], 1).to(DEVICE)
    # Define out_features_list
    n_heads = trial.suggest_int("n_heads", 1, 10)
    
    n_layers = trial.suggest_int("n_layers", 1, 5)
    
    token_multiplier = trial.suggest_int("token_multiplier", 5, 30)  # Adjust the range as necessary
    
    # Suggest an integer for a that is divisible by b
    d_token = trial.suggest_int("d_token", n_heads, n_heads * token_multiplier, n_heads)
    
    attention_dropout = trial.suggest_float("attention_dropout", 0, 0.5)
    d_ffn_factor = trial.suggest_float("d_ffn_factor", 1, 3)
    ffn_dropout = trial.suggest_float("ffn_dropout", 0, 0.5)
    
    # activation = trial.suggest_categorical("activation", choices=['relu', 'reglu', 'geglu'])
    # batch_size = trial.suggest_int('batch_size', 16, 128, step=16)
    
    categories = [len(le.classes_) for le in label_encoders.values()]
    d_numerical = len(non_category_features)

    args = {'activation': 'relu', #activation, #
    'attention_dropout': attention_dropout,
    'd_ffn_factor': d_ffn_factor,
    'd_token': d_token,
    'ffn_dropout': ffn_dropout,
    'initialization': 'kaiming',
    'n_heads': n_heads,
    'n_layers': n_layers,
    'prenormalization': False,
    'residual_dropout': 0.0,
    'kv_compression': None,
    "kv_compression_sharing": None,
    'token_bias': True,
    'd_out': 1
    }

    # Generate the optimizers.
    optimizer_name = trial.suggest_categorical("optimizer", ["Adam", "RMSprop", "SGD"])
    lr = trial.suggest_float("lr", 1e-5, 1e-1, log=True)
    weight_decay = trial.suggest_float("weight_decay", 1e-9, 1e-1, log=True)

    # training with 5-fold CV
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    val_losses = []

    for train_idx, val_idx in kf.split(train_features):
        # Create training and validation datasets for the current fold
        X_train_fold, X_val_fold = train_features.iloc[train_idx], train_features.iloc[val_idx]
        y_train_fold, y_val_fold = train_labels.iloc[train_idx], train_labels.iloc[val_idx]
        
        # scaling features
        X_train_fold = scaler.fit_transform(X_train_fold)
        X_val_fold = scaler.transform(X_val_fold)
            
        # Initialize the model for this fold
        model = Transformer(d_numerical=d_numerical, categories=categories, **args).to(DEVICE)
        model = nn.DataParallel(model, device_ids = DEVICE_LIST)
        model.to(DEVICE)
        
        # define optimizer
        if optimizer_name == "Adam":
         optimizer = getattr(optim, optimizer_name)(model.parameters(), lr=lr, weight_decay=weight_decay)
        else:
            momentum = trial.suggest_float("momentum", 1e-9, 0.95, log=True)
            optimizer = getattr(optim, optimizer_name)(model.parameters(), lr=lr, weight_decay=weight_decay, momentum=momentum)
        
        # Define the loss function and optimizer
        criterion = nn.MSELoss()
        
        # Prepare DataLoader for training
        train_dataset = CustomDataset(X_train_fold, y_train_fold.to_numpy())
        val_dataset = CustomDataset(X_val_fold, y_val_fold.to_numpy())
        train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=True)
            
        # Training of the model.
        model.train()
        for epoch in range(EPOCHS):
            for batch_idx, (data, target) in enumerate(train_loader):
                data, target = data.to(DEVICE), target.to(DEVICE)
                
                X_num = data[:, :len(non_category_features)].to(DEVICE)
                X_cat = data[:, -len(category_features):].detach().long().to(DEVICE)

                optimizer.zero_grad()
                output = model(X_num, X_cat)

                # print("shape", output.shape, target.shape)
                loss = criterion(output, target)
                loss.backward()
                optimizer.step()

        # Validation of the model.
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch_idx, (data, target) in enumerate(val_loader):
                data, target = data.to(DEVICE), target.to(DEVICE)
                X_num = data[:, :len(non_category_features)].to(DEVICE)
                X_cat = data[:, -len(category_features):].detach().long().to(DEVICE)
                output = model(X_num, X_cat)
                val_loss = criterion(output, target).item()
                val_losses.append(val_loss**0.5) #rmse

        trial.report(val_loss, epoch)

    # Return the average validation loss across all folds
    return np.mean(val_losses)

In [34]:
# 3. Create a study object and optimize the objective function.
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=100, timeout=600)

pruned_trials = study.get_trials(deepcopy=False, states=[TrialState.PRUNED])
complete_trials = study.get_trials(deepcopy=False, states=[TrialState.COMPLETE])

print("Study statistics: ")
print("  Number of finished trials: ", len(study.trials))
print("  Number of pruned trials: ", len(pruned_trials))
print("  Number of complete trials: ", len(complete_trials))

print("Best trial:")
trial = study.best_trial

print("  Value: ", trial.value)

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))


[I 2024-08-27 03:59:38,680] A new study created in memory with name: no-name-e5f08483-18f9-496d-961d-f3f73e36a323


self.category_embeddings.weight.shape=torch.Size([166, 5])
activation <function relu at 0x7f35d2d734c0> relu


  d_token = trial.suggest_int("d_token", n_heads, n_heads * token_multiplier, n_heads)


self.category_embeddings.weight.shape=torch.Size([166, 5])
activation <function relu at 0x7f35d2d734c0> relu




self.category_embeddings.weight.shape=torch.Size([166, 5])
activation <function relu at 0x7f35d2d734c0> relu




self.category_embeddings.weight.shape=torch.Size([166, 5])
activation <function relu at 0x7f35d2d734c0> relu




self.category_embeddings.weight.shape=torch.Size([166, 5])
activation <function relu at 0x7f35d2d734c0> relu


[I 2024-08-27 04:03:49,651] Trial 0 finished with value: 0.2778907221754452 and parameters: {'n_heads': 5, 'n_layers': 3, 'token_multiplier': 13, 'd_token': 5, 'attention_dropout': 0.47357853391284443, 'd_ffn_factor': 1.5567463702685527, 'ffn_dropout': 0.09703116655822941, 'optimizer': 'RMSprop', 'lr': 0.00045523806322604355, 'weight_decay': 5.01534556567381e-07, 'momentum': 8.456152924489299e-07}. Best is trial 0 with value: 0.2778907221754452.
  d_token = trial.suggest_int("d_token", n_heads, n_heads * token_multiplier, n_heads)


self.category_embeddings.weight.shape=torch.Size([166, 40])
activation <function relu at 0x7f35d2d734c0> relu
self.category_embeddings.weight.shape=torch.Size([166, 40])
activation <function relu at 0x7f35d2d734c0> relu




self.category_embeddings.weight.shape=torch.Size([166, 40])
activation <function relu at 0x7f35d2d734c0> relu




self.category_embeddings.weight.shape=torch.Size([166, 40])
activation <function relu at 0x7f35d2d734c0> relu




self.category_embeddings.weight.shape=torch.Size([166, 40])
activation <function relu at 0x7f35d2d734c0> relu


[I 2024-08-27 04:07:01,453] Trial 1 finished with value: 0.3261027943937555 and parameters: {'n_heads': 10, 'n_layers': 3, 'token_multiplier': 13, 'd_token': 40, 'attention_dropout': 0.11634236810833715, 'd_ffn_factor': 1.4151651199217234, 'ffn_dropout': 0.20928743701569774, 'optimizer': 'Adam', 'lr': 0.009410155719751772, 'weight_decay': 0.09667794268657083}. Best is trial 0 with value: 0.2778907221754452.
  d_token = trial.suggest_int("d_token", n_heads, n_heads * token_multiplier, n_heads)


self.category_embeddings.weight.shape=torch.Size([166, 60])
activation <function relu at 0x7f35d2d734c0> relu
self.category_embeddings.weight.shape=torch.Size([166, 60])
activation <function relu at 0x7f35d2d734c0> relu




self.category_embeddings.weight.shape=torch.Size([166, 60])
activation <function relu at 0x7f35d2d734c0> relu




self.category_embeddings.weight.shape=torch.Size([166, 60])
activation <function relu at 0x7f35d2d734c0> relu




self.category_embeddings.weight.shape=torch.Size([166, 60])
activation <function relu at 0x7f35d2d734c0> relu


[I 2024-08-27 04:10:04,882] Trial 2 finished with value: 0.2516675406673338 and parameters: {'n_heads': 5, 'n_layers': 2, 'token_multiplier': 20, 'd_token': 60, 'attention_dropout': 0.09192629672301816, 'd_ffn_factor': 2.565011542335736, 'ffn_dropout': 0.07564072635765423, 'optimizer': 'RMSprop', 'lr': 0.00022923558072776028, 'weight_decay': 7.433796546124456e-05, 'momentum': 2.161429700687457e-05}. Best is trial 2 with value: 0.2516675406673338.


Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.2516675406673338
  Params: 
    n_heads: 5
    n_layers: 2
    token_multiplier: 20
    d_token: 60
    attention_dropout: 0.09192629672301816
    d_ffn_factor: 2.565011542335736
    ffn_dropout: 0.07564072635765423
    optimizer: RMSprop
    lr: 0.00022923558072776028
    weight_decay: 7.433796546124456e-05
    momentum: 2.161429700687457e-05


In [35]:
trial.params

{'n_heads': 5,
 'n_layers': 2,
 'token_multiplier': 20,
 'd_token': 60,
 'attention_dropout': 0.09192629672301816,
 'd_ffn_factor': 2.565011542335736,
 'ffn_dropout': 0.07564072635765423,
 'optimizer': 'RMSprop',
 'lr': 0.00022923558072776028,
 'weight_decay': 7.433796546124456e-05,
 'momentum': 2.161429700687457e-05}

In [36]:
MODEL_CONFIG = {"model": {}, "optimizer": {}}

for key, value in trial.params.items():
    if key in ['lr', 'weight_decay', 'momentum', 'weight_decay', 'optimizer']:
        MODEL_CONFIG["optimizer"][key] = value
    elif key == 'token_multiplier':
        continue
    elif key == 'batch_size':
        BATCH_SIZE = value
    else:
        # adj_key = key.rpartition('_')[0]
        MODEL_CONFIG["model"][key] = value

In [37]:
MODEL_CONFIG

{'model': {'n_heads': 5,
  'n_layers': 2,
  'd_token': 60,
  'attention_dropout': 0.09192629672301816,
  'd_ffn_factor': 2.565011542335736,
  'ffn_dropout': 0.07564072635765423},
 'optimizer': {'optimizer': 'RMSprop',
  'lr': 0.00022923558072776028,
  'weight_decay': 7.433796546124456e-05,
  'momentum': 2.161429700687457e-05}}

RUNNING THE MODEL

In [38]:
# empty cache first
torch.cuda.empty_cache()

In [39]:
# Transform training data
train_features = scaler.fit_transform(train_features)
test_features = scaler.transform(test_features)

new_feature_list = non_category_features + category_features

In [40]:
# Create dataset instances
train_dataset = CustomDataset(train_features, train_labels.to_numpy())
test_dataset = CustomDataset(test_features, test_labels.to_numpy())

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=test_features.shape[0], shuffle=False)

In [41]:
categories = [len(le.classes_) for le in label_encoders.values()]
d_numerical = len(non_category_features)
# d_token = 32  # Example token dimension
# token_bias = True

args = {
  'initialization': 'kaiming',
  'activation': 'relu',
    'prenormalization': False,
    'residual_dropout': 0.0,
    'kv_compression': None,
    "kv_compression_sharing": None,
    'token_bias': True,
    'd_out': 1
}

args.update(MODEL_CONFIG["model"])
model = Transformer(d_numerical=d_numerical, categories=categories, **args).to(DEVICE)

self.category_embeddings.weight.shape=torch.Size([166, 60])
activation <function relu at 0x7f35d2d734c0> relu


In [42]:
model

Transformer(
  (tokenizer): Tokenizer(
    (category_embeddings): Embedding(166, 60)
  )
  (layers): ModuleList(
    (0-1): 2 x ModuleDict(
      (attention): MultiheadAttention(
        (W_q): Linear(in_features=60, out_features=60, bias=True)
        (W_k): Linear(in_features=60, out_features=60, bias=True)
        (W_v): Linear(in_features=60, out_features=60, bias=True)
        (W_out): Linear(in_features=60, out_features=60, bias=True)
        (dropout): Dropout(p=0.09192629672301816, inplace=False)
      )
      (linear0): Linear(in_features=60, out_features=153, bias=True)
      (linear1): Linear(in_features=153, out_features=60, bias=True)
      (norm1): LayerNorm((60,), eps=1e-05, elementwise_affine=True)
      (norm0): LayerNorm((60,), eps=1e-05, elementwise_affine=True)
    )
  )
  (head): Linear(in_features=60, out_features=1, bias=True)
)

In [43]:
if DEVICE != "cpu":
    model = nn.DataParallel(model, device_ids = DEVICE_LIST)
model.to(DEVICE)

DataParallel(
  (module): Transformer(
    (tokenizer): Tokenizer(
      (category_embeddings): Embedding(166, 60)
    )
    (layers): ModuleList(
      (0-1): 2 x ModuleDict(
        (attention): MultiheadAttention(
          (W_q): Linear(in_features=60, out_features=60, bias=True)
          (W_k): Linear(in_features=60, out_features=60, bias=True)
          (W_v): Linear(in_features=60, out_features=60, bias=True)
          (W_out): Linear(in_features=60, out_features=60, bias=True)
          (dropout): Dropout(p=0.09192629672301816, inplace=False)
        )
        (linear0): Linear(in_features=60, out_features=153, bias=True)
        (linear1): Linear(in_features=153, out_features=60, bias=True)
        (norm1): LayerNorm((60,), eps=1e-05, elementwise_affine=True)
        (norm0): LayerNorm((60,), eps=1e-05, elementwise_affine=True)
      )
    )
    (head): Linear(in_features=60, out_features=1, bias=True)
  )
)

In [44]:
# define optimizer
optim_config = deepcopy(MODEL_CONFIG["optimizer"])
del optim_config["optimizer"]

optimizer = getattr(optim, MODEL_CONFIG["optimizer"]["optimizer"])(model.parameters(), **optim_config)
optimizer

RMSprop (
Parameter Group 0
    alpha: 0.99
    centered: False
    differentiable: False
    eps: 1e-08
    foreach: None
    lr: 0.00022923558072776028
    maximize: False
    momentum: 2.161429700687457e-05
    weight_decay: 7.433796546124456e-05
)

In [45]:
EPOCH = 500
criterion = nn.MSELoss()
start_time = time.time()

for ep in tqdm(range(EPOCH)):

    model.train()
    running_loss = 0.0
    for i, (data, target) in enumerate(train_loader):
        data, target = data.to(DEVICE), target.to(DEVICE)
        X_num = data[:, :len(non_category_features)].to(DEVICE)
        X_cat = data[:, -len(category_features):].detach().long().to(DEVICE)
        
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(X_num, X_cat)

        loss = criterion(outputs, target)
        
        loss.backward()

        optimizer.step()

        # print statistics
        running_loss += loss.item() * target.size(0)
        if i % 100 == 99:    # print every 100 mini-batches
            print(f'[{ep + 1}, {i + 1:5d}] loss: {running_loss / 100:.3f}')


    train_loss = running_loss  / len(train_loader.dataset)
    
train_loss = running_loss  / len(train_loader.dataset)
print(f'Epoch [{ep+1}], Train Loss: {train_loss**0.5:.4f}')

# print out training time
elapsed_time = time.time() - start_time
print(f"Training time: {elapsed_time:.3f} seconds")

100%|██████████| 500/500 [08:35<00:00,  1.03s/it]

Epoch [500], Train Loss: 0.1137
Training time: 515.221 seconds





In [46]:
# empty cache first
torch.cuda.empty_cache()

In [47]:
# Testing phase
model.eval()
test_loss = 0.0

with torch.no_grad():
    i = 0
    for data, target in test_loader:
        # inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
        X_num = data[:, :len(non_category_features)]
        X_cat = data[:, -len(category_features):].detach().long()
        
        if isinstance(model, nn.DataParallel):
            model = model.module  # Unwrap from DataParallel
        model = model.to('cpu')
            
               
        outputs = model(X_num, X_cat)

        # save metrics
        mae, mape, rmse, rsqr = calculate_metric(outputs.numpy(), target.numpy())
        print(f"Training average mean absolute error: {mae}")
        print(f"Training average mean absolute percentage error: {mape}")
        print(f"Training average root mean squared error: {rmse}")
        print(f"Training average R2: {rsqr}")

Training average mean absolute error: 0.15283623337745667
Training average mean absolute percentage error: 312.3091459274292
Training average root mean squared error: 0.23530429349573925
Training average R2: 0.516621470451355


In [48]:
# let’s load back in our saved model
# model = MLP()
# model.load_state_dict(torch.load(MODEL_PATH))

5 fold CV

In [49]:
features.shape

(1725, 164)

In [50]:
labels.shape

(1725,)

In [51]:
features.dtypes

currency                        int64
seniorioty_adj                  int64
coupon rate                   float64
domicile_country                int64
exchange_country                int64
                               ...   
PD_60_pd                      float64
DTD                           float64
NI_Over_TA                    float64
Size                          float64
defaulted_in_last_6_months      int64
Length: 164, dtype: object

In [52]:
# Define cross-validation setup
kf = KFold(n_splits=5, shuffle=True, random_state=42)
EPOCHS = 500
val_mae = []
val_mape = []
val_rmse = []
val_rsqr = []


for train_idx, val_idx in kf.split(features):
    # Create training and validation datasets for the current fold
    X_train_fold, X_val_fold = features.iloc[train_idx], features.iloc[val_idx]
    y_train_fold, y_val_fold = labels.iloc[train_idx], labels.iloc[val_idx]
    
    # scaling features
    X_train_fold = scaler.fit_transform(X_train_fold)
    X_val_fold = scaler.transform(X_val_fold)
        
    # Initialize the model for this fold
    model = Transformer(d_numerical=d_numerical, categories=categories, **args)
    model = nn.DataParallel(model, device_ids = DEVICE_LIST)
    model.to(DEVICE)
    
    # define optimizer
    optimizer = getattr(optim, MODEL_CONFIG["optimizer"]["optimizer"])(model.parameters(), **optim_config)
    
    # Define the loss function and optimizer
    criterion = nn.MSELoss()
    
    # Prepare DataLoader for training
    train_dataset = CustomDataset(X_train_fold, y_train_fold.to_numpy())
    val_dataset = CustomDataset(X_val_fold, y_val_fold.to_numpy())
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=val_dataset.features.shape[0], shuffle=True)
        
    # Training of the model.
    model.train()
    for epoch in range(EPOCHS):
        for batch_idx, (data, target) in enumerate(train_loader):
            data, target = data.to(DEVICE), target.to(DEVICE)
  
            X_num = data[:, :len(non_category_features)].to(DEVICE)
            X_cat = data[:, -len(category_features):].detach().long().to(DEVICE)
            
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = model(X_num, X_cat)

            
            loss = criterion(outputs, target)
            loss.backward()
            optimizer.step()
            # print(f'Epoch [{ep+1}], Train Loss: {train_loss:.4f}')

    # Validation of the model.
    model.eval()
    with torch.no_grad():
        for batch_idx, (data, target) in enumerate(val_loader):
            # data, target = data.to(DEVICE), target.to(DEVICE)
            X_num = data[:, :len(non_category_features)]
            X_cat = data[:, -len(category_features):].detach().long()
            
            if isinstance(model, nn.DataParallel):
                model = model.module  # Unwrap from DataParallel
            model = model.to('cpu')
            outputs = model(X_num, X_cat)
            
            # save metrics
            mae, mape, rmse, rsqr = calculate_metric(outputs.numpy(), target.numpy())
            val_mae.append(mae)
            val_mape.append(mape)
            val_rmse.append(rmse)
            val_rsqr.append(rsqr)

self.category_embeddings.weight.shape=torch.Size([166, 60])
activation <function relu at 0x7f35d2d734c0> relu
self.category_embeddings.weight.shape=torch.Size([166, 60])
activation <function relu at 0x7f35d2d734c0> relu
self.category_embeddings.weight.shape=torch.Size([166, 60])
activation <function relu at 0x7f35d2d734c0> relu
self.category_embeddings.weight.shape=torch.Size([166, 60])
activation <function relu at 0x7f35d2d734c0> relu
self.category_embeddings.weight.shape=torch.Size([166, 60])
activation <function relu at 0x7f35d2d734c0> relu


In [53]:
print(f"Test average mean absolute error: {statistics.mean(val_mae)}")
print(f"Test average mean absolute percentage error: {statistics.mean(val_mape)}")
print(f"Test average root mean squared error: {statistics.mean(val_rmse)}")
print(f"Test average R2: {statistics.mean(val_rsqr)}")

Test average mean absolute error: 0.16323910653591156
Test average mean absolute percentage error: 5443.055862188339
Test average root mean squared error: 0.24957806415090034
Test average R2: 0.42442349195480344
