In [1]:
import math
from typing import List, Optional, Callable, cast, Dict
import sys
import os
from pathlib import Path
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
from torch.utils.data import DataLoader, Dataset
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.discriminant_analysis import StandardScaler
from sklearn.model_selection import train_test_split
from copy import deepcopy
import torch.optim as optim
from sklearn.preprocessing import LabelEncoder
import optuna
from optuna.trial import TrialState
# import typing as ty
from torch import Tensor
import torch.nn.init as nn_init
import statistics
from sklearn.model_selection import KFold
import time

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), "..")))

In [3]:
from utils import calculate_metric

In [4]:
INPUT_FILE = "../data/data_removing_na.xlsx"

MODEL_PATH = 'best_transformer_model.pth'
BATCH_SIZE = 32

DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
# DEVICE = 'cpu'
DEVICE_LIST = [0, 1, 3]

In [5]:
DEVICE

device(type='cuda', index=0)

In [6]:
df = pd.read_excel(INPUT_FILE)

In [7]:
df.head()

Unnamed: 0,rr1_30,currency,seniorioty_adj,coupon rate,domicile_country,exchange_country,Industry_sector,Industry_group,Industry_subgroup,event_type,...,PD_55_pd,PD_56_pd,PD_57_pd,PD_58_pd,PD_59_pd,PD_60_pd,DTD,NI_Over_TA,Size,defaulted_in_last_6_months
0,0.259908,USD,Senior Subordinated Unsecured,9.0,United States,United States,Consumer Discretionary,Retail & Whsle - Discretionary,E-Commerce Discretionary,Bankruptcy Filing,...,0.396731,0.397453,0.398148,0.398819,0.399467,0.400092,-0.732815,-0.007137,-0.852484,False
1,0.032729,USD,Senior Subordinated Unsecured,5.75,United States,United States,Health Care,Health Care,Health Care Facilities & Svcs,Default Corp Action,...,0.957454,0.957467,0.95748,0.957492,0.957503,0.957514,-1.666262,-0.000286,-1.186347,False
2,0.9724,USD,Unsecured,5.675,South Korea,South Korea,Consumer Discretionary,Retail & Whsle - Discretionary,Wholesale - Discretionary,Default Corp Action,...,0.568169,0.568693,0.569197,0.569682,0.57015,0.5706,-1.853366,0.000191,1.053677,False
3,1.047416,CHF,Unsecured,0.125,South Korea,South Korea,Consumer Discretionary,Retail & Whsle - Discretionary,Wholesale - Discretionary,Default Corp Action,...,0.568169,0.568693,0.569197,0.569682,0.57015,0.5706,-1.853366,0.000191,1.053677,False
4,0.848872,JPY,Unsecured,1.75,Japan,Japan,Industrials,Industrial Products,Electrical Equipment,Bankruptcy Filing,...,0.130285,0.130688,0.131081,0.131465,0.13184,0.132206,-0.768857,-0.028058,-1.946507,False


In [8]:
df.shape

(1725, 165)

In [9]:
labels = df['rr1_30']
features = df.drop(columns='rr1_30')

In [10]:
features.dtypes

currency                       object
seniorioty_adj                 object
coupon rate                   float64
domicile_country               object
exchange_country               object
                               ...   
PD_60_pd                      float64
DTD                           float64
NI_Over_TA                    float64
Size                          float64
defaulted_in_last_6_months       bool
Length: 164, dtype: object

In [11]:
feature_list = features.columns
category_features = list(features.select_dtypes(include=['object', 'bool']).columns)
non_category_features = [i for i in feature_list if i not in category_features]

In [12]:
print(len(non_category_features))
print(len(category_features))

153
11


In [13]:
category_features

['currency',
 'seniorioty_adj',
 'domicile_country',
 'exchange_country',
 'Industry_sector',
 'Industry_group',
 'Industry_subgroup',
 'event_type',
 'event_type_subcategory_sum',
 'defaulted_in_last_5_years',
 'defaulted_in_last_6_months']

In [14]:
# transform categorical features in features
label_encoders = {}
mappings = {}

for column in category_features:
    le = LabelEncoder()
    features[column] = le.fit_transform(features[column])
    label_encoders[column] = le
    mappings[column] = {index: label for index, label in enumerate(le.classes_)}

In [15]:
mappings

{'currency': {0: 'CAD',
  1: 'CHF',
  2: 'CNY',
  3: 'EUR',
  4: 'GBP',
  5: 'HKD',
  6: 'INR',
  7: 'ISK',
  8: 'JPY',
  9: 'MYR',
  10: 'NOK',
  11: 'SEK',
  12: 'SGD',
  13: 'THB',
  14: 'TWD',
  15: 'USD'},
 'seniorioty_adj': {0: 'Junior Unsecured or Junior Subordinated Unsecured',
  1: 'Secured',
  2: 'Senior Secured',
  3: 'Senior Subordinated Unsecured',
  4: 'Senior Unsecured',
  5: 'Subordinated Unsecured',
  6: 'Unsecured'},
 'domicile_country': {0: 'Argentina',
  1: 'Australia',
  2: 'Bahamas',
  3: 'Belgium',
  4: 'Bermuda',
  5: 'Canada',
  6: 'Cayman Islands',
  7: 'China',
  8: 'Czech Republic',
  9: 'Greece',
  10: 'Hong Kong',
  11: 'Iceland',
  12: 'India',
  13: 'Indonesia',
  14: 'Japan',
  15: 'Luxembourg',
  16: 'Malaysia',
  17: 'Mongolia',
  18: 'Philippines',
  19: 'Poland',
  20: 'Singapore',
  21: 'South Africa',
  22: 'South Korea',
  23: 'Taiwan',
  24: 'Thailand',
  25: 'United Kingdom',
  26: 'United States'},
 'exchange_country': {0: 'Australia',
  1: 'C

In [16]:
features

Unnamed: 0,currency,seniorioty_adj,coupon rate,domicile_country,exchange_country,Industry_sector,Industry_group,Industry_subgroup,event_type,event_type_subcategory_sum,...,PD_55_pd,PD_56_pd,PD_57_pd,PD_58_pd,PD_59_pd,PD_60_pd,DTD,NI_Over_TA,Size,defaulted_in_last_6_months
0,15,3,9.000,26,12,1,14,12,0,1,...,0.396731,0.397453,0.398148,0.398819,0.399467,0.400092,-0.732815,-0.007137,-0.852484,0
1,15,3,5.750,26,12,5,4,20,1,5,...,0.957454,0.957467,0.957480,0.957492,0.957503,0.957514,-1.666262,-0.000286,-1.186347,0
2,15,6,5.675,22,9,1,14,50,1,6,...,0.568169,0.568693,0.569197,0.569682,0.570150,0.570600,-1.853366,0.000191,1.053677,0
3,1,6,0.125,22,9,1,14,50,1,6,...,0.568169,0.568693,0.569197,0.569682,0.570150,0.570600,-1.853366,0.000191,1.053677,0
4,8,6,1.750,14,5,6,5,14,0,13,...,0.130285,0.130688,0.131081,0.131465,0.131840,0.132206,-0.768857,-0.028058,-1.946507,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1720,9,2,5.000,16,6,1,1,2,0,9,...,0.081004,0.081844,0.082676,0.083500,0.084315,0.085123,0.954865,0.000425,2.497169,0
1721,9,2,4.950,16,6,1,1,2,0,9,...,0.081004,0.081844,0.082676,0.083500,0.084315,0.085123,0.954865,0.000425,2.497169,0
1722,9,2,5.150,16,6,1,1,2,0,9,...,0.081004,0.081844,0.082676,0.083500,0.084315,0.085123,0.954865,0.000425,2.497169,0
1723,9,2,5.050,16,6,1,1,2,0,9,...,0.081004,0.081844,0.082676,0.083500,0.084315,0.085123,0.954865,0.000425,2.497169,0


In [17]:
# split data into training and test set
test_size = 0.25
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=test_size, random_state=42)

print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

Training Features Shape: (1293, 164)
Training Labels Shape: (1293,)
Testing Features Shape: (432, 164)
Testing Labels Shape: (432,)


In [18]:
# Normalize the data
# Prepare the ColumnTransformer
scaler = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), non_category_features)   # StandardScaler()
    ],
    remainder='passthrough'  # Leave categorical features untouched
)

In [22]:
class CustomDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels
    
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, idx):
        feature = self.features[idx]
        label = self.labels[idx]
        return torch.tensor(feature, dtype=torch.float32), torch.tensor(label, dtype=torch.float32)

TUNING PARAMETERS

In [26]:
class Tokenizer(nn.Module):
    category_offsets: Optional[Tensor]

    def __init__(
        self,
        d_numerical: int,
        categories: Optional[List[int]],
        d_token: int,
        bias: bool,
    ) -> None:
        super().__init__()
        if categories is None:
            d_bias = d_numerical
            self.category_offsets = None
            self.category_embeddings = None
        else:
            d_bias = d_numerical + len(categories)
            
            # to ensure proper indexing for embeddings
            category_offsets = torch.tensor([0] + categories[:-1]).cumsum(0)
            self.register_buffer('category_offsets', category_offsets)
            
            # sum(categories) tensors of size d_token
            self.category_embeddings = nn.Embedding(sum(categories), d_token)
            
            # initialize the embeddings
            nn_init.kaiming_uniform_(self.category_embeddings.weight, a=math.sqrt(5))
            print(f'{self.category_embeddings.weight.shape=}')

        # take [CLS] token into account
        self.weight = nn.Parameter(Tensor(d_numerical + 1, d_token))
        self.bias = nn.Parameter(Tensor(d_bias, d_token)) if bias else None
        # The initialization is inspired by nn.Linear
        nn_init.kaiming_uniform_(self.weight, a=math.sqrt(5))
        if self.bias is not None:
            nn_init.kaiming_uniform_(self.bias, a=math.sqrt(5))

    @property
    def n_tokens(self) -> int:
        return len(self.weight) + (
            0 if self.category_offsets is None else len(self.category_offsets)
        )

    def forward(self, x_num: Tensor, x_cat: Optional[Tensor]) -> Tensor:
        x_some = x_num if x_cat is None else x_cat
        assert x_some is not None
        
        # if x_cat is not None, x_num = x_cat + x_num
        x_num = torch.cat(
            [torch.ones(len(x_some), 1, device=x_some.device)]  # [CLS]
            + ([] if x_num is None else [x_num]),
            dim=1,
        )
        
        # numerical features are weighted by weights
        x = self.weight[None] * x_num[:, :, None]
        if x_cat is not None:
            x = torch.cat(
                [x, self.category_embeddings(x_cat + self.category_offsets[None])],
                dim=1,
            )
        if self.bias is not None:
            bias = torch.cat(
                [
                    torch.zeros(1, self.bias.shape[1], device=x.device),
                    self.bias,
                ]
            )
            x = x + bias[None]
        return x


In [27]:
class MultiheadAttention(nn.Module):
    def __init__(
        self, d: int, n_heads: int, dropout: float, initialization: str
    ) -> None:
        if n_heads > 1:
            assert d % n_heads == 0
        assert initialization in ['xavier', 'kaiming']

        super().__init__()
        self.W_q = nn.Linear(d, d)
        self.W_k = nn.Linear(d, d)
        self.W_v = nn.Linear(d, d)
        self.W_out = nn.Linear(d, d) if n_heads > 1 else None
        self.n_heads = n_heads
        self.dropout = nn.Dropout(dropout) if dropout else None

        for m in [self.W_q, self.W_k, self.W_v]:
            if initialization == 'xavier' and (n_heads > 1 or m is not self.W_v):
                # gain is needed since W_qkv is represented with 3 separate layers
                nn_init.xavier_uniform_(m.weight, gain=1 / math.sqrt(2))
            nn_init.zeros_(m.bias)
        if self.W_out is not None:
            nn_init.zeros_(self.W_out.bias)

    def _reshape(self, x: Tensor) -> Tensor:
        batch_size, n_tokens, d = x.shape
        d_head = d // self.n_heads
        return (
            x.reshape(batch_size, n_tokens, self.n_heads, d_head)
            .transpose(1, 2)
            .reshape(batch_size * self.n_heads, n_tokens, d_head)
        )

    def forward(
        self,
        x_q: Tensor,
        x_kv: Tensor,
        key_compression: Optional[nn.Linear],
        value_compression: Optional[nn.Linear],
    ) -> Tensor:
        q, k, v = self.W_q(x_q), self.W_k(x_kv), self.W_v(x_kv)
        for tensor in [q, k, v]:
            assert tensor.shape[-1] % self.n_heads == 0
        if key_compression is not None:
            assert value_compression is not None
            k = key_compression(k.transpose(1, 2)).transpose(1, 2)
            v = value_compression(v.transpose(1, 2)).transpose(1, 2)
        else:
            assert value_compression is None

        batch_size = len(q)
        d_head_key = k.shape[-1] // self.n_heads
        d_head_value = v.shape[-1] // self.n_heads
        n_q_tokens = q.shape[1]

        q = self._reshape(q)
        k = self._reshape(k)
        attention = F.softmax(q @ k.transpose(1, 2) / math.sqrt(d_head_key), dim=-1)
        if self.dropout is not None:
            attention = self.dropout(attention)
        x = attention @ self._reshape(v)
        x = (
            x.reshape(batch_size, self.n_heads, n_q_tokens, d_head_value)
            .transpose(1, 2)
            .reshape(batch_size, n_q_tokens, self.n_heads * d_head_value)
        )
        if self.W_out is not None:
            x = self.W_out(x)
        return x

In [28]:
def get_activation_fn(name: str) -> Callable[[Tensor], Tensor]:
    return (
        reglu
        if name == 'reglu'
        else geglu
        if name == 'geglu'
        else torch.sigmoid
        if name == 'sigmoid'
        else getattr(F, name)
    )


def get_nonglu_activation_fn(name: str) -> Callable[[Tensor], Tensor]:
    return (
        F.relu
        if name == 'reglu'
        else F.gelu
        if name == 'geglu'
        else get_activation_fn(name)
    )
    
def reglu(x: Tensor) -> Tensor:
    a, b = x.chunk(2, dim=-1)
    return a * F.relu(b)


def geglu(x: Tensor) -> Tensor:
    a, b = x.chunk(2, dim=-1)
    return a * F.gelu(b)

In [29]:
class Transformer(nn.Module):
    """Transformer.

    References:
    - https://pytorch.org/docs/stable/generated/torch.nn.Transformer.html
    - https://github.com/facebookresearch/pytext/tree/master/pytext/models/representations/transformer
    - https://github.com/pytorch/fairseq/blob/1bba712622b8ae4efb3eb793a8a40da386fe11d0/examples/linformer/linformer_src/modules/multihead_linear_attention.py#L19
    """

    def __init__(
        self,
        *,
        # tokenizer
        d_numerical: int,
        categories: Optional[List[int]],
        token_bias: bool,
        # transformer
        n_layers: int,
        d_token: int,
        n_heads: int,
        d_ffn_factor: float,
        attention_dropout: float,
        ffn_dropout: float,
        residual_dropout: float,
        activation: str,
        prenormalization: bool,
        initialization: str,
        # linformer
        kv_compression: Optional[float],
        kv_compression_sharing: Optional[str],
        #
        d_out: int,
    ) -> None:
        assert (kv_compression is None) ^ (kv_compression_sharing is not None)

        super().__init__()
        self.tokenizer = Tokenizer(d_numerical, categories, d_token, token_bias)
        n_tokens = self.tokenizer.n_tokens

        def make_kv_compression():
            assert kv_compression
            compression = nn.Linear(
                n_tokens, int(n_tokens * kv_compression), bias=False
            )
            if initialization == 'xavier':
                nn_init.xavier_uniform_(compression.weight)
            return compression

        self.shared_kv_compression = (
            make_kv_compression()
            if kv_compression and kv_compression_sharing == 'layerwise'
            else None
        )

        def make_normalization():
            return nn.LayerNorm(d_token)

        d_hidden = int(d_token * d_ffn_factor)
        self.layers = nn.ModuleList([])
        for layer_idx in range(n_layers):
            layer = nn.ModuleDict(
                {
                    'attention': MultiheadAttention(
                        d_token, n_heads, attention_dropout, initialization
                    ),
                    'linear0': nn.Linear(
                        d_token, d_hidden * (2 if activation.endswith('glu') else 1)
                    ),
                    'linear1': nn.Linear(d_hidden, d_token),
                    'norm1': make_normalization(),
                }
            )
            if not prenormalization or layer_idx:
                layer['norm0'] = make_normalization()
            if kv_compression and self.shared_kv_compression is None:
                layer['key_compression'] = make_kv_compression()
                if kv_compression_sharing == 'headwise':
                    layer['value_compression'] = make_kv_compression()
                else:
                    assert kv_compression_sharing == 'key-value'
            self.layers.append(layer)

        self.activation = get_activation_fn(activation)
        self.last_activation = get_nonglu_activation_fn(activation)
        self.prenormalization = prenormalization
        self.last_normalization = make_normalization() if prenormalization else None
        self.ffn_dropout = ffn_dropout
        self.residual_dropout = residual_dropout
        self.head = nn.Linear(d_token, d_out)

    def _get_kv_compressions(self, layer):
        return (
            (self.shared_kv_compression, self.shared_kv_compression)
            if self.shared_kv_compression is not None
            else (layer['key_compression'], layer['value_compression'])
            if 'key_compression' in layer and 'value_compression' in layer
            else (layer['key_compression'], layer['key_compression'])
            if 'key_compression' in layer
            else (None, None)
        )

    def _start_residual(self, x, layer, norm_idx):
        x_residual = x
        if self.prenormalization:
            norm_key = f'norm{norm_idx}'
            if norm_key in layer:
                x_residual = layer[norm_key](x_residual)
        return x_residual

    def _end_residual(self, x, x_residual, layer, norm_idx):
        if self.residual_dropout:
            x_residual = F.dropout(x_residual, self.residual_dropout, self.training)
        x = x + x_residual
        if not self.prenormalization:
            x = layer[f'norm{norm_idx}'](x)
        return x

    def forward(self, x_num: Tensor, x_cat: Optional[Tensor]) -> Tensor:
        x = self.tokenizer(x_num, x_cat)

        for layer_idx, layer in enumerate(self.layers):
            is_last_layer = layer_idx + 1 == len(self.layers)
            layer = cast(Dict[str, nn.Module], layer)

            # start residual connection
            x_residual = self._start_residual(x, layer, 0)
            
            # attention layer
            x_residual = layer['attention'](
                # for the last attention, it is enough to process only [CLS]
                (x_residual[:, :1] if is_last_layer else x_residual),
                x_residual,
                *self._get_kv_compressions(layer),
            )
            
            # end residual connection
            if is_last_layer:
                x = x[:, : x_residual.shape[1]]
            x = self._end_residual(x, x_residual, layer, 0)

            # feedforward network
            x_residual = self._start_residual(x, layer, 1)
            x_residual = layer['linear0'](x_residual)
            x_residual = self.activation(x_residual)
            if self.ffn_dropout:
                x_residual = F.dropout(x_residual, self.ffn_dropout, self.training)
            x_residual = layer['linear1'](x_residual)
            x = self._end_residual(x, x_residual, layer, 1)

        # final layer processing
        assert x.shape[1] == 1
        x = x[:, 0]
        if self.last_normalization is not None:
            x = self.last_normalization(x)
        x = self.last_activation(x)
        x = self.head(x)
        x = x.squeeze(-1)
        return x

In [30]:
EPOCHS = 50

In [31]:
[len(le.classes_) for le in label_encoders.values()]

[16, 7, 27, 13, 11, 19, 51, 3, 15, 2, 2]

In [32]:
# empty cache first
torch.cuda.empty_cache()

In [33]:
def objective(trial):
    # Generate the model.
    # model = define_model(trial, train_features.shape[1], 1).to(DEVICE)
    # Define out_features_list
    n_heads = trial.suggest_int("n_heads", 1, 10)
    
    n_layers = trial.suggest_int("n_layers", 1, 5)
    
    token_multiplier = trial.suggest_int("token_multiplier", 5, 30)  # Adjust the range as necessary
    
    # Suggest an integer for a that is divisible by b
    d_token = trial.suggest_int("d_token", n_heads, n_heads * token_multiplier, n_heads)
    
    attention_dropout = trial.suggest_float("attention_dropout", 0, 0.5)
    d_ffn_factor = trial.suggest_float("d_ffn_factor", 1, 3)
    ffn_dropout = trial.suggest_float("ffn_dropout", 0, 0.5)
    
    # activation = trial.suggest_categorical("activation", choices=['relu', 'reglu', 'geglu'])
    # batch_size = trial.suggest_int('batch_size', 16, 128, step=16)
    
    categories = [len(le.classes_) for le in label_encoders.values()]
    d_numerical = len(non_category_features)

    args = {'activation': 'relu', #activation, #
    'attention_dropout': attention_dropout,
    'd_ffn_factor': d_ffn_factor,
    'd_token': d_token,
    'ffn_dropout': ffn_dropout,
    'initialization': 'kaiming',
    'n_heads': n_heads,
    'n_layers': n_layers,
    'prenormalization': False,
    'residual_dropout': 0.0,
    'kv_compression': None,
    "kv_compression_sharing": None,
    'token_bias': True,
    'd_out': 1
    }

    # Generate the optimizers.
    optimizer_name = trial.suggest_categorical("optimizer", ["Adam", "RMSprop", "SGD"])
    lr = trial.suggest_float("lr", 1e-5, 1e-1, log=True)
    weight_decay = trial.suggest_float("weight_decay", 1e-9, 1e-1, log=True)

    # training with 5-fold CV
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    val_losses = []

    for train_idx, val_idx in kf.split(train_features):
        # Create training and validation datasets for the current fold
        X_train_fold, X_val_fold = train_features.iloc[train_idx], train_features.iloc[val_idx]
        y_train_fold, y_val_fold = train_labels.iloc[train_idx], train_labels.iloc[val_idx]
        
        # scaling features
        X_train_fold = scaler.fit_transform(X_train_fold)
        X_val_fold = scaler.transform(X_val_fold)
            
        # Initialize the model for this fold
        model = Transformer(d_numerical=d_numerical, categories=categories, **args).to(DEVICE)
        model = nn.DataParallel(model, device_ids = DEVICE_LIST)
        model.to(DEVICE)
        
        # define optimizer
        if optimizer_name == "Adam":
         optimizer = getattr(optim, optimizer_name)(model.parameters(), lr=lr, weight_decay=weight_decay)
        else:
            momentum = trial.suggest_float("momentum", 1e-9, 0.95, log=True)
            optimizer = getattr(optim, optimizer_name)(model.parameters(), lr=lr, weight_decay=weight_decay, momentum=momentum)
        
        # Define the loss function and optimizer
        criterion = nn.MSELoss()
        
        # Prepare DataLoader for training
        train_dataset = CustomDataset(X_train_fold, y_train_fold.to_numpy())
        val_dataset = CustomDataset(X_val_fold, y_val_fold.to_numpy())
        train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=True)
            
        # Training of the model.
        model.train()
        for epoch in range(EPOCHS):
            for batch_idx, (data, target) in enumerate(train_loader):
                data, target = data.to(DEVICE), target.to(DEVICE)
                
                X_num = data[:, :len(non_category_features)].to(DEVICE)
                X_cat = data[:, -len(category_features):].detach().long().to(DEVICE)

                optimizer.zero_grad()
                output = model(X_num, X_cat)

                # print("shape", output.shape, target.shape)
                loss = criterion(output, target)
                loss.backward()
                optimizer.step()

        # Validation of the model.
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch_idx, (data, target) in enumerate(val_loader):
                data, target = data.to(DEVICE), target.to(DEVICE)
                X_num = data[:, :len(non_category_features)].to(DEVICE)
                X_cat = data[:, -len(category_features):].detach().long().to(DEVICE)
                output = model(X_num, X_cat)
                val_loss = criterion(output, target).item()
                val_losses.append(val_loss**0.5) #rmse

        trial.report(val_loss, epoch)

    # Return the average validation loss across all folds
    return np.mean(val_losses)

In [34]:
# 3. Create a study object and optimize the objective function.
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=100, timeout=600)

pruned_trials = study.get_trials(deepcopy=False, states=[TrialState.PRUNED])
complete_trials = study.get_trials(deepcopy=False, states=[TrialState.COMPLETE])

print("Study statistics: ")
print("  Number of finished trials: ", len(study.trials))
print("  Number of pruned trials: ", len(pruned_trials))
print("  Number of complete trials: ", len(complete_trials))

print("Best trial:")
trial = study.best_trial

print("  Value: ", trial.value)

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))


[I 2024-08-27 03:59:38,680] A new study created in memory with name: no-name-e5f08483-18f9-496d-961d-f3f73e36a323


self.category_embeddings.weight.shape=torch.Size([166, 5])
activation <function relu at 0x7f35d2d734c0> relu


  d_token = trial.suggest_int("d_token", n_heads, n_heads * token_multiplier, n_heads)


self.category_embeddings.weight.shape=torch.Size([166, 5])
activation <function relu at 0x7f35d2d734c0> relu




self.category_embeddings.weight.shape=torch.Size([166, 5])
activation <function relu at 0x7f35d2d734c0> relu




self.category_embeddings.weight.shape=torch.Size([166, 5])
activation <function relu at 0x7f35d2d734c0> relu




self.category_embeddings.weight.shape=torch.Size([166, 5])
activation <function relu at 0x7f35d2d734c0> relu


[I 2024-08-27 04:03:49,651] Trial 0 finished with value: 0.2778907221754452 and parameters: {'n_heads': 5, 'n_layers': 3, 'token_multiplier': 13, 'd_token': 5, 'attention_dropout': 0.47357853391284443, 'd_ffn_factor': 1.5567463702685527, 'ffn_dropout': 0.09703116655822941, 'optimizer': 'RMSprop', 'lr': 0.00045523806322604355, 'weight_decay': 5.01534556567381e-07, 'momentum': 8.456152924489299e-07}. Best is trial 0 with value: 0.2778907221754452.
  d_token = trial.suggest_int("d_token", n_heads, n_heads * token_multiplier, n_heads)


self.category_embeddings.weight.shape=torch.Size([166, 40])
activation <function relu at 0x7f35d2d734c0> relu
self.category_embeddings.weight.shape=torch.Size([166, 40])
activation <function relu at 0x7f35d2d734c0> relu




self.category_embeddings.weight.shape=torch.Size([166, 40])
activation <function relu at 0x7f35d2d734c0> relu




self.category_embeddings.weight.shape=torch.Size([166, 40])
activation <function relu at 0x7f35d2d734c0> relu




self.category_embeddings.weight.shape=torch.Size([166, 40])
activation <function relu at 0x7f35d2d734c0> relu


[I 2024-08-27 04:07:01,453] Trial 1 finished with value: 0.3261027943937555 and parameters: {'n_heads': 10, 'n_layers': 3, 'token_multiplier': 13, 'd_token': 40, 'attention_dropout': 0.11634236810833715, 'd_ffn_factor': 1.4151651199217234, 'ffn_dropout': 0.20928743701569774, 'optimizer': 'Adam', 'lr': 0.009410155719751772, 'weight_decay': 0.09667794268657083}. Best is trial 0 with value: 0.2778907221754452.
  d_token = trial.suggest_int("d_token", n_heads, n_heads * token_multiplier, n_heads)


self.category_embeddings.weight.shape=torch.Size([166, 60])
activation <function relu at 0x7f35d2d734c0> relu
self.category_embeddings.weight.shape=torch.Size([166, 60])
activation <function relu at 0x7f35d2d734c0> relu




self.category_embeddings.weight.shape=torch.Size([166, 60])
activation <function relu at 0x7f35d2d734c0> relu




self.category_embeddings.weight.shape=torch.Size([166, 60])
activation <function relu at 0x7f35d2d734c0> relu




self.category_embeddings.weight.shape=torch.Size([166, 60])
activation <function relu at 0x7f35d2d734c0> relu


[I 2024-08-27 04:10:04,882] Trial 2 finished with value: 0.2516675406673338 and parameters: {'n_heads': 5, 'n_layers': 2, 'token_multiplier': 20, 'd_token': 60, 'attention_dropout': 0.09192629672301816, 'd_ffn_factor': 2.565011542335736, 'ffn_dropout': 0.07564072635765423, 'optimizer': 'RMSprop', 'lr': 0.00022923558072776028, 'weight_decay': 7.433796546124456e-05, 'momentum': 2.161429700687457e-05}. Best is trial 2 with value: 0.2516675406673338.


Study statistics: 
  Number of finished trials:  3
  Number of pruned trials:  0
  Number of complete trials:  3
Best trial:
  Value:  0.2516675406673338
  Params: 
    n_heads: 5
    n_layers: 2
    token_multiplier: 20
    d_token: 60
    attention_dropout: 0.09192629672301816
    d_ffn_factor: 2.565011542335736
    ffn_dropout: 0.07564072635765423
    optimizer: RMSprop
    lr: 0.00022923558072776028
    weight_decay: 7.433796546124456e-05
    momentum: 2.161429700687457e-05


In [35]:
trial.params

{'n_heads': 5,
 'n_layers': 2,
 'token_multiplier': 20,
 'd_token': 60,
 'attention_dropout': 0.09192629672301816,
 'd_ffn_factor': 2.565011542335736,
 'ffn_dropout': 0.07564072635765423,
 'optimizer': 'RMSprop',
 'lr': 0.00022923558072776028,
 'weight_decay': 7.433796546124456e-05,
 'momentum': 2.161429700687457e-05}

In [36]:
MODEL_CONFIG = {"model": {}, "optimizer": {}}

for key, value in trial.params.items():
    if key in ['lr', 'weight_decay', 'momentum', 'weight_decay', 'optimizer']:
        MODEL_CONFIG["optimizer"][key] = value
    elif key == 'token_multiplier':
        continue
    elif key == 'batch_size':
        BATCH_SIZE = value
    else:
        # adj_key = key.rpartition('_')[0]
        MODEL_CONFIG["model"][key] = value

In [37]:
MODEL_CONFIG

{'model': {'n_heads': 5,
  'n_layers': 2,
  'd_token': 60,
  'attention_dropout': 0.09192629672301816,
  'd_ffn_factor': 2.565011542335736,
  'ffn_dropout': 0.07564072635765423},
 'optimizer': {'optimizer': 'RMSprop',
  'lr': 0.00022923558072776028,
  'weight_decay': 7.433796546124456e-05,
  'momentum': 2.161429700687457e-05}}

RUNNING THE MODEL

In [38]:
# empty cache first
torch.cuda.empty_cache()

In [39]:
# Transform training data
train_features = scaler.fit_transform(train_features)
test_features = scaler.transform(test_features)

new_feature_list = non_category_features + category_features

In [40]:
# Create dataset instances
train_dataset = CustomDataset(train_features, train_labels.to_numpy())
test_dataset = CustomDataset(test_features, test_labels.to_numpy())

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=test_features.shape[0], shuffle=False)

In [41]:
categories = [len(le.classes_) for le in label_encoders.values()]
d_numerical = len(non_category_features)
# d_token = 32  # Example token dimension
# token_bias = True

args = {
  'initialization': 'kaiming',
  'activation': 'relu',
    'prenormalization': False,
    'residual_dropout': 0.0,
    'kv_compression': None,
    "kv_compression_sharing": None,
    'token_bias': True,
    'd_out': 1
}

args.update(MODEL_CONFIG["model"])
model = Transformer(d_numerical=d_numerical, categories=categories, **args).to(DEVICE)

self.category_embeddings.weight.shape=torch.Size([166, 60])
activation <function relu at 0x7f35d2d734c0> relu


In [42]:
model

Transformer(
  (tokenizer): Tokenizer(
    (category_embeddings): Embedding(166, 60)
  )
  (layers): ModuleList(
    (0-1): 2 x ModuleDict(
      (attention): MultiheadAttention(
        (W_q): Linear(in_features=60, out_features=60, bias=True)
        (W_k): Linear(in_features=60, out_features=60, bias=True)
        (W_v): Linear(in_features=60, out_features=60, bias=True)
        (W_out): Linear(in_features=60, out_features=60, bias=True)
        (dropout): Dropout(p=0.09192629672301816, inplace=False)
      )
      (linear0): Linear(in_features=60, out_features=153, bias=True)
      (linear1): Linear(in_features=153, out_features=60, bias=True)
      (norm1): LayerNorm((60,), eps=1e-05, elementwise_affine=True)
      (norm0): LayerNorm((60,), eps=1e-05, elementwise_affine=True)
    )
  )
  (head): Linear(in_features=60, out_features=1, bias=True)
)

In [43]:
if DEVICE != "cpu":
    model = nn.DataParallel(model, device_ids = DEVICE_LIST)
model.to(DEVICE)

DataParallel(
  (module): Transformer(
    (tokenizer): Tokenizer(
      (category_embeddings): Embedding(166, 60)
    )
    (layers): ModuleList(
      (0-1): 2 x ModuleDict(
        (attention): MultiheadAttention(
          (W_q): Linear(in_features=60, out_features=60, bias=True)
          (W_k): Linear(in_features=60, out_features=60, bias=True)
          (W_v): Linear(in_features=60, out_features=60, bias=True)
          (W_out): Linear(in_features=60, out_features=60, bias=True)
          (dropout): Dropout(p=0.09192629672301816, inplace=False)
        )
        (linear0): Linear(in_features=60, out_features=153, bias=True)
        (linear1): Linear(in_features=153, out_features=60, bias=True)
        (norm1): LayerNorm((60,), eps=1e-05, elementwise_affine=True)
        (norm0): LayerNorm((60,), eps=1e-05, elementwise_affine=True)
      )
    )
    (head): Linear(in_features=60, out_features=1, bias=True)
  )
)

In [44]:
# define optimizer
optim_config = deepcopy(MODEL_CONFIG["optimizer"])
del optim_config["optimizer"]

optimizer = getattr(optim, MODEL_CONFIG["optimizer"]["optimizer"])(model.parameters(), **optim_config)
optimizer

RMSprop (
Parameter Group 0
    alpha: 0.99
    centered: False
    differentiable: False
    eps: 1e-08
    foreach: None
    lr: 0.00022923558072776028
    maximize: False
    momentum: 2.161429700687457e-05
    weight_decay: 7.433796546124456e-05
)

In [45]:
EPOCH = 500
criterion = nn.MSELoss()
start_time = time.time()

for ep in tqdm(range(EPOCH)):

    model.train()
    running_loss = 0.0
    for i, (data, target) in enumerate(train_loader):
        data, target = data.to(DEVICE), target.to(DEVICE)
        X_num = data[:, :len(non_category_features)].to(DEVICE)
        X_cat = data[:, -len(category_features):].detach().long().to(DEVICE)
        
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(X_num, X_cat)

        loss = criterion(outputs, target)
        
        loss.backward()

        optimizer.step()

        # print statistics
        running_loss += loss.item() * target.size(0)
        if i % 100 == 99:    # print every 100 mini-batches
            print(f'[{ep + 1}, {i + 1:5d}] loss: {running_loss / 100:.3f}')


    train_loss = running_loss  / len(train_loader.dataset)
    
train_loss = running_loss  / len(train_loader.dataset)
print(f'Epoch [{ep+1}], Train Loss: {train_loss**0.5:.4f}')

# print out training time
elapsed_time = time.time() - start_time
print(f"Training time: {elapsed_time:.3f} seconds")

100%|██████████| 500/500 [08:35<00:00,  1.03s/it]

Epoch [500], Train Loss: 0.1137
Training time: 515.221 seconds





In [46]:
# empty cache first
torch.cuda.empty_cache()

In [47]:
# Testing phase
model.eval()
test_loss = 0.0

with torch.no_grad():
    i = 0
    for data, target in test_loader:
        # inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
        X_num = data[:, :len(non_category_features)]
        X_cat = data[:, -len(category_features):].detach().long()
        
        if isinstance(model, nn.DataParallel):
            model = model.module  # Unwrap from DataParallel
        model = model.to('cpu')
            
               
        outputs = model(X_num, X_cat)

        # save metrics
        mae, mape, rmse, rsqr = calculate_metric(outputs.numpy(), target.numpy())
        print(f"Training average mean absolute error: {mae}")
        print(f"Training average mean absolute percentage error: {mape}")
        print(f"Training average root mean squared error: {rmse}")
        print(f"Training average R2: {rsqr}")

Training average mean absolute error: 0.15283623337745667
Training average mean absolute percentage error: 312.3091459274292
Training average root mean squared error: 0.23530429349573925
Training average R2: 0.516621470451355


In [48]:
# let’s load back in our saved model
# model = MLP()
# model.load_state_dict(torch.load(MODEL_PATH))

5 fold CV

In [49]:
features.shape

(1725, 164)

In [50]:
labels.shape

(1725,)

In [51]:
features.dtypes

currency                        int64
seniorioty_adj                  int64
coupon rate                   float64
domicile_country                int64
exchange_country                int64
                               ...   
PD_60_pd                      float64
DTD                           float64
NI_Over_TA                    float64
Size                          float64
defaulted_in_last_6_months      int64
Length: 164, dtype: object

In [52]:
# Define cross-validation setup
kf = KFold(n_splits=5, shuffle=True, random_state=42)
EPOCHS = 500
val_mae = []
val_mape = []
val_rmse = []
val_rsqr = []


for train_idx, val_idx in kf.split(features):
    # Create training and validation datasets for the current fold
    X_train_fold, X_val_fold = features.iloc[train_idx], features.iloc[val_idx]
    y_train_fold, y_val_fold = labels.iloc[train_idx], labels.iloc[val_idx]
    
    # scaling features
    X_train_fold = scaler.fit_transform(X_train_fold)
    X_val_fold = scaler.transform(X_val_fold)
        
    # Initialize the model for this fold
    model = Transformer(d_numerical=d_numerical, categories=categories, **args)
    model = nn.DataParallel(model, device_ids = DEVICE_LIST)
    model.to(DEVICE)
    
    # define optimizer
    optimizer = getattr(optim, MODEL_CONFIG["optimizer"]["optimizer"])(model.parameters(), **optim_config)
    
    # Define the loss function and optimizer
    criterion = nn.MSELoss()
    
    # Prepare DataLoader for training
    train_dataset = CustomDataset(X_train_fold, y_train_fold.to_numpy())
    val_dataset = CustomDataset(X_val_fold, y_val_fold.to_numpy())
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=val_dataset.features.shape[0], shuffle=True)
        
    # Training of the model.
    model.train()
    for epoch in range(EPOCHS):
        for batch_idx, (data, target) in enumerate(train_loader):
            data, target = data.to(DEVICE), target.to(DEVICE)
  
            X_num = data[:, :len(non_category_features)].to(DEVICE)
            X_cat = data[:, -len(category_features):].detach().long().to(DEVICE)
            
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = model(X_num, X_cat)

            
            loss = criterion(outputs, target)
            loss.backward()
            optimizer.step()
            # print(f'Epoch [{ep+1}], Train Loss: {train_loss:.4f}')

    # Validation of the model.
    model.eval()
    with torch.no_grad():
        for batch_idx, (data, target) in enumerate(val_loader):
            # data, target = data.to(DEVICE), target.to(DEVICE)
            X_num = data[:, :len(non_category_features)]
            X_cat = data[:, -len(category_features):].detach().long()
            
            if isinstance(model, nn.DataParallel):
                model = model.module  # Unwrap from DataParallel
            model = model.to('cpu')
            outputs = model(X_num, X_cat)
            
            # save metrics
            mae, mape, rmse, rsqr = calculate_metric(outputs.numpy(), target.numpy())
            val_mae.append(mae)
            val_mape.append(mape)
            val_rmse.append(rmse)
            val_rsqr.append(rsqr)

self.category_embeddings.weight.shape=torch.Size([166, 60])
activation <function relu at 0x7f35d2d734c0> relu
self.category_embeddings.weight.shape=torch.Size([166, 60])
activation <function relu at 0x7f35d2d734c0> relu
self.category_embeddings.weight.shape=torch.Size([166, 60])
activation <function relu at 0x7f35d2d734c0> relu
self.category_embeddings.weight.shape=torch.Size([166, 60])
activation <function relu at 0x7f35d2d734c0> relu
self.category_embeddings.weight.shape=torch.Size([166, 60])
activation <function relu at 0x7f35d2d734c0> relu


In [53]:
print(f"Test average mean absolute error: {statistics.mean(val_mae)}")
print(f"Test average mean absolute percentage error: {statistics.mean(val_mape)}")
print(f"Test average root mean squared error: {statistics.mean(val_rmse)}")
print(f"Test average R2: {statistics.mean(val_rsqr)}")

Test average mean absolute error: 0.16323910653591156
Test average mean absolute percentage error: 5443.055862188339
Test average root mean squared error: 0.24957806415090034
Test average R2: 0.42442349195480344
