In [1]:
pip install pytorch_tabular

Collecting pytorch_tabular
  Downloading pytorch_tabular-1.1.1-py2.py3-none-any.whl.metadata (24 kB)
Collecting pytorch-lightning<2.5.0,>=2.0.0 (from pytorch_tabular)
  Downloading pytorch_lightning-2.4.0-py3-none-any.whl.metadata (21 kB)
Collecting omegaconf>=2.3.0 (from pytorch_tabular)
  Downloading omegaconf-2.3.0-py3-none-any.whl.metadata (3.9 kB)
Collecting torchmetrics<1.6.0,>=0.10.0 (from pytorch_tabular)
  Downloading torchmetrics-1.5.2-py3-none-any.whl.metadata (20 kB)
Collecting tensorboard!=2.5.0,>2.2.0 (from pytorch_tabular)
  Downloading tensorboard-2.19.0-py3-none-any.whl.metadata (1.8 kB)
Collecting pytorch-tabnet==4.1 (from pytorch_tabular)
  Downloading pytorch_tabnet-4.1.0-py3-none-any.whl.metadata (15 kB)
Collecting ipywidgets (from pytorch_tabular)
  Downloading ipywidgets-8.1.5-py3-none-any.whl.metadata (2.3 kB)
Collecting einops<0.8.0,>=0.6.0 (from pytorch_tabular)
  Downloading einops-0.7.0-py3-none-any.whl.metadata (13 kB)
Collecting rich>=11.0.0 (from pytorch_

In [6]:
import numpy as np
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
import joblib

from utils import calculate_metric

from pytorch_tabular.config import (
    DataConfig,
    OptimizerConfig,
    TrainerConfig,
    ExperimentConfig
)
from pytorch_tabular import model_sweep
import joblib

In [7]:
# set random seeds
np.random.seed(0)
torch.manual_seed(0)

<torch._C.Generator at 0x7f4b4efb0330>

In [9]:
INPUT_FILE = "../data/data_removing_na.xlsx"
BATCH_SIZE = 32
PROJECT_NAME = "run/all"

DEVICE = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
DEVICE_LIST = [3, 2, 1, 0]

In [10]:
DEVICE

device(type='cuda', index=1)

In [11]:
df = pd.read_excel(INPUT_FILE)

In [12]:
df.head()

Unnamed: 0,rr1_30,currency,seniorioty_adj,coupon rate,domicile_country,exchange_country,Industry_sector,Industry_group,Industry_subgroup,event_type,...,PD_55_pd,PD_56_pd,PD_57_pd,PD_58_pd,PD_59_pd,PD_60_pd,DTD,NI_Over_TA,Size,defaulted_in_last_6_months
0,0.259908,USD,Senior Subordinated Unsecured,9.0,United States,United States,Consumer Discretionary,Retail & Whsle - Discretionary,E-Commerce Discretionary,Bankruptcy Filing,...,0.396731,0.397453,0.398148,0.398819,0.399467,0.400092,-0.732815,-0.007137,-0.852484,False
1,0.032729,USD,Senior Subordinated Unsecured,5.75,United States,United States,Health Care,Health Care,Health Care Facilities & Svcs,Default Corp Action,...,0.957454,0.957467,0.95748,0.957492,0.957503,0.957514,-1.666262,-0.000286,-1.186347,False
2,0.9724,USD,Unsecured,5.675,South Korea,South Korea,Consumer Discretionary,Retail & Whsle - Discretionary,Wholesale - Discretionary,Default Corp Action,...,0.568169,0.568693,0.569197,0.569682,0.57015,0.5706,-1.853366,0.000191,1.053677,False
3,1.047416,CHF,Unsecured,0.125,South Korea,South Korea,Consumer Discretionary,Retail & Whsle - Discretionary,Wholesale - Discretionary,Default Corp Action,...,0.568169,0.568693,0.569197,0.569682,0.57015,0.5706,-1.853366,0.000191,1.053677,False
4,0.848872,JPY,Unsecured,1.75,Japan,Japan,Industrials,Industrial Products,Electrical Equipment,Bankruptcy Filing,...,0.130285,0.130688,0.131081,0.131465,0.13184,0.132206,-0.768857,-0.028058,-1.946507,False


In [13]:
df.shape

(1725, 165)

In [14]:
feature_list = df.columns
feature_list = feature_list.drop('rr1_30')

In [15]:
feature_list

Index(['currency', 'seniorioty_adj', 'coupon rate', 'domicile_country',
       'exchange_country', 'Industry_sector', 'Industry_group',
       'Industry_subgroup', 'event_type', 'event_type_subcategory_sum',
       ...
       'PD_55_pd', 'PD_56_pd', 'PD_57_pd', 'PD_58_pd', 'PD_59_pd', 'PD_60_pd',
       'DTD', 'NI_Over_TA', 'Size', 'defaulted_in_last_6_months'],
      dtype='object', length=164)

In [16]:
category_features = list(df.select_dtypes(include=['object', 'bool']).columns)
non_category_features = [i for i in feature_list if i not in category_features]

In [17]:
print(len(non_category_features))
print(len(category_features))

153
11


In [18]:
category_features

['currency',
 'seniorioty_adj',
 'domicile_country',
 'exchange_country',
 'Industry_sector',
 'Industry_group',
 'Industry_subgroup',
 'event_type',
 'event_type_subcategory_sum',
 'defaulted_in_last_5_years',
 'defaulted_in_last_6_months']

In [19]:
# split data into training and test set
test_size = 0.25
train, test = train_test_split(df, test_size=test_size, random_state=42)
train, valid = train_test_split(train, test_size=test_size, random_state=42)

In [20]:
train.head()

Unnamed: 0,rr1_30,currency,seniorioty_adj,coupon rate,domicile_country,exchange_country,Industry_sector,Industry_group,Industry_subgroup,event_type,...,PD_55_pd,PD_56_pd,PD_57_pd,PD_58_pd,PD_59_pd,PD_60_pd,DTD,NI_Over_TA,Size,defaulted_in_last_6_months
536,0.182132,USD,Senior Subordinated Unsecured,2.75,United States,United States,Technology,Tech Hardware & Semiconductors,Technology Hardware,Bankruptcy Filing,...,0.56342,0.56389,0.564342,0.564777,0.565194,0.565597,-1.004436,-0.082341,-3.87064,False
765,0.05674,USD,Senior Unsecured,8.125,United States,United States,Energy,Oil & Gas,Oil & Gas Producers,Default Corp Action,...,0.691011,0.691382,0.691738,0.69208,0.692408,0.692725,-1.084433,-0.052027,-2.074964,False
222,0.603846,USD,Senior Unsecured,8.625,United States,United States,Industrials,Industrial Services,Transportation & Logistics,Bankruptcy Filing,...,0.585977,0.586288,0.586585,0.586867,0.587137,0.587395,-1.115711,-0.007636,-4.029905,False
555,0.836365,USD,Senior Unsecured,9.5,India,India,Utilities,Utilities,Electric Utilities,Default Corp Action,...,0.462222,0.467771,0.47325,0.478659,0.484,0.489275,-0.751984,-0.000207,3.582827,False
1527,0.459547,CNY,Senior Unsecured,6.6,China,China,Real Estate,Real Estate,Real Estate Owners & Developers,Default Corp Action,...,0.253968,0.256798,0.259601,0.262379,0.265132,0.267862,-0.427533,0.002135,0.722957,False


RUNNING THE MODEL

In [21]:
# empty cache first
torch.cuda.empty_cache()

In [23]:
EPOCHS = 100
target_col = 'rr1_30'

data_config = DataConfig(
    target=[
        target_col
    ],  # target should always be a list
    continuous_cols=non_category_features,
    categorical_cols=category_features,
)
trainer_config = TrainerConfig(
    batch_size=BATCH_SIZE,
    max_epochs=EPOCHS,
    early_stopping=None,
    accelerator="gpu",
    load_best=True,
    devices_list=DEVICE_LIST
)
optimizer_config = OptimizerConfig()
experiment_config = ExperimentConfig(project_name=PROJECT_NAME,exp_watch="all",log_target="tensorboard")

In [24]:
EPOCHS = 100

trainer_config = TrainerConfig(
    batch_size=BATCH_SIZE,
    max_epochs=EPOCHS,
    early_stopping=None
)


sweep_df, best_model = model_sweep(
                            task="regression",
                            train=train,
                            test=valid,
                            data_config=data_config,
                            optimizer_config=optimizer_config,
                            trainer_config=trainer_config,
                            model_list="high_memory",
                            verbose=True # Make True if you want to log metrics and params each trial
                        )


Output()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_encoded[col].fillna(self._imputed, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_encoded[col].fillna(self._imputed, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are settin

In [25]:
sweep_df

Unnamed: 0,model,# Params,epochs,test_loss,test_mean_squared_error,time_taken,time_taken_per_epoch,params
4,GANDALFModel,2 M,100,0.06294,0.06294,99.181822,0.991818,"{'task': 'regression', 'head': 'LinearHead', '..."
5,GatedAdditiveTreeEnsembleModel,2 M,100,0.065177,0.065177,1219.995764,12.199958,"{'task': 'regression', 'head': 'LinearHead', '..."
8,TabTransformerModel,277 T,100,0.066654,0.066654,105.431943,1.054319,"{'task': 'regression', 'head': 'LinearHead', '..."
1,CategoryEmbeddingModel,43 T,100,0.067292,0.067292,25.864567,0.258646,"{'task': 'regression', 'head': 'LinearHead', '..."
6,NODEModel,3 M,100,0.068326,0.068326,81.989906,0.819899,"{'task': 'regression', 'head': 'LinearHead', '..."
3,FTTransformerModel,287 T,100,0.068833,0.068833,137.728758,1.377288,"{'task': 'regression', 'head': 'LinearHead', '..."
0,AutoIntModel,27 T,100,0.08126,0.08126,51.428808,0.514288,"{'task': 'regression', 'head': 'LinearHead', '..."
2,DANetModel,1 M,100,0.127868,0.127868,366.805661,3.668057,"{'task': 'regression', 'head': 'LinearHead', '..."
7,TabNetModel,21 T,100,0.127891,0.127891,156.950606,1.569506,"{'task': 'regression', 'head': 'LinearHead', '..."


In [30]:
sweep_df["rmse"] = sweep_df["test_mean_squared_error"]**0.5

In [31]:
sweep_df

Unnamed: 0,model,# Params,epochs,test_loss,test_mean_squared_error,time_taken,time_taken_per_epoch,params,rmse
4,GANDALFModel,2 M,100,0.06294,0.06294,99.181822,0.991818,"{'task': 'regression', 'head': 'LinearHead', '...",0.250879
5,GatedAdditiveTreeEnsembleModel,2 M,100,0.065177,0.065177,1219.995764,12.199958,"{'task': 'regression', 'head': 'LinearHead', '...",0.255297
8,TabTransformerModel,277 T,100,0.066654,0.066654,105.431943,1.054319,"{'task': 'regression', 'head': 'LinearHead', '...",0.258174
1,CategoryEmbeddingModel,43 T,100,0.067292,0.067292,25.864567,0.258646,"{'task': 'regression', 'head': 'LinearHead', '...",0.259407
6,NODEModel,3 M,100,0.068326,0.068326,81.989906,0.819899,"{'task': 'regression', 'head': 'LinearHead', '...",0.261393
3,FTTransformerModel,287 T,100,0.068833,0.068833,137.728758,1.377288,"{'task': 'regression', 'head': 'LinearHead', '...",0.262361
0,AutoIntModel,27 T,100,0.08126,0.08126,51.428808,0.514288,"{'task': 'regression', 'head': 'LinearHead', '...",0.285061
2,DANetModel,1 M,100,0.127868,0.127868,366.805661,3.668057,"{'task': 'regression', 'head': 'LinearHead', '...",0.357586
7,TabNetModel,21 T,100,0.127891,0.127891,156.950606,1.569506,"{'task': 'regression', 'head': 'LinearHead', '...",0.357619


In [26]:
sweep_df["params"][0]

{'task': 'regression',
 'head': 'LinearHead',
 'head_config': {'layers': ''},
 'embedding_dims': None,
 'embedding_dropout': 0.0,
 'batch_norm_continuous_input': True,
 'learning_rate': 0.001,
 'loss': 'MSELoss',
 'metrics': ['mean_squared_error'],
 'metrics_prob_input': [False],
 'metrics_params': [{}],
 'target_range': None,
 'virtual_batch_size': None,
 'seed': 42,
 '_module_src': 'models.autoint',
 '_model_name': 'AutoIntModel',
 '_backbone_name': 'AutoIntBackbone',
 '_config_name': 'AutoIntConfig',
 'attn_embed_dim': 32,
 'num_heads': 2,
 'num_attn_blocks': 3,
 'attn_dropouts': 0.0,
 'has_residuals': True,
 'embedding_dim': 16,
 'embedding_initialization': 'kaiming_uniform',
 'embedding_bias': True,
 'share_embedding': False,
 'share_embedding_strategy': 'fraction',
 'shared_embedding_fraction': 0.25,
 'deep_layers': False,
 'layers': '128-64-32',
 'activation': 'ReLU',
 'use_batch_norm': False,
 'initialization': 'kaiming',
 'dropout': 0.0,
 'attention_pooling': False}

In [32]:
test_result = best_model.evaluate(test)
test_result

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_encoded[col].fillna(self._imputed, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_encoded[col].fillna(self._imputed, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are settin

[{'test_loss': 0.06673388183116913,
  'test_mean_squared_error': 0.06673388183116913}]

In [34]:
test_result[0]["test_mean_squared_error"]**0.5

0.25832901856192836

In [28]:
best_model

Layer,Type,Params,In sizes,Out sizes
_backbone,GANDALFBackbone,2027778,?,?
_embedding_layer,Embedding1dLayer,2385,?,?
_head,Sequential,239,?,?
loss,MSELoss,0,?,?


In [29]:
# saving model
joblib.dump(best_model, f"{PROJECT_NAME}/final_models/final.joblib")

['run/all/final_models/final.joblib']