In [1]:
import pandas as pd
from pytorch_tabular import TabularModel
from pytorch_tabular.config import DataConfig, ModelConfig, TrainerConfig
from pytorch_tabular.models import TabTransformerConfig
import torch
import typing
from collections import defaultdict

In [2]:
# from omegaconf.dictconfig import DictConfig
# from omegaconf.base import ContainerMetadata

# torch.serialization.add_safe_globals([DictConfig, ContainerMetadata, typing.Any, dict, defaultdict])

In [3]:
df = pd.read_csv("../data/Airlines_updated.csv")
train_df = df.sample(frac=0.8, random_state=0)
test_df  = df.drop(train_df.index)

In [4]:
cat_cols = ["Airline","AirportFrom","AirportTo","Route","DayOfWeek"]
num_cols = [c for c in df.columns if c not in cat_cols + ["id","Delay"]]
target = "Delay"

data_config = DataConfig(
    target=[target],
    continuous_cols=num_cols,
    categorical_cols=cat_cols,
    num_workers=19
)

In [5]:
# model_config = ModelConfig(
#     task="classification",
#     model_name="tabtransformer",   # or "ft_transformer"
#     metrics=["accuracy", "f1"],
#     learning_rate=1e-3,
#     loss="cross_entropy",
# )
model_config = TabTransformerConfig(
    task="classification",         # binary classification
    input_embed_dim=32,            # embedding size for each categorical feature
    embedding_dropout=0.1,         # dropout on embeddings
    num_heads=8,                   # attention heads
    num_attn_blocks=6,             # transformer layers
    transformer_head_dim=None,     # defaults to input_embed_dim
    # attn_dropout=0.1,              # dropout after attention
    # add_norm_dropout=0.1,          # dropout in AddNorm
    # ff_dropout=0.1,                # dropout in feed-forward
    # ff_hidden_multiplier=4,        # FF hidden size = multiplier × embed_dim
    # transformer_activation="GEGLU",# activation in FF layers
    # learning_rate=1e-3,            # override default LR if you like
    metrics=["accuracy"],    # what to track
)


In [6]:
import torch

trainer_config = TrainerConfig(
    auto_lr_find=False,
    load_best=False,
    batch_size=1024,
    max_epochs=30,
    accelerator="gpu" if torch.cuda.is_available() else "cpu",
    devices=1 if torch.cuda.is_available() else None,
)

In [7]:
from pytorch_tabular.config import OptimizerConfig
optimizer_config = OptimizerConfig()

tab_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    trainer_config=trainer_config,
    optimizer_config=optimizer_config,
)

In [8]:
tab_model.fit(train=train_df, validation=test_df)

Seed set to 42


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_encoded[col].fillna(self._imputed, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_encoded[col].fillna(self._imputed, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are settin

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


You are using a CUDA device ('NVIDIA GeForce RTX 3090') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
/home/ishan/sjsu/dm/flightdelay/.venv/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:639: Checkpoint directory /home/ishan/sjsu/dm/flightdelay/nbs/saved_models exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

<pytorch_lightning.trainer.trainer.Trainer at 0x720b67009f00>

In [9]:
res = tab_model.evaluate(test=test_df)
print(res)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_encoded[col].fillna(self._imputed, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_encoded[col].fillna(self._imputed, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are settin

Output()

[{'test_loss': 0.6138545870780945, 'test_accuracy': 0.6610398888587952}]
