In [1]:
import sys 
import os 

# Add the project root to the Python path 
project_root = os.path.abspath(os.path.join(os.getcwd(), '..')) 
if project_root not in sys.path: 
    sys.path.insert(0, project_root) 

In [13]:
import torch
import torch.nn as nn
from omegaconf import DictConfig
from models import GraphTransformer
from torch.nn import TransformerEncoderLayer

In [5]:
cfg = DictConfig({
        'model': {
            'self_attention_layer': 2,
            'readout': 'mean'  # Options: 'concat', 'sum', 'mean', 'max'
        },
        'dataset': {
            'node_sz': 10,
            'node_feature_sz': 32
        }
    })

In [29]:
model = GraphTransformer(cfg)

In [8]:
model

GraphTransformer(
  (attention_list): ModuleList(
    (0-1): 2 x TransformerEncoderLayer(
      (self_attn): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=32, out_features=32, bias=True)
      )
      (linear1): Linear(in_features=32, out_features=1024, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (linear2): Linear(in_features=1024, out_features=32, bias=True)
      (norm1): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
      (dropout1): Dropout(p=0.1, inplace=False)
      (dropout2): Dropout(p=0.1, inplace=False)
    )
  )
  (fc): Sequential(
    (0): Linear(in_features=32, out_features=256, bias=True)
    (1): LeakyReLU(negative_slope=0.01)
    (2): Linear(in_features=256, out_features=32, bias=True)
    (3): LeakyReLU(negative_slope=0.01)
    (4): Linear(in_features=32, out_features=2, bias=True)
  )
)

In [10]:
batch_size = 4
num_timesteps = 50  # Placeholder, as it's not used in the forward pass
time_series_data = torch.randn(batch_size, num_timesteps, cfg.dataset.node_sz)
node_feature_data = torch.randn(batch_size, cfg.dataset.node_sz, cfg.dataset.node_feature_sz)

In [11]:
node_feature_data.shape

torch.Size([4, 10, 32])

In [30]:
model.attention_list[0].self_attn

MultiheadAttention(
  (out_proj): NonDynamicallyQuantizableLinear(in_features=32, out_features=32, bias=True)
)

In [25]:
current_features = node_feature_data
for i, atten_layer in enumerate(model.attention_list):
    # Pass the features through the current attention layer
    current_features = atten_layer(current_features)
    print(f"Output shape after Attention Layer {i}: {current_features.shape}")
     # The shape should remain (batch_size, node_sz, node_feature_sz)

Output shape after Attention Layer 0: torch.Size([4, 10, 32])
Output shape after Attention Layer 1: torch.Size([4, 10, 32])


In [27]:
features_after_attention = current_features
if cfg.model.readout == "mean":
    readout_output = torch.mean(features_after_attention, dim=1)
    print(f"Features shape after '{cfg.model.readout}' readout: {readout_output.shape}")

Features shape after 'mean' readout: torch.Size([4, 32])


In [31]:
model.get_attention_weights()

AttributeError: 'TransformerEncoderLayer' object has no attribute 'get_attention_weights'

In [23]:
encoder_layer = TransformerEncoderLayer(d_model=512, nhead=2)
src = torch.rand(10, 32, 512)
out = encoder_layer(src)

In [21]:
out.shape

torch.Size([10, 32, 512])