In [4]:
# ============ DeepConvContext (RWHAR · Bi-Attention, paper-aligned) – structure & size ============

import json
from pathlib import Path

import torch
import torch.nn as nn

print("\n[DeepConvContext (RWHAR · Bi-Attention, paper-aligned) – structure & size]")

# ---------------------------
# 1) Determine NUM_CLASSES
# ---------------------------
BASE = Path("/content")
CFG_DIR = BASE / "configs"

if (CFG_DIR / "classes.json").exists():
    with open(CFG_DIR / "classes.json", "r") as f:
        classes_cfg = json.load(f)
    NUM_CLASSES = int(classes_cfg["num_classes"])
    print(f"Detected NUM_CLASSES from configs: {NUM_CLASSES}")
else:
    # Change this default if your experiment uses a different number of classes
    NUM_CLASSES = 8
    print("Warning: /content/configs/classes.json not found. Using default NUM_CLASSES = 8.")
    print("Please update NUM_CLASSES manually if this does not match your setup.")

# ---------------------------
# 2) Hyperparameters (must match your Bi-Attention script)
# ---------------------------
NUM_CHANNELS     = 6
SAMPLES_PER_WIN  = 150
STRIDE_SAMPLES   = 75

EPOCHS        = 30
LEARNING_RATE = 1e-4
WEIGHT_DECAY  = 1e-6
STEP_SIZE     = 10
GAMMA         = 0.9

DROPOUT_P      = 0.5
HIDDEN_UNITS   = 128
CONV_CHANNELS  = 64
KERNEL_SIZE    = 9
PROJECTION_DIM = 128
ATTN_HEADS     = 4
ATTN_LAYERS    = 3
MAX_CONTEXT_WINS = 200   # used for positional embedding length

print(f"\nConfig for size check:")
print(f"  NUM_CLASSES      = {NUM_CLASSES}")
print(f"  NUM_CHANNELS     = {NUM_CHANNELS}")
print(f"  CONV_CHANNELS    = {CONV_CHANNELS}")
print(f"  HIDDEN_UNITS     = {HIDDEN_UNITS}")
print(f"  PROJECTION_DIM   = {PROJECTION_DIM}")
print(f"  ATTN_HEADS       = {ATTN_HEADS}")
print(f"  ATTN_LAYERS      = {ATTN_LAYERS}")
print(f"  MAX_CONTEXT_WINS = {MAX_CONTEXT_WINS}")

# ---------------------------
# 3) Model definition (identical to your training script)
# ---------------------------
class DeepConvLSTM_Intra(nn.Module):
    """
    Intra-window branch:
    4×Conv1d(64, k=9) + ReLU → 1-layer LSTM(128)
    """
    def __init__(self, in_ch: int = 6, conv_ch: int = 64, kernel_size: int = 9, hidden: int = 128):
        super().__init__()
        pad = kernel_size // 2
        self.conv1 = nn.Conv1d(in_ch,   conv_ch, kernel_size, padding=pad)
        self.conv2 = nn.Conv1d(conv_ch, conv_ch, kernel_size, padding=pad)
        self.conv3 = nn.Conv1d(conv_ch, conv_ch, kernel_size, padding=pad)
        self.conv4 = nn.Conv1d(conv_ch, conv_ch, kernel_size, padding=pad)
        self.relu  = nn.ReLU(inplace=True)
        self.lstm  = nn.LSTM(
            input_size=conv_ch,
            hidden_size=hidden,
            num_layers=1,
            batch_first=True
        )

    def forward(self, x_win: torch.Tensor) -> torch.Tensor:
        # x_win: (B, C, T)
        x = self.relu(self.conv1(x_win))
        x = self.relu(self.conv2(x))
        x = self.relu(self.conv3(x))
        x = self.relu(self.conv4(x))        # (B, conv_ch, T)
        x = x.permute(0, 2, 1)              # (B, T, conv_ch)
        _, (h_n, _) = self.lstm(x)          # h_n: (1, B, hidden)
        return h_n[-1]                      # (B, hidden)


class PositionalEncoding1D(nn.Module):
    """
    Learnable 1D positional encoding for window index within the batch-context.
    """
    def __init__(self, d_model: int, max_len: int = 200):
        super().__init__()
        self.pos_embedding = nn.Embedding(max_len, d_model)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # x: (B_seq, S, D)
        seq_len = x.size(1)
        positions = torch.arange(seq_len, device=x.device).unsqueeze(0)  # (1, S)
        pos = self.pos_embedding(positions)                              # (1, S, D)
        return x + pos


class MultiHeadSelfAttentionBlock(nn.Module):
    """
    Pure self-attention block: MHA + residual + LayerNorm
    (dropout=0.0 inside MHA, as in your script)
    """
    def __init__(self, dim: int, num_heads: int):
        super().__init__()
        self.mha  = nn.MultiheadAttention(
            embed_dim=dim,
            num_heads=num_heads,
            dropout=0.0,
            batch_first=True
        )
        self.norm = nn.LayerNorm(dim)

    def forward(self, x: torch.Tensor, attn_mask: torch.Tensor | None = None) -> torch.Tensor:
        attn_out, _ = self.mha(x, x, x, attn_mask=attn_mask, need_weights=False)
        return self.norm(x + attn_out)


class DeepConvContext_BiAttention(nn.Module):
    """
    DeepConvContext Bi-Attention variant (paper-aligned):
      Intra: Conv×4 + LSTM(128)
      Projection: Linear(128 → 128)
      Inter: 3-layer 4-head self-attention stack, batch-as-context
      Positional encoding + optional causal mask (here fixed to bidirectional)
      Dropout(0.5) → FC(num_classes)
    """
    def __init__(self,
                 num_channels: int = 6,
                 num_classes: int = 8,
                 conv_channels: int = 64,
                 hidden_intra: int = 128,
                 projection_dim: int = 128,
                 attn_heads: int = 4,
                 num_attn_layers: int = 3,
                 max_context_len: int = 200,
                 dropout: float = 0.5,
                 bidirectional: bool = True):
        super().__init__()

        self.intra = DeepConvLSTM_Intra(
            in_ch=num_channels,
            conv_ch=conv_channels,
            kernel_size=KERNEL_SIZE,
            hidden=hidden_intra
        )

        self.proj = nn.Linear(hidden_intra, projection_dim)
        self.pos_enc = PositionalEncoding1D(projection_dim, max_len=max_context_len)

        self.bidirectional = bidirectional
        self.attn_layers = nn.ModuleList([
            MultiHeadSelfAttentionBlock(projection_dim, attn_heads)
            for _ in range(num_attn_layers)
        ])

        self.dropout = nn.Dropout(dropout)
        self.fc      = nn.Linear(projection_dim, num_classes)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        x: (B, C, T) — windows sorted by time; batch dimension is the context sequence.
        Returns: (B, num_classes)
        """
        B, C, T = x.shape

        feats = self.intra(x)                # (B, hidden_intra)
        proj  = self.proj(feats)            # (B, D)

        seq = proj.unsqueeze(0)             # (1, B, D)
        seq = self.pos_enc(seq)             # (1, B, D)

        attn_mask = None
        if not self.bidirectional:
            S_len = B
            attn_mask = torch.ones(
                S_len, S_len,
                device=x.device,
                dtype=torch.bool
            ).triu(diagonal=1)             # upper-triangular mask for causal attention

        for layer in self.attn_layers:
            seq = layer(seq, attn_mask=attn_mask)

        seq = self.dropout(seq)
        logits = self.fc(seq)               # (1, B, num_classes)
        return logits.squeeze(0)            # (B, num_classes)

# ---------------------------
# 4) Instantiate model and compute size
# ---------------------------
model = DeepConvContext_BiAttention(
    num_channels=NUM_CHANNELS,
    num_classes=NUM_CLASSES,
    conv_channels=CONV_CHANNELS,
    hidden_intra=HIDDEN_UNITS,
    projection_dim=PROJECTION_DIM,
    attn_heads=ATTN_HEADS,
    num_attn_layers=ATTN_LAYERS,
    max_context_len=MAX_CONTEXT_WINS,
    dropout=DROPOUT_P,
    bidirectional=True   # fixed to Bi-Attention variant
)

print("\n====== nn.Module structure ======\n")
print(model)

# Parameter counts
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print("\n====== Parameter statistics ======")
print(f"Total params:      {total_params:,}")
print(f"Trainable params:  {trainable_params:,}")

print("\n====== Per-layer parameter counts ======")
for name, p in model.named_parameters():
    print(f"{name:50s} shape={tuple(p.shape)}  params={p.numel():,}")

# Size estimation (parameters only)
def fmt_mb(n_bytes: int) -> str:
    return f"{n_bytes / 1024 / 1024:.2f} MB"

bytes_fp32 = total_params * 4   # float32: 4 bytes per parameter
bytes_fp16 = total_params * 2   # float16: 2 bytes per parameter

print("\n====== Model size estimate (parameters only) ======")
print(f"FP32 (float32, 4B/param): {fmt_mb(bytes_fp32)}")
print(f"FP16 (float16, 2B/param): {fmt_mb(bytes_fp16)}")

# Save a randomly initialised state_dict to check actual .pth size
models_dir = BASE / "models"
models_dir.mkdir(parents=True, exist_ok=True)
tmp_path = models_dir / "deepconvcontext_biattn_rwhar_dummy.pth"
torch.save(model.state_dict(), tmp_path)
file_bytes = tmp_path.stat().st_size
print(f"\nRandom-initialised state_dict saved to {tmp_path.name}")
print(f"Actual .pth file size: {fmt_mb(file_bytes)}")
tmp_path.unlink(missing_ok=True)

print("\n[DeepConvContext (RWHAR · Bi-Attention, paper-aligned) – structure & size done]\n")


[DeepConvContext (RWHAR · Bi-Attention, paper-aligned) – structure & size]
Please update NUM_CLASSES manually if this does not match your setup.

Config for size check:
  NUM_CLASSES      = 8
  NUM_CHANNELS     = 6
  CONV_CHANNELS    = 64
  HIDDEN_UNITS     = 128
  PROJECTION_DIM   = 128
  ATTN_HEADS       = 4
  ATTN_LAYERS      = 3
  MAX_CONTEXT_WINS = 200


DeepConvContext_BiAttention(
  (intra): DeepConvLSTM_Intra(
    (conv1): Conv1d(6, 64, kernel_size=(9,), stride=(1,), padding=(4,))
    (conv2): Conv1d(64, 64, kernel_size=(9,), stride=(1,), padding=(4,))
    (conv3): Conv1d(64, 64, kernel_size=(9,), stride=(1,), padding=(4,))
    (conv4): Conv1d(64, 64, kernel_size=(9,), stride=(1,), padding=(4,))
    (relu): ReLU(inplace=True)
    (lstm): LSTM(64, 128, batch_first=True)
  )
  (proj): Linear(in_features=128, out_features=128, bias=True)
  (pos_enc): PositionalEncoding1D(
    (pos_embedding): Embedding(200, 128)
  )
  (attn_layers): ModuleList(
    (0-2): 3 x MultiHeadSelfAttenti