Test sequence:
1. lightweight RCCMix
2. RCCMix-HAR++
3. Knn
4. Random forest
5. InceptionTime
6. minirocket
7. minirocket(Deployment-Friendly)
8. TST
9. lightweight rTsfNet
10. rTsfNet
11. DeepConvContext – LSTM variant – 1-layer – bidirectional
12. DeepConvContext-LSTM (unidirectional, 1-layer)
13. DeepConvContext Bi‑Attention

1.lightweight RCCMix

In [15]:
# ================ Step 10: RCCMix-HAR CPU Inference Latency Benchmark (Standalone) ================
import os
os.environ.setdefault("OMP_NUM_THREADS", "1")
os.environ.setdefault("MKL_NUM_THREADS", "1")
os.environ.setdefault("OPENBLAS_NUM_THREADS", "1")

import time
import random
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import torch
import torch.nn as nn

print("\n[Step 10: RCCMix-HAR CPU Inference Latency Benchmark]")

# ---------------------------
# 0) Basic configuration
# ---------------------------
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

device = torch.device("cpu")
torch.set_num_threads(1)

print(f"Using device: {device}")
print(f"torch.get_num_threads() = {torch.get_num_threads()}")

# Hyperparameters (must match Step 10)
SEQ_LEN        = 8        # number of consecutive windows per sequence
WINDOW_SAMPLES = 150      # samples per window (e.g., 3 s @ 50 Hz)
IN_CHANNELS    = 6        # 3D acc + 3D gyro
D_MODEL        = 128
N_HEADS        = 4
N_LAYERS       = 2
D_FF           = 4 * D_MODEL
DROPOUT        = 0.2
NUM_CLASSES    = 8

print(f"SEQ_LEN={SEQ_LEN}, WINDOW_SAMPLES={WINDOW_SAMPLES}, IN_CHANNELS={IN_CHANNELS}")
print(f"D_MODEL={D_MODEL}, N_LAYERS={N_LAYERS}, N_HEADS={N_HEADS}, NUM_CLASSES={NUM_CLASSES}")


# ---------------------------
# 1) Model definition (exactly as in Step 10)
# ---------------------------
class DepthwiseSeparableConv1d(nn.Module):
    def __init__(self, in_ch, out_ch, k, dilation=1, dropout=0.0):
        super().__init__()
        pad = (k // 2) * dilation
        self.dw = nn.Conv1d(
            in_ch,
            in_ch,
            kernel_size=k,
            padding=pad,
            dilation=dilation,
            groups=in_ch,
            bias=False,
        )
        self.pw = nn.Conv1d(in_ch, out_ch, kernel_size=1, bias=False)
        self.bn = nn.BatchNorm1d(out_ch)
        self.act = nn.GELU()
        self.drop = nn.Dropout(dropout)

    def forward(self, x):
        x = self.dw(x)
        x = self.pw(x)
        x = self.bn(x)
        x = self.act(x)
        return self.drop(x)


class WindowEncoder(nn.Module):
    """
    rTsfNet-style per-window representation:
      - Axis-group enhancement: L2-norm channels of acc/gyro
      - Multi-scale depthwise separable convolution branches
      - Global pooling over time -> window token
      - Geometric conditioning vector g (RMS & energy stats)
    """
    def __init__(self, in_ch=6, d_model=128, dropout=0.2):
        super().__init__()
        self.in_ch = in_ch
        self.aug_ch = in_ch + 2   # + acc_norm + gyro_norm

        # Multi-scale branches
        self.b1 = DepthwiseSeparableConv1d(self.aug_ch, d_model // 2, k=9,  dilation=1, dropout=dropout)
        self.b2 = DepthwiseSeparableConv1d(self.aug_ch, d_model // 2, k=19, dilation=2, dropout=dropout)
        self.mix = nn.Conv1d(d_model, d_model, kernel_size=1, bias=False)
        self.bn  = nn.BatchNorm1d(d_model)
        self.act = nn.GELU()
        self.drop= nn.Dropout(dropout)

        # Geometric conditioning vector: RMS & energy (4-dim) -> d_model
        self.g_proj = nn.Sequential(
            nn.Linear(4, d_model),
            nn.GELU(),
            nn.Linear(d_model, d_model),
        )

    def forward(self, x):
        # x: [B*L, 6, T]
        bl, c, t = x.shape

        # Norm channels
        acc_norm = torch.sqrt(
            x[:, 0, :] ** 2 + x[:, 1, :] ** 2 + x[:, 2, :] ** 2 + 1e-8
        ).unsqueeze(1)  # [BL,1,T]
        gyr_norm = torch.sqrt(
            x[:, 3, :] ** 2 + x[:, 4, :] ** 2 + x[:, 5, :] ** 2 + 1e-8
        ).unsqueeze(1)  # [BL,1,T]

        x_aug = torch.cat([x, acc_norm, gyr_norm], dim=1)  # [BL,8,T]

        # Multi-scale branches + fusion
        z = torch.cat([self.b1(x_aug), self.b2(x_aug)], dim=1)  # [BL, d_model, T]
        z = self.mix(z)
        z = self.bn(z)
        z = self.act(z)
        z = self.drop(z)

        # Global average pooling over time → window token
        token = z.mean(dim=-1)  # [BL, d_model]

        # Geometric conditioning stats (RMS & energy)
        acc_rms = acc_norm.squeeze(1).pow(2).mean(dim=-1).sqrt()
        gyr_rms = gyr_norm.squeeze(1).pow(2).mean(dim=-1).sqrt()
        acc_en  = x[:, 0:3, :].pow(2).mean(dim=(1, 2)).sqrt()
        gyr_en  = x[:, 3:6, :].pow(2).mean(dim=(1, 2)).sqrt()

        g = torch.stack([acc_rms, gyr_rms, acc_en, gyr_en], dim=-1)  # [BL, 4]
        g = self.g_proj(g)  # [BL, d_model]

        return token, g


class CondLayerNorm(nn.Module):
    """ FiLM-style conditional LayerNorm: LN(x) * (1 + gamma(g)) + beta(g) """
    def __init__(self, d_model):
        super().__init__()
        self.ln = nn.LayerNorm(d_model)
        self.gamma = nn.Linear(d_model, d_model)
        self.beta  = nn.Linear(d_model, d_model)

    def forward(self, x, g):
        # x, g: [B, L, d]
        y = self.ln(x)
        return y * (1.0 + self.gamma(g)) + self.beta(g)


class RCCBlock(nn.Module):
    """ Rotation-conditioned Transformer encoder block """
    def __init__(self, d_model=128, n_heads=4, d_ff=512, dropout=0.2):
        super().__init__()
        self.condln1 = CondLayerNorm(d_model)
        self.mha = nn.MultiheadAttention(
            d_model,
            n_heads,
            dropout=dropout,
            batch_first=True,
        )
        self.drop1 = nn.Dropout(dropout)

        self.condln2 = CondLayerNorm(d_model)
        self.ff = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(d_ff, d_model),
        )
        self.drop2 = nn.Dropout(dropout)

    def forward(self, x, g):
        # x, g: [B, L+1, d]
        y = self.condln1(x, g)
        attn, _ = self.mha(y, y, y, need_weights=False)
        x = x + self.drop1(attn)

        y = self.condln2(x, g)
        y = self.ff(y)
        x = x + self.drop2(y)
        return x


class GeoContextHAR(nn.Module):
    """ Window encoder + rotation-conditioned context + CLS classification head """
    def __init__(
        self,
        in_ch=6,
        d_model=128,
        n_layers=2,
        n_heads=4,
        d_ff=512,
        dropout=0.2,
        seq_len=8,
        num_classes=8,
    ):
        super().__init__()
        self.seq_len = seq_len
        self.encoder = WindowEncoder(in_ch=in_ch, d_model=d_model, dropout=dropout)
        self.cls_token = nn.Parameter(torch.zeros(1, 1, d_model))
        self.pos = nn.Parameter(torch.zeros(1, seq_len + 1, d_model))
        self.blocks = nn.ModuleList(
            [RCCBlock(d_model, n_heads, d_ff, dropout) for _ in range(n_layers)]
        )
        self.norm = nn.LayerNorm(d_model)
        self.head = nn.Linear(d_model, num_classes)

        nn.init.trunc_normal_(self.pos, std=0.02)
        nn.init.trunc_normal_(self.cls_token, std=0.02)

    def forward(self, x):
        # x: [B, L, C, T]
        b, l, c, t = x.shape
        x = x.view(b * l, c, t)
        token, g = self.encoder(x)              # [B*L, d], [B*L, d]
        token = token.view(b, l, -1)            # [B, L, d]
        g     = g.view(b, l, -1)                # [B, L, d]

        # Append CLS token and global condition g_cls
        cls = self.cls_token.expand(b, 1, -1)   # [B, 1, d]
        z = torch.cat([cls, token], dim=1)      # [B, L+1, d]
        g_cls = g.mean(dim=1, keepdim=True)     # [B, 1, d]
        g_all = torch.cat([g_cls, g], dim=1)    # [B, L+1, d]

        # Positional encoding
        z = z + self.pos

        # Conditional Transformer blocks
        for blk in self.blocks:
            z = blk(z, g_all)

        z = self.norm(z)
        cls_rep = z[:, 0, :]                   # [B, d]
        logits = self.head(cls_rep)            # [B, num_classes]
        return logits


# ---------------------------
# 2) Utility: parameter count
# ---------------------------
def count_parameters(model: nn.Module) -> int:
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


# ---------------------------
# 3) Utility: latency measurement on CPU
# ---------------------------
def measure_latency(
    model: nn.Module,
    input_shape,
    device: torch.device,
    n_warmup: int = 20,
    n_runs: int = 100,
):
    """
    Measure pure forward-pass latency on CPU.
    Returns latency stats in milliseconds per sequence (B, L, C, T).
    """
    model = model.to(device)
    model.eval()
    torch.set_grad_enabled(False)

    x = torch.randn(*input_shape, device=device, dtype=torch.float32)

    # Warm-up
    for _ in range(n_warmup):
        _ = model(x)

    # Timed runs
    times_ms = []
    for _ in range(n_runs):
        t0 = time.perf_counter()
        _ = model(x)
        t1 = time.perf_counter()
        times_ms.append((t1 - t0) * 1000.0)

    times_ms = np.array(times_ms, dtype=float)
    stats = {
        "sequence_latency_p50_ms": float(np.percentile(times_ms, 50)),
        "sequence_latency_p90_ms": float(np.percentile(times_ms, 90)),
        "sequence_latency_mean_ms": float(times_ms.mean()),
        "sequence_latency_std_ms":  float(times_ms.std()),
        "n_runs": int(n_runs),
    }
    return stats


# ---------------------------
# 4) Instantiate model and run benchmark
# ---------------------------
model = GeoContextHAR(
    in_ch=IN_CHANNELS,
    d_model=D_MODEL,
    n_layers=N_LAYERS,
    n_heads=N_HEADS,
    d_ff=D_FF,
    dropout=DROPOUT,
    seq_len=SEQ_LEN,
    num_classes=NUM_CLASSES,
)

n_params = count_parameters(model)
print(f"Number of trainable parameters: {n_params:,}")

# Batch size for latency measurement
BATCH_SIZE = 1  # sequence-level model: one sequence = 8 windows

input_shape = (BATCH_SIZE, SEQ_LEN, IN_CHANNELS, WINDOW_SAMPLES)
print(f"\nMeasuring CPU latency with input shape: {input_shape} (B, L, C, T)")

stats = measure_latency(
    model,
    input_shape=input_shape,
    device=device,
    n_warmup=20,
    n_runs=100,
)

print("\nCPU latency stats per sequence (8 windows):")
for k, v in stats.items():
    if k.endswith("_ms"):
        print(f"  {k}: {v:.4f}")
    else:
        print(f"  {k}: {v}")

# Convert sequence latency → per-window latency (divide by SEQ_LEN=8)
per_window_p50  = stats["sequence_latency_p50_ms"]  / SEQ_LEN
per_window_p90  = stats["sequence_latency_p90_ms"]  / SEQ_LEN
per_window_mean = stats["sequence_latency_mean_ms"] / SEQ_LEN

print("\nApproximate CPU latency per window (HAR window, 3 s @ 50 Hz):")
print(f"  window_latency_p50_ms  ≈ {per_window_p50:.4f}")
print(f"  window_latency_p90_ms  ≈ {per_window_p90:.4f}")
print(f"  window_latency_mean_ms ≈ {per_window_mean:.4f}")

print("\n[RCCMix-HAR Step 10 CPU latency benchmark completed]")


[Step 10: RCCMix-HAR CPU Inference Latency Benchmark]
Using device: cpu
torch.get_num_threads() = 1
SEQ_LEN=8, WINDOW_SAMPLES=150, IN_CHANNELS=6
D_MODEL=128, N_LAYERS=2, N_HEADS=4, NUM_CLASSES=8
Number of trainable parameters: 566,504

Measuring CPU latency with input shape: (1, 8, 6, 150) (B, L, C, T)

CPU latency stats per sequence (8 windows):
  sequence_latency_p50_ms: 5.5079
  sequence_latency_p90_ms: 5.8094
  sequence_latency_mean_ms: 5.5643
  sequence_latency_std_ms: 0.2565
  n_runs: 100

Approximate CPU latency per window (HAR window, 3 s @ 50 Hz):
  window_latency_p50_ms  ≈ 0.6885
  window_latency_p90_ms  ≈ 0.7262
  window_latency_mean_ms ≈ 0.6955

[RCCMix-HAR Step 10 CPU latency benchmark completed]


2.RCCMix-HAR++

In [1]:
# ================ Step 11: RCCMix-HAR++ CPU Inference Latency Benchmark (Standalone) ================
import os
os.environ.setdefault("OMP_NUM_THREADS", "1")
os.environ.setdefault("MKL_NUM_THREADS", "1")
os.environ.setdefault("OPENBLAS_NUM_THREADS", "1")

import time
import math
import random
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import torch
import torch.nn as nn

print("\n[Step 11: RCCMix-HAR++ CPU Inference Latency Benchmark]")

# ---------------------------
# 0) Basic configuration
# ---------------------------
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

# Use CPU explicitly
device = torch.device("cpu")
torch.set_num_threads(1)

print(f"Using device: {device}")
print(f"torch.get_num_threads() = {torch.get_num_threads()}")

# Hyperparameters (must match Step 11)
SEQ_LEN        = 8        # number of consecutive windows per sequence
WINDOW_SAMPLES = 150      # samples per window (e.g., 3s @ 50Hz)
IN_CHANNELS    = 6        # 3D acc + 3D gyro
D_MODEL        = 192      # divisible by 3
N_HEADS        = 6
N_LAYERS       = 3
D_FF           = 4 * D_MODEL
DROPOUT        = 0.2
NUM_CLASSES    = 8        # number of activity classes

print(f"SEQ_LEN={SEQ_LEN}, WINDOW_SAMPLES={WINDOW_SAMPLES}, IN_CHANNELS={IN_CHANNELS}")
print(f"D_MODEL={D_MODEL}, N_LAYERS={N_LAYERS}, N_HEADS={N_HEADS}, NUM_CLASSES={NUM_CLASSES}")


# ---------------------------
# 1) Model definition (exactly as in Step 11)
# ---------------------------
class DepthwiseSeparableConv1d(nn.Module):
    def __init__(self, in_ch, out_ch, k, dilation=1, dropout=0.0):
        super().__init__()
        pad = (k // 2) * dilation
        self.dw = nn.Conv1d(
            in_ch,
            in_ch,
            kernel_size=k,
            padding=pad,
            dilation=dilation,
            groups=in_ch,
            bias=False,
        )
        self.pw = nn.Conv1d(in_ch, out_ch, kernel_size=1, bias=False)
        self.bn = nn.BatchNorm1d(out_ch)
        self.act = nn.GELU()
        self.drop = nn.Dropout(dropout)

    def forward(self, x):
        x = self.dw(x)
        x = self.pw(x)
        x = self.bn(x)
        x = self.act(x)
        return self.drop(x)


class SEBlock(nn.Module):
    """Channel attention applied to the 8 channels"""
    def __init__(self, ch, reduction=4):
        super().__init__()
        self.pool = nn.AdaptiveAvgPool1d(1)
        hidden = max(1, ch // reduction)
        self.fc = nn.Sequential(
            nn.Linear(ch, hidden, bias=False),
            nn.ReLU(inplace=True),
            nn.Linear(hidden, ch, bias=False),
            nn.Sigmoid(),
        )

    def forward(self, x):
        # x: [B, C, T]
        b, c, t = x.shape
        s = self.pool(x).view(b, c)
        s = self.fc(s).view(b, c, 1)
        return x * s


class WindowEncoderV2(nn.Module):
    """
    Improved per-window representation:
      - channel SE attention
      - 3-branch multi-scale depthwise conv
      - [avg + max] pooling token
      - 8-dimensional geometric conditioning vector g
    """
    def __init__(self, in_ch=6, d_model=192, dropout=0.2):
        super().__init__()
        self.in_ch = in_ch
        self.aug_ch = in_ch + 2   # + acc_norm + gyro_norm
        self.se = SEBlock(self.aug_ch, reduction=4)

        b_dim = d_model // 3
        assert b_dim * 3 == d_model, "D_MODEL must be divisible by 3 for WindowEncoderV2"

        # Multi-scale branches
        self.b1 = DepthwiseSeparableConv1d(self.aug_ch, b_dim, k=7,  dilation=1, dropout=dropout)
        self.b2 = DepthwiseSeparableConv1d(self.aug_ch, b_dim, k=15, dilation=2, dropout=dropout)
        self.b3 = DepthwiseSeparableConv1d(self.aug_ch, b_dim, k=31, dilation=3, dropout=dropout)

        self.mix = nn.Conv1d(d_model, d_model, kernel_size=1, bias=False)
        self.bn  = nn.BatchNorm1d(d_model)
        self.act = nn.GELU()
        self.drop= nn.Dropout(dropout)

        # token pooling: [avg, max] -> 2*d_model projected back to d_model
        self.token_proj = nn.Sequential(
            nn.Linear(2 * d_model, d_model),
            nn.GELU(),
            nn.Dropout(dropout),
        )

        # richer geometric conditioning (8-d stats)
        self.g_proj = nn.Sequential(
            nn.Linear(8, d_model),
            nn.GELU(),
            nn.Linear(d_model, d_model),
        )

    def forward(self, x):
        # x: [B*L, 6, T]
        bl, c, t = x.shape

        # norm channels
        acc_norm = torch.sqrt(
            x[:, 0, :] ** 2 + x[:, 1, :] ** 2 + x[:, 2, :] ** 2 + 1e-8
        ).unsqueeze(1)
        gyr_norm = torch.sqrt(
            x[:, 3, :] ** 2 + x[:, 4, :] ** 2 + x[:, 5, :] ** 2 + 1e-8
        ).unsqueeze(1)

        x_aug = torch.cat([x, acc_norm, gyr_norm], dim=1)  # [BL, 8, T]

        # channel-wise attention
        x_aug = self.se(x_aug)

        # multi-scale conv
        z1 = self.b1(x_aug)
        z2 = self.b2(x_aug)
        z3 = self.b3(x_aug)
        z = torch.cat([z1, z2, z3], dim=1)  # [BL, d_model, T]

        z = self.mix(z)
        z = self.bn(z)
        z = self.act(z)
        z = self.drop(z)

        # dual pooling
        avg_pool = z.mean(dim=-1)
        max_pool, _ = z.max(dim=-1)
        token = torch.cat([avg_pool, max_pool], dim=-1)
        token = self.token_proj(token)  # [BL, d_model]

        # geometric conditioning stats (8-dim)
        acc_rms = acc_norm.squeeze(1).pow(2).mean(dim=-1).sqrt()
        gyr_rms = gyr_norm.squeeze(1).pow(2).mean(dim=-1).sqrt()

        acc_en  = x[:, 0:3, :].pow(2).mean(dim=(1, 2)).sqrt()
        gyr_en  = x[:, 3:6, :].pow(2).mean(dim=(1, 2)).sqrt()

        acc_mean = x[:, 0:3, :].mean(dim=-1)
        gyr_mean = x[:, 3:6, :].mean(dim=-1)
        acc_mean_norm = acc_mean.pow(2).sum(dim=-1).sqrt()
        gyr_mean_norm = gyr_mean.pow(2).sum(dim=-1).sqrt()

        acc_var = x[:, 0:3, :].var(dim=-1).mean(dim=-1)
        gyr_var = x[:, 3:6, :].var(dim=-1).mean(dim=-1)

        g_raw = torch.stack(
            [
                acc_rms,
                gyr_rms,
                acc_en,
                gyr_en,
                acc_mean_norm,
                gyr_mean_norm,
                acc_var,
                gyr_var,
            ],
            dim=-1,
        )  # [BL, 8]
        g = self.g_proj(g_raw)  # [BL, d_model]

        return token, g


class CondLayerNorm(nn.Module):
    """FiLM-style conditional LayerNorm: LN(x) * (1 + gamma(g)) + beta(g)"""
    def __init__(self, d_model):
        super().__init__()
        self.ln = nn.LayerNorm(d_model)
        self.gamma = nn.Linear(d_model, d_model)
        self.beta  = nn.Linear(d_model, d_model)

    def forward(self, x, g):
        y = self.ln(x)
        return y * (1.0 + self.gamma(g)) + self.beta(g)


class RCCBlock(nn.Module):
    """Rotation-conditioned Transformer encoder block"""
    def __init__(self, d_model=192, n_heads=6, d_ff=768, dropout=0.2):
        super().__init__()
        self.condln1 = CondLayerNorm(d_model)
        self.mha = nn.MultiheadAttention(
            d_model,
            n_heads,
            dropout=dropout,
            batch_first=True,
        )
        self.drop1 = nn.Dropout(dropout)

        self.condln2 = CondLayerNorm(d_model)
        self.ff = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(d_ff, d_model),
        )
        self.drop2 = nn.Dropout(dropout)

    def forward(self, x, g):
        # x, g: [B, L+1, d]
        y = self.condln1(x, g)
        attn, _ = self.mha(y, y, y, need_weights=False)
        x = x + self.drop1(attn)

        y = self.condln2(x, g)
        y = self.ff(y)
        x = x + self.drop2(y)
        return x


class GeoContextHARV2(nn.Module):
    """RCCMix-HAR++: WindowEncoderV2 + rotation-conditioned Transformer + [CLS||mean] head"""
    def __init__(
        self,
        in_ch=6,
        d_model=192,
        n_layers=3,
        n_heads=6,
        d_ff=768,
        dropout=0.2,
        seq_len=8,
        num_classes=8,
    ):
        super().__init__()
        self.seq_len = seq_len
        self.encoder = WindowEncoderV2(in_ch=in_ch, d_model=d_model, dropout=dropout)
        self.cls_token = nn.Parameter(torch.zeros(1, 1, d_model))
        self.pos = nn.Parameter(torch.zeros(1, seq_len + 1, d_model))
        self.blocks = nn.ModuleList(
            [RCCBlock(d_model, n_heads, d_ff, dropout) for _ in range(n_layers)]
        )
        self.norm = nn.LayerNorm(d_model)
        self.head_drop = nn.Dropout(dropout)
        # use [CLS || mean] representation
        self.head = nn.Linear(2 * d_model, num_classes)

        nn.init.trunc_normal_(self.pos, std=0.02)
        nn.init.trunc_normal_(self.cls_token, std=0.02)

    def forward(self, x):
        # x: [B, L, C, T]
        b, l, c, t = x.shape
        x = x.view(b * l, c, t)
        token, g = self.encoder(x)          # [B*L, d], [B*L, d]
        token = token.view(b, l, -1)        # [B, L, d]
        g     = g.view(b, l, -1)            # [B, L, d]

        # CLS token + global g_cls
        cls = self.cls_token.expand(b, 1, -1)   # [B, 1, d]
        z = torch.cat([cls, token], dim=1)      # [B, L+1, d]
        g_cls = g.mean(dim=1, keepdim=True)     # [B, 1, d]
        g_all = torch.cat([g_cls, g], dim=1)    # [B, L+1, d]

        # positional encoding
        z = z + self.pos

        # conditional Transformer
        for blk in self.blocks:
            z = blk(z, g_all)

        z = self.norm(z)
        cls_rep = z[:, 0, :]              # [B, d]
        mean_rep = z[:, 1:, :].mean(dim=1) # [B, d]
        feat = torch.cat([cls_rep, mean_rep], dim=-1)
        feat = self.head_drop(feat)
        logits = self.head(feat)          # [B, num_classes]
        return logits


# ---------------------------
# 2) Utility: parameter count
# ---------------------------
def count_parameters(model: nn.Module) -> int:
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


# ---------------------------
# 3) Utility: latency measurement (CPU)
# ---------------------------
def _sync_device(dev: torch.device):
    # CPU execution is synchronous; nothing to do here.
    if dev.type == "cuda":
        torch.cuda.synchronize()


def measure_latency(
    model: nn.Module,
    input_shape,
    device: torch.device,
    n_warmup: int = 20,
    n_runs: int = 100,
):
    """
    Measure pure forward-pass latency of the model (no data loading, no preprocessing).
    Returns latency stats in milliseconds per sequence.
    """
    model = model.to(device)
    model.eval()
    torch.set_grad_enabled(False)

    x = torch.randn(*input_shape, device=device, dtype=torch.float32)

    # warm-up
    _sync_device(device)
    for _ in range(n_warmup):
        _ = model(x)
        _sync_device(device)

    # timed runs
    times_ms = []
    for _ in range(n_runs):
        t0 = time.perf_counter()
        _ = model(x)
        _sync_device(device)
        t1 = time.perf_counter()
        times_ms.append((t1 - t0) * 1000.0)

    times_ms = np.array(times_ms, dtype=float)
    stats = {
        "sequence_latency_p50_ms": float(np.percentile(times_ms, 50)),
        "sequence_latency_p90_ms": float(np.percentile(times_ms, 90)),
        "sequence_latency_mean_ms": float(times_ms.mean()),
        "sequence_latency_std_ms":  float(times_ms.std()),
        "n_runs": int(n_runs),
    }
    return stats


# ---------------------------
# 4) Instantiate model and run benchmark on CPU
# ---------------------------
model = GeoContextHARV2(
    in_ch=IN_CHANNELS,
    d_model=D_MODEL,
    n_layers=N_LAYERS,
    n_heads=N_HEADS,
    d_ff=D_FF,
    dropout=DROPOUT,
    seq_len=SEQ_LEN,
    num_classes=NUM_CLASSES,
)

n_params = count_parameters(model)
print(f"Number of trainable parameters: {n_params:,}")

# batch size for latency measurement
BATCH_SIZE = 1  # you can also try 2, 4, 8 for throughput analysis

input_shape = (BATCH_SIZE, SEQ_LEN, IN_CHANNELS, WINDOW_SAMPLES)
print(f"\nMeasuring CPU latency with input shape: {input_shape} (B, L, C, T)")

stats = measure_latency(
    model,
    input_shape=input_shape,
    device=device,
    n_warmup=20,
    n_runs=100,
)

print("\nCPU latency stats (ms) per sequence (8 windows):")
for k, v in stats.items():
    if k.endswith("_ms"):
        print(f"  {k}: {v:.4f}")
    else:
        print(f"  {k}: {v}")

per_window_p50 = stats["sequence_latency_p50_ms"] / SEQ_LEN
per_window_p90 = stats["sequence_latency_p90_ms"] / SEQ_LEN
per_window_mean = stats["sequence_latency_mean_ms"] / SEQ_LEN

print("\nApproximate CPU latency per window (divide by SEQ_LEN=8):")
print(f"  window_latency_p50_ms  ≈ {per_window_p50:.4f}")
print(f"  window_latency_p90_ms  ≈ {per_window_p90:.4f}")
print(f"  window_latency_mean_ms ≈ {per_window_mean:.4f}")
print("\n[RCCMix-HAR++ Step 11 CPU latency benchmark completed]")


[Step 11: RCCMix-HAR++ CPU Inference Latency Benchmark]
Using device: cpu
torch.get_num_threads() = 1
SEQ_LEN=8, WINDOW_SAMPLES=150, IN_CHANNELS=6
D_MODEL=192, N_LAYERS=3, N_HEADS=6, NUM_CLASSES=8
Number of trainable parameters: 1,936,976

Measuring CPU latency with input shape: (1, 8, 6, 150) (B, L, C, T)

CPU latency stats (ms) per sequence (8 windows):
  sequence_latency_p50_ms: 12.2629
  sequence_latency_p90_ms: 15.5251
  sequence_latency_mean_ms: 12.8934
  sequence_latency_std_ms: 1.9676
  n_runs: 100

Approximate CPU latency per window (divide by SEQ_LEN=8):
  window_latency_p50_ms  ≈ 1.5329
  window_latency_p90_ms  ≈ 1.9406
  window_latency_mean_ms ≈ 1.6117

[RCCMix-HAR++ Step 11 CPU latency benchmark completed]


3. Knn

In [2]:
# ================ KNN CPU Inference Latency Benchmark for HAR Windows (Standalone) ================
import os
os.environ.setdefault("OMP_NUM_THREADS", "1")
os.environ.setdefault("MKL_NUM_THREADS", "1")
os.environ.setdefault("OPENBLAS_NUM_THREADS", "1")

import time
import random
import warnings
warnings.filterwarnings("ignore")

import numpy as np
from sklearn.neighbors import KNeighborsClassifier

print("\n[KNN CPU Inference Latency Benchmark for HAR Windows]")

# ---------------------------
# 0) Basic configuration
# ---------------------------
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

# HAR-like window settings: 6 channels, 150 time steps (e.g., 3 s @ 50 Hz)
N_CHANNELS   = 6
WINDOW_SAMPLES = 150
N_FEATURES   = N_CHANNELS * WINDOW_SAMPLES

# Dataset scale for KNN (can be adjusted to match your real setting)
N_TRAIN   = 20000      # number of stored windows in the KNN "training set"
N_CLASSES = 8          # number of activity classes

# Latency measurement settings
N_WARMUP  = 20
N_RUNS    = 200        # number of timed runs
BATCH_SIZE = 1         # number of query windows per call to predict()

print(f"N_TRAIN={N_TRAIN}, N_FEATURES={N_FEATURES}, N_CLASSES={N_CLASSES}")
print(f"N_WARMUP={N_WARMUP}, N_RUNS={N_RUNS}, BATCH_SIZE={BATCH_SIZE}")

# ---------------------------
# 1) Create synthetic HAR-like feature dataset
# ---------------------------
# Each sample represents one window, flattened: (6 channels × 150 samples) -> 900-dim vector.
X_train = np.random.randn(N_TRAIN, N_FEATURES).astype(np.float32)
y_train = np.random.randint(0, N_CLASSES, size=(N_TRAIN,), dtype=np.int32)

print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")

# ---------------------------
# 2) Define KNN model (typical HAR baseline configuration)
# ---------------------------
knn = KNeighborsClassifier(
    n_neighbors=5,
    weights="uniform",   # or "distance" if you prefer
    metric="euclidean",
    n_jobs=1,            # single-threaded for reproducible CPU latency
)

# "Training" for KNN = storing the training set
t0_fit = time.perf_counter()
knn.fit(X_train, y_train)
t1_fit = time.perf_counter()
fit_time_ms = (t1_fit - t0_fit) * 1000.0
print(f"\nKNN fit (store training set) time: {fit_time_ms:.3f} ms")

# Approximate memory footprint of stored data (X + y)
memory_bytes = X_train.nbytes + y_train.nbytes
memory_mb = memory_bytes / (1024 ** 2)
print(f"Approximate memory for stored data: {memory_mb:.3f} MB")

# ---------------------------
# 3) Utility: latency measurement on CPU
# ---------------------------
def measure_knn_latency(
    model: KNeighborsClassifier,
    feature_dim: int,
    batch_size: int = 1,
    n_warmup: int = 20,
    n_runs: int = 200,
):
    """
    Measure pure KNN.predict() latency on CPU.
    Each query corresponds to one HAR window flattened to a feature vector.
    Returns latency stats in milliseconds per batch (i.e., per call to predict()).
    """
    # Synthetic query batch (fixed to avoid cache effects from data allocation)
    X_query = np.random.randn(batch_size, feature_dim).astype(np.float32)

    # Warm-up
    for _ in range(n_warmup):
        _ = model.predict(X_query)

    # Timed runs
    times_ms = []
    for _ in range(n_runs):
        t0 = time.perf_counter()
        _ = model.predict(X_query)
        t1 = time.perf_counter()
        times_ms.append((t1 - t0) * 1000.0)

    times_ms = np.array(times_ms, dtype=float)
    stats = {
        "batch_latency_p50_ms": float(np.percentile(times_ms, 50)),
        "batch_latency_p90_ms": float(np.percentile(times_ms, 90)),
        "batch_latency_mean_ms": float(times_ms.mean()),
        "batch_latency_std_ms":  float(times_ms.std()),
        "n_runs": int(n_runs),
        "batch_size": int(batch_size),
    }
    return stats


# ---------------------------
# 4) Run KNN CPU latency benchmark
# ---------------------------
stats = measure_knn_latency(
    knn,
    feature_dim=N_FEATURES,
    batch_size=BATCH_SIZE,
    n_warmup=N_WARMUP,
    n_runs=N_RUNS,
)

print("\nCPU latency stats for KNN.predict() (per batch):")
for k, v in stats.items():
    if k.endswith("_ms"):
        print(f"  {k}: {v:.4f}")
    else:
        print(f"  {k}: {v}")

# Convert to "per-window" latency (since each sample = one HAR window)
per_window_p50  = stats["batch_latency_p50_ms"]  / stats["batch_size"]
per_window_p90  = stats["batch_latency_p90_ms"]  / stats["batch_size"]
per_window_mean = stats["batch_latency_mean_ms"] / stats["batch_size"]

print("\nApproximate CPU latency per window (HAR sample):")
print(f"  window_latency_p50_ms  ≈ {per_window_p50:.4f}")
print(f"  window_latency_p90_ms  ≈ {per_window_p90:.4f}")
print(f"  window_latency_mean_ms ≈ {per_window_mean:.4f}")

print("\n[KNN CPU inference latency benchmark completed]")



[KNN CPU Inference Latency Benchmark for HAR Windows]
N_TRAIN=20000, N_FEATURES=900, N_CLASSES=8
N_WARMUP=20, N_RUNS=200, BATCH_SIZE=1
X_train shape: (20000, 900), y_train shape: (20000,)

KNN fit (store training set) time: 15.751 ms
Approximate memory for stored data: 68.741 MB

CPU latency stats for KNN.predict() (per batch):
  batch_latency_p50_ms: 46.9475
  batch_latency_p90_ms: 48.5252
  batch_latency_mean_ms: 47.1534
  batch_latency_std_ms: 0.9841
  n_runs: 200
  batch_size: 1

Approximate CPU latency per window (HAR sample):
  window_latency_p50_ms  ≈ 46.9475
  window_latency_p90_ms  ≈ 48.5252
  window_latency_mean_ms ≈ 47.1534

[KNN CPU inference latency benchmark completed]


4. Random forest

In [4]:
# ================ Random Forest CPU Inference Latency Benchmark for HAR Windows (Standalone) ================
import os
os.environ.setdefault("OMP_NUM_THREADS", "1")
os.environ.setdefault("MKL_NUM_THREADS", "1")
os.environ.setdefault("OPENBLAS_NUM_THREADS", "1")

import time
import random
import warnings
warnings.filterwarnings("ignore")

import numpy as np
from sklearn.ensemble import RandomForestClassifier
import pickle

print("\n[Random Forest CPU Inference Latency Benchmark for HAR Windows]")

# ---------------------------
# 0) Basic configuration
# ---------------------------
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

# HAR-like window settings: 6 channels, 150 time steps (e.g., 3 s @ 50 Hz)
N_CHANNELS      = 6
WINDOW_SAMPLES  = 150
N_FEATURES      = N_CHANNELS * WINDOW_SAMPLES

# Dataset scale for Random Forest (can be adjusted to match your real setting)
N_TRAIN   = 20000      # number of training windows
N_CLASSES = 8          # number of activity classes

# Latency measurement settings
N_WARMUP   = 20
N_RUNS     = 200        # number of timed runs
BATCH_SIZE = 1          # number of query windows per call to predict()

print(f"N_TRAIN={N_TRAIN}, N_FEATURES={N_FEATURES}, N_CLASSES={N_CLASSES}")
print(f"N_WARMUP={N_WARMUP}, N_RUNS={N_RUNS}, BATCH_SIZE={BATCH_SIZE}")

# ---------------------------
# 1) Create synthetic HAR-like feature dataset
# ---------------------------
# Each sample represents one window, flattened: (6 channels × 150 samples) -> 900-dim vector.
X_train = np.random.randn(N_TRAIN, N_FEATURES).astype(np.float32)
y_train = np.random.randint(0, N_CLASSES, size=(N_TRAIN,), dtype=np.int32)

print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")

# ---------------------------
# 2) Define Random Forest model (typical HAR baseline configuration)
# ---------------------------
rf = RandomForestClassifier(
    n_estimators=100,
    criterion="gini",
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features="sqrt",
    bootstrap=True,
    n_jobs=1,             # single-threaded for reproducible CPU latency
    random_state=SEED,
)

# Training Random Forest
t0_fit = time.perf_counter()
rf.fit(X_train, y_train)
t1_fit = time.perf_counter()
fit_time_ms = (t1_fit - t0_fit) * 1000.0
print(f"\nRandom Forest fit time: {fit_time_ms:.3f} ms")

# Approximate model size via pickle
model_bytes = len(pickle.dumps(rf))
model_mb = model_bytes / (1024 ** 2)
print(f"Approximate Random Forest model size: {model_mb:.3f} MB")

# ---------------------------
# 3) Utility: latency measurement on CPU
# ---------------------------
def measure_rf_latency(
    model: RandomForestClassifier,
    feature_dim: int,
    batch_size: int = 1,
    n_warmup: int = 20,
    n_runs: int = 200,
):
    """
    Measure pure RandomForest.predict() latency on CPU.
    Each query corresponds to one HAR window flattened to a feature vector.
    Returns latency stats in milliseconds per batch (i.e., per call to predict()).
    """
    # Synthetic query batch (fixed to avoid allocation cost in the loop)
    X_query = np.random.randn(batch_size, feature_dim).astype(np.float32)

    # Warm-up
    for _ in range(n_warmup):
        _ = model.predict(X_query)

    # Timed runs
    times_ms = []
    for _ in range(n_runs):
        t0 = time.perf_counter()
        _ = model.predict(X_query)
        t1 = time.perf_counter()
        times_ms.append((t1 - t0) * 1000.0)

    times_ms = np.array(times_ms, dtype=float)
    stats = {
        "batch_latency_p50_ms": float(np.percentile(times_ms, 50)),
        "batch_latency_p90_ms": float(np.percentile(times_ms, 90)),
        "batch_latency_mean_ms": float(times_ms.mean()),
        "batch_latency_std_ms":  float(times_ms.std()),
        "n_runs": int(n_runs),
        "batch_size": int(batch_size),
    }
    return stats


# ---------------------------
# 4) Run Random Forest CPU latency benchmark
# ---------------------------
stats = measure_rf_latency(
    rf,
    feature_dim=N_FEATURES,
    batch_size=BATCH_SIZE,
    n_warmup=N_WARMUP,
    n_runs=N_RUNS,
)

print("\nCPU latency stats for RandomForest.predict() (per batch):")
for k, v in stats.items():
    if k.endswith("_ms"):
        print(f"  {k}: {v:.4f}")
    else:
        print(f"  {k}: {v}")

# Convert to "per-window" latency (since each sample = one HAR window)
per_window_p50  = stats["batch_latency_p50_ms"]  / stats["batch_size"]
per_window_p90  = stats["batch_latency_p90_ms"]  / stats["batch_size"]
per_window_mean = stats["batch_latency_mean_ms"] / stats["batch_size"]

print("\nApproximate CPU latency per window (HAR sample):")
print(f"  window_latency_p50_ms  ≈ {per_window_p50:.4f}")
print(f"  window_latency_p90_ms  ≈ {per_window_p90:.4f}")
print(f"  window_latency_mean_ms ≈ {per_window_mean:.4f}")

print("\n[Random Forest CPU inference latency benchmark completed]")



[Random Forest CPU Inference Latency Benchmark for HAR Windows]
N_TRAIN=20000, N_FEATURES=900, N_CLASSES=8
N_WARMUP=20, N_RUNS=200, BATCH_SIZE=1
X_train shape: (20000, 900), y_train shape: (20000,)

Random Forest fit time: 200355.768 ms
Approximate Random Forest model size: 133.679 MB

CPU latency stats for RandomForest.predict() (per batch):
  batch_latency_p50_ms: 6.8383
  batch_latency_p90_ms: 7.2557
  batch_latency_mean_ms: 6.9059
  batch_latency_std_ms: 0.2590
  n_runs: 200
  batch_size: 1

Approximate CPU latency per window (HAR sample):
  window_latency_p50_ms  ≈ 6.8383
  window_latency_p90_ms  ≈ 7.2557
  window_latency_mean_ms ≈ 6.9059

[Random Forest CPU inference latency benchmark completed]


5.InceptionTime

In [17]:
# ================ InceptionTime CPU Inference Latency Benchmark for HAR Windows (Standalone) ================
import os
os.environ.setdefault("OMP_NUM_THREADS", "1")
os.environ.setdefault("MKL_NUM_THREADS", "1")
os.environ.setdefault("OPENBLAS_NUM_THREADS", "1")

import time
import random
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import torch
import torch.nn as nn

print("\n[InceptionTime CPU Inference Latency Benchmark for HAR Windows]")

# ---------------------------
# 0) Basic configuration
# ---------------------------
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

device = torch.device("cpu")
torch.set_num_threads(1)

print(f"Using device: {device}")
print(f"torch.get_num_threads() = {torch.get_num_threads()}")

# HAR-like window settings: 6 channels, 150 time steps (e.g., 3 s @ 50 Hz)
N_CHANNELS      = 6
WINDOW_SAMPLES  = 150
N_CLASSES       = 8

print(f"N_CHANNELS={N_CHANNELS}, WINDOW_SAMPLES={WINDOW_SAMPLES}, N_CLASSES={N_CLASSES}")

# InceptionTime hyperparameters (typical multivariate configuration)
NB_FILTERS        = 32
KERNEL_SIZES      = (10, 20, 40)   # following common InceptionTime practice
USE_BOTTLENECK   = True
BOTTLENECK_SIZE  = 32
NB_BLOCKS         = 3              # number of residual Inception blocks
DROPOUT           = 0.0            # InceptionTime usually does not use dropout in the original paper


# ---------------------------
# 1) InceptionTime model definition
#    (multivariate 1D time-series, following common implementations)
# ---------------------------
class InceptionModule(nn.Module):
    def __init__(
        self,
        in_channels: int,
        n_filters: int = 32,
        kernel_sizes=(10, 20, 40),
        bottleneck_channels: int = 32,
        use_bottleneck: bool = True,
    ):
        super().__init__()
        self.use_bottleneck = use_bottleneck

        if use_bottleneck and in_channels > 1:
            self.bottleneck = nn.Conv1d(
                in_channels,
                bottleneck_channels,
                kernel_size=1,
                stride=1,
                padding=0,
                bias=False,
            )
            conv_in_channels = bottleneck_channels
        else:
            self.bottleneck = None
            conv_in_channels = in_channels

        self.conv_list = nn.ModuleList()
        for k in kernel_sizes:
            # padding chosen to roughly approximate "same", but may overshoot by 1 for even kernels
            pad = k // 2
            self.conv_list.append(
                nn.Conv1d(
                    conv_in_channels,
                    n_filters,
                    kernel_size=k,
                    stride=1,
                    padding=pad,
                    bias=False,
                )
            )

        self.maxpool = nn.MaxPool1d(kernel_size=3, stride=1, padding=1)
        self.conv_pool = nn.Conv1d(
            in_channels, n_filters, kernel_size=1, stride=1, padding=0, bias=False
        )

        self.bn = nn.BatchNorm1d(n_filters * (len(kernel_sizes) + 1))
        self.act = nn.ReLU(inplace=True)

    def forward(self, x):
        # x: [B, C, T]
        if self.use_bottleneck and self.bottleneck is not None:
            z = self.bottleneck(x)
        else:
            z = x

        out_branches = []
        for conv in self.conv_list:
            out_branches.append(conv(z))

        out_pool = self.conv_pool(self.maxpool(x))
        out_branches.append(out_pool)

        # Align all branches along the time dimension (T) by cropping to the minimum length
        min_len = min(t.shape[-1] for t in out_branches)
        out_branches = [t[..., :min_len] for t in out_branches]

        out = torch.cat(out_branches, dim=1)
        out = self.bn(out)
        out = self.act(out)
        return out


class InceptionBlock(nn.Module):
    def __init__(
        self,
        in_channels: int,
        n_filters: int = 32,
        kernel_sizes=(10, 20, 40),
        bottleneck_channels: int = 32,
        use_bottleneck: bool = True,
        use_residual: bool = True,
    ):
        super().__init__()
        self.use_residual = use_residual

        self.inception1 = InceptionModule(
            in_channels=in_channels,
            n_filters=n_filters,
            kernel_sizes=kernel_sizes,
            bottleneck_channels=bottleneck_channels,
            use_bottleneck=use_bottleneck,
        )
        self.inception2 = InceptionModule(
            in_channels=n_filters * (len(kernel_sizes) + 1),
            n_filters=n_filters,
            kernel_sizes=kernel_sizes,
            bottleneck_channels=bottleneck_channels,
            use_bottleneck=use_bottleneck,
        )
        self.inception3 = InceptionModule(
            in_channels=n_filters * (len(kernel_sizes) + 1),
            n_filters=n_filters,
            kernel_sizes=kernel_sizes,
            bottleneck_channels=bottleneck_channels,
            use_bottleneck=use_bottleneck,
        )

        out_channels = n_filters * (len(kernel_sizes) + 1)

        if self.use_residual:
            self.residual = nn.Sequential(
                nn.Conv1d(in_channels, out_channels, kernel_size=1, stride=1, padding=0, bias=False),
                nn.BatchNorm1d(out_channels),
            )
        else:
            self.residual = None

        self.act = nn.ReLU(inplace=True)

    def forward(self, x):
        # x: [B, C, T]
        residual = x
        x = self.inception1(x)
        x = self.inception2(x)
        x = self.inception3(x)

        if self.use_residual and self.residual is not None:
            # Align residual length to match x if needed
            res = self.residual(residual)
            if res.shape[-1] != x.shape[-1]:
                min_len = min(res.shape[-1], x.shape[-1])
                res = res[..., :min_len]
                x = x[..., :min_len]
            x = x + res

        x = self.act(x)
        return x


class InceptionTime(nn.Module):
    def __init__(
        self,
        in_channels: int,
        n_classes: int,
        n_filters: int = 32,
        kernel_sizes=(10, 20, 40),
        bottleneck_channels: int = 32,
        use_bottleneck: bool = True,
        n_blocks: int = 3,
        use_residual: bool = True,
        dropout: float = 0.0,
    ):
        super().__init__()

        blocks = []
        input_channels = in_channels
        for i in range(n_blocks):
            block = InceptionBlock(
                in_channels=input_channels,
                n_filters=n_filters,
                kernel_sizes=kernel_sizes,
                bottleneck_channels=bottleneck_channels,
                use_bottleneck=use_bottleneck,
                use_residual=use_residual,
            )
            blocks.append(block)
            # after each block, number of channels becomes n_filters * (len(kernel_sizes) + 1)
            input_channels = n_filters * (len(kernel_sizes) + 1)

        self.blocks = nn.ModuleList(blocks)
        self.gap = nn.AdaptiveAvgPool1d(1)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(input_channels, n_classes)

    def forward(self, x):
        # x: [B, C, T]
        for block in self.blocks:
            x = block(x)
        x = self.gap(x).squeeze(-1)  # [B, C_out]
        x = self.dropout(x)
        logits = self.fc(x)          # [B, n_classes]
        return logits


# ---------------------------
# 2) Utility: parameter count
# ---------------------------
def count_parameters(model: nn.Module) -> int:
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


# ---------------------------
# 3) Utility: latency measurement on CPU
# ---------------------------
def measure_latency(
    model: nn.Module,
    input_shape,
    device: torch.device,
    n_warmup: int = 20,
    n_runs: int = 100,
):
    """
    Measure pure forward-pass latency on CPU.
    Returns latency stats in milliseconds per batch (per call to model()).
    For InceptionTime here, each batch element corresponds to one HAR window.
    """
    model = model.to(device)
    model.eval()
    torch.set_grad_enabled(False)

    x = torch.randn(*input_shape, device=device, dtype=torch.float32)

    # Warm-up
    for _ in range(n_warmup):
        _ = model(x)

    # Timed runs
    times_ms = []
    for _ in range(n_runs):
        t0 = time.perf_counter()
        _ = model(x)
        t1 = time.perf_counter()
        times_ms.append((t1 - t0) * 1000.0)

    times_ms = np.array(times_ms, dtype=float)
    stats = {
        "batch_latency_p50_ms": float(np.percentile(times_ms, 50)),
        "batch_latency_p90_ms": float(np.percentile(times_ms, 90)),
        "batch_latency_mean_ms": float(times_ms.mean()),
        "batch_latency_std_ms":  float(times_ms.std()),
        "n_runs": int(n_runs),
        "batch_size": int(input_shape[0]),
    }
    return stats


# ---------------------------
# 4) Instantiate model and run benchmark
# ---------------------------
model = InceptionTime(
    in_channels=N_CHANNELS,
    n_classes=N_CLASSES,
    n_filters=NB_FILTERS,
    kernel_sizes=KERNEL_SIZES,
    bottleneck_channels=BOTTLENECK_SIZE,
    use_bottleneck=USE_BOTTLENECK,
    n_blocks=NB_BLOCKS,
    use_residual=True,
    dropout=DROPOUT,
)

n_params = count_parameters(model)
print(f"Number of trainable parameters: {n_params:,}")

# Batch size for latency measurement (per-window model)
BATCH_SIZE = 1  # you can also test larger batches for throughput analysis
input_shape = (BATCH_SIZE, N_CHANNELS, WINDOW_SAMPLES)  # [B, C, T]

print(f"\nMeasuring CPU latency with input shape: {input_shape} (B, C, T)")

stats = measure_latency(
    model,
    input_shape=input_shape,
    device=device,
    n_warmup=20,
    n_runs=100,
)

print("\nCPU latency stats for InceptionTime.forward() (per batch):")
for k, v in stats.items():
    if k.endswith("_ms"):
        print(f"  {k}: {v:.4f}")
    else:
        print(f"  {k}: {v}")

# Since each sample = one HAR window, per-window latency = per-batch latency / batch_size
per_window_p50  = stats["batch_latency_p50_ms"]  / stats["batch_size"]
per_window_p90  = stats["batch_latency_p90_ms"]  / stats["batch_size"]
per_window_mean = stats["batch_latency_mean_ms"] / stats["batch_size"]

print("\nApproximate CPU latency per window (HAR sample):")
print(f"  window_latency_p50_ms  ≈ {per_window_p50:.4f}")
print(f"  window_latency_p90_ms  ≈ {per_window_p90:.4f}")
print(f"  window_latency_mean_ms ≈ {per_window_mean:.4f}")

print("\n[InceptionTime CPU inference latency benchmark completed]")


[InceptionTime CPU Inference Latency Benchmark for HAR Windows]
Using device: cpu
torch.get_num_threads() = 1
N_CHANNELS=6, WINDOW_SAMPLES=150, N_CLASSES=8
Number of trainable parameters: 748,680

Measuring CPU latency with input shape: (1, 6, 150) (B, C, T)

CPU latency stats for InceptionTime.forward() (per batch):
  batch_latency_p50_ms: 10.5010
  batch_latency_p90_ms: 11.5529
  batch_latency_mean_ms: 10.7735
  batch_latency_std_ms: 1.0131
  n_runs: 100
  batch_size: 1

Approximate CPU latency per window (HAR sample):
  window_latency_p50_ms  ≈ 10.5010
  window_latency_p90_ms  ≈ 11.5529
  window_latency_mean_ms ≈ 10.7735

[InceptionTime CPU inference latency benchmark completed]


In [5]:
!pip -q install sktime

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m35.6/35.6 MB[0m [31m64.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m149.5/149.5 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25h

6. minirocket

In [7]:
# ================ MiniRocket CPU Inference Latency Benchmark for HAR Windows (Standalone) ================
!pip -q install sktime

import os
os.environ.setdefault("OMP_NUM_THREADS", "1")
os.environ.setdefault("MKL_NUM_THREADS", "1")
os.environ.setdefault("OPENBLAS_NUM_THREADS", "1")
os.environ.setdefault("NUMBA_NUM_THREADS", "1")
os.environ.setdefault("NUMBA_DEFAULT_NUM_THREADS", "1")

import time
import random
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import torch
import numba
from sklearn.linear_model import RidgeClassifier
from sktime.transformations.panel.rocket import MiniRocketMultivariate

print("\n[MiniRocket CPU Inference Latency Benchmark for HAR Windows]")

# ---------------------------
# 0) Basic configuration
# ---------------------------
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

# Single-core CPU
device = "cpu"
torch.set_num_threads(1)
numba.set_num_threads(1)

print(f"Using device: {device}")
print(f"torch.get_num_threads() = {torch.get_num_threads()}")
print(f"NUMBA threads = {numba.get_num_threads()}")

# HAR-like window settings: 6 channels, 150 time steps (e.g., 3 s @ 50 Hz)
N_CHANNELS      = 6
WINDOW_SAMPLES  = 150
N_CLASSES       = 8

print(f"N_CHANNELS={N_CHANNELS}, WINDOW_SAMPLES={WINDOW_SAMPLES}, N_CLASSES={N_CLASSES}")

# Training set size for MiniRocket + linear head (aligned with KNN/RF)
N_TRAIN   = 20000   # number of training windows

# Latency measurement settings
N_WARMUP   = 20
N_RUNS     = 100
BATCH_SIZE = 1      # number of query windows per call

print(f"N_TRAIN={N_TRAIN}, N_WARMUP={N_WARMUP}, N_RUNS={N_RUNS}, BATCH_SIZE={BATCH_SIZE}")


# ---------------------------
# 1) Helper: convert 3D numpy to sktime nested DataFrame
#    X_np: (n_instances, n_channels, n_timepoints)
# ---------------------------
def to_nested_dataframe(X_np: np.ndarray) -> pd.DataFrame:
    n_instances, n_channels, n_timepoints = X_np.shape
    data = {}
    for c in range(n_channels):
        data[f"dim_{c}"] = [pd.Series(X_np[i, c, :]) for i in range(n_instances)]
    return pd.DataFrame(data)


# ---------------------------
# 2) Create synthetic HAR-like training dataset
# ---------------------------
# Each sample: one window [channels, time] with Gaussian noise
X_train_np = np.random.randn(N_TRAIN, N_CHANNELS, WINDOW_SAMPLES).astype(np.float32)
y_train    = np.random.randint(0, N_CLASSES, size=(N_TRAIN,), dtype=np.int32)

print(f"X_train_np shape: {X_train_np.shape}, y_train shape: {y_train.shape}")

X_train = to_nested_dataframe(X_train_np)
print(f"X_train nested DataFrame shape: {X_train.shape}")


# ---------------------------
# 3) Define MiniRocket + linear classifier
# ---------------------------
minirocket = MiniRocketMultivariate(
    num_kernels=10000,             # standard MiniRocket setting
    max_dilations_per_kernel=32,
    n_jobs=1,                      # single-core for fair comparison
    random_state=SEED,
)

clf = RidgeClassifier(
    alpha=1.0,
    fit_intercept=True
)

# ---------------------------
# 4) Fit MiniRocket feature extractor + linear classifier
# ---------------------------
t0_fit = time.perf_counter()
minirocket.fit(X_train, y_train)
X_train_trans = minirocket.transform(X_train)
clf.fit(X_train_trans, y_train)
t1_fit = time.perf_counter()
fit_time_ms = (t1_fit - t0_fit) * 1000.0

n_features_rocket = X_train_trans.shape[1]
n_linear_params = clf.coef_.size + clf.intercept_.size

print(f"\nMiniRocket fit + linear head fit time: {fit_time_ms:.3f} ms")
print(f"MiniRocket transformed feature dimension: {n_features_rocket}")
print(f"Approximate number of linear head parameters: {n_linear_params:,}")


# ---------------------------
# 5) Utility: latency measurement for MiniRocket + linear head
# ---------------------------
def measure_minirocket_latency(
    transformer: MiniRocketMultivariate,
    classifier,
    X_query_nested: pd.DataFrame,
    batch_size: int,
    n_warmup: int = 20,
    n_runs: int = 100,
):
    """
    Measure MiniRocket.transform() + linear classifier.predict() latency on CPU.
    X_query_nested: nested DataFrame with shape (batch_size, n_channels).
    Returns latency stats in milliseconds per batch.
    """
    # Warm-up
    for _ in range(n_warmup):
        X_feat = transformer.transform(X_query_nested)
        _ = classifier.predict(X_feat)

    # Timed runs
    times_ms = []
    for _ in range(n_runs):
        t0 = time.perf_counter()
        X_feat = transformer.transform(X_query_nested)
        _ = classifier.predict(X_feat)
        t1 = time.perf_counter()
        times_ms.append((t1 - t0) * 1000.0)

    times_ms = np.array(times_ms, dtype=float)
    stats = {
        "batch_latency_p50_ms": float(np.percentile(times_ms, 50)),
        "batch_latency_p90_ms": float(np.percentile(times_ms, 90)),
        "batch_latency_mean_ms": float(times_ms.mean()),
        "batch_latency_std_ms":  float(times_ms.std()),
        "n_runs": int(n_runs),
        "batch_size": int(batch_size),
    }
    return stats


# ---------------------------
# 6) Build a synthetic query batch and run the benchmark
# ---------------------------
X_query_np = np.random.randn(BATCH_SIZE, N_CHANNELS, WINDOW_SAMPLES).astype(np.float32)
X_query = to_nested_dataframe(X_query_np)

print(f"\nQuery batch nested DataFrame shape: {X_query.shape}")

stats = measure_minirocket_latency(
    minirocket,
    clf,
    X_query_nested=X_query,
    batch_size=BATCH_SIZE,
    n_warmup=N_WARMUP,
    n_runs=N_RUNS,
)

print("\nCPU latency stats for MiniRocket + linear head (per batch):")
for k, v in stats.items():
    if k.endswith("_ms"):
        print(f"  {k}: {v:.4f}")
    else:
        print(f"  {k}: {v}")

# Each sample corresponds to one HAR window, so per-window latency = per-batch / batch_size
per_window_p50  = stats["batch_latency_p50_ms"]  / stats["batch_size"]
per_window_p90  = stats["batch_latency_p90_ms"]  / stats["batch_size"]
per_window_mean = stats["batch_latency_mean_ms"] / stats["batch_size"]

print("\nApproximate CPU latency per window (HAR sample):")
print(f"  window_latency_p50_ms  ≈ {per_window_p50:.4f}")
print(f"  window_latency_p90_ms  ≈ {per_window_p90:.4f}")
print(f"  window_latency_mean_ms ≈ {per_window_mean:.4f}")

print("\n[MiniRocket CPU inference latency benchmark completed]")


[MiniRocket CPU Inference Latency Benchmark for HAR Windows]
Using device: cpu
torch.get_num_threads() = 1
NUMBA threads = 1
N_CHANNELS=6, WINDOW_SAMPLES=150, N_CLASSES=8
N_TRAIN=20000, N_WARMUP=20, N_RUNS=100, BATCH_SIZE=1
X_train_np shape: (20000, 6, 150), y_train shape: (20000,)
X_train nested DataFrame shape: (20000, 6)

MiniRocket fit + linear head fit time: 99168.711 ms
MiniRocket transformed feature dimension: 9996
Approximate number of linear head parameters: 79,976

Query batch nested DataFrame shape: (1, 6)

CPU latency stats for MiniRocket + linear head (per batch):
  batch_latency_p50_ms: 49.1377
  batch_latency_p90_ms: 51.1574
  batch_latency_mean_ms: 50.8583
  batch_latency_std_ms: 6.1007
  n_runs: 100
  batch_size: 1

Approximate CPU latency per window (HAR sample):
  window_latency_p50_ms  ≈ 49.1377
  window_latency_p90_ms  ≈ 51.1574
  window_latency_mean_ms ≈ 50.8583

[MiniRocket CPU inference latency benchmark completed]


7. minirocket(Deployment-Friendly)

In [13]:
# ================ MiniRocket-Lite CPU Inference Latency Benchmark for HAR Windows (Deployment-Friendly) ================
# This script is intended to be self-contained and comparable to the "standard" MiniRocket
# benchmark you ran before:
#   - 6 channels, 150 time steps (~3 s @ 50 Hz)
#   - N_TRAIN = 20,000 synthetic windows
#   - Latency metric: MiniRocket.transform() + RidgeClassifier.predict()
#   - Deployment-friendly configuration:
#       * NUM_KERNELS = 2000 (lighter than the standard 10k)
#       * Attempts to use up to 4 threads, but adapts to the effective Numba limit
#         in the current Python process (e.g., if Numba is locked to 1 thread, we
#         automatically set n_jobs=1 to avoid errors).

!pip -q install sktime psutil

import os
import time
import random
import warnings
import platform

import numpy as np
import pandas as pd
import psutil

warnings.filterwarnings("ignore")

print("\n[MiniRocket-Lite CPU Inference Latency Benchmark for HAR Windows (Deployment-Friendly)]")

# ---------------------------
# 0) Reproducibility and thread budget (decide before importing torch/numba)
# ---------------------------
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

logical_cores = psutil.cpu_count(logical=True) or 1
physical_cores = psutil.cpu_count(logical=False) or logical_cores
TARGET_THREADS = min(4, logical_cores)

# Configure BLAS-related environment variables for a deployment-friendly setting.
# These affect libraries such as OpenBLAS/MKL, but Numba's effective maximum may
# already be fixed by previous imports in the same process.
os.environ["OMP_NUM_THREADS"] = str(TARGET_THREADS)
os.environ["MKL_NUM_THREADS"] = str(TARGET_THREADS)
os.environ["OPENBLAS_NUM_THREADS"] = str(TARGET_THREADS)

# Import torch / numba / MiniRocket after setting env vars
import torch
import numba
from sklearn.linear_model import RidgeClassifier
from sktime.transformations.panel.rocket import MiniRocketMultivariate

# Try to request TARGET_THREADS for PyTorch
torch.set_num_threads(TARGET_THREADS)

# Try to request TARGET_THREADS for Numba, but Numba may already be constrained
try:
    numba.set_num_threads(TARGET_THREADS)
except ValueError as e:
    print(f"[Warning] numba.set_num_threads({TARGET_THREADS}) failed: {e}")
    print("          Falling back to Numba's existing thread configuration.")

# Effective Numba threads in this process
effective_numba_threads = numba.get_num_threads()
# MiniRocket will not request more threads than Numba actually allows
MINIROCKET_THREADS = min(TARGET_THREADS, effective_numba_threads)

print("Platform                   :", platform.system(), platform.release())
print("CPU                        :", platform.processor())
print("Physical cores             :", physical_cores)
print("Logical cores              :", logical_cores)
print("Target threads (requested) :", TARGET_THREADS)
print("Numba threads (effective)  :", effective_numba_threads)
print("MiniRocket n_jobs (used)   :", MINIROCKET_THREADS)
print("torch.get_num_threads()    :", torch.get_num_threads())

device = "cpu"
print(f"\nUsing device: {device}")

# ---------------------------
# 1) HAR-like window settings (aligned with previous MiniRocket benchmark)
# ---------------------------
N_CHANNELS      = 6
WINDOW_SAMPLES  = 150   # e.g., 3 s @ 50 Hz
N_CLASSES       = 8

# Training set size (same as previous benchmark for comparability)
N_TRAIN   = 20000

# Latency measurement settings
N_WARMUP   = 20
N_RUNS     = 100
BATCH_SIZE = 1  # per-window latency

# Deployment-friendly MiniRocket configuration (lighter than the standard 10k-kernel version)
NUM_KERNELS = 2000

SAMPLE_RATE_HZ   = 50.0
WINDOW_SECONDS   = WINDOW_SAMPLES / SAMPLE_RATE_HZ

print(f"\nN_CHANNELS={N_CHANNELS}, WINDOW_SAMPLES={WINDOW_SAMPLES}, N_CLASSES={N_CLASSES}")
print(f"N_TRAIN={N_TRAIN}, N_WARMUP={N_WARMUP}, N_RUNS={N_RUNS}, BATCH_SIZE={BATCH_SIZE}")
print(f"NUM_KERNELS (MiniRocket-Lite) = {NUM_KERNELS}")
print(f"Assumed sampling rate = {SAMPLE_RATE_HZ} Hz → window length ≈ {WINDOW_SECONDS:.3f} s")

# ---------------------------
# 2) Helper: convert 3D numpy to sktime nested DataFrame
#    X_np: (n_instances, n_channels, n_timepoints)
# ---------------------------
def to_nested_dataframe(X_np: np.ndarray) -> pd.DataFrame:
    n_instances, n_channels, n_timepoints = X_np.shape
    data = {}
    for c in range(n_channels):
        # one pandas Series per instance and per channel
        data[f"dim_{c}"] = [pd.Series(X_np[i, c, :]) for i in range(n_instances)]
    return pd.DataFrame(data)

# ---------------------------
# 3) Create synthetic HAR-like training dataset
# ---------------------------
X_train_np = np.random.randn(N_TRAIN, N_CHANNELS, WINDOW_SAMPLES).astype(np.float32)
y_train    = np.random.randint(0, N_CLASSES, size=(N_TRAIN,), dtype=np.int32)

print(f"\nX_train_np shape: {X_train_np.shape}, y_train shape: {y_train.shape}")

X_train = to_nested_dataframe(X_train_np)
print(f"X_train nested DataFrame shape: {X_train.shape}")

# ---------------------------
# 4) Define MiniRocket-Lite + linear classifier
# ---------------------------
minirocket_lite = MiniRocketMultivariate(
    num_kernels=NUM_KERNELS,
    max_dilations_per_kernel=32,
    n_jobs=MINIROCKET_THREADS,  # do not exceed Numba's effective thread limit
    random_state=SEED,
)

clf = RidgeClassifier(
    alpha=1.0,
    fit_intercept=True
)

# ---------------------------
# 5) Fit MiniRocket-Lite feature extractor + linear classifier
# ---------------------------
t0_fit = time.perf_counter()
minirocket_lite.fit(X_train, y_train)
X_train_trans = minirocket_lite.transform(X_train)
clf.fit(X_train_trans, y_train)
t1_fit = time.perf_counter()
fit_time_ms = (t1_fit - t0_fit) * 1000.0

n_features_rocket = X_train_trans.shape[1]
n_linear_params = clf.coef_.size + clf.intercept_.size

print(f"\nMiniRocket-Lite fit + linear head fit time: {fit_time_ms:.3f} ms")
print(f"MiniRocket-Lite transformed feature dimension: {n_features_rocket}")
print(f"Approximate number of linear head parameters: {n_linear_params:,}")

# ---------------------------
# 6) Utility: latency measurement for MiniRocket-Lite + linear head
# ---------------------------
def measure_minirocket_latency(
    transformer: MiniRocketMultivariate,
    classifier,
    X_query_nested: pd.DataFrame,
    batch_size: int,
    n_warmup: int = 20,
    n_runs: int = 100,
):
    """
    Measure MiniRocket.transform() + linear classifier.predict() latency on CPU.
    X_query_nested: nested DataFrame with shape (batch_size, n_channels).
    Returns latency stats in milliseconds per batch.
    """
    # Warm-up runs (excluded from statistics)
    for _ in range(n_warmup):
        X_feat = transformer.transform(X_query_nested)
        _ = classifier.predict(X_feat)

    # Timed runs
    times_ms = []
    for _ in range(n_runs):
        t0 = time.perf_counter()
        X_feat = transformer.transform(X_query_nested)
        _ = classifier.predict(X_feat)
        t1 = time.perf_counter()
        times_ms.append((t1 - t0) * 1000.0)

    times_ms = np.array(times_ms, dtype=float)
    stats = {
        "batch_latency_p50_ms": float(np.percentile(times_ms, 50)),
        "batch_latency_p90_ms": float(np.percentile(times_ms, 90)),
        "batch_latency_mean_ms": float(times_ms.mean()),
        "batch_latency_std_ms":  float(times_ms.std()),
        "n_runs": int(n_runs),
        "batch_size": int(batch_size),
    }
    return stats

# ---------------------------
# 7) Build a synthetic query batch and run the benchmark
# ---------------------------
X_query_np = np.random.randn(BATCH_SIZE, N_CHANNELS, WINDOW_SAMPLES).astype(np.float32)
X_query = to_nested_dataframe(X_query_np)

print(f"\nQuery batch nested DataFrame shape: {X_query.shape}")

stats = measure_minirocket_latency(
    minirocket_lite,
    clf,
    X_query_nested=X_query,
    batch_size=BATCH_SIZE,
    n_warmup=N_WARMUP,
    n_runs=N_RUNS,
)

print("\nCPU latency stats for MiniRocket-Lite + linear head (per batch):")
for k, v in stats.items():
    if k.endswith("_ms"):
        print(f"  {k}: {v:.4f}")
    else:
        print(f"  {k}: {v}")

# Each sample corresponds to one HAR window, so per-window latency = per-batch / batch_size
per_window_p50  = stats["batch_latency_p50_ms"]  / stats["batch_size"]
per_window_p90  = stats["batch_latency_p90_ms"]  / stats["batch_size"]
per_window_mean = stats["batch_latency_mean_ms"] / stats["batch_size"]

print("\nApproximate CPU latency per window (HAR sample):")
print(f"  window_latency_p50_ms  ≈ {per_window_p50:.4f}")
print(f"  window_latency_p90_ms  ≈ {per_window_p90:.4f}")
print(f"  window_latency_mean_ms ≈ {per_window_mean:.4f}")

# Real-time factor relative to window length
rt_factor_p50  = per_window_p50  / (WINDOW_SECONDS * 1000.0)
rt_factor_mean = per_window_mean / (WINDOW_SECONDS * 1000.0)

print("\nReal-time factor (MiniRocket-Lite, CPU):")
print(f"  p50  RTF ≈ {rt_factor_p50:.4f} (CPU time / wall-clock window length)")
print(f"  mean RTF ≈ {rt_factor_mean:.4f}")

print("\n[MiniRocket-Lite CPU inference latency benchmark (deployment-friendly) completed]")


[MiniRocket-Lite CPU Inference Latency Benchmark for HAR Windows (Deployment-Friendly)]
          Falling back to Numba's existing thread configuration.
Platform                   : Linux 6.6.105+
CPU                        : x86_64
Physical cores             : 4
Logical cores              : 8
Target threads (requested) : 4
Numba threads (effective)  : 1
MiniRocket n_jobs (used)   : 1
torch.get_num_threads()    : 4

Using device: cpu

N_CHANNELS=6, WINDOW_SAMPLES=150, N_CLASSES=8
N_TRAIN=20000, N_WARMUP=20, N_RUNS=100, BATCH_SIZE=1
NUM_KERNELS (MiniRocket-Lite) = 2000
Assumed sampling rate = 50.0 Hz → window length ≈ 3.000 s

X_train_np shape: (20000, 6, 150), y_train shape: (20000,)
X_train nested DataFrame shape: (20000, 6)

MiniRocket-Lite fit + linear head fit time: 49031.609 ms
MiniRocket-Lite transformed feature dimension: 1932
Approximate number of linear head parameters: 15,464

Query batch nested DataFrame shape: (1, 6)

CPU latency stats for MiniRocket-Lite + linear head (pe

8. TST

In [25]:
# ================ TST (Time Series Transformer) CPU Inference Latency Benchmark (Standalone) ================
import os
os.environ.setdefault("OMP_NUM_THREADS", "1")
os.environ.setdefault("MKL_NUM_THREADS", "1")
os.environ.setdefault("OPENBLAS_NUM_THREADS", "1")

import time
import random
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import torch
import torch.nn as nn

print("\n[TST CPU Inference Latency Benchmark for HAR Windows]")

# ---------------------------
# 0) Basic configuration
# ---------------------------
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

device = torch.device("cpu")
torch.set_num_threads(1)

print(f"Using device: {device}")
print(f"torch.get_num_threads() = {torch.get_num_threads()}")

# HAR-like window settings: 6 channels, 150 time steps (e.g., 3 s @ 50 Hz)
N_CHANNELS      = 6
WINDOW_SAMPLES  = 150
N_CLASSES       = 8

print(f"\nN_CHANNELS={N_CHANNELS}, WINDOW_SAMPLES={WINDOW_SAMPLES}, N_CLASSES={N_CLASSES}")

# TST hyperparameters (typical configuration)
D_MODEL    = 64
N_HEADS    = 8
D_FF       = 128
N_LAYERS   = 4
DROPOUT    = 0.1

print(f"TST config: d_model={D_MODEL}, n_heads={N_HEADS}, d_ff={D_FF}, n_layers={N_LAYERS}, dropout={DROPOUT}")


# ---------------------------
# 1) Positional encoding
# ---------------------------
class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, max_len: int = 5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model, dtype=torch.float32)
        position = torch.arange(0, max_len, dtype=torch.float32).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, d_model, 2, dtype=torch.float32) * (-np.log(10000.0) / d_model)
        )
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)  # [1, max_len, d_model]
        self.register_buffer("pe", pe, persistent=False)

    def forward(self, x):
        # x: [B, T, d_model]
        T = x.size(1)
        return x + self.pe[:, :T, :]


# ---------------------------
# 2) Transformer encoder layer (batch_first)
# ---------------------------
class TransformerEncoderLayer(nn.Module):
    def __init__(self, d_model, n_heads, d_ff, dropout=0.1):
        super().__init__()
        self.self_attn = nn.MultiheadAttention(
            embed_dim=d_model,
            num_heads=n_heads,
            dropout=dropout,
            batch_first=True,
        )
        self.linear1 = nn.Linear(d_model, d_ff)
        self.linear2 = nn.Linear(d_ff, d_model)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        self.dropout_ff = nn.Dropout(dropout)
        self.activation = nn.GELU()

    def forward(self, x):
        # x: [B, T, d_model]
        attn_in = x
        attn_out, _ = self.self_attn(attn_in, attn_in, attn_in, need_weights=False)
        x = x + self.dropout(attn_out)
        x = self.norm1(x)

        ff = self.linear2(self.dropout_ff(self.activation(self.linear1(x))))
        x = x + self.dropout(ff)
        x = self.norm2(x)
        return x


# ---------------------------
# 3) TST classifier (window-level)
# ---------------------------
class TSTClassifier(nn.Module):
    """
    A standard Time Series Transformer classifier:
      - Input: [B, C, T]
      - Linear projection over channels -> [B, T, d_model]
      - Add positional encoding
      - N transformer encoder layers
      - Global average pooling over time
      - Linear classifier to n_classes
    """
    def __init__(
        self,
        in_channels: int,
        seq_len: int,
        n_classes: int,
        d_model: int = 64,
        n_heads: int = 8,
        d_ff: int = 128,
        n_layers: int = 4,
        dropout: float = 0.1,
    ):
        super().__init__()
        self.in_channels = in_channels
        self.seq_len = seq_len
        self.d_model = d_model

        self.input_proj = nn.Linear(in_channels, d_model)
        self.pos_enc = PositionalEncoding(d_model, max_len=seq_len)

        self.layers = nn.ModuleList(
            [
                TransformerEncoderLayer(
                    d_model=d_model,
                    n_heads=n_heads,
                    d_ff=d_ff,
                    dropout=dropout,
                )
                for _ in range(n_layers)
            ]
        )

        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(d_model, n_classes)

    def forward(self, x):
        # x: [B, C, T]
        x = x.permute(0, 2, 1)  # [B, T, C]
        x = self.input_proj(x)  # [B, T, d_model]
        x = self.pos_enc(x)     # [B, T, d_model]

        for layer in self.layers:
            x = layer(x)

        x = self.dropout(x)
        x = x.mean(dim=1)       # global average over time: [B, d_model]
        logits = self.classifier(x)  # [B, n_classes]
        return logits


# ---------------------------
# 4) Utility: parameter count
# ---------------------------
def count_parameters(model: nn.Module) -> int:
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


# ---------------------------
# 5) Latency measurement on CPU
# ---------------------------
def measure_latency(
    model: nn.Module,
    input_shape,
    device: torch.device,
    n_warmup: int = 20,
    n_runs: int = 100,
):
    """
    Measure pure forward-pass latency on CPU.
    Returns latency stats in milliseconds per batch (per call to model()).
    Each batch element corresponds to one HAR window.
    """
    model = model.to(device)
    model.eval()
    torch.set_grad_enabled(False)

    x = torch.randn(*input_shape, device=device, dtype=torch.float32)

    # Warm-up
    for _ in range(n_warmup):
        _ = model(x)

    # Timed runs
    times_ms = []
    for _ in range(n_runs):
        t0 = time.perf_counter()
        _ = model(x)
        t1 = time.perf_counter()
        times_ms.append((t1 - t0) * 1000.0)

    times_ms = np.array(times_ms, dtype=float)
    stats = {
        "batch_latency_p50_ms": float(np.percentile(times_ms, 50)),
        "batch_latency_p90_ms": float(np.percentile(times_ms, 90)),
        "batch_latency_mean_ms": float(times_ms.mean()),
        "batch_latency_std_ms":  float(times_ms.std()),
        "n_runs": int(n_runs),
        "batch_size": int(input_shape[0]),
    }
    return stats


# ---------------------------
# 6) Instantiate model and run benchmark
# ---------------------------
model = TSTClassifier(
    in_channels=N_CHANNELS,
    seq_len=WINDOW_SAMPLES,
    n_classes=N_CLASSES,
    d_model=D_MODEL,
    n_heads=N_HEADS,
    d_ff=D_FF,
    n_layers=N_LAYERS,
    dropout=DROPOUT,
)

n_params = count_parameters(model)
print(f"\nNumber of trainable parameters: {n_params:,}")

# Batch size for latency measurement (per-window model)
BATCH_SIZE = 1
input_shape = (BATCH_SIZE, N_CHANNELS, WINDOW_SAMPLES)  # [B, C, T]

print(f"\nMeasuring CPU latency with input shape: {input_shape} (B, C, T)")

stats = measure_latency(
    model,
    input_shape=input_shape,
    device=device,
    n_warmup=20,
    n_runs=100,
)

print("\nCPU latency stats for TST.forward() (per batch):")
for k, v in stats.items():
    if k.endswith("_ms"):
        print(f"  {k}: {v:.4f}")
    else:
        print(f"  {k}: {v}")

# Per-window latency (each batch element = one HAR window)
per_window_p50  = stats["batch_latency_p50_ms"]  / stats["batch_size"]
per_window_p90  = stats["batch_latency_p90_ms"]  / stats["batch_size"]
per_window_mean = stats["batch_latency_mean_ms"] / stats["batch_size"]

print("\nApproximate CPU latency per window (HAR sample):")
print(f"  window_latency_p50_ms  ≈ {per_window_p50:.4f}")
print(f"  window_latency_p90_ms  ≈ {per_window_p90:.4f}")
print(f"  window_latency_mean_ms ≈ {per_window_mean:.4f}")

print("\n[TST CPU inference latency benchmark completed]")


[TST CPU Inference Latency Benchmark for HAR Windows]
Using device: cpu
torch.get_num_threads() = 1

N_CHANNELS=6, WINDOW_SAMPLES=150, N_CLASSES=8
TST config: d_model=64, n_heads=8, d_ff=128, n_layers=4, dropout=0.1

Number of trainable parameters: 134,856

Measuring CPU latency with input shape: (1, 6, 150) (B, C, T)

CPU latency stats for TST.forward() (per batch):
  batch_latency_p50_ms: 5.6155
  batch_latency_p90_ms: 6.7708
  batch_latency_mean_ms: 5.8072
  batch_latency_std_ms: 0.4870
  n_runs: 100
  batch_size: 1

Approximate CPU latency per window (HAR sample):
  window_latency_p50_ms  ≈ 5.6155
  window_latency_p90_ms  ≈ 6.7708
  window_latency_mean_ms ≈ 5.8072

[TST CPU inference latency benchmark completed]


9. lightweight rTsfNet

In [22]:
# ================ rTsfNet (lightweight) CPU Inference Latency Benchmark (Standalone) ================
# 0) Ensure a compatible TensorFlow + NumPy combination for Python 3.12
!pip -q install "numpy<2.0.0" "tensorflow==2.16.1"

import os
os.environ.setdefault("OMP_NUM_THREADS", "1")
os.environ.setdefault("MKL_NUM_THREADS", "1")
os.environ.setdefault("OPENBLAS_NUM_THREADS", "1")
os.environ.setdefault("CUDA_VISIBLE_DEVICES", "-1")

import time
import random
import math
import warnings
warnings.filterwarnings("ignore")

import numpy as np

import tensorflow as tf
from tensorflow.keras import Input
from tensorflow.keras.layers import (
    Dense, Dropout, LayerNormalization, LeakyReLU,
    Layer, Lambda, Flatten, GlobalAveragePooling1D, Activation
)
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2

print("\n[rTsfNet (lightweight) CPU Inference Latency Benchmark]")

# ---------------------------
# 1) Basic configuration
# ---------------------------
SEED = 42
tf.random.set_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

# Force single-threaded CPU inside TensorFlow
tf.config.threading.set_intra_op_parallelism_threads(1)
tf.config.threading.set_inter_op_parallelism_threads(1)

print("Physical devices:", tf.config.list_physical_devices())
print(f"Intra-op threads: {tf.config.threading.get_intra_op_parallelism_threads()}")
print(f"Inter-op threads: {tf.config.threading.get_inter_op_parallelism_threads()}")

# HAR-like window settings: 6 channels, 150 time steps (3 s @ 50 Hz)
FS             = 50.0
N_CHANNELS     = 6
WINDOW_SAMPLES = 150
N_CLASSES      = 8

print(f"\nN_CHANNELS={N_CHANNELS}, WINDOW_SAMPLES={WINDOW_SAMPLES}, N_CLASSES={N_CLASSES}")

# rTsfNet hyperparameters (must match your training script)
IMU_ROT_HEADS = 2
MLP_BASE      = 128
MLP_DEPTH     = 3
DROPOUT       = 0.5
LR            = 1e-3
WEIGHT_DECAY  = 1e-6
USE_ORIG_INPUT = True

print(f"FS={FS}, IMU_ROT_HEADS={IMU_ROT_HEADS}, MLP_BASE={MLP_BASE}, MLP_DEPTH={MLP_DEPTH}")
print(f"DROPOUT={DROPOUT}, LR={LR}, WEIGHT_DECAY={WEIGHT_DECAY}, USE_ORIG_INPUT={USE_ORIG_INPUT}")


# ---------------------------
# 2) TSF feature layer (same as in your script)
# ---------------------------
class TSFFeatureLayer(Layer):
    """
    Input: [B, T, C]  Output: [B, C, F]
    Time domain: mean/std/max/min/ptp/rms/energy/skew/kurt/zcr/ar1/ar2
    Frequency domain: centroid/entropy/flatness/soft-peak + bandpower (0.5–3 / 3–8 / 8–15 Hz)
    """
    def __init__(self, fs=50.0, **kwargs):
        super().__init__(**kwargs)
        self.fs = float(fs)
        self.eps = 1e-8

    def get_config(self):
        cfg = super().get_config()
        cfg.update({"fs": self.fs})
        return cfg

    def call(self, x):  # x: [B, T, C]
        mean = tf.reduce_mean(x, axis=1, keepdims=True)
        std  = tf.math.reduce_std(x, axis=1, keepdims=True) + self.eps

        maxv = tf.reduce_max(x, axis=1, keepdims=True)
        minv = tf.reduce_min(x, axis=1, keepdims=True)
        ptp  = maxv - minv
        rms  = tf.sqrt(tf.reduce_mean(tf.square(x), axis=1, keepdims=True))
        energy = tf.reduce_sum(tf.square(x), axis=1, keepdims=True)

        skew = tf.reduce_mean(tf.pow((x - mean) / std, 3), axis=1, keepdims=True)
        kurt = tf.reduce_mean(tf.pow((x - mean) / std, 4), axis=1, keepdims=True)

        signs = tf.sign(x)
        sign_changes = tf.abs(signs[:, 1:, :] - signs[:, :-1, :])
        zcr = tf.reduce_mean(sign_changes, axis=1, keepdims=True) / 2.0

        x_t1 = x[:, :-1, :]
        x_tn1 = x[:, 1:, :]
        ar1 = tf.reduce_sum(x_t1 * x_tn1, axis=1, keepdims=True) / (
            tf.reduce_sum(tf.square(x_t1), axis=1, keepdims=True) + self.eps
        )

        x_t2 = x[:, :-2, :]
        x_tn2 = x[:, 2:, :]
        ar2 = tf.reduce_sum(x_t2 * x_tn2, axis=1, keepdims=True) / (
            tf.reduce_sum(tf.square(x_t2), axis=1, keepdims=True) + self.eps
        )

        # Frequency domain
        xc = x - mean
        x_bc_t = tf.transpose(xc, [0, 2, 1])               # [B, C, T]
        fft = tf.signal.rfft(x_bc_t)                       # [B, C, F]
        power = tf.square(tf.abs(fft)) + self.eps          # [B, C, F]
        power = tf.transpose(power, [0, 2, 1])             # [B, F, C]

        F = tf.shape(power)[1]
        freqs = tf.linspace(0.0, tf.cast(self.fs, tf.float32) / 2.0, F)  # [F]
        freqs = tf.reshape(freqs, [1, F, 1])                             # [1, F, 1]

        p = power / (tf.reduce_sum(power, axis=1, keepdims=True) + self.eps)
        centroid = tf.reduce_sum(p * freqs, axis=1, keepdims=True)       # [B, 1, C]
        entropy  = -tf.reduce_sum(p * tf.math.log(p + self.eps), axis=1, keepdims=True) / (
            tf.math.log(tf.cast(F, tf.float32) + self.eps)
        )

        geo = tf.exp(tf.reduce_mean(tf.math.log(power), axis=1, keepdims=True))
        ari = tf.reduce_mean(power, axis=1, keepdims=True)
        flatness = geo / (ari + self.eps)

        temp = 10.0
        w = tf.nn.softmax(power * temp, axis=1)                        # [B, F, C]
        soft_peak = tf.reduce_sum(w * freqs, axis=1, keepdims=True)    # [B, 1, C]

        def band(low, high):
            mask = tf.cast((freqs >= low) & (freqs < high), tf.float32)
            bp = tf.reduce_sum(power * mask, axis=1, keepdims=True) / (
                tf.reduce_sum(power, axis=1, keepdims=True) + self.eps
            )
            return bp

        bp1 = band(0.5, 3.0)
        bp2 = band(3.0, 8.0)
        bp3 = band(8.0, 15.0)

        feats = [
            mean, std, maxv, minv, ptp, rms, energy, skew, kurt, zcr, ar1, ar2,
            centroid, entropy, flatness, soft_peak, bp1, bp2, bp3
        ]  # each [B,1,C]
        res = tf.concat(feats, axis=1)                       # [B, Fnum, C]
        return tf.transpose(res, [0, 2, 1])                  # [B, C, Fnum]


# ---------------------------
# 3) Multi-head 3D rotation (same as in your script)
# ---------------------------
class Multihead3DRotation(Layer):
    """
    Input [B, T, 6] (ACC + GYR), output: list of length head_nums, each element [B, T, 6].
    """
    def __init__(self, head_nums=2, base_kn=64, param_depth=2, **kwargs):
        super().__init__(**kwargs)
        self.head_nums = head_nums
        self.base_kn = base_kn
        self.param_depth = param_depth
        self.eps = 1e-8

        self.gap = GlobalAveragePooling1D()
        self.mlp = [Dense(self.base_kn, activation="relu") for _ in range(self.param_depth)]
        self.out_heads = [Dense(4, activation="tanh") for _ in range(self.head_nums)]

    def get_config(self):
        cfg = super().get_config()
        cfg.update(
            {"head_nums": self.head_nums, "base_kn": self.base_kn, "param_depth": self.param_depth}
        )
        return cfg

    def compute_output_shape(self, input_shape):
        return [tf.TensorShape(input_shape) for _ in range(self.head_nums)]

    def _axis_angle_to_R(self, axis_raw, angle_raw):
        axis = axis_raw / (tf.norm(axis_raw, axis=-1, keepdims=True) + self.eps)
        theta = angle_raw * math.pi                                       # [B,1]
        B = tf.shape(axis)[0]

        ux, uy, uz = axis[:, 0], axis[:, 1], axis[:, 2]
        z = tf.zeros_like(ux)
        K = tf.stack(
            [
                z, -uz, uy,
                uz, z, -ux,
                -uy, ux, z
            ],
            axis=-1,
        )
        K = tf.reshape(K, [B, 3, 3])

        I3 = tf.eye(3, dtype=axis.dtype)
        I  = tf.tile(I3[None, ...], [B, 1, 1])

        u = tf.expand_dims(axis, -1)            # [B,3,1]
        uuT = tf.matmul(u, u, transpose_b=True) # [B,3,3]

        cos = tf.reshape(tf.cos(theta), [-1, 1, 1])
        sin = tf.reshape(tf.sin(theta), [-1, 1, 1])

        R = cos * I + (1.0 - cos) * uuT + sin * K
        return R

    def call(self, x):   # x: [B, T, 6]
        acc, gyr = x[:, :, :3], x[:, :, 3:6]
        pooled = self.gap(x)                                  # [B, 6]

        h = pooled
        for layer in self.mlp:
            h = layer(h)

        out_list = []
        for oh in self.out_heads:
            p = oh(h)                                         # [B, 4]
            axis = p[:, :3]
            angle = tf.expand_dims(p[:, 3], -1)               # [B,1]
            R = self._axis_angle_to_R(axis, angle)            # [B,3,3]

            acc_t = tf.transpose(acc, [0, 2, 1])              # [B,3,T]
            acc_rot_t = tf.matmul(R, acc_t)                   # [B,3,T]
            acc_rot = tf.transpose(acc_rot_t, [0, 2, 1])      # [B,T,3]

            gyr_t = tf.transpose(gyr, [0, 2, 1])              # [B,3,T]
            gyr_rot_t = tf.matmul(R, gyr_t)                   # [B,3,T]
            gyr_rot = tf.transpose(gyr_rot_t, [0, 2, 1])      # [B,T,3]

            out_list.append(tf.concat([acc_rot, gyr_rot], axis=-1))  # [B,T,6]
        return out_list


# ---------------------------
# 4) L2 norm channels (same as in your script)
# ---------------------------
def add_l2_channels(x):     # x: [B, T, 6]
    acc = x[:, :, :3]
    gyr = x[:, :, 3:6]
    l2_acc = tf.sqrt(tf.reduce_sum(tf.square(acc), axis=-1, keepdims=True))
    l2_gyr = tf.sqrt(tf.reduce_sum(tf.square(gyr), axis=-1, keepdims=True))
    return tf.concat([x, l2_acc, l2_gyr], axis=-1)  # [B, T, 8]


# ---------------------------
# 5) rTsfNet model (same structure & hyperparameters)
# ---------------------------
def r_tsf_net(
    x_shape,
    n_classes,
    learning_rate=1e-3,
    base_kn=128,
    depth=3,
    dropout_rate=0.5,
    imu_rot_heads=2,
    fs=50.0,
    use_orig_input=True,
):
    inputs = Input(shape=x_shape[1:])     # [T, 6]
    x = inputs

    rot_layer = Multihead3DRotation(
        head_nums=imu_rot_heads, base_kn=64, param_depth=2, name="multihead_rot"
    )
    rotated_list = rot_layer(x)   # list of [B, T, 6]

    streams = []
    if use_orig_input:
        streams.append(Lambda(add_l2_channels, name="orig_plus_l2")(x))
    for i, xr in enumerate(rotated_list):
        streams.append(Lambda(add_l2_channels, name=f"rot{i}_plus_l2")(xr))

    concat_streams = Lambda(
        lambda lst: tf.concat(lst, axis=-1), name="concat_streams"
    )(streams)  # [B,T,8*(1+heads)]

    tsf = TSFFeatureLayer(fs=fs, name="tsf")(concat_streams)  # [B, C_total, F]

    z = Flatten(name="flatten")(tsf)
    for k in range(depth - 1, -1, -1):
        z = Dense(base_kn * (2 ** k), kernel_regularizer=l2(WEIGHT_DECAY), name=f"fc_{k}")(z)
        z = LayerNormalization(epsilon=1e-7, name=f"ln_{k}")(z)
        z = LeakyReLU(name=f"lrelu_{k}")(z)
        z = Dropout(dropout_rate, name=f"drop_{k}")(z)

    logits = Dense(n_classes, kernel_regularizer=l2(WEIGHT_DECAY), name="logits")(z)
    probs  = Activation("softmax", dtype="float32", name="softmax")(logits)

    model = Model(inputs, probs, name="rTsfNet_officially_aligned_fixed")

    opt = Adam(learning_rate=learning_rate, amsgrad=True)
    # compile is not strictly needed for inference latency, but harmless
    model.compile(
        loss="sparse_categorical_crossentropy",
        optimizer=opt,
        metrics=["accuracy"],
    )
    return model


# ---------------------------
# 6) Instantiate model
# ---------------------------
BATCH_SIZE = 1
x_shape = (BATCH_SIZE, WINDOW_SAMPLES, N_CHANNELS)

model = r_tsf_net(
    x_shape=x_shape,
    n_classes=N_CLASSES,
    learning_rate=LR,
    base_kn=MLP_BASE,
    depth=MLP_DEPTH,
    dropout_rate=DROPOUT,
    imu_rot_heads=IMU_ROT_HEADS,
    fs=FS,
    use_orig_input=USE_ORIG_INPUT,
)

n_params = model.count_params()
print(f"\nTotal number of model parameters: {n_params:,}")


# ---------------------------
# 7) Latency measurement (CPU, single batch = one window)
# ---------------------------
def measure_latency_tf(
    model,
    input_shape,
    n_warmup: int = 20,
    n_runs: int = 100,
):
    """
    Measure pure forward-pass latency of the Keras model on CPU.
    Returns latency stats in milliseconds per batch (per call to model()).
    Each batch element corresponds to one HAR window.
    """
    x = np.random.randn(*input_shape).astype(np.float32)
    x_tf = tf.constant(x)

    # Warm-up
    for _ in range(n_warmup):
        _ = model(x_tf, training=False)

    times_ms = []
    for _ in range(n_runs):
        t0 = time.perf_counter()
        _ = model(x_tf, training=False)
        t1 = time.perf_counter()
        times_ms.append((t1 - t0) * 1000.0)

    times_ms = np.array(times_ms, dtype=float)
    stats = {
        "batch_latency_p50_ms": float(np.percentile(times_ms, 50)),
        "batch_latency_p90_ms": float(np.percentile(times_ms, 90)),
        "batch_latency_mean_ms": float(times_ms.mean()),
        "batch_latency_std_ms":  float(times_ms.std()),
        "n_runs": int(n_runs),
        "batch_size": int(input_shape[0]),
    }
    return stats


input_shape = (BATCH_SIZE, WINDOW_SAMPLES, N_CHANNELS)
print(f"\nMeasuring CPU latency with input shape: {input_shape} (B, T, C)")

stats = measure_latency_tf(
    model,
    input_shape=input_shape,
    n_warmup=20,
    n_runs=100,
)

print("\nCPU latency stats for rTsfNet.forward() (per batch):")
for k, v in stats.items():
    if k.endswith("_ms"):
        print(f"  {k}: {v:.4f}")
    else:
        print(f"  {k}: {v}")

per_window_p50  = stats["batch_latency_p50_ms"]  / stats["batch_size"]
per_window_p90  = stats["batch_latency_p90_ms"]  / stats["batch_size"]
per_window_mean = stats["batch_latency_mean_ms"] / stats["batch_size"]

print("\nApproximate CPU latency per window (HAR sample):")
print(f"  window_latency_p50_ms  ≈ {per_window_p50:.4f}")
print(f"  window_latency_p90_ms  ≈ {per_window_p90:.4f}")
print(f"  window_latency_mean_ms ≈ {per_window_mean:.4f}")

print("\n[rTsfNet (lightweight) CPU inference latency benchmark completed]")

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m589.9/589.9 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m90.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.9/294.9 kB[0m [31m23.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m124.6 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow-decision-forests 1.12.0 requires tensorflow==2.19.0, but you have tensorflow 2.16.1 which is incompatible.
opentelemetry-proto 1.37.0 requires protobuf<7.0,>=5.0, but you have protobuf 4.25.8 which is incompatible.
jax 0.7.2 requires ml_dtypes>=0.5.0, but you have ml-dtypes 0.3.2 which is incompatible.
jax 0.7.2 requires nump

10. rTsfNet

In [23]:
# ================ rTsfNet (IMWUT 2024 official) CPU Inference Latency Benchmark (Standalone) ================
# Ensure a compatible TensorFlow + NumPy combination (Python 3.12 safe)
!pip -q install "numpy<2.0.0" "tensorflow==2.16.1"

import os
os.environ.setdefault("OMP_NUM_THREADS", "1")
os.environ.setdefault("MKL_NUM_THREADS", "1")
os.environ.setdefault("OPENBLAS_NUM_THREADS", "1")
os.environ.setdefault("CUDA_VISIBLE_DEVICES", "-1")

import time
import random
import math
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import tensorflow as tf
from tensorflow.keras import Input
from tensorflow.keras.layers import (
    Dense, Dropout, LayerNormalization, LeakyReLU,
    Layer, Activation, TimeDistributed, Flatten, Concatenate
)
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2

print("\n[rTsfNet (IMWUT 2024 official) CPU Inference Latency Benchmark]")

# ---------------------------
# 0) Basic configuration
# ---------------------------
SEED = 42
tf.random.set_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

# Force single-threaded CPU inside TensorFlow
tf.config.threading.set_intra_op_parallelism_threads(1)
tf.config.threading.set_inter_op_parallelism_threads(1)

print("Physical devices:", tf.config.list_physical_devices())
print(f"Intra-op threads: {tf.config.threading.get_intra_op_parallelism_threads()}")
print(f"Inter-op threads: {tf.config.threading.get_inter_op_parallelism_threads()}")

# HAR-like window settings: 6 channels, 150 time steps (3 s @ 50 Hz)
FS             = 50.0
N_CHANNELS     = 6
WINDOW_SAMPLES = 150
N_CLASSES      = 8

print(f"\nN_CHANNELS={N_CHANNELS}, WINDOW_SAMPLES={WINDOW_SAMPLES}, N_CLASSES={N_CLASSES}")

# Hyperparameters (must match the official Step 10 script)
IMU_ROT_HEADS       = 2
MLP_BASE            = 128
MLP_DEPTH           = 3
DROPOUT             = 0.5
LR                  = 1e-3
WEIGHT_DECAY        = 1e-6
USE_ORIG_INPUT      = True
USE_BINARY_SELECTION= True
LN_EPS              = 1e-7
PAD_MODE            = "SYMMETRIC"

# Block specs as in the official architecture
BLOCK_SPECS = [
    dict(name="short", num_blocks=4, use_time=True,  use_freq=False),
    dict(name="long",  num_blocks=1, use_time=False, use_freq=True),
]

print(f"\nFS={FS}, IMU_ROT_HEADS={IMU_ROT_HEADS}, MLP_BASE={MLP_BASE}, MLP_DEPTH={MLP_DEPTH}")
print(f"DROPOUT={DROPOUT}, LR={LR}, WEIGHT_DECAY={WEIGHT_DECAY}")
print(f"USE_ORIG_INPUT={USE_ORIG_INPUT}, USE_BINARY_SELECTION={USE_BINARY_SELECTION}")
print(f"LN_EPS={LN_EPS}, PAD_MODE={PAD_MODE}, BLOCK_SPECS={BLOCK_SPECS}")

# ---------------------------
# 1) Shared TSF config (same as training script)
# ---------------------------
TIME_FEATS = 12  # mean/std/max/min/ptp/rms/energy/skew/kurt/zcr/ar1/ar2
FREQ_FEATS = 7   # centroid/entropy/flatness/soft-peak/bandpowers(3)


# ---------------------------
# 2) MLPStack (Keras 3–safe MLP stack)
# ---------------------------
class MLPStack(Layer):
    """
    Dense -> LayerNorm -> LeakyReLU -> Dropout repeated 'depth' times,
    hidden width base_kn * (2**k), k: depth-1..0; output dimensionality
    is fixed to base_kn.
    """
    def __init__(self, base_kn=128, depth=3, drop=0.5, wd=0.0, ln_eps=1e-7, name=None):
        super().__init__(name=name)
        self.base_kn = int(base_kn)
        self.depth = int(depth)
        self.drop = float(drop)
        self.wd = float(wd)
        self.ln_eps = float(ln_eps)

        self.seq = []
        for k in range(self.depth - 1, -1, -1):
            self.seq.append(Dense(self.base_kn * (2**k), kernel_regularizer=l2(self.wd)))
            self.seq.append(LayerNormalization(epsilon=self.ln_eps))
            self.seq.append(LeakyReLU())
            self.seq.append(Dropout(self.drop))

    @property
    def out_dim(self):
        return self.base_kn

    def call(self, x, training=None):
        z = x
        for lyr in self.seq:
            if isinstance(lyr, Dropout):
                z = lyr(z, training=training)
            else:
                z = lyr(z)
        return z

    def compute_output_shape(self, input_shape):
        return tf.TensorShape([input_shape[0], self.out_dim])


# ---------------------------
# 3) TSF extraction (axis-wise)
# ---------------------------
class TSFFeatureLayer(Layer):
    """Compute axis-wise TSF features for a single block [B, L, C];
    output shape [B, C, F] where F is the fixed TSF feature dimensionality."""
    def __init__(self, fs=50.0, use_time=True, use_freq=True, **kwargs):
        super().__init__(**kwargs)
        self.fs = float(fs)
        self.use_time = bool(use_time)
        self.use_freq = bool(use_freq)
        self.eps = 1e-8
        self._feat_dim = (TIME_FEATS if self.use_time else 0) + (FREQ_FEATS if self.use_freq else 0)

    def get_config(self):
        cfg = super().get_config()
        cfg.update({"fs": self.fs, "use_time": self.use_time, "use_freq": self.use_freq})
        return cfg

    def call(self, x):  # x: [B, L, C]
        feats = []
        if self.use_time:
            mean = tf.reduce_mean(x, axis=1, keepdims=True)
            std  = tf.math.reduce_std(x, axis=1, keepdims=True) + self.eps
            maxv = tf.reduce_max(x, axis=1, keepdims=True)
            minv = tf.reduce_min(x, axis=1, keepdims=True)
            ptp  = maxv - minv
            rms  = tf.sqrt(tf.reduce_mean(tf.square(x), axis=1, keepdims=True))
            energy = tf.reduce_sum(tf.square(x), axis=1, keepdims=True)
            skew = tf.reduce_mean(tf.pow((x - mean) / std, 3), axis=1, keepdims=True)
            kurt = tf.reduce_mean(tf.pow((x - mean) / std, 4), axis=1, keepdims=True)
            signs = tf.sign(x)
            sign_changes = tf.abs(signs[:, 1:, :] - signs[:, :-1, :])
            zcr = tf.reduce_mean(sign_changes, axis=1, keepdims=True) / 2.0
            x_t1 = x[:, :-1, :]; x_tn1 = x[:, 1:, :]
            ar1 = tf.reduce_sum(x_t1 * x_tn1, axis=1, keepdims=True) / (
                tf.reduce_sum(tf.square(x_t1), axis=1, keepdims=True) + self.eps
            )
            x_t2 = x[:, :-2, :]; x_tn2 = x[:, 2:, :]
            ar2 = tf.reduce_sum(x_t2 * x_tn2, axis=1, keepdims=True) / (
                tf.reduce_sum(tf.square(x_t2), axis=1, keepdims=True) + self.eps
            )
            feats += [mean, std, maxv, minv, ptp, rms, energy, skew, kurt, zcr, ar1, ar2]

        if self.use_freq:
            mean = tf.reduce_mean(x, axis=1, keepdims=True)
            xc = x - mean
            x_bc_t = tf.transpose(xc, [0, 2, 1])               # [B, C, L]
            fft = tf.signal.rfft(x_bc_t)                      # [B, C, F]
            power = tf.square(tf.abs(fft)) + self.eps         # [B, C, F]
            power = tf.transpose(power, [0, 2, 1])            # [B, F, C]

            F = tf.shape(power)[1]
            freqs = tf.linspace(0.0, tf.cast(self.fs, tf.float32) / 2.0, F)  # [F]
            freqs = tf.reshape(freqs, [1, F, 1])                             # [1, F, 1]

            p = power / (tf.reduce_sum(power, axis=1, keepdims=True) + self.eps)
            centroid = tf.reduce_sum(p * freqs, axis=1, keepdims=True)       # [B, 1, C]
            entropy  = -tf.reduce_sum(p * tf.math.log(p + self.eps), axis=1, keepdims=True) / \
                        (tf.math.log(tf.cast(F, tf.float32) + self.eps))
            geo = tf.exp(tf.reduce_mean(tf.math.log(power), axis=1, keepdims=True))
            ari = tf.reduce_mean(power, axis=1, keepdims=True)
            flatness = geo / (ari + self.eps)
            temp = 10.0
            w = tf.nn.softmax(power * temp, axis=1)                          # [B, F, C]
            soft_peak = tf.reduce_sum(w * freqs, axis=1, keepdims=True)      # [B, 1, C]

            def band(low, high):
                mask = tf.cast((freqs >= low) & (freqs < high), tf.float32)
                bp = tf.reduce_sum(power * mask, axis=1, keepdims=True) / (
                    tf.reduce_sum(power, axis=1, keepdims=True) + self.eps
                )
                return bp

            bp1 = band(0.5, 3.0); bp2 = band(3.0, 8.0); bp3 = band(8.0, 15.0)
            feats += [centroid, entropy, flatness, soft_peak, bp1, bp2, bp3]

        res = tf.concat(feats, axis=1)                       # [B, Fnum, C]
        return tf.transpose(res, [0, 2, 1])                  # [B, C, Fnum]

    def compute_output_shape(self, input_shape):
        return tf.TensorShape([input_shape[0], input_shape[2], self._feat_dim])


# ---------------------------
# 4) L2-norm channels (two variants as in original code)
# ---------------------------
class AddL2Channels(Layer):
    def call(self, x, training=None):
        acc = x[:, :, :3]; gyr = x[:, :, 3:6]
        l2_acc = tf.sqrt(tf.reduce_sum(tf.square(acc), axis=-1, keepdims=True))
        l2_gyr = tf.sqrt(tf.reduce_sum(tf.square(gyr), axis=-1, keepdims=True))
        return tf.concat([x, l2_acc, l2_gyr], axis=-1)  # [B, T, 8]

    def compute_output_shape(self, input_shape):
        return tf.TensorShape([input_shape[0], input_shape[1], 8])


class AddL2ChannelsPublic(Layer):
    def call(self, x, training=None):
        acc = x[:, :, :3]; gyr = x[:, :, 3:6]
        l2_acc = tf.sqrt(tf.reduce_sum(tf.square(acc), axis=-1, keepdims=True))
        l2_gyr = tf.sqrt(tf.reduce_sum(tf.square(gyr), axis=-1, keepdims=True))
        return tf.concat([x, l2_acc, l2_gyr], axis=-1)  # [B, T, 8]

    def compute_output_shape(self, input_shape):
        return tf.TensorShape([input_shape[0], input_shape[1], 8])


# ---------------------------
# 5) Block framing utilities
# ---------------------------
def _int_ceil_div(a, b):
    a = tf.cast(a, tf.int32); b = tf.cast(b, tf.int32)
    return tf.math.floordiv(a + b - 1, b)

def frame_signal_with_padding(x, num_blocks, pad_mode="SYMMETRIC"):
    """
    [B, T, C] -> symmetric padding to length L * num_blocks
    and reshape to [B, num_blocks, L, C].
    """
    B = tf.shape(x)[0]; T = tf.shape(x)[1]; C = tf.shape(x)[2]
    nb = tf.cast(num_blocks, tf.int32)
    L  = _int_ceil_div(T, nb)
    total = L * nb
    pad_len = total - T
    pad_left  = tf.math.floordiv(pad_len, 2)
    pad_right = pad_len - pad_left
    paddings = tf.stack([
        tf.constant([0, 0], dtype=tf.int32),
        tf.stack([pad_left, pad_right]),
        tf.constant([0, 0], dtype=tf.int32)
    ], axis=0)  # [3, 2]
    x_pad = tf.pad(x, paddings, mode=pad_mode)
    x_blocks = tf.reshape(x_pad, [B, nb, L, C])
    return x_blocks


class BlockTSFExtractor(Layer):
    """
    Apply TSF extraction and axis-tag injection for a block set.
    Input:  x with shape [B, T, C_total]
    Output: TSF tensor [B, num_blocks, A, F_total] (A = C_total; last dimension includes tags).
    """
    def __init__(self, num_blocks, fs, use_time, use_freq,
                 tag_spec=None, pad_mode="SYMMETRIC", name=None, **kwargs):
        super().__init__(name=name, **kwargs)
        self.num_blocks = int(num_blocks)
        self.tsf = TSFFeatureLayer(fs=fs, use_time=use_time, use_freq=use_freq)
        self.tag_spec = tag_spec
        self.pad_mode = pad_mode
        self.tag_dim = 0 if (tag_spec is None or "axis_tags" not in tag_spec) else int(tag_spec["axis_tags"].shape[1])
        self.base_feat_dim = (TIME_FEATS if use_time else 0) + (FREQ_FEATS if use_freq else 0)
        self.out_feat_dim = self.base_feat_dim + self.tag_dim

    def get_config(self):
        cfg = super().get_config()
        cfg.update({
            "num_blocks": self.num_blocks,
            "fs": self.tsf.fs,
            "use_time": self.tsf.use_time,
            "use_freq": self.tsf.use_freq,
            "pad_mode": self.pad_mode
        })
        return cfg

    def call(self, x, training=None):  # x: [B, T, C]
        xb = frame_signal_with_padding(x, self.num_blocks, pad_mode=self.pad_mode)  # [B, K, L, C]
        B = tf.shape(xb)[0]; K = tf.shape(xb)[1]; L = tf.shape(xb)[2]; C = tf.shape(xb)[3]
        xb2 = tf.reshape(xb, [B * K, L, C])                   # [B*K, L, C]
        tsf_axis = self.tsf(xb2)                             # [B*K, C, F]
        tsf_axis = tf.reshape(tsf_axis, [B, K, C, self.base_feat_dim])  # [B, K, A, F_base]

        if self.tag_dim > 0:
            axis_tags = tf.convert_to_tensor(self.tag_spec["axis_tags"], dtype=tsf_axis.dtype)  # [A, tag_dim]
            axis_tags = tf.reshape(axis_tags, [1, 1, tf.shape(tsf_axis)[2], -1])  # [1, 1, A, tag_dim]
            axis_tags = tf.tile(axis_tags, [B, K, 1, 1])                          # [B, K, A, tag_dim]
            tsf_axis = tf.concat([tsf_axis, axis_tags], axis=-1)                  # [B, K, A, F_base+tag_dim]
        return tsf_axis  # [B, K, A, F_total]

    def compute_output_shape(self, input_shape):
        return tf.TensorShape([input_shape[0], self.num_blocks, input_shape[2], self.out_feat_dim])


# ---------------------------
# 6) Binary gate (straight-through estimator)
# ---------------------------
class BinaryGate(Layer):
    def call(self, p, training=None):
        p = tf.clip_by_value(p, 0.0, 1.0)
        hard = tf.round(p)
        return hard + tf.stop_gradient(p - hard)

    def compute_output_shape(self, input_shape):
        return tf.TensorShape(input_shape)


# ---------------------------
# 7) TSF-Mixer sub-block and block
# ---------------------------
class TSFMixerSubBlock(Layer):
    """
    Input: per-block axis-level TSF features with shape [B', A, F]
    Architecture: axis-shared MLP -> concatenate axes -> MLP
    Output: block-level feature [B', H_out]
    """
    def __init__(self, axis_hidden=128, out_hidden=128, base_depth=2,
                 drop=0.5, wd=0.0, ln_eps=1e-7, name=None):
        super().__init__(name=name)
        self.axis_hidden = int(axis_hidden)
        self.out_hidden = int(out_hidden)
        self.base_depth = int(base_depth)
        self.drop = float(drop); self.wd = float(wd); self.ln_eps = float(ln_eps)
        self.axis_mlp_layers = []
        for k in range(self.base_depth - 1, -1, -1):
            self.axis_mlp_layers.append(Dense(self.axis_hidden * (2**k), kernel_regularizer=l2(self.wd)))
            self.axis_mlp_layers.append(LayerNormalization(epsilon=self.ln_eps))
            self.axis_mlp_layers.append(LeakyReLU())
            self.axis_mlp_layers.append(Dropout(self.drop))
        self.out_stack = MLPStack(base_kn=self.out_hidden, depth=self.base_depth,
                                  drop=self.drop, wd=self.wd, ln_eps=self.ln_eps, name=f"{self.name}_out")

    def call(self, x, training=None, **kwargs):  # x: [B', A, F]
        Bp = tf.shape(x)[0]; A = tf.shape(x)[1]; F = tf.shape(x)[2]
        x2 = tf.reshape(x, [Bp * A, F])
        z = x2
        for lyr in self.axis_mlp_layers:
            if isinstance(lyr, Dropout):
                z = lyr(z, training=training)
            else:
                z = lyr(z)
        z = tf.reshape(z, [Bp, A, self.axis_hidden])   # [B', A, H_axis]
        z = tf.reshape(z, [Bp, A * self.axis_hidden])  # [B', A*H_axis]
        z = self.out_stack(z, training=training)       # [B', H_out]
        return z

    def compute_output_shape(self, input_shape):
        return tf.TensorShape([input_shape[0], self.out_stack.out_dim])


class TSFMixerBlock(Layer):
    """
    Extends the sub-Block with:
      - channel-wise binary selection (over feature dimension F)
      - axis-wise binary selection (over axis dimension A)
    Output shape: [B', out_hidden]
    """
    def __init__(self, feat_dim, axis_hidden=128, out_hidden=128, base_depth=2,
                 drop=0.5, wd=0.0, ln_eps=1e-7, use_binary=True, name=None):
        super().__init__(name=name)
        self.use_binary = bool(use_binary)
        self.sub = TSFMixerSubBlock(axis_hidden, out_hidden, base_depth, drop, wd, ln_eps,
                                    name=f"{name}_sub")
        self.axis_gate_dense = Dense(1, activation="sigmoid", name=f"{name}_axis_gate")
        self.chan_gate_dense = Dense(int(feat_dim), activation="sigmoid", name=f"{name}_chan_gate")
        self.bin_gate = BinaryGate(name=f"{name}_bin")
        self.out_stack = MLPStack(base_kn=out_hidden, depth=base_depth,
                                  drop=drop, wd=wd, ln_eps=ln_eps, name=f"{name}_out")

    def call(self, x, training=None, **kwargs):  # x: [B', A, F]
        Bp = tf.shape(x)[0]; A = tf.shape(x)[1]; F = tf.shape(x)[2]
        x_mean_axis = tf.reduce_mean(x, axis=1)            # [B', F]
        p_chan = self.chan_gate_dense(x_mean_axis)         # [B', F]
        p_chan = tf.reshape(p_chan, [Bp, 1, F])
        g_chan = self.bin_gate(p_chan, training=training) if self.use_binary else p_chan
        x = x * g_chan

        x2 = tf.reshape(x, [Bp * A, F])
        z = x2
        for lyr in self.sub.axis_mlp_layers:
            if isinstance(lyr, Dropout):
                z = lyr(z, training=training)
            else:
                z = lyr(z)
        z = tf.reshape(z, [Bp, A, self.sub.axis_hidden])   # [B', A, H_axis]

        p_axis = self.axis_gate_dense(z)                   # [B', A, 1]
        g_axis = self.bin_gate(p_axis, training=training) if self.use_binary else p_axis
        z = z * g_axis

        z = tf.reshape(z, [Bp, A * self.sub.axis_hidden])  # [B', A*H_axis]
        z = self.out_stack(z, training=training)           # [B', H_out]
        return z

    def compute_output_shape(self, input_shape):
        return tf.TensorShape([input_shape[0], self.out_stack.out_dim])


# ---------------------------
# 8) Rotation parameter estimation
# ---------------------------
def _feat_dim_for_spec(use_time, use_freq, tag_dim):
    base = (TIME_FEATS if use_time else 0) + (FREQ_FEATS if use_freq else 0)
    return base + tag_dim


class RotationParamEstimator(Layer):
    """
    Input: [B, T, 6] raw IMU (ACC + GYR); internally append L2 channels,
    extract TSF (for multiple block sets), pass through TSF-Mixer,
    concatenate all block-level features, then MLP -> Dense(4, tanh)
    to produce Rodrigues 4-parameter representation.
    """
    def __init__(self, block_specs, fs, mlp_base=128, mlp_depth=2,
                 drop=0.5, wd=0.0, ln_eps=1e-7,
                 use_binary=True, pad_mode="SYMMETRIC", name=None):
        super().__init__(name=name)
        self.block_specs = block_specs
        self.fs = fs
        self.mlp_base = int(mlp_base)
        self.mlp_depth = int(mlp_depth)
        self.drop = float(drop)
        self.wd = float(wd)
        self.ln_eps = float(ln_eps)
        self.use_binary = bool(use_binary)
        self.pad_mode = pad_mode

        axis_tags = []
        for i in range(8):
            axis_type = i + 1
            sensor_type = 1 if (i <= 2 or i == 6) else 2
            axis_tags.append([axis_type, sensor_type])
        axis_tags = np.array(axis_tags, dtype=np.float32)
        self.tag_spec = {"axis_tags": axis_tags}
        tag_dim = axis_tags.shape[1]

        self.extractors = []
        self.td_mixers  = []
        self.flatteners = []
        for spec in block_specs:
            ext = BlockTSFExtractor(num_blocks=spec["num_blocks"], fs=fs,
                                    use_time=spec["use_time"], use_freq=spec["use_freq"],
                                    tag_spec=self.tag_spec, pad_mode=self.pad_mode,
                                    name=f"rot_ext_{spec['name']}")
            self.extractors.append(ext)
            feat_dim = _feat_dim_for_spec(spec["use_time"], spec["use_freq"], tag_dim)
            mix = TSFMixerBlock(feat_dim=feat_dim, axis_hidden=self.mlp_base,
                                out_hidden=self.mlp_base,
                                base_depth=max(1, self.mlp_depth - 1),
                                drop=self.drop, wd=self.wd,
                                ln_eps=self.ln_eps, use_binary=self.use_binary,
                                name=f"rot_mix_{spec['name']}")
            self.td_mixers.append(TimeDistributed(mix, name=f"rot_td_{spec['name']}"))
            self.flatteners.append(Flatten(name=f"rot_flat_{spec['name']}"))

        self.concat_sets = Concatenate(name="rot_concat_sets")
        self.post_stack = MLPStack(base_kn=self.mlp_base, depth=self.mlp_depth,
                                   drop=self.drop, wd=self.wd, ln_eps=self.ln_eps, name="rot_post")
        self.out_head = Dense(4, activation="tanh", name="rot4_tanh")
        self.add_l2 = AddL2Channels()

    def call(self, x, training=None, **kwargs):  # x: [B, T, 6]
        x8 = self.add_l2(x)  # [B, T, 8]
        feats_all = []
        for ext, td, flt in zip(self.extractors, self.td_mixers, self.flatteners):
            tsf_blocks = ext(x8, training=training)        # [B, K, A, F]
            blk_feat   = td(tsf_blocks, training=training) # [B, K, H]
            blk_feat   = flt(blk_feat)                     # [B, K*H]
            feats_all.append(blk_feat)
        h = self.concat_sets(feats_all)                    # [B, sum(K*H)]
        h = self.post_stack(h, training=training)
        rot4 = self.out_head(h)
        return rot4  # [B, 4]

    def compute_output_shape(self, input_shape):
        return tf.TensorShape([input_shape[0], 4])


# ---------------------------
# 9) Multi-head 3D rotation (official)
# ---------------------------
class Multihead3DRotationOfficial(Layer):
    """
    Input [B, T, 6] (ACC + GYR); output: list whose each element is a
    rotated stream [B, T, 6]. Rotation parameters are estimated by
    RotationParamEstimator; for head index >= 2, the 4 parameters are
    accumulated over previous heads.
    """
    def __init__(self, head_nums=2, fs=50.0, mlp_base=128, mlp_depth=2,
                 drop=0.5, wd=0.0, ln_eps=1e-7,
                 block_specs=None, use_binary=True, pad_mode="SYMMETRIC", name=None):
        super().__init__(name=name)
        if block_specs is None:
            block_specs = BLOCK_SPECS
        self.head_nums = int(head_nums)
        self.estimator = RotationParamEstimator(block_specs=block_specs, fs=fs,
                                                mlp_base=mlp_base, mlp_depth=mlp_depth,
                                                drop=drop, wd=wd,
                                                ln_eps=ln_eps, use_binary=use_binary,
                                                pad_mode=pad_mode,
                                                name="rot_estimator")
        self.eps = 1e-8

    def compute_output_shape(self, input_shape):
        return [tf.TensorShape(input_shape) for _ in range(self.head_nums)]

    def call(self, x, training=None, **kwargs):  # x: [B, T, 6]
        acc, gyr = x[:, :, :3], x[:, :, 3:6]
        out_list = []
        prev_rot4 = None
        for _ in range(self.head_nums):
            rot4 = self.estimator(x, training=training)     # [B, 4]
            if prev_rot4 is not None:
                rot4 = rot4 + prev_rot4
            prev_rot4 = rot4
            axis = rot4[:, :3]; angle = tf.expand_dims(rot4[:, 3], -1)
            R = self._axis_angle_to_R(axis, angle)          # [B, 3, 3]

            acc_t = tf.transpose(acc, [0, 2, 1])            # [B, 3, T]
            acc_rot = tf.transpose(tf.matmul(R, acc_t), [0, 2, 1])
            gyr_t = tf.transpose(gyr, [0, 2, 1])
            gyr_rot = tf.transpose(tf.matmul(R, gyr_t), [0, 2, 1])

            out_list.append(tf.concat([acc_rot, gyr_rot], axis=-1))  # [B, T, 6]
        return out_list

    def _axis_angle_to_R(self, axis_raw, angle_raw):
        axis = axis_raw / (tf.norm(axis_raw, axis=-1, keepdims=True) + self.eps)  # [B, 3]
        theta = angle_raw * math.pi                                               # [B, 1]
        B = tf.shape(axis)[0]
        ux, uy, uz = axis[:, 0], axis[:, 1], axis[:, 2]
        z = tf.zeros_like(ux)
        K = tf.stack([z, -uz,  uy,
                      uz,  z, -ux,
                     -uy,  ux,  z], axis=-1)
        K = tf.reshape(K, [B, 3, 3])
        I  = tf.tile(tf.eye(3, dtype=axis.dtype)[None, ...], [B, 1, 1])
        u = tf.expand_dims(axis, -1)                                              # [B, 3, 1]
        uuT = tf.matmul(u, u, transpose_b=True)                                   # [B, 3, 3]
        cos = tf.reshape(tf.cos(theta), [-1, 1, 1])
        sin = tf.reshape(tf.sin(theta), [-1, 1, 1])
        R = cos * I + (1.0 - cos) * uuT + sin * K                                 # [B, 3, 3]
        return R


# ---------------------------
# 10) Official rTsfNet body
# ---------------------------
def r_tsf_net_official(
    x_shape,
    n_classes,
    learning_rate=1e-3,
    base_kn=128,
    depth=3,
    dropout_rate=0.5,
    imu_rot_heads=2,
    fs=50.0,
    use_orig_input=True,
    use_binary_selection=True,
    ln_eps=1e-7,
    pad_mode="SYMMETRIC",
):
    inputs = Input(shape=x_shape[1:])
    x = inputs

    rot_layer = Multihead3DRotationOfficial(
        head_nums=imu_rot_heads, fs=fs,
        mlp_base=base_kn, mlp_depth=max(1, depth - 1), drop=dropout_rate, wd=WEIGHT_DECAY,
        ln_eps=ln_eps, block_specs=BLOCK_SPECS, use_binary=use_binary_selection, pad_mode=pad_mode,
        name="multihead_rot_official"
    )
    rotated_list = rot_layer(x)

    streams = []
    add_l2 = AddL2ChannelsPublic()
    if use_orig_input:
        streams.append(add_l2(x))
    for xr in rotated_list:
        streams.append(add_l2(xr))
    concat_streams = Concatenate(axis=-1, name="concat_streams")(streams)

    feats_all_sets = []
    num_streams = (1 if use_orig_input else 0) + imu_rot_heads
    axis_tags_one_stream = []
    for i in range(8):
        axis_type = i + 1
        sensor_type = 1 if (i <= 2 or i == 6) else 2
        axis_tags_one_stream.append([axis_type, sensor_type])
    axis_tags_one_stream = np.array(axis_tags_one_stream, dtype=np.float32)
    axis_tags_all = np.concatenate([axis_tags_one_stream for _ in range(num_streams)], axis=0)
    tag_spec_main = {"axis_tags": axis_tags_all}
    tag_dim_main = axis_tags_all.shape[1]

    for spec in BLOCK_SPECS:
        ext = BlockTSFExtractor(num_blocks=spec["num_blocks"], fs=fs,
                                use_time=spec["use_time"], use_freq=spec["use_freq"],
                                tag_spec=tag_spec_main, pad_mode=pad_mode,
                                name=f"main_ext_{spec['name']}")
        feat_dim = _feat_dim_for_spec(spec["use_time"], spec["use_freq"], tag_dim_main)
        mix = TSFMixerBlock(feat_dim=feat_dim, axis_hidden=base_kn, out_hidden=base_kn,
                            base_depth=max(1, depth - 1), drop=dropout_rate, wd=WEIGHT_DECAY,
                            ln_eps=ln_eps, use_binary=use_binary_selection,
                            name=f"main_mix_{spec['name']}")
        td  = TimeDistributed(mix, name=f"main_td_{spec['name']}")
        flt = Flatten(name=f"main_flat_{spec['name']}")

        tsf_blocks = ext(concat_streams)   # [B, K, A_all, F]
        blk_feat   = td(tsf_blocks)        # [B, K, H]
        blk_feat   = flt(blk_feat)         # [B, K*H]
        feats_all_sets.append(blk_feat)

    z = Concatenate(name="main_concat_sets")(feats_all_sets)
    cls_stack = MLPStack(base_kn=base_kn, depth=depth, drop=dropout_rate,
                         wd=WEIGHT_DECAY, ln_eps=ln_eps, name="cls")
    z = cls_stack(z)
    logits = Dense(n_classes, kernel_regularizer=l2(WEIGHT_DECAY), name="logits")(z)
    probs  = Activation("softmax", dtype="float32", name="softmax")(logits)

    model = Model(inputs, probs, name="rTsfNet_official_aligned")

    opt = Adam(learning_rate=learning_rate, amsgrad=True)
    model.compile(
        loss="sparse_categorical_crossentropy",
        optimizer=opt,
        metrics=["accuracy"],
    )
    return model


# ---------------------------
# 11) Instantiate model
# ---------------------------
BATCH_SIZE = 1
x_shape = (BATCH_SIZE, WINDOW_SAMPLES, N_CHANNELS)

model = r_tsf_net_official(
    x_shape=x_shape,
    n_classes=N_CLASSES,
    learning_rate=LR,
    base_kn=MLP_BASE,
    depth=MLP_DEPTH,
    dropout_rate=DROPOUT,
    imu_rot_heads=IMU_ROT_HEADS,
    fs=FS,
    use_orig_input=USE_ORIG_INPUT,
    use_binary_selection=USE_BINARY_SELECTION,
    ln_eps=LN_EPS,
    pad_mode=PAD_MODE,
)

n_params = model.count_params()
print(f"\nTotal number of model parameters: {n_params:,}")


# ---------------------------
# 12) Latency measurement (CPU, forward only)
# ---------------------------
def measure_latency_tf(
    model,
    input_shape,
    n_warmup: int = 20,
    n_runs: int = 100,
):
    """
    Measure pure forward-pass latency of the Keras model on CPU.
    Returns latency stats in milliseconds per batch (per call to model()).
    Each batch element corresponds to one HAR window.
    """
    x = np.random.randn(*input_shape).astype(np.float32)
    x_tf = tf.constant(x)

    # Warm-up
    for _ in range(n_warmup):
        _ = model(x_tf, training=False)

    times_ms = []
    for _ in range(n_runs):
        t0 = time.perf_counter()
        _ = model(x_tf, training=False)
        t1 = time.perf_counter()
        times_ms.append((t1 - t0) * 1000.0)

    times_ms = np.array(times_ms, dtype=float)
    stats = {
        "batch_latency_p50_ms": float(np.percentile(times_ms, 50)),
        "batch_latency_p90_ms": float(np.percentile(times_ms, 90)),
        "batch_latency_mean_ms": float(times_ms.mean()),
        "batch_latency_std_ms":  float(times_ms.std()),
        "n_runs": int(n_runs),
        "batch_size": int(input_shape[0]),
    }
    return stats


input_shape = (BATCH_SIZE, WINDOW_SAMPLES, N_CHANNELS)
print(f"\nMeasuring CPU latency with input shape: {input_shape} (B, T, C)")

stats = measure_latency_tf(
    model,
    input_shape=input_shape,
    n_warmup=20,
    n_runs=100,
)

print("\nCPU latency stats for rTsfNet (IMWUT 2024 official) forward() (per batch):")
for k, v in stats.items():
    if k.endswith("_ms"):
        print(f"  {k}: {v:.4f}")
    else:
        print(f"  {k}: {v}")

per_window_p50  = stats["batch_latency_p50_ms"]  / stats["batch_size"]
per_window_p90  = stats["batch_latency_p90_ms"]  / stats["batch_size"]
per_window_mean = stats["batch_latency_mean_ms"] / stats["batch_size"]

print("\nApproximate CPU latency per window (HAR sample):")
print(f"  window_latency_p50_ms  ≈ {per_window_p50:.4f}")
print(f"  window_latency_p90_ms  ≈ {per_window_p90:.4f}")
print(f"  window_latency_mean_ms ≈ {per_window_mean:.4f}")

print("\n[rTsfNet (IMWUT 2024 official) CPU inference latency benchmark completed]")


[rTsfNet (IMWUT 2024 official) CPU Inference Latency Benchmark]
Physical devices: [PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')]
Intra-op threads: 1
Inter-op threads: 1

N_CHANNELS=6, WINDOW_SAMPLES=150, N_CLASSES=8

FS=50.0, IMU_ROT_HEADS=2, MLP_BASE=128, MLP_DEPTH=3
DROPOUT=0.5, LR=0.001, WEIGHT_DECAY=1e-06
USE_ORIG_INPUT=True, USE_BINARY_SELECTION=True
LN_EPS=1e-07, PAD_MODE=SYMMETRIC, BLOCK_SPECS=[{'name': 'short', 'num_blocks': 4, 'use_time': True, 'use_freq': False}, {'name': 'long', 'num_blocks': 1, 'use_time': False, 'use_freq': True}]

Total number of model parameters: 960,698

Measuring CPU latency with input shape: (1, 150, 6) (B, T, C)

CPU latency stats for rTsfNet (IMWUT 2024 official) forward() (per batch):
  batch_latency_p50_ms: 439.6153
  batch_latency_p90_ms: 454.8026
  batch_latency_mean_ms: 441.3187
  batch_latency_std_ms: 9.5725
  n_runs: 100
  batch_size: 1

Approximate CPU latency per window (HAR sample):
  window_latency_p50_ms  ≈ 439.6153


11. DeepConvContext – LSTM variant – 1-layer – bidirectional

In [24]:
# ============ DeepConvContext CPU Inference Latency Benchmark (Standalone) ============

import os
os.environ.setdefault("OMP_NUM_THREADS", "1")
os.environ.setdefault("MKL_NUM_THREADS", "1")
os.environ.setdefault("OPENBLAS_NUM_THREADS", "1")

import time
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import torch
import torch.nn as nn

print("\n[DeepConvContext CPU Inference Latency Benchmark]")

# ---------------------------
# 0) Basic configuration
# ---------------------------
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)

device = torch.device("cpu")
torch.set_num_threads(1)

print(f"Using device: {device}")
print(f"torch.get_num_threads() = {torch.get_num_threads()}")

# Windowing: 3 s @ 50 Hz = 150 samples; 50% overlap → stride = 75
# sequence length S = 100 windows, 6 channels
NUM_CHANNELS      = 6
SAMPLES_PER_WIN   = 150
CONTEXT_LEN_WINS  = 100
NUM_CLASSES       = 8   # RealWorld-HAR uses 8 classes

print(f"\nConfig: num_classes={NUM_CLASSES}, channels={NUM_CHANNELS}, "
      f"samples_per_window={SAMPLES_PER_WIN}, sequence_length={CONTEXT_LEN_WINS}")


# ---------------------------
# 1) DeepConvContext model (same architecture as Step 10)
# ---------------------------
class DeepConvLSTM_Intra(nn.Module):
    def __init__(self, in_ch=6, conv_ch=64, kernel_size=9, lstm_units=128):
        super().__init__()
        pad = kernel_size // 2
        self.conv1 = nn.Conv1d(in_ch,   conv_ch, kernel_size, padding=pad)
        self.conv2 = nn.Conv1d(conv_ch, conv_ch, kernel_size, padding=pad)
        self.conv3 = nn.Conv1d(conv_ch, conv_ch, kernel_size, padding=pad)
        self.conv4 = nn.Conv1d(conv_ch, conv_ch, kernel_size, padding=pad)
        self.relu  = nn.ReLU(inplace=True)
        self.lstm  = nn.LSTM(input_size=conv_ch, hidden_size=lstm_units,
                             num_layers=1, batch_first=True)

    def forward(self, x_win):           # x_win: (N, C, T)
        x = self.relu(self.conv1(x_win))
        x = self.relu(self.conv2(x))
        x = self.relu(self.conv3(x))
        x = self.relu(self.conv4(x))
        x = x.permute(0, 2, 1)          # (N, T, C)
        _, (h_n, _) = self.lstm(x)
        return h_n[-1]                  # (N, 128)


class DeepConvContext(nn.Module):
    def __init__(self, num_channels=6, num_classes=8,
                 conv_channels=64, intra_lstm_units=128,
                 inter_lstm_units=128, projection_dim=128,
                 dropout=0.5, bidirectional=True):
        super().__init__()
        self.intra = DeepConvLSTM_Intra(num_channels, conv_channels, 9, intra_lstm_units)
        self.proj  = nn.Linear(intra_lstm_units, projection_dim)
        self.inter = nn.LSTM(input_size=projection_dim, hidden_size=inter_lstm_units,
                             num_layers=1, batch_first=True, bidirectional=bidirectional)
        inter_out = inter_lstm_units * (2 if bidirectional else 1)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(inter_out, num_classes)

    def forward(self, x):               # x: (B, S, C, T) → logits: (B, S, K)
        B, S, C, T = x.shape
        x2d = x.reshape(B * S, C, T)
        feats = self.intra(x2d).view(B, S, -1)     # (B, S, 128)
        proj  = self.proj(feats)                   # (B, S, 128)
        inter_out, _ = self.inter(proj)            # (B, S, 128*dir)
        inter_out = self.dropout(inter_out)
        logits = self.fc(inter_out)                # (B, S, num_classes)
        return logits


# ---------------------------
# 2) Utility: parameter count
# ---------------------------
def count_parameters(model: nn.Module) -> int:
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


# ---------------------------
# 3) Latency measurement on CPU
# ---------------------------
def measure_latency(
    model: nn.Module,
    input_shape,
    device: torch.device,
    n_warmup: int = 20,
    n_runs: int = 100,
):
    """
    Measure pure forward-pass latency on CPU.
    Returns latency stats in milliseconds per sequence (B, S, C, T).
    """
    model = model.to(device)
    model.eval()
    torch.set_grad_enabled(False)

    x = torch.randn(*input_shape, device=device, dtype=torch.float32)

    # Warm-up
    for _ in range(n_warmup):
        _ = model(x)

    # Timed runs
    times_ms = []
    for _ in range(n_runs):
        t0 = time.perf_counter()
        _ = model(x)
        t1 = time.perf_counter()
        times_ms.append((t1 - t0) * 1000.0)

    times_ms = np.array(times_ms, dtype=float)
    stats = {
        "sequence_latency_p50_ms": float(np.percentile(times_ms, 50)),
        "sequence_latency_p90_ms": float(np.percentile(times_ms, 90)),
        "sequence_latency_mean_ms": float(times_ms.mean()),
        "sequence_latency_std_ms":  float(times_ms.std()),
        "n_runs": int(n_runs),
    }
    return stats


# ---------------------------
# 4) Instantiate model and run benchmark
# ---------------------------
model = DeepConvContext(
    num_channels=NUM_CHANNELS,
    num_classes=NUM_CLASSES,
    conv_channels=64,
    intra_lstm_units=128,
    inter_lstm_units=128,
    projection_dim=128,
    dropout=0.5,
    bidirectional=True,
)

n_params = count_parameters(model)
print(f"\nNumber of trainable parameters: {n_params:,}")

BATCH_SIZE = 1   # one sequence per batch
input_shape = (BATCH_SIZE, CONTEXT_LEN_WINS, NUM_CHANNELS, SAMPLES_PER_WIN)  # (B, S, C, T)

print(f"\nMeasuring CPU latency with input shape: {input_shape} (B, S, C, T)")

stats = measure_latency(
    model,
    input_shape=input_shape,
    device=device,
    n_warmup=20,
    n_runs=100,
)

print("\nCPU latency stats for DeepConvContext.forward() (per sequence of 100 windows):")
for k, v in stats.items():
    if k.endswith("_ms"):
        print(f"  {k}: {v:.4f}")
    else:
        print(f"  {k}: {v}")

# Convert sequence latency → per-window latency (divide by sequence length = 100)
per_window_p50  = stats["sequence_latency_p50_ms"]  / CONTEXT_LEN_WINS
per_window_p90  = stats["sequence_latency_p90_ms"]  / CONTEXT_LEN_WINS
per_window_mean = stats["sequence_latency_mean_ms"] / CONTEXT_LEN_WINS

print("\nApproximate CPU latency per window (3 s HAR window):")
print(f"  window_latency_p50_ms  ≈ {per_window_p50:.4f}")
print(f"  window_latency_p90_ms  ≈ {per_window_p90:.4f}")
print(f"  window_latency_mean_ms ≈ {per_window_mean:.4f}")

print("\n[DeepConvContext CPU inference latency benchmark completed]")


[DeepConvContext CPU Inference Latency Benchmark]
Using device: cpu
torch.get_num_threads() = 1

Config: num_classes=8, channels=6, samples_per_window=150, sequence_length=100

Number of trainable parameters: 496,392

Measuring CPU latency with input shape: (1, 100, 6, 150) (B, S, C, T)

CPU latency stats for DeepConvContext.forward() (per sequence of 100 windows):
  sequence_latency_p50_ms: 202.3898
  sequence_latency_p90_ms: 206.5136
  sequence_latency_mean_ms: 203.3992
  sequence_latency_std_ms: 3.6146
  n_runs: 100

Approximate CPU latency per window (3 s HAR window):
  window_latency_p50_ms  ≈ 2.0239
  window_latency_p90_ms  ≈ 2.0651
  window_latency_mean_ms ≈ 2.0340

[DeepConvContext CPU inference latency benchmark completed]


12. DeepConvContext-LSTM (unidirectional, 1-layer)

In [26]:
# ================ DeepConvContext (from-scratch, official architecture) CPU Inference Latency Benchmark ================
import os
os.environ.setdefault("OMP_NUM_THREADS", "1")
os.environ.setdefault("MKL_NUM_THREADS", "1")
os.environ.setdefault("OPENBLAS_NUM_THREADS", "1")

import time
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import torch
import torch.nn as nn

print("\n[DeepConvContext (from-scratch) CPU Inference Latency Benchmark]")

# ---------------------------
# 0) Basic configuration (CPU, single-thread)
# ---------------------------
SEED = 1
np.random.seed(SEED)
torch.manual_seed(SEED)

device = torch.device("cpu")
torch.set_num_threads(1)

print(f"Using device: {device}")
print(f"torch.get_num_threads() = {torch.get_num_threads()}")

# Window configuration: 3s @ 50 Hz → 150 samples; 50% overlap → stride 75; context length S = 100
NUM_CHANNELS      = 6
SAMPLES_PER_WIN   = 150
WIN_OVERLAP       = 0.5
STRIDE_SAMPLES    = int(SAMPLES_PER_WIN * (1 - WIN_OVERLAP))  # 75
CONTEXT_LEN_WINS  = 100
NUM_CLASSES       = 8

# Model hyperparameters (must match training script)
HIDDEN_UNITS  = 128
CONV_CHANNELS = 64
KERNEL_SIZE   = 9
DROPOUT_P     = 0.5
BIDIRECTIONAL = False

print(f"\nConfig: num_classes={NUM_CLASSES}, num_channels={NUM_CHANNELS}, "
      f"samples_per_window={SAMPLES_PER_WIN}, sequence_length={CONTEXT_LEN_WINS}")
print(f"DeepConvContext: conv_channels={CONV_CHANNELS}, hidden_intra={HIDDEN_UNITS}, "
      f"hidden_inter={HIDDEN_UNITS}, bidirectional={BIDIRECTIONAL}, dropout={DROPOUT_P}")


# ---------------------------
# 1) Model definition (exactly as in the Step 10 from-scratch script)
# ---------------------------
class DeepConvLSTM_Intra(nn.Module):
    def __init__(self, in_ch=6, conv_ch=64, kernel_size=9, hidden=128):
        super().__init__()
        pad = kernel_size // 2
        self.conv1 = nn.Conv1d(in_ch,   conv_ch, kernel_size, padding=pad)
        self.conv2 = nn.Conv1d(conv_ch, conv_ch, kernel_size, padding=pad)
        self.conv3 = nn.Conv1d(conv_ch, conv_ch, kernel_size, padding=pad)
        self.conv4 = nn.Conv1d(conv_ch, conv_ch, kernel_size, padding=pad)
        self.relu  = nn.ReLU(inplace=True)
        self.lstm  = nn.LSTM(input_size=conv_ch, hidden_size=hidden,
                             num_layers=1, batch_first=True)

    def forward(self, x_win):  # x_win: (N, C, T)
        x = self.relu(self.conv1(x_win))
        x = self.relu(self.conv2(x))
        x = self.relu(self.conv3(x))
        x = self.relu(self.conv4(x))
        x = x.permute(0, 2, 1)  # (N, T, C)
        _, (h_n, _) = self.lstm(x)
        return h_n[-1]         # (N, hidden)


class DeepConvContext(nn.Module):
    def __init__(self,
                 num_channels=6,
                 num_classes=8,
                 conv_channels=64,
                 hidden_intra=128,
                 hidden_inter=128,
                 dropout=0.5,
                 bidirectional=False):
        super().__init__()
        self.intra = DeepConvLSTM_Intra(num_channels, conv_channels, KERNEL_SIZE, hidden_intra)

        self.inter = nn.LSTM(input_size=hidden_intra,
                             hidden_size=hidden_inter,
                             num_layers=1,
                             batch_first=True,
                             bidirectional=bidirectional)
        inter_out = hidden_inter * (2 if bidirectional else 1)

        self.dropout = nn.Dropout(dropout)
        self.fc      = nn.Linear(inter_out, num_classes)

    def forward(self, x):               # x: (B, S, C, T)
        B, S, C, T = x.shape
        x = x.reshape(B * S, C, T)
        feats = self.intra(x)           # (B*S, hidden_intra)
        feats = feats.view(B, S, -1)    # (B, S, hidden_intra)
        inter_out, _ = self.inter(feats)  # (B, S, inter_out)
        inter_out = self.dropout(inter_out)
        logits = self.fc(inter_out)     # (B, S, K)
        return logits


# ---------------------------
# 2) Utility: parameter count
# ---------------------------
def count_parameters(model: nn.Module) -> int:
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


# ---------------------------
# 3) Latency measurement on CPU
# ---------------------------
def measure_latency(
    model: nn.Module,
    input_shape,
    device: torch.device,
    n_warmup: int = 20,
    n_runs: int = 100,
):
    """
    Measure pure forward-pass latency on CPU.
    Returns latency stats in milliseconds per sequence (B, S, C, T).
    """
    model = model.to(device)
    model.eval()
    torch.set_grad_enabled(False)

    x = torch.randn(*input_shape, device=device, dtype=torch.float32)

    # Warm-up
    for _ in range(n_warmup):
        _ = model(x)

    # Timed runs
    times_ms = []
    for _ in range(n_runs):
        t0 = time.perf_counter()
        _ = model(x)
        t1 = time.perf_counter()
        times_ms.append((t1 - t0) * 1000.0)

    times_ms = np.array(times_ms, dtype=float)
    stats = {
        "sequence_latency_p50_ms": float(np.percentile(times_ms, 50)),
        "sequence_latency_p90_ms": float(np.percentile(times_ms, 90)),
        "sequence_latency_mean_ms": float(times_ms.mean()),
        "sequence_latency_std_ms":  float(times_ms.std()),
        "n_runs": int(n_runs),
    }
    return stats


# ---------------------------
# 4) Instantiate model and run benchmark
# ---------------------------
model = DeepConvContext(
    num_channels=NUM_CHANNELS,
    num_classes=NUM_CLASSES,
    conv_channels=CONV_CHANNELS,
    hidden_intra=HIDDEN_UNITS,
    hidden_inter=HIDDEN_UNITS,
    dropout=DROPOUT_P,
    bidirectional=BIDIRECTIONAL,
)

n_params = count_parameters(model)
print(f"\nNumber of trainable parameters: {n_params:,}")

BATCH_SIZE = 1   # one sequence per batch
input_shape = (BATCH_SIZE, CONTEXT_LEN_WINS, NUM_CHANNELS, SAMPLES_PER_WIN)  # (B, S, C, T)

print(f"\nMeasuring CPU latency with input shape: {input_shape} (B, S, C, T)")

stats = measure_latency(
    model,
    input_shape=input_shape,
    device=device,
    n_warmup=20,
    n_runs=100,
)

print("\nCPU latency stats for DeepConvContext (from-scratch) forward() "
      "(per sequence of 100 windows):")
for k, v in stats.items():
    if k.endswith("_ms"):
        print(f"  {k}: {v:.4f}")
    else:
        print(f"  {k}: {v}")

# Convert sequence latency → per-window latency (divide by S=100)
per_window_p50  = stats["sequence_latency_p50_ms"]  / CONTEXT_LEN_WINS
per_window_p90  = stats["sequence_latency_p90_ms"]  / CONTEXT_LEN_WINS
per_window_mean = stats["sequence_latency_mean_ms"] / CONTEXT_LEN_WINS

print("\nApproximate CPU latency per window (3 s HAR window):")
print(f"  window_latency_p50_ms  ≈ {per_window_p50:.4f}")
print(f"  window_latency_p90_ms  ≈ {per_window_p90:.4f}")
print(f"  window_latency_mean_ms ≈ {per_window_mean:.4f}")

print("\n[DeepConvContext (from-scratch) CPU inference latency benchmark completed]")


[DeepConvContext (from-scratch) CPU Inference Latency Benchmark]
Using device: cpu
torch.get_num_threads() = 1

Config: num_classes=8, num_channels=6, samples_per_window=150, sequence_length=100
DeepConvContext: conv_channels=64, hidden_intra=128, hidden_inter=128, bidirectional=False, dropout=0.5

Number of trainable parameters: 346,760

Measuring CPU latency with input shape: (1, 100, 6, 150) (B, S, C, T)

CPU latency stats for DeepConvContext (from-scratch) forward() (per sequence of 100 windows):
  sequence_latency_p50_ms: 201.0896
  sequence_latency_p90_ms: 205.5973
  sequence_latency_mean_ms: 201.9558
  sequence_latency_std_ms: 3.1175
  n_runs: 100

Approximate CPU latency per window (3 s HAR window):
  window_latency_p50_ms  ≈ 2.0109
  window_latency_p90_ms  ≈ 2.0560
  window_latency_mean_ms ≈ 2.0196

[DeepConvContext (from-scratch) CPU inference latency benchmark completed]


13. DeepConvContext Bi‑Attention

In [28]:
# ================ DeepConvContext Bi-Attention CPU Inference Latency Benchmark (Standalone) ================
import os
os.environ.setdefault("OMP_NUM_THREADS", "1")
os.environ.setdefault("MKL_NUM_THREADS", "1")
os.environ.setdefault("OPENBLAS_NUM_THREADS", "1")

import time
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import torch
import torch.nn as nn

print("\n[DeepConvContext Bi-Attention CPU Inference Latency Benchmark]")

# ---------------------------
# 0) Basic configuration (CPU, single-thread)
# ---------------------------
SEED = 1
np.random.seed(SEED)
torch.manual_seed(SEED)

device = torch.device("cpu")
torch.set_num_threads(1)

print(f"Using device: {device}")
print(f"torch.get_num_threads() = {torch.get_num_threads()}")

# Window settings: 3 s @ 50 Hz = 150 samples; 50% overlap → stride = 75
NUM_CHANNELS     = 6
SAMPLES_PER_WIN  = 150
WIN_OVERLAP      = 0.5
STRIDE_SAMPLES   = int(SAMPLES_PER_WIN * (1 - WIN_OVERLAP))  # 75

# Context length per batch (paper uses batch as context, batch_size=100)
CONTEXT_LEN_WINS = 100   # number of windows per context (batch size during training/testing)

# Model hyperparameters (must match training script)
NUM_CLASSES     = 8
DROPOUT_P       = 0.5
HIDDEN_UNITS    = 128
CONV_CHANNELS   = 64
KERNEL_SIZE     = 9
PROJECTION_DIM  = 128
ATTN_HEADS      = 4
ATTN_LAYERS     = 3
MAX_CONTEXT_WINS= 200    # for positional encoding
BIDIRECTIONAL   = True   # Bi-Attention variant

print(f"\nConfig: num_classes={NUM_CLASSES}, num_channels={NUM_CHANNELS}, "
      f"samples_per_window={SAMPLES_PER_WIN}, context_len_windows={CONTEXT_LEN_WINS}")
print(f"DeepConvContext Bi-Attention: conv_channels={CONV_CHANNELS}, hidden={HIDDEN_UNITS}, "
      f"proj_dim={PROJECTION_DIM}, attn_heads={ATTN_HEADS}, attn_layers={ATTN_LAYERS}, "
      f"dropout={DROPOUT_P}, bidirectional={BIDIRECTIONAL}")


# ---------------------------
# 1) Model definition (exactly as in the paper-aligned script)
# ---------------------------
class DeepConvLSTM_Intra(nn.Module):
    """
    Intra-window branch:
    4×Conv1d(64, k=9) + ReLU → 1-layer LSTM(128)
    """
    def __init__(self, in_ch: int = 6, conv_ch: int = 64, kernel_size: int = 9, hidden: int = 128):
        super().__init__()
        pad = kernel_size // 2
        self.conv1 = nn.Conv1d(in_ch,   conv_ch, kernel_size, padding=pad)
        self.conv2 = nn.Conv1d(conv_ch, conv_ch, kernel_size, padding=pad)
        self.conv3 = nn.Conv1d(conv_ch, conv_ch, kernel_size, padding=pad)
        self.conv4 = nn.Conv1d(conv_ch, conv_ch, kernel_size, padding=pad)
        self.relu  = nn.ReLU(inplace=True)
        self.lstm  = nn.LSTM(
            input_size=conv_ch,
            hidden_size=hidden,
            num_layers=1,
            batch_first=True
        )

    def forward(self, x_win: torch.Tensor) -> torch.Tensor:
        # x_win: (B, C, T)
        x = self.relu(self.conv1(x_win))
        x = self.relu(self.conv2(x))
        x = self.relu(self.conv3(x))
        x = self.relu(self.conv4(x))        # (B, conv_ch, T)
        x = x.permute(0, 2, 1)              # (B, T, conv_ch)
        _, (h_n, _) = self.lstm(x)          # h_n: (1, B, hidden)
        return h_n[-1]                      # (B, hidden)


class PositionalEncoding1D(nn.Module):
    """
    Learnable positional encoding over the context window index.
    """
    def __init__(self, d_model: int, max_len: int = 200):
        super().__init__()
        self.pos_embedding = nn.Embedding(max_len, d_model)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # x: (B_seq, S, D)
        seq_len = x.size(1)
        positions = torch.arange(seq_len, device=x.device).unsqueeze(0)  # (1, S)
        pos = self.pos_embedding(positions)                              # (1, S, D)
        return x + pos


class MultiHeadSelfAttentionBlock(nn.Module):
    """
    Self-attention block: MHA + residual + LayerNorm (no dropout inside attention to match paper).
    """
    def __init__(self, dim: int, num_heads: int):
        super().__init__()
        self.mha  = nn.MultiheadAttention(
            embed_dim=dim,
            num_heads=num_heads,
            dropout=0.0,
            batch_first=True
        )
        self.norm = nn.LayerNorm(dim)

    def forward(self, x: torch.Tensor, attn_mask: torch.Tensor | None = None) -> torch.Tensor:
        # x: (B_seq, S, D)
        attn_out, _ = self.mha(x, x, x, attn_mask=attn_mask, need_weights=False)
        return self.norm(x + attn_out)


class DeepConvContext_BiAttention(nn.Module):
    """
    DeepConvContext attention variant (paper-aligned RWHAR):
      Intra : Conv×4 + 1-Layer LSTM(128)
      Proj  : FC(128→128)
      Inter : stack of 3×(4-head Multi-Head Self-Attention) blocks
              (batch dimension is treated as context sequence)
      Positional encoding added before attention
      Dropout(0.5) → FC(num_classes)
    """
    def __init__(self,
                 num_channels: int = 6,
                 num_classes: int = 8,
                 conv_channels: int = 64,
                 hidden_intra: int = 128,
                 projection_dim: int = 128,
                 attn_heads: int = 4,
                 num_attn_layers: int = 3,
                 max_context_len: int = 200,
                 dropout: float = 0.5,
                 bidirectional: bool = True):
        super().__init__()

        self.intra = DeepConvLSTM_Intra(
            in_ch=num_channels,
            conv_ch=conv_channels,
            kernel_size=KERNEL_SIZE,
            hidden=hidden_intra
        )

        self.proj = nn.Linear(hidden_intra, projection_dim)
        self.pos_enc = PositionalEncoding1D(projection_dim, max_len=max_context_len)

        self.bidirectional = bidirectional
        self.attn_layers = nn.ModuleList([
            MultiHeadSelfAttentionBlock(projection_dim, attn_heads)
            for _ in range(num_attn_layers)
        ])

        self.dropout = nn.Dropout(dropout)
        self.fc      = nn.Linear(projection_dim, num_classes)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        x: (B, C, T) — B time-ordered windows, batch is treated as context
        return: (B, num_classes)
        """
        B, C, T = x.shape

        feats = self.intra(x)          # (B, hidden_intra)
        proj  = self.proj(feats)       # (B, D)

        # Batch-as-context: interpret B as sequence length
        seq = proj.unsqueeze(0)        # (1, B, D)
        seq = self.pos_enc(seq)        # (1, B, D)

        attn_mask = None
        if not self.bidirectional:
            S_len = B
            attn_mask = torch.ones(
                S_len, S_len,
                device=x.device,
                dtype=torch.bool
            ).triu(diagonal=1)

        for layer in self.attn_layers:
            seq = layer(seq, attn_mask=attn_mask)   # (1, B, D)

        seq = self.dropout(seq)
        logits = self.fc(seq)         # (1, B, num_classes)
        return logits.squeeze(0)      # (B, num_classes)


# ---------------------------
# 2) Utility: parameter count
# ---------------------------
def count_parameters(model: nn.Module) -> int:
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


# ---------------------------
# 3) Latency measurement on CPU
# ---------------------------
def measure_latency(
    model: nn.Module,
    input_shape,
    device: torch.device,
    n_warmup: int = 20,
    n_runs: int = 100,
):
    """
    Measure pure forward-pass latency on CPU.
    Returns latency stats in milliseconds per batch (i.e., per context of 100 windows).
    """
    model = model.to(device)
    model.eval()
    torch.set_grad_enabled(False)

    x = torch.randn(*input_shape, device=device, dtype=torch.float32)

    # Warm-up
    for _ in range(n_warmup):
        _ = model(x)

    # Timed runs
    times_ms = []
    for _ in range(n_runs):
        t0 = time.perf_counter()
        _ = model(x)
        t1 = time.perf_counter()
        times_ms.append((t1 - t0) * 1000.0)

    times_ms = np.array(times_ms, dtype=float)
    stats = {
        "context_latency_p50_ms": float(np.percentile(times_ms, 50)),
        "context_latency_p90_ms": float(np.percentile(times_ms, 90)),
        "context_latency_mean_ms": float(times_ms.mean()),
        "context_latency_std_ms":  float(times_ms.std()),
        "n_runs": int(n_runs),
    }
    return stats


# ---------------------------
# 4) Instantiate model and run benchmark
# ---------------------------
model = DeepConvContext_BiAttention(
    num_channels=NUM_CHANNELS,
    num_classes=NUM_CLASSES,
    conv_channels=CONV_CHANNELS,
    hidden_intra=HIDDEN_UNITS,
    projection_dim=PROJECTION_DIM,
    attn_heads=ATTN_HEADS,
    num_attn_layers=ATTN_LAYERS,
    max_context_len=MAX_CONTEXT_WINS,
    dropout=DROPOUT_P,
    bidirectional=BIDIRECTIONAL,   # Bi-Attention variant
)

n_params = count_parameters(model)
print(f"\nNumber of trainable parameters: {n_params:,}")

# Here, each batch = one context of 100 windows (B=100, C=6, T=150)
BATCH_CONTEXT = CONTEXT_LEN_WINS
input_shape = (BATCH_CONTEXT, NUM_CHANNELS, SAMPLES_PER_WIN)  # (B, C, T)

print(f"\nMeasuring CPU latency with input shape: {input_shape} (B=context_len, C, T)")

stats = measure_latency(
    model,
    input_shape=input_shape,
    device=device,
    n_warmup=20,
    n_runs=100,
)

print("\nCPU latency stats for DeepConvContext Bi-Attention forward() "
      "(per context of 100 windows):")
for k, v in stats.items():
    if k.endswith("_ms"):
        print(f"  {k}: {v:.4f}")
    else:
        print(f"  {k}: {v}")

# Convert context latency → per-window latency (divide by context length = 100)
per_window_p50  = stats["context_latency_p50_ms"]  / BATCH_CONTEXT
per_window_p90  = stats["context_latency_p90_ms"]  / BATCH_CONTEXT
per_window_mean = stats["context_latency_mean_ms"] / BATCH_CONTEXT

print("\nApproximate CPU latency per window (3 s HAR window):")
print(f"  window_latency_p50_ms  ≈ {per_window_p50:.4f}")
print(f"  window_latency_p90_ms  ≈ {per_window_p90:.4f}")
print(f"  window_latency_mean_ms ≈ {per_window_mean:.4f}")

print("\n[DeepConvContext Bi-Attention CPU inference latency benchmark completed]")


[DeepConvContext Bi-Attention CPU Inference Latency Benchmark]
Using device: cpu
torch.get_num_threads() = 1

Config: num_classes=8, num_channels=6, samples_per_window=150, context_len_windows=100
DeepConvContext Bi-Attention: conv_channels=64, hidden=128, proj_dim=128, attn_heads=4, attn_layers=3, dropout=0.5, bidirectional=True

Number of trainable parameters: 455,688

Measuring CPU latency with input shape: (100, 6, 150) (B=context_len, C, T)

CPU latency stats for DeepConvContext Bi-Attention forward() (per context of 100 windows):
  context_latency_p50_ms: 202.3500
  context_latency_p90_ms: 208.1371
  context_latency_mean_ms: 203.6163
  context_latency_std_ms: 4.0729
  n_runs: 100

Approximate CPU latency per window (3 s HAR window):
  window_latency_p50_ms  ≈ 2.0235
  window_latency_p90_ms  ≈ 2.0814
  window_latency_mean_ms ≈ 2.0362

[DeepConvContext Bi-Attention CPU inference latency benchmark completed]
