Test sequence:
1. minirocket (sktime)
2. multirocket (sktime)
3. multirocket (aeon)

1. minirocket (sktime)

In [4]:

# ================ MiniRocket CPU Inference Latency Benchmark for HAR Windows (Standalone) ================
!pip -q install sktime

import os
os.environ.setdefault("OMP_NUM_THREADS", "1")
os.environ.setdefault("MKL_NUM_THREADS", "1")
os.environ.setdefault("OPENBLAS_NUM_THREADS", "1")
os.environ.setdefault("NUMBA_NUM_THREADS", "1")
os.environ.setdefault("NUMBA_DEFAULT_NUM_THREADS", "1")

import time
import random
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import torch
import numba
from sklearn.linear_model import RidgeClassifier
from sktime.transformations.panel.rocket import MiniRocketMultivariate

print("\n[MiniRocket CPU Inference Latency Benchmark for HAR Windows]")

# ---------------------------
# 0) Basic configuration
# ---------------------------
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

# Single-core CPU
device = "cpu"
torch.set_num_threads(1)
numba.set_num_threads(1)

print(f"Using device: {device}")
print(f"torch.get_num_threads() = {torch.get_num_threads()}")
print(f"NUMBA threads = {numba.get_num_threads()}")

# HAR-like window settings: 6 channels, 150 time steps (e.g., 3 s @ 50 Hz)
N_CHANNELS      = 6
WINDOW_SAMPLES  = 150
N_CLASSES       = 8

print(f"N_CHANNELS={N_CHANNELS}, WINDOW_SAMPLES={WINDOW_SAMPLES}, N_CLASSES={N_CLASSES}")

# Training set size for MiniRocket + linear head (aligned with KNN/RF)
N_TRAIN   = 20000   # number of training windows

# Latency measurement settings
N_WARMUP   = 20
N_RUNS     = 100
BATCH_SIZE = 1      # number of query windows per call

print(f"N_TRAIN={N_TRAIN}, N_WARMUP={N_WARMUP}, N_RUNS={N_RUNS}, BATCH_SIZE={BATCH_SIZE}")


# ---------------------------
# 1) Helper: convert 3D numpy to sktime nested DataFrame
#    X_np: (n_instances, n_channels, n_timepoints)
# ---------------------------
def to_nested_dataframe(X_np: np.ndarray) -> pd.DataFrame:
    n_instances, n_channels, n_timepoints = X_np.shape
    data = {}
    for c in range(n_channels):
        data[f"dim_{c}"] = [pd.Series(X_np[i, c, :]) for i in range(n_instances)]
    return pd.DataFrame(data)


# ---------------------------
# 2) Create synthetic HAR-like training dataset
# ---------------------------
# Each sample: one window [channels, time] with Gaussian noise
X_train_np = np.random.randn(N_TRAIN, N_CHANNELS, WINDOW_SAMPLES).astype(np.float32)
y_train    = np.random.randint(0, N_CLASSES, size=(N_TRAIN,), dtype=np.int32)

print(f"X_train_np shape: {X_train_np.shape}, y_train shape: {y_train.shape}")

X_train = to_nested_dataframe(X_train_np)
print(f"X_train nested DataFrame shape: {X_train.shape}")


# ---------------------------
# 3) Define MiniRocket + linear classifier
# ---------------------------
minirocket = MiniRocketMultivariate(
    num_kernels=10000,             # standard MiniRocket setting
    max_dilations_per_kernel=32,
    n_jobs=1,                      # single-core for fair comparison
    random_state=SEED,
)

clf = RidgeClassifier(
    alpha=1.0,
    fit_intercept=True
)

# ---------------------------
# 4) Fit MiniRocket feature extractor + linear classifier
# ---------------------------
t0_fit = time.perf_counter()
minirocket.fit(X_train, y_train)
X_train_trans = minirocket.transform(X_train)
clf.fit(X_train_trans, y_train)
t1_fit = time.perf_counter()
fit_time_ms = (t1_fit - t0_fit) * 1000.0

n_features_rocket = X_train_trans.shape[1]
n_linear_params = clf.coef_.size + clf.intercept_.size

print(f"\nMiniRocket fit + linear head fit time: {fit_time_ms:.3f} ms")
print(f"MiniRocket transformed feature dimension: {n_features_rocket}")
print(f"Approximate number of linear head parameters: {n_linear_params:,}")


# ---------------------------
# 5) Utility: latency measurement for MiniRocket + linear head
# ---------------------------
def measure_minirocket_latency(
    transformer: MiniRocketMultivariate,
    classifier,
    X_query_nested: pd.DataFrame,
    batch_size: int,
    n_warmup: int = 20,
    n_runs: int = 100,
):
    """
    Measure MiniRocket.transform() + linear classifier.predict() latency on CPU.
    X_query_nested: nested DataFrame with shape (batch_size, n_channels).
    Returns latency stats in milliseconds per batch.
    """
    # Warm-up
    for _ in range(n_warmup):
        X_feat = transformer.transform(X_query_nested)
        _ = classifier.predict(X_feat)

    # Timed runs
    times_ms = []
    for _ in range(n_runs):
        t0 = time.perf_counter()
        X_feat = transformer.transform(X_query_nested)
        _ = classifier.predict(X_feat)
        t1 = time.perf_counter()
        times_ms.append((t1 - t0) * 1000.0)

    times_ms = np.array(times_ms, dtype=float)
    stats = {
        "batch_latency_p50_ms": float(np.percentile(times_ms, 50)),
        "batch_latency_p90_ms": float(np.percentile(times_ms, 90)),
        "batch_latency_mean_ms": float(times_ms.mean()),
        "batch_latency_std_ms":  float(times_ms.std()),
        "n_runs": int(n_runs),
        "batch_size": int(batch_size),
    }
    return stats


# ---------------------------
# 6) Build a synthetic query batch and run the benchmark
# ---------------------------
X_query_np = np.random.randn(BATCH_SIZE, N_CHANNELS, WINDOW_SAMPLES).astype(np.float32)
X_query = to_nested_dataframe(X_query_np)

print(f"\nQuery batch nested DataFrame shape: {X_query.shape}")

stats = measure_minirocket_latency(
    minirocket,
    clf,
    X_query_nested=X_query,
    batch_size=BATCH_SIZE,
    n_warmup=N_WARMUP,
    n_runs=N_RUNS,
)

print("\nCPU latency stats for MiniRocket + linear head (per batch):")
for k, v in stats.items():
    if k.endswith("_ms"):
        print(f"  {k}: {v:.4f}")
    else:
        print(f"  {k}: {v}")

# Each sample corresponds to one HAR window, so per-window latency = per-batch / batch_size
per_window_p50  = stats["batch_latency_p50_ms"]  / stats["batch_size"]
per_window_p90  = stats["batch_latency_p90_ms"]  / stats["batch_size"]
per_window_mean = stats["batch_latency_mean_ms"] / stats["batch_size"]

print("\nApproximate CPU latency per window (HAR sample):")
print(f"  window_latency_p50_ms  ≈ {per_window_p50:.4f}")
print(f"  window_latency_p90_ms  ≈ {per_window_p90:.4f}")
print(f"  window_latency_mean_ms ≈ {per_window_mean:.4f}")

print("\n[MiniRocket CPU inference latency benchmark completed]")


[MiniRocket CPU Inference Latency Benchmark for HAR Windows]
Using device: cpu
torch.get_num_threads() = 1
NUMBA threads = 1
N_CHANNELS=6, WINDOW_SAMPLES=150, N_CLASSES=8
N_TRAIN=20000, N_WARMUP=20, N_RUNS=100, BATCH_SIZE=1
X_train_np shape: (20000, 6, 150), y_train shape: (20000,)
X_train nested DataFrame shape: (20000, 6)

MiniRocket fit + linear head fit time: 43968.084 ms
MiniRocket transformed feature dimension: 9996
Approximate number of linear head parameters: 79,976

Query batch nested DataFrame shape: (1, 6)

CPU latency stats for MiniRocket + linear head (per batch):
  batch_latency_p50_ms: 31.7074
  batch_latency_p90_ms: 92.0187
  batch_latency_mean_ms: 40.9640
  batch_latency_std_ms: 22.8331
  n_runs: 100
  batch_size: 1

Approximate CPU latency per window (HAR sample):
  window_latency_p50_ms  ≈ 31.7074
  window_latency_p90_ms  ≈ 92.0187
  window_latency_mean_ms ≈ 40.9640

[MiniRocket CPU inference latency benchmark completed]


2. multirocket (sktime)

In [1]:
# ================ MultiRocket CPU Inference Latency Benchmark for HAR Windows (Single-Core Baseline) ================
import os
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["OPENBLAS_NUM_THREADS"] = "1"
os.environ["NUMBA_NUM_THREADS"] = "1"
os.environ["NUMBA_DEFAULT_NUM_THREADS"] = "1"

import time
import random
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import torch
import numba
from sklearn.linear_model import RidgeClassifier
from sktime.transformations.panel.rocket import MultiRocketMultivariate

print("\n[MultiRocket CPU Inference Latency Benchmark for HAR Windows]")

# ---------------------------
# 1) Basic configuration (single-core CPU)
# ---------------------------
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

device = "cpu"
torch.set_num_threads(1)
numba.set_num_threads(1)

print(f"Using device: {device}")
print(f"torch.get_num_threads() = {torch.get_num_threads()}")
print(f"NUMBA threads = {numba.get_num_threads()}")

# HAR-like window settings: 6 channels, 150 time steps (3 s @ 50 Hz)
N_CHANNELS      = 6
WINDOW_SAMPLES  = 150
N_CLASSES       = 8

print(f"\nN_CHANNELS={N_CHANNELS}, WINDOW_SAMPLES={WINDOW_SAMPLES}, N_CLASSES={N_CLASSES}")

# Training set size (aligned with other baselines)
N_TRAIN   = 20000

# Latency measurement settings
N_WARMUP   = 20
N_RUNS     = 100
BATCH_SIZE = 1   # per-window latency

print(f"N_TRAIN={N_TRAIN}, N_WARMUP={N_WARMUP}, N_RUNS={N_RUNS}, BATCH_SIZE={BATCH_SIZE}")

# MultiRocket configuration (canonical setting: ~6,250 kernels)
NUM_KERNELS = 6250     # MultiRocket will internally round down to multiple of 84
MAX_DILATIONS_PER_KERNEL = 32
N_FEATURES_PER_KERNEL    = 4

print(f"\nMultiRocket config: num_kernels={NUM_KERNELS}, "
      f"max_dilations_per_kernel={MAX_DILATIONS_PER_KERNEL}, "
      f"n_features_per_kernel={N_FEATURES_PER_KERNEL}")

# ---------------------------
# 2) Create synthetic HAR-like training dataset (3D numpy: [N, C, T])
# ---------------------------
X_train = np.random.randn(N_TRAIN, N_CHANNELS, WINDOW_SAMPLES).astype(np.float64)
y_train = np.random.randint(0, N_CLASSES, size=(N_TRAIN,), dtype=np.int32)

print(f"\nX_train shape: {X_train.shape}, y_train shape: {y_train.shape}")

# ---------------------------
# 3) Define MultiRocket + linear classifier
# ---------------------------
multirocket = MultiRocketMultivariate(
    num_kernels=NUM_KERNELS,
    max_dilations_per_kernel=MAX_DILATIONS_PER_KERNEL,
    n_features_per_kernel=N_FEATURES_PER_KERNEL,
    n_jobs=1,             # single-core for fair comparison
    random_state=SEED,
)

clf = RidgeClassifier(
    alpha=1.0,
    fit_intercept=True,
    solver="lsqr",
)

# ---------------------------
# 4) Fit MultiRocket feature extractor + linear classifier
# ---------------------------
t0_fit = time.perf_counter()
multirocket.fit(X_train, y_train)
X_train_trans = multirocket.transform(X_train)
if hasattr(X_train_trans, "values"):
    X_train_trans = X_train_trans.values
clf.fit(X_train_trans, y_train)
t1_fit = time.perf_counter()
fit_time_ms = (t1_fit - t0_fit) * 1000.0

n_features_rocket = X_train_trans.shape[1]
n_linear_params = clf.coef_.size + clf.intercept_.size

print(f"\nMultiRocket fit + linear head fit time: {fit_time_ms:.3f} ms")
print(f"MultiRocket transformed feature dimension: {n_features_rocket}")
print(f"Approximate number of linear head parameters: {n_linear_params:,}")

# ---------------------------
# 5) Utility: latency measurement for MultiRocket + linear head
# ---------------------------
def measure_multirocket_latency(
    transformer,
    classifier,
    X_query,
    batch_size: int,
    n_warmup: int = 20,
    n_runs: int = 100,
):
    """
    Measure MultiRocket.transform() + RidgeClassifier.predict() latency on CPU.
    X_query: numpy array with shape (batch_size, n_channels, n_timepoints).
    Returns latency stats in milliseconds per batch.
    """
    assert X_query.shape[0] == batch_size

    # Warm-up (excluded from statistics)
    for _ in range(n_warmup):
        feats = transformer.transform(X_query)
        if hasattr(feats, "values"):
            feats = feats.values
        _ = classifier.predict(feats)

    # Timed runs
    times_ms = []
    for _ in range(n_runs):
        t0 = time.perf_counter()
        feats = transformer.transform(X_query)
        if hasattr(feats, "values"):
            feats = feats.values
        _ = classifier.predict(feats)
        t1 = time.perf_counter()
        times_ms.append((t1 - t0) * 1000.0)

    times_ms = np.array(times_ms, dtype=float)
    stats = {
        "batch_latency_p50_ms": float(np.percentile(times_ms, 50)),
        "batch_latency_p90_ms": float(np.percentile(times_ms, 90)),
        "batch_latency_mean_ms": float(times_ms.mean()),
        "batch_latency_std_ms":  float(times_ms.std()),
        "n_runs": int(n_runs),
        "batch_size": int(batch_size),
    }
    return stats

# ---------------------------
# 6) Build a synthetic query batch and run the benchmark
# ---------------------------
X_query = np.random.randn(BATCH_SIZE, N_CHANNELS, WINDOW_SAMPLES).astype(np.float64)
print(f"\nX_query shape: {X_query.shape}")

stats = measure_multirocket_latency(
    multirocket,
    clf,
    X_query=X_query,
    batch_size=BATCH_SIZE,
    n_warmup=N_WARMUP,
    n_runs=N_RUNS,
)

print("\nCPU latency stats for MultiRocket + Ridge (per batch):")
for k, v in stats.items():
    if k.endswith("_ms"):
        print(f"  {k}: {v:.4f}")
    else:
        print(f"  {k}: {v}")

# Each batch element = one HAR window, so per-window latency = per-batch / batch_size
per_window_p50  = stats["batch_latency_p50_ms"]  / stats["batch_size"]
per_window_p90  = stats["batch_latency_p90_ms"]  / stats["batch_size"]
per_window_mean = stats["batch_latency_mean_ms"] / stats["batch_size"]

print("\nApproximate CPU latency per window (HAR sample):")
print(f"  window_latency_p50_ms  ≈ {per_window_p50:.4f}")
print(f"  window_latency_p90_ms  ≈ {per_window_p90:.4f}")
print(f"  window_latency_mean_ms ≈ {per_window_mean:.4f}")

print("\n[MultiRocket CPU inference latency benchmark completed]")



[MultiRocket CPU Inference Latency Benchmark for HAR Windows]
Using device: cpu
torch.get_num_threads() = 1
NUMBA threads = 1

N_CHANNELS=6, WINDOW_SAMPLES=150, N_CLASSES=8
N_TRAIN=20000, N_WARMUP=20, N_RUNS=100, BATCH_SIZE=1

MultiRocket config: num_kernels=6250, max_dilations_per_kernel=32, n_features_per_kernel=4

X_train shape: (20000, 6, 150), y_train shape: (20000,)

MultiRocket fit + linear head fit time: 524059.676 ms
MultiRocket transformed feature dimension: 49728
Approximate number of linear head parameters: 397,832

X_query shape: (1, 6, 150)

CPU latency stats for MultiRocket + Ridge (per batch):
  batch_latency_p50_ms: 10.2868
  batch_latency_p90_ms: 10.5759
  batch_latency_mean_ms: 10.3077
  batch_latency_std_ms: 0.2091
  n_runs: 100
  batch_size: 1

Approximate CPU latency per window (HAR sample):
  window_latency_p50_ms  ≈ 10.2868
  window_latency_p90_ms  ≈ 10.5759
  window_latency_mean_ms ≈ 10.3077

[MultiRocket CPU inference latency benchmark completed]


3. multirocket (aeon)

In [3]:
# ================ MultiRocket CPU Inference Latency Benchmark for HAR Windows (Standalone, aeon) ================
# IMPORTANT:
#   - Run this cell in a *fresh* Colab runtime, before importing numba / aeon / torch anywhere else.
#   - The script is designed to be directly comparable to your MiniRocket CPU benchmark:
#       * 6 channels, 150 time steps (~3 s @ 50 Hz), 8 classes
#       * N_TRAIN = 20,000 synthetic windows
#       * N_WARMUP = 20, N_RUNS = 100, BATCH_SIZE = 1
#       * Single-core intent: all thread-related env vars set to 1, n_jobs=1 in MultiRocket
#   - Latency = MultiRocket.transform() + RidgeClassifier.predict() on CPU.

!pip -q install aeon scikit-learn

import os
import time
import random
import warnings
import platform

warnings.filterwarnings("ignore")

# ----------------------------------------------------------------------
# 0) Environment: fix thread-related env vars BEFORE importing numba/aeon
# ----------------------------------------------------------------------
# We fix all relevant BLAS / OpenMP / Numba thread counts to 1 to emulate
# a single-core CPU setting. This MUST be done before importing numba.
for var in ["OMP_NUM_THREADS", "MKL_NUM_THREADS", "OPENBLAS_NUM_THREADS", "NUMBA_NUM_THREADS"]:
    os.environ[var] = "1"

import numpy as np
import torch
from sklearn.linear_model import RidgeClassifier
import numba
from aeon.transformations.collection.convolution_based import MultiRocket

print("\n[MultiRocket CPU Inference Latency Benchmark for HAR Windows (aeon)]")

# ----------------------------------------------------------------------
# 1) Basic configuration
# ----------------------------------------------------------------------
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

# Make PyTorch single-threaded (for consistency with env vars)
torch.set_num_threads(1)

# Numba should already respect NUMBA_NUM_THREADS=1 from the env
numba_threads = numba.get_num_threads()

device = "cpu"
print(f"Using device: {device}")
print(f"torch.get_num_threads()      = {torch.get_num_threads()}")
print(f"Numba effective num_threads  = {numba_threads}")

print("Platform :", platform.system(), platform.release())
print("CPU      :", platform.processor())

# HAR-like window settings (aligned with the MiniRocket benchmark)
N_CHANNELS      = 6
WINDOW_SAMPLES  = 150   # ~3 s @ 50 Hz
N_CLASSES       = 8

print(f"N_CHANNELS={N_CHANNELS}, WINDOW_SAMPLES={WINDOW_SAMPLES}, N_CLASSES={N_CLASSES}")

# Training set size (aligned with other baselines)
N_TRAIN   = 20000

# Latency measurement settings
N_WARMUP   = 20
N_RUNS     = 100
BATCH_SIZE = 1      # number of query windows per call

print(f"N_TRAIN={N_TRAIN}, N_WARMUP={N_WARMUP}, N_RUNS={N_RUNS}, BATCH_SIZE={BATCH_SIZE}")

# MultiRocket configuration (standard, literature-aligned)
N_KERNELS                = 10000   # number of convolutional kernels
MAX_DILATIONS_PER_KERNEL = 32
N_FEATURES_PER_KERNEL    = 4

print(f"N_KERNELS (MultiRocket)          = {N_KERNELS}")
print(f"MAX_DILATIONS_PER_KERNEL         = {MAX_DILATIONS_PER_KERNEL}")
print(f"N_FEATURES_PER_KERNEL            = {N_FEATURES_PER_KERNEL}")

SAMPLE_RATE_HZ = 50.0
WINDOW_SECONDS = WINDOW_SAMPLES / SAMPLE_RATE_HZ
print(f"Assumed sampling rate = {SAMPLE_RATE_HZ} Hz → window length ≈ {WINDOW_SECONDS:.3f} s")

# ----------------------------------------------------------------------
# 2) Create synthetic HAR-like training dataset (3D numpy)
# ----------------------------------------------------------------------
# Shape: (n_instances, n_channels, n_timepoints)
X_train_np = np.random.randn(N_TRAIN, N_CHANNELS, WINDOW_SAMPLES).astype(np.float32)
y_train    = np.random.randint(0, N_CLASSES, size=(N_TRAIN,), dtype=np.int32)

print(f"\nX_train_np shape: {X_train_np.shape}, y_train shape: {y_train.shape}")

# ----------------------------------------------------------------------
# 3) Define MultiRocket + linear classifier (Ridge)
# ----------------------------------------------------------------------
multirocket = MultiRocket(
    n_kernels=N_KERNELS,
    max_dilations_per_kernel=MAX_DILATIONS_PER_KERNEL,
    n_features_per_kernel=N_FEATURES_PER_KERNEL,
    normalise=False,
    n_jobs=1,           # do not use joblib parallelism; keep it single-process
    random_state=SEED,
)

clf = RidgeClassifier(
    alpha=1.0,
    fit_intercept=True
)

# ----------------------------------------------------------------------
# 4) Fit MultiRocket feature extractor + linear classifier
# ----------------------------------------------------------------------
t0_fit = time.perf_counter()
multirocket.fit(X_train_np, y_train)
X_train_trans = multirocket.transform(X_train_np)
clf.fit(X_train_trans, y_train)
t1_fit = time.perf_counter()
fit_time_ms = (t1_fit - t0_fit) * 1000.0

n_features_rocket = X_train_trans.shape[1]
n_linear_params = clf.coef_.size + clf.intercept_.size

print(f"\nMultiRocket fit + linear head fit time: {fit_time_ms:.3f} ms")
print(f"MultiRocket transformed feature dimension: {n_features_rocket}")
print(f"Approximate number of linear head parameters: {n_linear_params:,}")

# ----------------------------------------------------------------------
# 5) Utility: latency measurement for MultiRocket + linear head
# ----------------------------------------------------------------------
def measure_multirocket_latency(
    transformer: MultiRocket,
    classifier,
    X_query: np.ndarray,
    batch_size: int,
    n_warmup: int = 20,
    n_runs: int = 100,
):
    """
    Measure MultiRocket.transform() + linear classifier.predict() latency on CPU.
    X_query: np.ndarray with shape (batch_size, n_channels, n_timepoints).
    Returns latency stats in milliseconds per batch.
    """
    # Warm-up runs (excluded from stats)
    for _ in range(n_warmup):
        X_feat = transformer.transform(X_query)
        _ = classifier.predict(X_feat)

    # Timed runs
    times_ms = []
    for _ in range(n_runs):
        t0 = time.perf_counter()
        X_feat = transformer.transform(X_query)
        _ = classifier.predict(X_feat)
        t1 = time.perf_counter()
        times_ms.append((t1 - t0) * 1000.0)

    times_ms = np.array(times_ms, dtype=float)
    stats = {
        "batch_latency_p50_ms": float(np.percentile(times_ms, 50)),
        "batch_latency_p90_ms": float(np.percentile(times_ms, 90)),
        "batch_latency_mean_ms": float(times_ms.mean()),
        "batch_latency_std_ms":  float(times_ms.std()),
        "n_runs": int(n_runs),
        "batch_size": int(batch_size),
    }
    return stats

# ----------------------------------------------------------------------
# 6) Build a synthetic query batch and run the benchmark
# ----------------------------------------------------------------------
X_query_np = np.random.randn(BATCH_SIZE, N_CHANNELS, WINDOW_SAMPLES).astype(np.float32)
print(f"\nQuery batch array shape: {X_query_np.shape}")

stats = measure_multirocket_latency(
    multirocket,
    clf,
    X_query=X_query_np,
    batch_size=BATCH_SIZE,
    n_warmup=N_WARMUP,
    n_runs=N_RUNS,
)

print("\nCPU latency stats for MultiRocket + linear head (per batch):")
for k, v in stats.items():
    if k.endswith("_ms"):
        print(f"  {k}: {v:.4f}")
    else:
        print(f"  {k}: {v}")

# Each sample corresponds to one HAR window, so per-window latency = per-batch / batch_size
per_window_p50  = stats["batch_latency_p50_ms"]  / stats["batch_size"]
per_window_p90  = stats["batch_latency_p90_ms"]  / stats["batch_size"]
per_window_mean = stats["batch_latency_mean_ms"] / stats["batch_size"]

print("\nApproximate CPU latency per window (HAR sample):")
print(f"  window_latency_p50_ms  ≈ {per_window_p50:.4f}")
print(f"  window_latency_p90_ms  ≈ {per_window_p90:.4f}")
print(f"  window_latency_mean_ms ≈ {per_window_mean:.4f}")

# Optional: real-time factor relative to window length
rt_factor_p50  = per_window_p50  / (WINDOW_SECONDS * 1000.0)
rt_factor_mean = per_window_mean / (WINDOW_SECONDS * 1000.0)

print("\nReal-time factor (MultiRocket, CPU):")
print(f"  p50  RTF ≈ {rt_factor_p50:.4f} (CPU time / wall-clock window length)")
print(f"  mean RTF ≈ {rt_factor_mean:.4f}")

print("\n[MultiRocket CPU inference latency benchmark (aeon, single-core) completed]")

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.5/6.5 MB[0m [31m50.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m37.3/37.3 MB[0m [31m50.5 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
shap 0.50.0 requires numpy>=2, but you have numpy 1.26.4 which is incompatible.
mizani 0.13.5 requires pandas>=2.2.0, but you have pandas 2.1.4 which is incompatible.
umap-learn 0.5.9.post2 requires scikit-learn>=1.6, but you have scikit-learn 1.4.2 which is incompatible.
plotnine 0.14.5 requires pandas>=2.2.0, but you have pandas 2.1.4 which is incompatible.
cuml-cu12 25.10.0 requires numba<0.62.0a0,>=0.60.0, but you have numba 0.59.1 which is incompati