In [2]:
# ================================================================
# Step 19: GPU Energy Consumption (NVML Integration Method)
# mJ per 3-second window (treat each inference sample as one window)
# ================================================================
!nvidia-smi
!pip -q install pynvml

import os, json, math, time, multiprocessing as mp, pathlib, sys, subprocess
import numpy as np
import pandas as pd

# ---------------- GPU synchronization (CuPy -> PyTorch) ----------------
def gpu_sync():
    try:
        import cupy as cp
        cp.cuda.runtime.deviceSynchronize()
    except Exception:
        pass
    try:
        import torch
        if torch.cuda.is_available():
            torch.cuda.synchronize()
    except Exception:
        pass

# ---------------- NVML sampling subprocess ----------------
import pynvml

def nvml_sampler(stop_event, q, dev_index=0, interval=0.02):
    """Sample NVML power (mW) every `interval` and push (t_abs, mW)."""
    import pynvml, time
    pynvml.nvmlInit()
    h = pynvml.nvmlDeviceGetHandleByIndex(dev_index)
    try:
        while not stop_event.is_set():
            t = time.perf_counter()
            p_mw = pynvml.nvmlDeviceGetPowerUsage(h)
            q.put((t, p_mw))
            time.sleep(interval)
    finally:
        pynvml.nvmlShutdown()

def integrate_energy_mJ_between(samples, t0, t1):
    """Trapezoidal integrate power (mW) over [t0, t1] -> mJ."""
    if not samples: return 0.0
    samples = sorted(samples, key=lambda x: x[0])
    ts = np.array([t for t,_ in samples], dtype=np.float64)
    ps = np.array([p for _,p in samples], dtype=np.float64)
    m = (ts >= t0) & (ts <= t1)
    ts_w = ts[m]; ps_w = ps[m]
    if ts_w.size == 0 or ts_w[0] > t0:
        p0 = np.interp(t0, ts, ps)
        ts_w = np.insert(ts_w, 0, t0); ps_w = np.insert(ps_w, 0, p0)
    if ts_w[-1] < t1:
        p1 = np.interp(t1, ts, ps)
        ts_w = np.append(ts_w, t1); ps_w = np.append(ps_w, p1)
    return float(np.trapz(ps_w, ts_w))  # mW*s = mJ

def sample_idle_power_mW(duration_s=20.0, dev_index=0, interval=0.02, save_csv=None):
    """Return mean idle power (mW) and power trace."""
    q = mp.Queue(); stop = mp.Event()
    proc = mp.Process(target=nvml_sampler, args=(stop, q, dev_index, interval))
    proc.start()
    time.sleep(duration_s)
    stop.set(); proc.join()
    samples = []
    while not q.empty(): samples.append(q.get())
    if not samples: raise RuntimeError("NVML did not return idle samples.")
    samples = sorted(samples, key=lambda x: x[0])
    t0, t1 = samples[0][0], samples[-1][0]
    E_idle_mJ = integrate_energy_mJ_between(samples, t0, t1)
    T_idle_s  = max(1e-9, t1 - t0)
    P_idle_mW = E_idle_mJ / T_idle_s
    if save_csv:
        pd.DataFrame(samples, columns=["t_abs_s","power_mW"]).to_csv(save_csv, index=False)
    return P_idle_mW, samples

def calibrate_repeats(run_once, target_s=8.0, min_rep=3, max_rep=2000):
    """Estimate repeats so a single measurement lasts ~target_s."""
    gpu_sync()
    t0 = time.perf_counter(); run_once(); gpu_sync(); t1 = time.perf_counter()
    dt = max(1e-4, t1 - t0)
    reps = int(math.ceil(target_s / dt))
    return int(np.clip(reps, min_rep, max_rep))

def measure_mJ_per_window_core(run_once, n_windows_per_call, repeats, P_idle_mW,
                               dev_index=0, interval=0.02, save_csv=None):
    """
    Concurrent NVML sampling while `run_once()` is executed `repeats` times.
    Normalize energy by total windows -> mJ per window; also report ms per window.
    """
    q = mp.Queue(); stop = mp.Event()
    proc = mp.Process(target=nvml_sampler, args=(stop, q, dev_index, interval))
    proc.start()

    gpu_sync()
    t0 = time.perf_counter()
    for _ in range(repeats):
        run_once()
    gpu_sync()
    t1 = time.perf_counter()

    stop.set(); proc.join()
    samples = []
    while not q.empty(): samples.append(q.get())
    if not samples: raise RuntimeError("NVML did not return active samples.")

    E_total_mJ = integrate_energy_mJ_between(samples, t0, t1)
    T_total_s  = max(1e-9, t1 - t0)
    E_idle_mJ  = P_idle_mW * T_total_s
    n_windows  = max(1, repeats * n_windows_per_call)

    if save_csv:
        pd.DataFrame(samples, columns=["t_abs_s","power_mW"]).to_csv(save_csv, index=False)

    return {
        "mJ_per_window": max(0.0, (E_total_mJ - E_idle_mJ) / n_windows),
        "ms_per_window": (T_total_s / n_windows) * 1e3,
        "throughput_windows_per_s": n_windows / T_total_s,
        "n_windows": n_windows,
        "repeats": repeats,
        "T_total_s": T_total_s,
        "E_total_mJ": E_total_mJ,
        "E_idle_mJ": E_idle_mJ,
        "P_idle_mW": P_idle_mW,
        "t0_abs": t0, "t1_abs": t1,
    }

def measure_per_window_with_bootstrap(name, run_once, n_windows_per_call, repeats, n_runs=5, n_boot=1000):
    """Repeat and bootstrap the per-window mean with 95% CI."""
    results = []
    for i in range(n_runs):
        print(f"[Measure] {name} - run {i+1}/{n_runs} ...")
        res = measure_mJ_per_window_core(
            run_once, n_windows_per_call, repeats,
            P_idle_mW=P_idle_mW, dev_index=0, interval=0.02,
            save_csv=f"logs/power_trace_{name}_run{i+1}.csv"
        )
        results.append(res)

    mJs = np.array([r["mJ_per_window"] for r in results], dtype=np.float64)
    mss = np.array([r["ms_per_window"] for r in results], dtype=np.float64)

    rng = np.random.default_rng(123)
    boots = [float(np.mean(mJs[rng.integers(0, len(mJs), size=len(mJs))])) for _ in range(n_boot)]
    ci_low, ci_high = np.percentile(boots, [2.5, 97.5])

    summary = {
        "model": name,
        "mean_mJ_per_window": float(np.mean(mJs)),
        "ci95_low_mJ": float(ci_low),
        "ci95_high_mJ": float(ci_high),
        "mean_ms_per_window": float(np.mean(mss)),
        "runs": results,
    }
    with open(f"logs/energy_{name}.json", "w") as f:
        json.dump(summary, f, indent=2)
    print(f"[Result] {name}: {summary['mean_mJ_per_window']:.3f} mJ per window "
          f"(95% CI [{summary['ci95_low_mJ']:.3f}, {summary['ci95_high_mJ']:.3f}]); "
          f"{summary['mean_ms_per_window']:.3f} ms per window")
    return summary

# ---------------- Build synthetic data + cuML models (unchanged model hyperparams) ----------------
print("\n[Info] Preparing data and models (cuML KNN / RandomForest on GPU)")
import cupy as cp
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from cuml.neighbors import KNeighborsClassifier as cuKNN
from cuml.ensemble import RandomForestClassifier as cuRF

# If each sample corresponds to one 3 s window, the per-window metric equals mJ/sample.
WINDOW_SAMPLES = 150     # for reporting clarity only
FS = 50.0
WINDOW_SECONDS = WINDOW_SAMPLES / FS  # 3.0

X, y = make_classification(n_samples=30000, n_features=64, n_informative=48,
                           n_redundant=0, n_classes=8, random_state=7)
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=5000, random_state=42)

Xtr = cp.asarray(X_tr, dtype=cp.float32); ytr = cp.asarray(y_tr.astype(np.int32))
Xte = cp.asarray(X_te, dtype=cp.float32); yte = cp.asarray(y_te.astype(np.int32))
print(f"Train: {Xtr.shape}, Test: {Xte.shape} (treat 1 sample ≡ 1 window of ~{WINDOW_SECONDS:.1f}s)")

knn = cuKNN(n_neighbors=5, algorithm="brute", metric="euclidean")
knn.fit(Xtr, ytr); gpu_sync()

rf = cuRF(n_estimators=100, max_depth=16, n_bins=128, bootstrap=True, n_streams=8)
rf.fit(Xtr, ytr); gpu_sync()

def _run_knn_once():
    _ = knn.predict(Xte); gpu_sync()

def _run_rf_once():
    _ = rf.predict(Xte); gpu_sync()

print("\n[Info] Warming up ...")
for _ in range(30):
    _run_knn_once(); _run_rf_once()
gpu_sync()

pathlib.Path("logs").mkdir(exist_ok=True); pathlib.Path("figures").mkdir(exist_ok=True)

print("\n[Info] Sampling idle power (20 s) ...")
P_idle_mW, idle_trace = sample_idle_power_mW(duration_s=20.0, dev_index=0, interval=0.02,
                                             save_csv="logs/power_idle_trace.csv")
print(f"[Info] Mean idle power ~ {P_idle_mW:.1f} mW")

rep_knn = calibrate_repeats(_run_knn_once, target_s=8.0, min_rep=5, max_rep=2000)
rep_rf  = calibrate_repeats(_run_rf_once,  target_s=8.0, min_rep=5, max_rep=2000)
print(f"[Info] KNN repeats = {rep_knn}, RF repeats = {rep_rf}")

print("\n[Info] Measuring KNN (per-window) ...")
sum_knn = measure_per_window_with_bootstrap(
    name="knn_cuml_per_window",
    run_once=_run_knn_once,
    n_windows_per_call=int(Xte.shape[0]),
    repeats=rep_knn,
    n_runs=5, n_boot=1000
)

print("\n[Info] Measuring RandomForest (per-window) ...")
sum_rf = measure_per_window_with_bootstrap(
    name="rf_cuml_per_window",
    run_once=_run_rf_once,
    n_windows_per_call=int(Xte.shape[0]),
    repeats=rep_rf,
    n_runs=5, n_boot=1000
)

df_sum = pd.DataFrame([
    {"model":"KNN (cuML)", "mJ_per_3s_window_mean":sum_knn["mean_mJ_per_window"],
     "CI95_low_mJ":sum_knn["ci95_low_mJ"], "CI95_high_mJ":sum_knn["ci95_high_mJ"],
     "ms_per_3s_window_mean":sum_knn["mean_ms_per_window"], "window_seconds": WINDOW_SECONDS},
    {"model":"RandomForest (cuML)", "mJ_per_3s_window_mean":sum_rf["mean_mJ_per_window"],
     "CI95_low_mJ":sum_rf["ci95_low_mJ"], "CI95_high_mJ":sum_rf["ci95_high_mJ"],
     "ms_per_3s_window_mean":sum_rf["mean_ms_per_window"], "window_seconds": WINDOW_SECONDS},
])
df_sum.to_csv("logs/energy_summary_per_3s_window.csv", index=False)
print("\n=== Energy measurement (per 3-second window) completed ===")
print(df_sum)
print("\nLog files:")
print("- logs/power_idle_trace.csv")
print("- logs/power_trace_knn_cuml_per_window_run*.csv")
print("- logs/power_trace_rf_cuml_per_window_run*.csv")
print("- logs/energy_knn_cuml_per_window.json")
print("- logs/energy_rf_cuml_per_window.json")
print("- logs/energy_summary_per_3s_window.csv")

Fri Nov 21 17:18:13 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   48C    P0             27W /   70W |     334MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [3]:
!pip -q install sktime

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m35.6/35.6 MB[0m [31m60.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m149.5/149.5 kB[0m [31m19.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [4]:
# ================================================================
# Step 19b (FIXED): MiniROCKET / MultiROCKET + GPU Linear Head
# NVML-based GPU inference energy — mJ per 3-second window
# (CPU feature extraction is excluded from GPU energy accounting)
# ================================================================

# Optional: system check and NVML dependency
!nvidia-smi
!pip -q install pynvml sktime

import os, json, time, math, pathlib, warnings, multiprocessing as mp
from pathlib import Path
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

# ---------------- NVML utilities (per-window reporting) ----------------
import pynvml

def gpu_sync():
    """Synchronize CUDA work across CuPy and PyTorch if available."""
    try:
        import cupy as cp
        cp.cuda.runtime.deviceSynchronize()
    except Exception:
        pass
    try:
        import torch
        if torch.cuda.is_available():
            torch.cuda.synchronize()
    except Exception:
        pass

def _nvml_sampler(stop_event, q, dev_index=0, interval=0.02):
    """Sample NVML power (mW) every `interval` seconds and push (t_abs, mW)."""
    import time, pynvml
    pynvml.nvmlInit()
    h = pynvml.nvmlDeviceGetHandleByIndex(dev_index)
    try:
        while not stop_event.is_set():
            q.put((time.perf_counter(), pynvml.nvmlDeviceGetPowerUsage(h)))
            time.sleep(interval)
    finally:
        pynvml.nvmlShutdown()

def _integrate_mJ_between(samples, t0, t1):
    """Trapezoidal integrate power (mW) over [t0, t1] -> mJ."""
    if not samples:
        return 0.0
    samples = sorted(samples, key=lambda x: x[0])
    ts = np.array([t for t,_ in samples], dtype=np.float64)
    ps = np.array([p for _,p in samples], dtype=np.float64)
    m = (ts >= t0) & (ts <= t1)
    ts_w, ps_w = ts[m], ps[m]
    if ts_w.size == 0 or ts_w[0] > t0:
        p0 = np.interp(t0, ts, ps)
        ts_w = np.insert(ts_w, 0, t0)
        ps_w = np.insert(ps_w, 0, p0)
    if ts_w[-1] < t1:
        p1 = np.interp(t1, ts, ps)
        ts_w = np.append(ts_w, t1)
        ps_w = np.append(ps_w, p1)
    return float(np.trapz(ps_w, ts_w))  # mW*s = mJ

def sample_idle_power_mW(duration_s=20.0, dev_index=0, interval=0.02, save_csv=None):
    """Measure mean idle power (mW); optionally save the trace."""
    q = mp.Queue(); stop = mp.Event()
    p = mp.Process(target=_nvml_sampler, args=(stop, q, dev_index, interval)); p.start()
    time.sleep(duration_s); stop.set(); p.join()
    samples = []
    while not q.empty(): samples.append(q.get())
    if not samples:
        raise RuntimeError("NVML did not capture idle samples.")
    samples = sorted(samples, key=lambda x: x[0])
    t0, t1 = samples[0][0], samples[-1][0]
    E_idle_mJ = _integrate_mJ_between(samples, t0, t1)
    T_idle_s  = max(1e-9, t1 - t0)
    P_idle_mW = E_idle_mJ / T_idle_s
    if save_csv:
        pd.DataFrame(samples, columns=["t_abs_s","power_mW"]).to_csv(save_csv, index=False)
    return P_idle_mW, samples

def calibrate_repeats(run_once, target_s=8.0, min_rep=3, max_rep=5000):
    """Estimate repeats so a single measurement lasts ~target_s seconds."""
    gpu_sync(); t0 = time.perf_counter(); run_once(); gpu_sync(); t1 = time.perf_counter()
    dt = max(1e-4, t1 - t0)
    return int(np.clip(int(math.ceil(target_s / dt)), min_rep, max_rep))

def measure_mJ_per_window(run_once, n_windows_per_call, repeats, P_idle_mW,
                          dev_index=0, interval=0.02, save_csv=None):
    """Concurrent NVML sampling while running `run_once()`; return per-window energy & latency."""
    q = mp.Queue(); stop = mp.Event()
    p = mp.Process(target=_nvml_sampler, args=(stop, q, dev_index, interval)); p.start()
    gpu_sync(); t0 = time.perf_counter()
    for _ in range(repeats): run_once()
    gpu_sync(); t1 = time.perf_counter()
    stop.set(); p.join()
    samples = []
    while not q.empty(): samples.append(q.get())
    if not samples: raise RuntimeError("NVML did not capture active samples.")
    E_total_mJ = _integrate_mJ_between(samples, t0, t1)
    T_total_s  = max(1e-9, t1 - t0)
    E_idle_mJ  = P_idle_mW * T_total_s
    n_windows  = max(1, repeats * n_windows_per_call)
    if save_csv:
        pd.DataFrame(samples, columns=["t_abs_s","power_mW"]).to_csv(save_csv, index=False)
    return {
        "mJ_per_window": max(0.0, (E_total_mJ - E_idle_mJ) / n_windows),
        "ms_per_window": (T_total_s / n_windows) * 1e3,
        "throughput_windows_per_s": n_windows / T_total_s,
        "n_windows": n_windows, "repeats": repeats,
        "T_total_s": T_total_s, "E_total_mJ": E_total_mJ, "E_idle_mJ": E_idle_mJ,
        "P_idle_mW": P_idle_mW, "t0_abs": t0, "t1_abs": t1
    }

def measure_with_bootstrap_per_window(name, run_once, n_windows, repeats, n_runs=5, n_boot=1000):
    """Repeat measurements and bootstrap the per-window mean with 95% CI."""
    results = []
    for i in range(n_runs):
        print(f"[Measure] {name} run {i+1}/{n_runs} ...")
        res = measure_mJ_per_window(
            run_once, n_windows, repeats, P_idle_mW=P_idle_mW,
            dev_index=0, interval=0.02, save_csv=f"logs/power_trace_{name}_run{i+1}.csv"
        )
        results.append(res)
    mJ = np.array([r["mJ_per_window"] for r in results], dtype=np.float64)
    ms = np.array([r["ms_per_window"] for r in results], dtype=np.float64)
    rng = np.random.default_rng(123)
    boots_mJ = [float(np.mean(mJ[rng.integers(0, len(mJ), size=len(mJ))])) for _ in range(n_boot)]
    ci_lo, ci_hi = np.percentile(boots_mJ, [2.5, 97.5])
    summary = {
        "model": name,
        "mean_mJ_per_window": float(mJ.mean()),
        "ci95_low_mJ": float(ci_lo),
        "ci95_high_mJ": float(ci_hi),
        "mean_ms_per_window": float(ms.mean()),
        "runs": results
    }
    with open(f"logs/energy_{name}.json", "w") as f:
        json.dump(summary, f, indent=2)
    print(f"[Result] {name}: {summary['mean_mJ_per_window']:.4f} mJ per window "
          f"(95% CI [{summary['ci95_low_mJ']:.4f}, {summary['ci95_high_mJ']:.4f}]); "
          f"{summary['mean_ms_per_window']:.3f} ms per window")
    return summary

# ---------------- Synthetic time-series + MiniROCKET / MultiROCKET (CPU) ----------------
from sklearn.linear_model import RidgeClassifier
from sklearn.model_selection import train_test_split
from sktime.transformations.panel.rocket import MiniRocketMultivariate, MultiRocketMultivariate

def make_synth_ts(n_samples=2800, n_channels=6, length=150, n_classes=8, seed=2025):
    """X: (N, C, L), y: (N,)"""
    rng = np.random.default_rng(seed)
    X = rng.normal(0, 1, size=(n_samples, n_channels, length)).astype(np.float32)
    y = rng.integers(0, n_classes, size=n_samples).astype(np.int32)
    t = np.linspace(0, 2*np.pi, length, dtype=np.float32)
    for c in range(n_classes):
        idx = (y == c)
        if idx.any():
            freq = 1.0 + 0.2 * c
            X[idx, 0, :] += 0.6 * np.sin(freq * t)
            X[idx, 1, :] += 0.4 * np.cos(0.5 * freq * t)
    return X, y

X_ts, y_ts = make_synth_ts()
X_tr_ts, X_te_ts, y_tr_ts, y_te_ts = train_test_split(X_ts, y_ts, test_size=800, random_state=7, stratify=y_ts)
print(f"[Info] Time-series windows: Train={X_tr_ts.shape}, Test={X_te_ts.shape}, Classes={len(np.unique(y_tr_ts))}")

print("\n[MiniROCKET] feature transform (CPU, float64+C-contiguous) ...")
mini = MiniRocketMultivariate(random_state=42)
Xtr_mini = mini.fit_transform(np.ascontiguousarray(X_tr_ts, dtype=np.float64)).astype(np.float32, copy=False)
Xte_mini = mini.transform(    np.ascontiguousarray(X_te_ts, dtype=np.float64)).astype(np.float32, copy=False)
print(f"[MiniROCKET] features: train {Xtr_mini.shape}, test {Xte_mini.shape}")

print("[MiniROCKET] train Ridge (CPU) ...")
rc_mini = RidgeClassifier(alpha=1.0).fit(Xtr_mini, y_tr_ts)
acc_mini = (rc_mini.predict(Xte_mini) == y_te_ts).mean()
print(f"[Check] MiniROCKET Ridge Acc: {acc_mini:.3f}")

print("\n[MultiROCKET] feature transform (CPU, float64+C-contiguous) ...")
multi = MultiRocketMultivariate(random_state=123)
Xtr_multi = multi.fit_transform(np.ascontiguousarray(X_tr_ts, dtype=np.float64)).astype(np.float32, copy=False)
Xte_multi = multi.transform(    np.ascontiguousarray(X_te_ts, dtype=np.float64)).astype(np.float32, copy=False)
print(f"[MultiROCKET] features: train {Xtr_multi.shape}, test {Xte_multi.shape}")

print("[MultiROCKET] train Ridge (CPU) ...")
rc_multi = RidgeClassifier(alpha=1.0).fit(Xtr_multi, y_tr_ts)
acc_multi = (rc_multi.predict(Xte_multi) == y_te_ts).mean()
print(f"[Check] MultiROCKET Ridge Acc: {acc_multi:.3f}")

# ---------------- Move linear heads to GPU; define run_once() ----------------
import cupy as cp

def make_gpu_linear_runner(X_cpu: np.ndarray, clf: RidgeClassifier, batch: int = 512):
    """
    X_cpu: (N, D) float32, clf.coef_: (C, D), clf.intercept_: (C,)
    run_once(): full test prediction on GPU in mini-batches; returns nothing.
    """
    X_gpu = cp.asarray(X_cpu, dtype=cp.float32)                  # (N, D)
    W_gpu = cp.asarray(clf.coef_.T.astype(np.float32))           # (D, C)
    b_gpu = cp.asarray(clf.intercept_.astype(np.float32))        # (C,)
    N = X_cpu.shape[0]
    def run_once():
        for s in range(0, N, batch):
            e = min(N, s + batch)
            logits = X_gpu[s:e].dot(W_gpu) + b_gpu               # (B, C)
            _ = cp.argmax(logits, axis=1)
        gpu_sync()
    return run_once, N  # N windows per call

run_mini,  N_mini  = make_gpu_linear_runner(Xte_mini,  rc_mini,  batch=1024)
run_multi, N_multi = make_gpu_linear_runner(Xte_multi, rc_multi, batch=512)

# ---------------- Idle power (measure once or reuse) ----------------
Path("logs").mkdir(exist_ok=True)
if 'P_idle_mW' not in globals():
    print("\n[Info] Sampling idle power for 20 s ...")
    P_idle_mW, _idle = sample_idle_power_mW(duration_s=20.0, dev_index=0, interval=0.02,
                                            save_csv="logs/power_idle_trace_rocket.csv")
    print(f"[Info] Mean idle power ~ {P_idle_mW:.1f} mW")
else:
    print(f"\n[Info] Reusing idle power P_idle_mW = {P_idle_mW:.1f} mW")

# ---------------- Warm-up & repeats (target ≥ 8 s) ----------------
print("\n[Warmup] GPU linear heads warmup ...")
for _ in range(30): run_mini(); run_multi()
gpu_sync()

rep_mini  = calibrate_repeats(run_mini,  target_s=8.0, min_rep=3, max_rep=5000)
rep_multi = calibrate_repeats(run_multi, target_s=8.0, min_rep=3, max_rep=5000)
print(f"[Info] repeats: MiniROCKET={rep_mini}, MultiROCKET={rep_multi}")

# ---------------- NVML measurement (per-window) + bootstrap CI ----------------
sum_mini  = measure_with_bootstrap_per_window("minirocket_gpu_linear_per_window",  run_mini,  N_mini,  rep_mini,  n_runs=5, n_boot=1000)
sum_multi = measure_with_bootstrap_per_window("multirocket_gpu_linear_per_window", run_multi, N_multi, rep_multi, n_runs=5, n_boot=1000)

# ---------------- Summary (mJ/ms per 3-second window) ----------------
WINDOW_SECONDS = 150 / 50.0  # 3.0, for explicit reporting when using 150-sample windows @ 50 Hz
df_sum = pd.DataFrame([
    {"model":"MiniROCKET (GPU linear head)",
     "mJ_per_3s_window_mean": sum_mini["mean_mJ_per_window"],
     "ci95_low_mJ":           sum_mini["ci95_low_mJ"],
     "ci95_high_mJ":          sum_mini["ci95_high_mJ"],
     "ms_per_3s_window_mean": sum_mini["mean_ms_per_window"],
     "window_seconds":        WINDOW_SECONDS,
     "acc":                   float(acc_mini)},
    {"model":"MultiROCKET (GPU linear head)",
     "mJ_per_3s_window_mean": sum_multi["mean_mJ_per_window"],
     "ci95_low_mJ":           sum_multi["ci95_low_mJ"],
     "ci95_high_mJ":          sum_multi["ci95_high_mJ"],
     "ms_per_3s_window_mean": sum_multi["mean_ms_per_window"],
     "window_seconds":        WINDOW_SECONDS,
     "acc":                   float(acc_multi)}
])
df_sum.to_csv("logs/energy_summary_rocket_gpuhead_per_3s_window.csv", index=False)

print("\n=== Completed (ROCKET GPU linear head energy; per 3-second window) ===")
print(df_sum)
print("\nArtifacts:")
print("- logs/power_idle_trace_rocket.csv")
print("- logs/power_trace_minirocket_gpu_linear_per_window_run*.csv")
print("- logs/power_trace_multirocket_gpu_linear_per_window_run*.csv")
print("- logs/energy_minirocket_gpu_linear_per_window.json")
print("- logs/energy_multirocket_gpu_linear_per_window.json")
print("- logs/energy_summary_rocket_gpuhead_per_3s_window.csv")

Fri Nov 21 17:21:40 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   50C    P0             27W /   70W |     382MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [5]:
!pip -q install tsai fastai torch --upgrade

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m324.1/324.1 kB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m821.0/821.0 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m571.0/571.0 MB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m156.8/156.8 MB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m201.3/201.3 MB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m155.7/155.7 MB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m263.3/263.3 kB[0m [31m23.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m42.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [6]:
# ============ Step 19c (No-Train Hotfix · Self-contained):
# InceptionTime & TST Inference Energy Consumption (GPU, NVML)
# Per-window metric: mJ per 3-second window
# ============================================================

import os, math, time, json, pathlib, multiprocessing as mp
os.environ.setdefault("TORCH_COMPILE_DISABLE", "1")
os.environ.setdefault("TORCHDYNAMO_DISABLE", "1")

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split

# ============================================
# 0. NVML-based GPU energy measurement helpers
#     (per-window reporting)
# ============================================

def gpu_sync():
    """Best-effort sync across CuPy and PyTorch."""
    try:
        import cupy as cp
        cp.cuda.runtime.deviceSynchronize()
    except Exception:
        pass
    try:
        if torch.cuda.is_available():
            torch.cuda.synchronize()
    except Exception:
        pass

def nvml_sampler(stop_event, q, dev_index=0, interval=0.02):
    """Subprocess: periodically read NVML power (mW) and push (t_abs, power_mW)."""
    import pynvml, time as _time
    pynvml.nvmlInit()
    h = pynvml.nvmlDeviceGetHandleByIndex(dev_index)
    try:
        while not stop_event.is_set():
            q.put((_time.perf_counter(), pynvml.nvmlDeviceGetPowerUsage(h)))
            _time.sleep(interval)
    finally:
        pynvml.nvmlShutdown()

def integrate_energy_mJ_between(samples, t0, t1):
    """Trapezoidal integrate power (mW) over [t0, t1] -> mJ."""
    if not samples:
        return 0.0
    samples = sorted(samples, key=lambda x: x[0])
    ts = np.array([t for t,_ in samples], dtype=np.float64)
    ps = np.array([p for _,p in samples], dtype=np.float64)
    m = (ts >= t0) & (ts <= t1)
    ts_w = ts[m]; ps_w = ps[m]
    if ts_w.size == 0 or ts_w[0] > t0:
        p0 = np.interp(t0, ts, ps); ts_w = np.insert(ts_w, 0, t0); ps_w = np.insert(ps_w, 0, p0)
    if ts_w[-1] < t1:
        p1 = np.interp(t1, ts, ps); ts_w = np.append(ts_w, t1); ps_w = np.append(ps_w, p1)
    return float(np.trapz(ps_w, ts_w))

def sample_idle_power_mW(duration_s=20.0, dev_index=0, interval=0.02, save_csv=None):
    """Return mean idle power (mW) and the trace."""
    q = mp.Queue(); stop = mp.Event()
    proc = mp.Process(target=nvml_sampler, args=(stop, q, dev_index, interval))
    proc.start()
    time.sleep(duration_s)
    stop.set(); proc.join()
    samples = []
    while not q.empty():
        samples.append(q.get())
    if not samples:
        raise RuntimeError("NVML did not capture any power samples (idle).")
    samples = sorted(samples, key=lambda x: x[0])
    t0, t1 = samples[0][0], samples[-1][0]
    E_idle_mJ = integrate_energy_mJ_between(samples, t0, t1)
    T_idle_s  = max(1e-9, (t1 - t0))
    P_idle_mW = E_idle_mJ / T_idle_s
    if save_csv:
        pd.DataFrame(samples, columns=["t_abs_s","power_mW"]).to_csv(save_csv, index=False)
    return P_idle_mW, samples

def calibrate_repeats(run_once, target_s=8.0, min_rep=3, max_rep=5000):
    """Estimate repeats so a single measurement lasts ~target_s."""
    gpu_sync(); t0 = time.perf_counter(); run_once(); gpu_sync(); t1 = time.perf_counter()
    dt = max(1e-4, (t1 - t0))
    reps = int(math.ceil(target_s / dt))
    return int(np.clip(reps, min_rep, max_rep))

def measure_mJ_per_window(run_once, n_windows_per_call: int, repeats: int,
                          P_idle_mW: float, dev_index=0, interval=0.02, save_csv=None):
    """Concurrent NVML sampling during repeated run_once(); normalize by window count."""
    q = mp.Queue(); stop = mp.Event()
    proc = mp.Process(target=nvml_sampler, args=(stop, q, dev_index, interval))
    proc.start()
    gpu_sync(); t0 = time.perf_counter()
    for _ in range(repeats): run_once()
    gpu_sync(); t1 = time.perf_counter()
    stop.set(); proc.join()

    samples = []
    while not q.empty(): samples.append(q.get())
    if not samples:
        raise RuntimeError("NVML did not capture any power samples (active).")

    E_total_mJ = integrate_energy_mJ_between(samples, t0, t1)
    T_total_s  = max(1e-9, (t1 - t0))
    E_idle_mJ  = P_idle_mW * T_total_s
    n_windows  = max(1, repeats * n_windows_per_call)

    if save_csv:
        pd.DataFrame(samples, columns=["t_abs_s","power_mW"]).to_csv(save_csv, index=False)

    return {
        "mJ_per_window": max(0.0, (E_total_mJ - E_idle_mJ) / n_windows),
        "ms_per_window": (T_total_s / n_windows) * 1e3,
        "throughput_windows_per_s": n_windows / T_total_s,
        "n_windows": n_windows, "repeats": repeats,
        "T_total_s": T_total_s, "E_total_mJ": E_total_mJ, "E_idle_mJ": E_idle_mJ,
        "P_idle_mW": P_idle_mW, "t0_abs": t0, "t1_abs": t1
    }

def measure_with_bootstrap_per_window(name, run_once, n_windows, repeats, n_runs=5, n_boot=1000):
    """Repeat per-window measurement and bootstrap a 95% CI."""
    pathlib.Path("logs").mkdir(exist_ok=True)
    results = []
    for i in range(n_runs):
        print(f"[Measure] {name} - run {i+1}/{n_runs} ...")
        res = measure_mJ_per_window(
            run_once, n_windows_per_call=n_windows, repeats=repeats,
            P_idle_mW=P_idle_mW, dev_index=0, interval=0.02,
            save_csv=f"logs/power_trace_{name}_run{i+1}.csv"
        )
        results.append(res)
    mJ = np.array([r["mJ_per_window"] for r in results], dtype=np.float64)
    ms = np.array([r["ms_per_window"] for r in results], dtype=np.float64)
    rng = np.random.default_rng(123)
    boots = [float(np.mean(mJ[rng.integers(0, len(mJ), size=len(mJ))])) for _ in range(n_boot)]
    ci_low, ci_high = np.percentile(boots, [2.5, 97.5])
    summary = {
        "model": name,
        "mean_mJ_per_window": float(mJ.mean()),
        "ci95_low_mJ": float(ci_low),
        "ci95_high_mJ": float(ci_high),
        "mean_ms_per_window": float(ms.mean()),
        "runs": results
    }
    with open(f"logs/energy_{name}.json", "w") as f:
        json.dump(summary, f, indent=2)
    print(f"[Result] {name}: {summary['mean_mJ_per_window']:.3f} mJ per window "
          f"(95% CI [{summary['ci95_low_mJ']:.3f}, {summary['ci95_high_mJ']:.3f}]); "
          f"{summary['mean_ms_per_window']:.3f} ms per window")
    return summary

# ==============================
# 1. Synthetic 3D IMU-like data
# ==============================

def make_synth_ts(n_samples=2800, n_channels=6, length=150, n_classes=8, seed=2025):
    """X: (N, C, L), y: (N,) with simple class-specific patterns."""
    rng = np.random.default_rng(seed)
    X = rng.normal(0, 1, size=(n_samples, n_channels, length)).astype(np.float32)
    y = rng.integers(0, n_classes, size=n_samples).astype(np.int64)
    t = np.linspace(0, 2 * np.pi, length, dtype=np.float32)
    for c in range(n_classes):
        idx = (y == c)
        if idx.any():
            freq = 1.0 + 0.2 * c
            X[idx, 0, :] += 0.6 * np.sin(freq * t)
            X[idx, 1, :] += 0.4 * np.cos(0.5 * freq * t)
    return X, y

X_ts, y_ts = make_synth_ts()
X_tr_ts, X_te_ts, y_tr_ts, y_te_ts = train_test_split(X_ts, y_ts, test_size=800, random_state=7, stratify=y_ts)
print(f"[Info] Train={X_tr_ts.shape}, Test={X_te_ts.shape}, Classes={len(np.unique(y_tr_ts))}")

device = "cuda" if torch.cuda.is_available() else "cpu"
torch.backends.cudnn.benchmark = True
pathlib.Path("logs").mkdir(exist_ok=True)

# ===========================
# 2. Lightweight InceptionTime
# ===========================

class InceptionBlock1d(nn.Module):
    def __init__(self, in_ch, out_ch, bottleneck=32, ks=(9, 19, 39)):
        super().__init__()
        use_bn = in_ch > 1
        bott = bottleneck if use_bn else in_ch
        self.bottleneck = nn.Conv1d(in_ch, bott, 1, bias=False) if use_bn else nn.Identity()
        self.conv1 = nn.Conv1d(bott, out_ch // 4, ks[0], padding=ks[0] // 2, bias=False)
        self.conv2 = nn.Conv1d(bott, out_ch // 4, ks[1], padding=ks[1] // 2, bias=False)
        self.conv3 = nn.Conv1d(bott, out_ch // 4, ks[2], padding=ks[2] // 2, bias=False)
        self.pool = nn.MaxPool1d(3, stride=1, padding=1)
        self.conv_pool = nn.Conv1d(in_ch, out_ch // 4, 1, bias=False)
        self.bn = nn.BatchNorm1d(out_ch)
        self.act = nn.ReLU(inplace=True)
    def forward(self, x):
        z = self.bottleneck(x)
        y = torch.cat([self.conv1(z), self.conv2(z), self.conv3(z), self.conv_pool(self.pool(x))], dim=1)
        return self.act(self.bn(y))

class InceptionResNetModule(nn.Module):
    def __init__(self, in_ch, out_ch, **kw):
        super().__init__()
        self.b1 = InceptionBlock1d(in_ch, out_ch, **kw)
        self.b2 = InceptionBlock1d(out_ch, out_ch, **kw)
        self.b3 = InceptionBlock1d(out_ch, out_ch, **kw)
        self.short = nn.Identity() if in_ch == out_ch else nn.Sequential(
            nn.Conv1d(in_ch, out_ch, 1, bias=False), nn.BatchNorm1d(out_ch)
        )
        self.act = nn.ReLU(inplace=True)
    def forward(self, x):
        res = self.short(x)
        y = self.b1(x); y = self.b2(y); y = self.b3(y)
        return self.act(y + res)

class InceptionTimeSmall(nn.Module):
    def __init__(self, c_in, n_classes, nb_filters=64, n_modules=2, bottleneck=32):
        super().__init__()
        layers, in_ch = [], c_in
        for _ in range(n_modules):
            layers.append(InceptionResNetModule(in_ch, nb_filters, bottleneck=bottleneck))
            in_ch = nb_filters
        self.features = nn.Sequential(*layers)
        self.gap = nn.AdaptiveAvgPool1d(1)
        self.fc = nn.Linear(nb_filters, n_classes)
    def forward(self, x):
        x = self.features(x)
        x = self.gap(x).squeeze(-1)
        return self.fc(x)

# ================
# 3. Lightweight TST
# ================

class PatchEmbed1D(nn.Module):
    def __init__(self, c_in, d_model=128, patch_len=10, stride=None):
        super().__init__()
        self.proj = nn.Conv1d(c_in, d_model, kernel_size=patch_len, stride=stride or patch_len, bias=False)
    def forward(self, x):  # (B,C,L) -> (B,N,D)
        return self.proj(x).transpose(1, 2)

class SinPosEncoding(nn.Module):
    def __init__(self, d_model):
        super().__init__()
        self.d = d_model
    def forward(self, x):
        B, N, D = x.shape
        device = x.device
        pos = torch.arange(N, device=device).unsqueeze(1)
        div = torch.exp(torch.arange(0, D, 2, device=device) * (-math.log(10000.0) / D))
        pe = torch.zeros(N, D, device=device)
        pe[:, 0::2] = torch.sin(pos * div); pe[:, 1::2] = torch.cos(pos * div)
        return x + pe.unsqueeze(0)

class TSTSmall(nn.Module):
    def __init__(self, c_in, n_classes, d_model=128, n_heads=4, depth=2, dim_ff=256, patch_len=10, dropout=0.1):
        super().__init__()
        self.embed = PatchEmbed1D(c_in, d_model, patch_len)
        enc = nn.TransformerEncoderLayer(d_model=d_model, nhead=n_heads, dim_feedforward=dim_ff,
                                         dropout=dropout, batch_first=True, norm_first=True, activation='gelu')
        self.encoder = nn.TransformerEncoder(enc, num_layers=depth)
        self.pos = SinPosEncoding(d_model)
        self.head = nn.Linear(d_model, n_classes)
    def forward(self, x):
        x = self.embed(x); x = self.pos(x); x = self.encoder(x)
        return self.head(x.mean(dim=1))

# ==========================
# 4. Build models (random)
# ==========================

n_classes = int(len(np.unique(y_tr_ts)))
c_in = X_tr_ts.shape[1]

it_model = InceptionTimeSmall(c_in=c_in, n_classes=n_classes, nb_filters=64, n_modules=2, bottleneck=32)
tst_model = TSTSmall(c_in=c_in, n_classes=n_classes, d_model=128, n_heads=4, depth=2, dim_ff=256, patch_len=10)

# ==========================
# 5. Inference runner helpers
# ==========================

def make_runner(model, X_np, bs=512):
    model = model.to(device).eval()
    X_gpu = torch.as_tensor(X_np, dtype=torch.float32, device=device)
    N = X_np.shape[0]
    @torch.no_grad()
    def run_once():
        for s in range(0, N, bs):
            e = min(N, s + bs)
            _ = model(X_gpu[s:e])
        if torch.cuda.is_available(): torch.cuda.synchronize()
        gpu_sync()
    return run_once, N  # N windows per call

run_it,  N_it  = make_runner(it_model,  X_te_ts, bs=512)
run_tst, N_tst = make_runner(tst_model, X_te_ts, bs=512)

# ===============
# 6. Idle power
# ===============

if 'P_idle_mW' not in globals():
    print("\n[Info] Sampling idle power for 20 s ...")
    P_idle_mW, _idle = sample_idle_power_mW(duration_s=20.0, dev_index=0, interval=0.02,
                                            save_csv="logs/power_idle_trace_deepts_hotfix.csv")
    print(f"[Info] Mean idle power ~ {P_idle_mW:.1f} mW")
else:
    print(f"\n[Info] Reusing previously measured idle power P_idle_mW = {P_idle_mW:.1f} mW")

# ===============
# 7. Warm-up
# ===============

print("\n[Warmup] warmup ...")
for _ in range(20):
    run_it(); run_tst()
gpu_sync()

# =====================================
# 8. Adaptive window + NVML measurement
# =====================================

rep_it  = calibrate_repeats(run_it,  target_s=8.0, min_rep=3, max_rep=5000)
rep_tst = calibrate_repeats(run_tst, target_s=8.0, min_rep=3, max_rep=5000)
print(f"[Info] repeats: InceptionTime={rep_it}, TST={rep_tst}")

sum_it  = measure_with_bootstrap_per_window("inceptiontime_torch_eager_per_window", run_it,  N_it,  rep_it,  n_runs=5, n_boot=1000)
sum_tst = measure_with_bootstrap_per_window("tst_torch_eager_per_window",          run_tst, N_tst, rep_tst, n_runs=5, n_boot=1000)

# =========
# 9. Summary (mJ/ms per 3-second window)
# =========

WINDOW_SECONDS = 150 / 50.0  # if you use 150-sample windows @ 50 Hz
df = pd.DataFrame([
    {
        "model": "InceptionTime (eager, no-train)",
        "mJ_per_3s_window_mean":  sum_it["mean_mJ_per_window"],
        "ci95_low_mJ":           sum_it["ci95_low_mJ"],
        "ci95_high_mJ":          sum_it["ci95_high_mJ"],
        "ms_per_3s_window_mean": sum_it["mean_ms_per_window"],
        "window_seconds":        WINDOW_SECONDS,
    },
    {
        "model": "TST (eager, no-train)",
        "mJ_per_3s_window_mean":  sum_tst["mean_mJ_per_window"],
        "ci95_low_mJ":           sum_tst["ci95_low_mJ"],
        "ci95_high_mJ":          sum_tst["ci95_high_mJ"],
        "ms_per_3s_window_mean": sum_tst["mean_ms_per_window"],
        "window_seconds":        WINDOW_SECONDS,
    },
])
pathlib.Path("logs").mkdir(exist_ok=True)
df.to_csv("logs/energy_summary_deepts_eager_per_3s_window.csv", index=False)

print("\n=== Completed (InceptionTime & TST inference energy — per 3-second window) ===")
print(df)
print("\nLog files:")
print("- logs/power_idle_trace_deepts_hotfix.csv")
print("- logs/power_trace_inceptiontime_torch_eager_per_window_run*.csv")
print("- logs/power_trace_tst_torch_eager_per_window_run*.csv")
print("- logs/energy_inceptiontime_torch_eager_per_window.json")
print("- logs/energy_tst_torch_eager_per_window.json")
print("- logs/energy_summary_deepts_eager_per_3s_window.csv")

[Info] Train=(2000, 6, 150), Test=(800, 6, 150), Classes=8

[Info] Reusing previously measured idle power P_idle_mW = 28367.3 mW

[Warmup] warmup ...
[Info] repeats: InceptionTime=227, TST=1441
[Measure] inceptiontime_torch_eager_per_window - run 1/5 ...
[Measure] inceptiontime_torch_eager_per_window - run 2/5 ...
[Measure] inceptiontime_torch_eager_per_window - run 3/5 ...
[Measure] inceptiontime_torch_eager_per_window - run 4/5 ...
[Measure] inceptiontime_torch_eager_per_window - run 5/5 ...
[Result] inceptiontime_torch_eager_per_window: 1.902 mJ per window (95% CI [1.881, 1.924]); 0.046 ms per window
[Measure] tst_torch_eager_per_window - run 1/5 ...
[Measure] tst_torch_eager_per_window - run 2/5 ...
[Measure] tst_torch_eager_per_window - run 3/5 ...
[Measure] tst_torch_eager_per_window - run 4/5 ...
[Measure] tst_torch_eager_per_window - run 5/5 ...
[Result] tst_torch_eager_per_window: 0.262 mJ per window (95% CI [0.259, 0.265]); 0.006 ms per window

=== Completed (InceptionTime & 