In [1]:
# Inspect available GPU devices
!nvidia-smi

# Install RAPIDS (cuML) + CuPy + NVML
# Note: Official RAPIDS wheels are hosted on NVIDIA's PyPI, so --extra-index-url is required
!pip -q install --extra-index-url=https://pypi.nvidia.com \
    rmm-cu12 cudf-cu12 cuml-cu12 \
    cupy-cuda12x pynvml scikit-learn

Mon Nov 17 15:41:30 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   41C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
# ================ Step 19: GPU Energy Consumption (NVML Integration Method) ================
import os, json, math, time, multiprocessing as mp, pathlib, sys, subprocess
import numpy as np
import pandas as pd

# Multi-backend GPU synchronization (prioritize CuPy, then PyTorch)
def gpu_sync():
    try:
        import cupy as cp
        cp.cuda.runtime.deviceSynchronize()
    except Exception:
        pass
    try:
        import torch
        if torch.cuda.is_available():
            torch.cuda.synchronize()
    except Exception:
        pass

# --------- NVML sampling subprocess ---------
def nvml_sampler(stop_event, q, dev_index=0, interval=0.02):
    """
    Periodically read NVML power (mW) and push (t_absolute, power_mW) into the queue.
    t uses the absolute monotonic clock time.perf_counter(), so it can be aligned with the main process.
    """
    import pynvml, time
    pynvml.nvmlInit()
    h = pynvml.nvmlDeviceGetHandleByIndex(dev_index)
    try:
        while not stop_event.is_set():
            t = time.perf_counter()
            p_mw = pynvml.nvmlDeviceGetPowerUsage(h)  # milliwatt
            q.put((t, p_mw))
            time.sleep(interval)
    finally:
        pynvml.nvmlShutdown()

def integrate_energy_mJ_between(samples, t0, t1):
    """
    Perform trapezoidal integration of the power–time curve over [t0, t1].
    samples: List[(t, mW)] (t is the absolute time from perf_counter)
    Return energy in mJ.
    """
    if not samples:
        return 0.0
    samples = sorted(samples, key=lambda x: x[0])
    ts = np.array([t for t,_ in samples], dtype=np.float64)
    ps = np.array([p for _,p in samples], dtype=np.float64)  # mW
    # Select the interval and interpolate the endpoints
    mask = (ts >= t0) & (ts <= t1)
    ts_win = ts[mask]
    ps_win = ps[mask]
    if ts_win.size == 0 or ts_win[0] > t0:
        p0 = np.interp(t0, ts, ps)
        ts_win = np.insert(ts_win, 0, t0)
        ps_win = np.insert(ps_win, 0, p0)
    if ts_win[-1] < t1:
        p1 = np.interp(t1, ts, ps)
        ts_win = np.append(ts_win, t1)
        ps_win = np.append(ps_win, p1)
    # Integration: mW * s = mJ
    E_mJ = float(np.trapz(ps_win, ts_win))
    return E_mJ

def sample_idle_power_mW(duration_s=20.0, dev_index=0, interval=0.02, save_csv=None):
    """
    Sample the idle power and return the mean power (mW) and the power trace.
    """
    q = mp.Queue()
    stop = mp.Event()
    proc = mp.Process(target=nvml_sampler, args=(stop, q, dev_index, interval))
    proc.start()
    t_begin = time.perf_counter()
    time.sleep(duration_s)
    stop.set(); proc.join()
    samples = []
    while not q.empty():
        samples.append(q.get())
    if not samples:
        raise RuntimeError("NVML did not return any power samples.")
    samples = sorted(samples, key=lambda x: x[0])
    t0, t1 = samples[0][0], samples[-1][0]
    E_idle_mJ = integrate_energy_mJ_between(samples, t0, t1)
    T_idle_s  = max(1e-9, (t1 - t0))
    P_idle_mW = E_idle_mJ / T_idle_s
    if save_csv:
        df = pd.DataFrame(samples, columns=["t_abs_s","power_mW"])
        df.to_csv(save_csv, index=False)
    return P_idle_mW, samples

def calibrate_repeats(run_once, target_s=8.0, min_rep=3, max_rep=2000):
    """
    Adaptively estimate how many repetitions are required to obtain an effective
    measurement window of length target_s.
    """
    gpu_sync()
    t0 = time.perf_counter(); run_once(); gpu_sync(); t1 = time.perf_counter()
    dt = max(1e-4, t1 - t0)
    reps = int(math.ceil(target_s / dt))
    return int(np.clip(reps, min_rep, max_rep))

def measure_mJ_per_inference(run_once, n_items_per_call:int, repeats:int,
                             P_idle_mW:float, dev_index=0, interval=0.02,
                             save_csv=None):
    """
    Sample power concurrently while run_once() is executed repeats times,
    perform integration with idle power subtraction, and return mJ per inference
    along with detailed statistics.
    """
    q = mp.Queue()
    stop = mp.Event()
    proc = mp.Process(target=nvml_sampler, args=(stop, q, dev_index, interval))
    proc.start()

    gpu_sync()
    t0 = time.perf_counter()
    for _ in range(repeats):
        run_once()
    gpu_sync()
    t1 = time.perf_counter()

    stop.set(); proc.join()
    samples = []
    while not q.empty():
        samples.append(q.get())
    if not samples:
        raise RuntimeError("NVML did not return any power samples (active phase).")

    E_total_mJ = integrate_energy_mJ_between(samples, t0, t1)
    T_total_s  = max(1e-9, (t1 - t0))
    E_idle_mJ  = P_idle_mW * T_total_s
    n_inf      = max(1, repeats * n_items_per_call)
    mJ_per_inf = max(0.0, (E_total_mJ - E_idle_mJ) / n_inf)
    ms_per_inf = (T_total_s / n_inf) * 1e3
    throughput = n_inf / T_total_s

    if save_csv:
        df = pd.DataFrame(samples, columns=["t_abs_s","power_mW"])
        df.to_csv(save_csv, index=False)

    return {
        "mJ_per_inf": mJ_per_inf,
        "ms_per_inf": ms_per_inf,
        "throughput_inf_per_s": throughput,
        "n_inferences": n_inf,
        "repeats": repeats,
        "T_total_s": T_total_s,
        "E_total_mJ": E_total_mJ,
        "E_idle_mJ": E_idle_mJ,
        "P_idle_mW": P_idle_mW,
        "t0_abs": t0, "t1_abs": t1,
    }

# ========== Construct synthetic data and train cuML models ==========
print("\n[Info] Preparing data and models (cuML KNN / RandomForest on GPU)")
import cupy as cp
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from cuml.neighbors import KNeighborsClassifier as cuKNN
from cuml.ensemble import RandomForestClassifier as cuRF

# Generate a medium-scale dataset (can be increased if needed)
X, y = make_classification(n_samples=30000, n_features=64, n_informative=48,
                           n_redundant=0, n_classes=8, random_state=7)
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=5000, random_state=42)

# Move data to GPU (float32)
Xtr = cp.asarray(X_tr, dtype=cp.float32); ytr = cp.asarray(y_tr.astype(np.int32))
Xte = cp.asarray(X_te, dtype=cp.float32); yte = cp.asarray(y_te.astype(np.int32))

print(f"Train: {Xtr.shape}, Test: {Xte.shape}")

# KNN (brute-force, the most stable GPU path)
knn = cuKNN(n_neighbors=5, algorithm="brute", metric="euclidean")
knn.fit(Xtr, ytr)
gpu_sync()

# Random Forest (GPU)
rf = cuRF(n_estimators=100, max_depth=16, n_bins=128, bootstrap=True, n_streams=8)
rf.fit(Xtr, ytr)
gpu_sync()

# ========== Warm-up phase ==========
def _run_knn_once():
    _ = knn.predict(Xte)
    gpu_sync()

def _run_rf_once():
    _ = rf.predict(Xte)
    gpu_sync()

print("\n[Info] Warming up ...")
for _ in range(30):
    _run_knn_once()
    _run_rf_once()
gpu_sync()

# ========== Create log directories ==========
pathlib.Path("logs").mkdir(exist_ok=True)
pathlib.Path("figures").mkdir(exist_ok=True)

# ========== Idle-power baseline ==========
print("\n[Info] Sampling idle power (20 s) ...")
P_idle_mW, idle_trace = sample_idle_power_mW(
    duration_s=20.0, dev_index=0, interval=0.02, save_csv="logs/power_idle_trace.csv"
)
print(f"[Info] Mean idle power ~ {P_idle_mW:.1f} mW")

# ========== Adaptively determine repeats (ensure effective window ≥ 8 s) ==========
rep_knn = calibrate_repeats(_run_knn_once, target_s=8.0, min_rep=5, max_rep=2000)
rep_rf  = calibrate_repeats(_run_rf_once,  target_s=8.0, min_rep=5, max_rep=2000)
print(f"[Info] KNN repeats = {rep_knn}, RF repeats = {rep_rf}")

# ========== Multiple repeated measurements with bootstrap confidence intervals ==========
def measure_with_bootstrap(name, run_once, n_items, repeats, n_runs=5, n_boot=1000):
    results = []
    for i in range(n_runs):
        print(f"[Measure] {name} - run {i+1}/{n_runs} ...")
        res = measure_mJ_per_inference(
            run_once, n_items_per_call=n_items, repeats=repeats,
            P_idle_mW=P_idle_mW, dev_index=0, interval=0.02,
            save_csv=f"logs/power_trace_{name}_run{i+1}.csv"
        )
        results.append(res)

    mJs = np.array([r["mJ_per_inf"] for r in results], dtype=np.float64)
    # Percentile bootstrap confidence interval (simple and robust)
    rng = np.random.default_rng(123)
    boots = []
    for _ in range(n_boot):
        idx = rng.integers(0, len(mJs), size=len(mJs))
        boots.append(float(np.mean(mJs[idx])))
    ci_low, ci_high = np.percentile(boots, [2.5, 97.5])
    summary = {
        "model": name,
        "mean_mJ_per_inf": float(np.mean(mJs)),
        "ci95_low": float(ci_low),
        "ci95_high": float(ci_high),
        "runs": results,
    }
    with open(f"logs/energy_{name}.json", "w") as f:
        json.dump(summary, f, indent=2)
    print(f"[Result] {name}: {summary['mean_mJ_per_inf']:.3f} mJ/inf "
          f"(95% CI [{summary['ci95_low']:.3f}, {summary['ci95_high']:.3f}])")
    return summary

print("\n[Info] Starting KNN measurement ...")
sum_knn = measure_with_bootstrap(
    name="knn_cuml",
    run_once=_run_knn_once,
    n_items=Xte.shape[0],
    repeats=rep_knn,
    n_runs=5, n_boot=1000
)

print("\n[Info] Starting RandomForest measurement ...")
sum_rf = measure_with_bootstrap(
    name="rf_cuml",
    run_once=_run_rf_once,
    n_items=Xte.shape[0],
    repeats=rep_rf,
    n_runs=5, n_boot=1000
)

# Summary table
df_sum = pd.DataFrame([
    {"model":"KNN (cuML)", "mJ/inf_mean":sum_knn["mean_mJ_per_inf"],
     "CI95_low":sum_knn["ci95_low"], "CI95_high":sum_knn["ci95_high"]},
    {"model":"RandomForest (cuML)", "mJ/inf_mean":sum_rf["mean_mJ_per_inf"],
     "CI95_low":sum_rf["ci95_low"], "CI95_high":sum_rf["ci95_high"]},
])
df_sum.to_csv("logs/energy_summary.csv", index=False)
print("\n=== Energy measurement completed ===")
print(df_sum)
print("\nLog files:")
print("- logs/power_idle_trace.csv")
print("- logs/power_trace_knn_cuml_run*.csv")
print("- logs/power_trace_rf_cuml_run*.csv")
print("- logs/energy_knn_cuml.json")
print("- logs/energy_rf_cuml.json")
print("- logs/energy_summary.csv")


[Info] Preparing data and models (cuML KNN / RandomForest on GPU)
Train: (25000, 64), Test: (5000, 64)


  import pynvml  # type: ignore[import]



[Info] Warming up ...

[Info] Sampling idle power (20 s) ...
[Info] Mean idle power ~ 27726.4 mW
[Info] KNN repeats = 251, RF repeats = 2000

[Info] Starting KNN measurement ...
[Measure] knn_cuml - run 1/5 ...


  E_mJ = float(np.trapz(ps_win, ts_win))


[Measure] knn_cuml - run 2/5 ...
[Measure] knn_cuml - run 3/5 ...
[Measure] knn_cuml - run 4/5 ...
[Measure] knn_cuml - run 5/5 ...
[Result] knn_cuml: 0.117 mJ/inf (95% CI [0.115, 0.118])

[Info] Starting RandomForest measurement ...
[Measure] rf_cuml - run 1/5 ...
[Measure] rf_cuml - run 2/5 ...
[Measure] rf_cuml - run 3/5 ...
[Measure] rf_cuml - run 4/5 ...
[Measure] rf_cuml - run 5/5 ...
[Result] rf_cuml: 0.009 mJ/inf (95% CI [0.008, 0.009])

=== Energy measurement completed ===
                 model  mJ/inf_mean  CI95_low  CI95_high
0           KNN (cuML)     0.116521  0.115064   0.117602
1  RandomForest (cuML)     0.008627  0.008096   0.009497

Log files:
- logs/power_idle_trace.csv
- logs/power_trace_knn_cuml_run*.csv
- logs/power_trace_rf_cuml_run*.csv
- logs/energy_knn_cuml.json
- logs/energy_rf_cuml.json
- logs/energy_summary.csv


In [3]:
!pip -q install sktime

In [5]:
# ===== Step 19b (FIXED): MiniROCKET / MultiROCKET + GPU Linear Head Energy Consumption =====
import numpy as np, cupy as cp, pandas as pd, pathlib, json, time
from sklearn.linear_model import RidgeClassifier
from sklearn.model_selection import train_test_split
from sktime.transformations.panel.rocket import MiniRocketMultivariate, MultiRocketMultivariate

# Reuse the NVML utility functions defined in the previous cell:
# gpu_sync, sample_idle_power_mW, calibrate_repeats,
# measure_mJ_per_inference, measure_with_bootstrap
assert 'gpu_sync' in globals(), "Please run the previous NVML utility cell first (the KNN/RF one)."

pathlib.Path("logs").mkdir(exist_ok=True)

# ---------- 1) Synthetic multivariate time-series dataset (independent from previous X_tr) ----------
def make_synth_ts(n_samples=2800, n_channels=6, length=150, n_classes=8, seed=2025):
    """
    Generate a simple synthetic multivariate time-series dataset:
    X: (N, C, L), y: (N,)
    """
    rng = np.random.default_rng(seed)
    X = rng.normal(0, 1, size=(n_samples, n_channels, length)).astype(np.float32)
    y = rng.integers(0, n_classes, size=n_samples).astype(np.int32)

    # add simple class-dependent sinusoidal patterns on channel 0/1
    t = np.linspace(0, 2*np.pi, length, dtype=np.float32)
    for c in range(n_classes):
        idx = (y == c)
        if idx.any():
            freq = 1.0 + 0.2 * c
            X[idx, 0, :] += 0.6 * np.sin(freq * t)
            X[idx, 1, :] += 0.4 * np.cos(0.5 * freq * t)
    return X, y

# ALWAYS create a fresh TS dataset here; do NOT reuse global X_tr from the KNN/RF step
X_ts, y_ts = make_synth_ts()
X_tr_ts, X_te_ts, y_tr_ts, y_te_ts = train_test_split(
    X_ts, y_ts, test_size=800, random_state=7, stratify=y_ts
)
print(f"[Info] Time-series windows: Train={X_tr_ts.shape}, Test={X_te_ts.shape}, "
      f"Classes={len(np.unique(y_tr_ts))}")

# ---------- 2) MiniROCKET: CPU feature extraction (excluded from GPU energy accounting) ----------
print("\n[MiniROCKET] feature transform (CPU) ... (not part of GPU energy)")
mini = MiniRocketMultivariate(random_state=42)

# sktime MiniROCKET/MultiROCKET for float64 + C-contiguous
Xtr_mini = mini.fit_transform(
    np.ascontiguousarray(X_tr_ts, dtype=np.float64)
).astype(np.float32, copy=False)
Xte_mini = mini.transform(
    np.ascontiguousarray(X_te_ts, dtype=np.float64)
).astype(np.float32, copy=False)

print(f"[MiniROCKET] features: train {Xtr_mini.shape}, test {Xte_mini.shape}")

print("[MiniROCKET] train Ridge (CPU) ...")
rc_mini = RidgeClassifier(alpha=1.0)
rc_mini.fit(Xtr_mini, y_tr_ts)
acc_mini = (rc_mini.predict(Xte_mini) == y_te_ts).mean()
print(f"[Check] MiniROCKET Ridge Acc: {acc_mini:.3f}")

# ---------- 3) MultiROCKET: CPU feature extraction (excluded from GPU energy accounting) ----------
print("\n[MultiROCKET] feature transform (CPU, float64+C-contiguous) ... (not part of GPU energy)")
multi = MultiRocketMultivariate(random_state=123)

Xtr_multi = multi.fit_transform(
    np.ascontiguousarray(X_tr_ts, dtype=np.float64)
).astype(np.float32, copy=False)
Xte_multi = multi.transform(
    np.ascontiguousarray(X_te_ts, dtype=np.float64)
).astype(np.float32, copy=False)

print(f"[MultiROCKET] features: train {Xtr_multi.shape}, test {Xte_multi.shape}")

print("[MultiROCKET] train Ridge (CPU) ...")
rc_multi = RidgeClassifier(alpha=1.0)
rc_multi.fit(Xtr_multi, y_tr_ts)
acc_multi = (rc_multi.predict(Xte_multi) == y_te_ts).mean()
print(f"[Check] MultiROCKET Ridge Acc: {acc_multi:.3f}")

# ---------- 4) Move the linear head to GPU and define run_once() ----------
def make_gpu_linear_runner(X_cpu: np.ndarray, clf: RidgeClassifier, batch: int = 512):
    """
    X_cpu: (N, D) float32 features
    clf.coef_: (C, D), intercept_: (C,)
    run_once(): performs one full-batch prediction on GPU (processed in mini-batches)
    """
    # move features & classifier weights to GPU once
    X_gpu = cp.asarray(X_cpu, dtype=cp.float32)                  # (N, D)
    W_gpu = cp.asarray(clf.coef_.T.astype(np.float32))           # (D, C)
    b_gpu = cp.asarray(clf.intercept_.astype(np.float32))        # (C,)
    N = X_cpu.shape[0]

    def run_once():
        # iterate over the test set in mini-batches
        for s in range(0, N, batch):
            e = min(N, s + batch)
            logits = X_gpu[s:e].dot(W_gpu) + b_gpu  # (B, C)
            _ = cp.argmax(logits, axis=1)
        gpu_sync()  # ensure all kernels are finished before timing/energy accounting
    return run_once, N

run_mini,  N_mini  = make_gpu_linear_runner(Xte_mini,  rc_mini,  batch=1024)
run_multi, N_multi = make_gpu_linear_runner(Xte_multi, rc_multi, batch=512)

# ---------- 5) Idle power: reuse existing value if available; otherwise, measure once ----------
if 'P_idle_mW' not in globals():
    print("\n[Info] Sampling idle power for 20 s ...")
    P_idle_mW, _idle = sample_idle_power_mW(
        duration_s=20.0, dev_index=0, interval=0.02,
        save_csv="logs/power_idle_trace_rocket.csv"
    )
    print(f"[Info] Mean idle power ~ {P_idle_mW:.1f} mW")
else:
    print(f"\n[Info] Reusing previously measured idle power P_idle_mW = {P_idle_mW:.1f} mW")

# ---------- 6) Warm-up ----------
print("\n[Warmup] GPU linear heads warmup ...")
for _ in range(30):
    run_mini()
    run_multi()
gpu_sync()

# ---------- 7) Adaptive determination of the measurement window (target ≥ 8 s) ----------
rep_mini  = calibrate_repeats(run_mini,  target_s=8.0, min_rep=3, max_rep=5000)
rep_multi = calibrate_repeats(run_multi, target_s=8.0, min_rep=3, max_rep=5000)
print(f"[Info] repeats: MiniROCKET={rep_mini}, MultiROCKET={rep_multi}")

# ---------- 8) NVML measurement + bootstrap confidence intervals ----------
sum_mini = measure_with_bootstrap(
    name="minirocket_gpu_linear",
    run_once=run_mini,
    n_items=N_mini,
    repeats=rep_mini,
    n_runs=5,
    n_boot=1000
)
sum_multi = measure_with_bootstrap(
    name="multirocket_gpu_linear",
    run_once=run_multi,
    n_items=N_multi,
    repeats=rep_multi,
    n_runs=5,
    n_boot=1000
)

# ---------- 9) Summary ----------
df_sum = pd.DataFrame([
    {
        "model": "MiniROCKET (GPU linear head)",
        "mJ/inf_mean": sum_mini["mean_mJ_per_inf"],
        "CI95_low":    sum_mini["ci95_low"],
        "CI95_high":   sum_mini["ci95_high"],
        "acc":         float(acc_mini),
    },
    {
        "model": "MultiROCKET (GPU linear head)",
        "mJ/inf_mean": sum_multi["mean_mJ_per_inf"],
        "CI95_low":    sum_multi["ci95_low"],
        "CI95_high":   sum_multi["ci95_high"],
        "acc":         float(acc_multi),
    },
])
df_sum.to_csv("logs/energy_summary_rocket_gpuhead.csv", index=False)

print("\n=== Completed (ROCKET GPU linear head energy; CPU feature extraction excluded from energy accounting) ===")
print(df_sum)
print("\nLog files:")
print("- logs/power_trace_minirocket_gpu_linear_run*.csv")
print("- logs/power_trace_multirocket_gpu_linear_run*.csv")
print("- logs/energy_minirocket_gpu_linear.json")
print("- logs/energy_multirocket_gpu_linear.json")
print("- logs/energy_summary_rocket_gpuhead.csv")

[Info] Time-series windows: Train=(2000, 6, 150), Test=(800, 6, 150), Classes=8

[MiniROCKET] feature transform (CPU) ... (not part of GPU energy)
[MiniROCKET] features: train (2000, 9996), test (800, 9996)
[MiniROCKET] train Ridge (CPU) ...
[Check] MiniROCKET Ridge Acc: 0.616

[MultiROCKET] feature transform (CPU, float64+C-contiguous) ... (not part of GPU energy)
[MultiROCKET] features: train (2000, 49728), test (800, 49728)
[MultiROCKET] train Ridge (CPU) ...




[Check] MultiROCKET Ridge Acc: 0.504

[Info] Reusing previously measured idle power P_idle_mW = 27726.4 mW

[Warmup] GPU linear heads warmup ...
[Info] repeats: MiniROCKET=5000, MultiROCKET=3695
[Measure] minirocket_gpu_linear - run 1/5 ...
[Measure] minirocket_gpu_linear - run 2/5 ...


  E_mJ = float(np.trapz(ps_win, ts_win))


[Measure] minirocket_gpu_linear - run 3/5 ...
[Measure] minirocket_gpu_linear - run 4/5 ...
[Measure] minirocket_gpu_linear - run 5/5 ...
[Result] minirocket_gpu_linear: 0.022 mJ/inf (95% CI [0.021, 0.023])
[Measure] multirocket_gpu_linear - run 1/5 ...
[Measure] multirocket_gpu_linear - run 2/5 ...
[Measure] multirocket_gpu_linear - run 3/5 ...
[Measure] multirocket_gpu_linear - run 4/5 ...
[Measure] multirocket_gpu_linear - run 5/5 ...
[Result] multirocket_gpu_linear: 0.092 mJ/inf (95% CI [0.091, 0.092])

=== Completed (ROCKET GPU linear head energy; CPU feature extraction excluded from energy accounting) ===
                           model  mJ/inf_mean  CI95_low  CI95_high      acc
0   MiniROCKET (GPU linear head)     0.021889  0.020916   0.022525  0.61625
1  MultiROCKET (GPU linear head)     0.091602  0.090517   0.092305  0.50375

Log files:
- logs/power_trace_minirocket_gpu_linear_run*.csv
- logs/power_trace_multirocket_gpu_linear_run*.csv
- logs/energy_minirocket_gpu_linear.json

In [7]:
!pip -q install tsai fastai torch --upgrade

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m324.1/324.1 kB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m821.0/821.0 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m393.1/393.1 MB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m61.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m571.0/571.0 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.2/200.2 MB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m42.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m158.2/158.2 MB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [9]:
# ============ Step 19c (No-Train Hotfix · Self-contained):
# InceptionTime & TST Inference Energy Consumption (GPU, NVML) ============

import os, math, time, json, pathlib, multiprocessing as mp
os.environ.setdefault("TORCH_COMPILE_DISABLE", "1")
os.environ.setdefault("TORCHDYNAMO_DISABLE", "1")

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split

# ============================================
# 0. NVML-based GPU energy measurement helpers
# ============================================

def gpu_sync():
    """
    Attempt to synchronize all GPU backends as much as possible:
    - Prefer synchronizing CuPy
    - If PyTorch CUDA is available, also synchronize torch.cuda
    """
    try:
        import cupy as cp
        cp.cuda.runtime.deviceSynchronize()
    except Exception:
        pass
    try:
        if torch.cuda.is_available():
            torch.cuda.synchronize()
    except Exception:
        pass


def nvml_sampler(stop_event, q, dev_index=0, interval=0.02):
    """
    Subprocess: periodically read NVML power (mW) and push (t_absolute, power_mW) into the queue.
    t uses the absolute monotonic clock time.perf_counter(), which facilitates alignment
    with the main process.
    """
    import pynvml, time as _time
    pynvml.nvmlInit()
    h = pynvml.nvmlDeviceGetHandleByIndex(dev_index)
    try:
        while not stop_event.is_set():
            t = _time.perf_counter()
            p_mw = pynvml.nvmlDeviceGetPowerUsage(h)  # milliwatt
            q.put((t, p_mw))
            _time.sleep(interval)
    finally:
        pynvml.nvmlShutdown()


def integrate_energy_mJ_between(samples, t0, t1):
    """
    Perform trapezoidal integration of the power–time curve over [t0, t1].
    samples: List[(t, mW)] (t is the absolute time returned by perf_counter)
    Returns energy in mJ.
    """
    if not samples:
        return 0.0
    samples = sorted(samples, key=lambda x: x[0])
    ts = np.array([t for t, _ in samples], dtype=np.float64)
    ps = np.array([p for _, p in samples], dtype=np.float64)  # mW

    # Select the interval and interpolate endpoints
    mask = (ts >= t0) & (ts <= t1)
    ts_win = ts[mask]
    ps_win = ps[mask]
    if ts_win.size == 0 or ts_win[0] > t0:
        p0 = np.interp(t0, ts, ps)
        ts_win = np.insert(ts_win, 0, t0)
        ps_win = np.insert(ps_win, 0, p0)
    if ts_win[-1] < t1:
        p1 = np.interp(t1, ts, ps)
        ts_win = np.append(ts_win, t1)
        ps_win = np.append(ps_win, p1)

    # Integration: mW * s = mJ
    E_mJ = float(np.trapz(ps_win, ts_win))
    return E_mJ


def sample_idle_power_mW(duration_s=20.0, dev_index=0, interval=0.02, save_csv=None):
    """
    Sample idle-state power and return the mean power (mW) and the full power trace.
    """
    q = mp.Queue()
    stop = mp.Event()
    proc = mp.Process(target=nvml_sampler, args=(stop, q, dev_index, interval))
    proc.start()

    time.sleep(duration_s)
    stop.set()
    proc.join()

    samples = []
    while not q.empty():
        samples.append(q.get())
    if not samples:
        raise RuntimeError("NVML did not capture any power samples (idle).")

    samples = sorted(samples, key=lambda x: x[0])
    t0, t1 = samples[0][0], samples[-1][0]
    E_idle_mJ = integrate_energy_mJ_between(samples, t0, t1)
    T_idle_s = max(1e-9, (t1 - t0))
    P_idle_mW = E_idle_mJ / T_idle_s

    if save_csv:
        df = pd.DataFrame(samples, columns=["t_abs_s", "power_mW"])
        df.to_csv(save_csv, index=False)

    return P_idle_mW, samples


def calibrate_repeats(run_once, target_s=8.0, min_rep=3, max_rep=2000):
    """
    Adaptively estimate how many repetitions are required to achieve an effective
    measurement window of length target_s.
    """
    gpu_sync()
    t0 = time.perf_counter()
    run_once()
    gpu_sync()
    t1 = time.perf_counter()
    dt = max(1e-4, (t1 - t0))
    reps = int(math.ceil(target_s / dt))
    return int(np.clip(reps, min_rep, max_rep))


def measure_mJ_per_inference(run_once, n_items_per_call: int, repeats: int,
                             P_idle_mW: float, dev_index=0, interval=0.02,
                             save_csv=None):
    """
    Concurrently sample power while run_once() is executed 'repeats' times,
    perform integration with idle power subtraction, and return mJ/inf together
    with detailed statistics.
    """
    q = mp.Queue()
    stop = mp.Event()
    proc = mp.Process(target=nvml_sampler, args=(stop, q, dev_index, interval))
    proc.start()

    gpu_sync()
    t0 = time.perf_counter()
    for _ in range(repeats):
        run_once()
    gpu_sync()
    t1 = time.perf_counter()

    stop.set()
    proc.join()

    samples = []
    while not q.empty():
        samples.append(q.get())
    if not samples:
        raise RuntimeError("NVML did not capture any power samples (active).")

    E_total_mJ = integrate_energy_mJ_between(samples, t0, t1)
    T_total_s = max(1e-9, (t1 - t0))
    E_idle_mJ = P_idle_mW * T_total_s
    n_inf = max(1, repeats * n_items_per_call)

    mJ_per_inf = max(0.0, (E_total_mJ - E_idle_mJ) / n_inf)
    ms_per_inf = (T_total_s / n_inf) * 1e3
    throughput = n_inf / T_total_s

    if save_csv:
        df = pd.DataFrame(samples, columns=["t_abs_s", "power_mW"])
        df.to_csv(save_csv, index=False)

    return {
        "mJ_per_inf": mJ_per_inf,
        "ms_per_inf": ms_per_inf,
        "throughput_inf_per_s": throughput,
        "n_inferences": n_inf,
        "repeats": repeats,
        "T_total_s": T_total_s,
        "E_total_mJ": E_total_mJ,
        "E_idle_mJ": E_idle_mJ,
        "P_idle_mW": P_idle_mW,
        "t0_abs": t0,
        "t1_abs": t1,
    }


def measure_with_bootstrap(name, run_once, n_items, repeats, n_runs=5, n_boot=1000):
    """
    Perform repeated measurements of mJ/inf and compute a bootstrap confidence
    interval for the mean.
    """
    pathlib.Path("logs").mkdir(exist_ok=True)
    results = []
    for i in range(n_runs):
        print(f"[Measure] {name} - run {i+1}/{n_runs} ...")
        res = measure_mJ_per_inference(
            run_once,
            n_items_per_call=n_items,
            repeats=repeats,
            P_idle_mW=P_idle_mW,
            dev_index=0,
            interval=0.02,
            save_csv=f"logs/power_trace_{name}_run{i+1}.csv"
        )
        results.append(res)

    mJs = np.array([r["mJ_per_inf"] for r in results], dtype=np.float64)
    rng = np.random.default_rng(123)
    boots = []
    for _ in range(n_boot):
        idx = rng.integers(0, len(mJs), size=len(mJs))
        boots.append(float(np.mean(mJs[idx])))
    ci_low, ci_high = np.percentile(boots, [2.5, 97.5])

    summary = {
        "model": name,
        "mean_mJ_per_inf": float(np.mean(mJs)),
        "ci95_low": float(ci_low),
        "ci95_high": float(ci_high),
        "runs": results,
    }
    with open(f"logs/energy_{name}.json", "w") as f:
        json.dump(summary, f, indent=2)

    print(f"[Result] {name}: {summary['mean_mJ_per_inf']:.3f} mJ/inf "
          f"(95% CI [{summary['ci95_low']:.3f}, {summary['ci95_high']:.3f}])")
    return summary


# ==============================
# 1. Data: synthetic 3D IMU-like
# ==============================

def make_synth_ts(n_samples=2800, n_channels=6, length=150, n_classes=8, seed=2025):
    """
    Generate a simple multichannel time series of shape (N, C, L),
    with class-dependent sinusoidal patterns.
    """
    rng = np.random.default_rng(seed)
    X = rng.normal(0, 1, size=(n_samples, n_channels, length)).astype(np.float32)
    y = rng.integers(0, n_classes, size=n_samples).astype(np.int64)

    t = np.linspace(0, 2 * np.pi, length, dtype=np.float32)
    for c in range(n_classes):
        idx = (y == c)
        if idx.any():
            freq = 1.0 + 0.2 * c
            X[idx, 0, :] += 0.6 * np.sin(freq * t)
            X[idx, 1, :] += 0.4 * np.cos(0.5 * freq * t)
    return X, y


# Here, we directly use synthetic data as an energy proxy for InceptionTime / TST.
X_ts, y_ts = make_synth_ts()
X_tr_ts, X_te_ts, y_tr_ts, y_te_ts = train_test_split(
    X_ts, y_ts, test_size=800, random_state=7, stratify=y_ts
)
print(f"[Info] Train={X_tr_ts.shape}, Test={X_te_ts.shape}, Classes={len(np.unique(y_tr_ts))}")

device = "cuda" if torch.cuda.is_available() else "cpu"
torch.backends.cudnn.benchmark = True
pathlib.Path("logs").mkdir(exist_ok=True)

# ===========================
# 2. Lightweight InceptionTime
# ===========================

class InceptionBlock1d(nn.Module):
    def __init__(self, in_ch, out_ch, bottleneck=32, ks=(9, 19, 39)):
        super().__init__()
        use_bn = in_ch > 1
        bott = bottleneck if use_bn else in_ch
        self.bottleneck = nn.Conv1d(in_ch, bott, 1, bias=False) if use_bn else nn.Identity()
        self.conv1 = nn.Conv1d(bott, out_ch // 4, ks[0], padding=ks[0] // 2, bias=False)
        self.conv2 = nn.Conv1d(bott, out_ch // 4, ks[1], padding=ks[1] // 2, bias=False)
        self.conv3 = nn.Conv1d(bott, out_ch // 4, ks[2], padding=ks[2] // 2, bias=False)
        self.pool = nn.MaxPool1d(3, stride=1, padding=1)
        self.conv_pool = nn.Conv1d(in_ch, out_ch // 4, 1, bias=False)
        self.bn = nn.BatchNorm1d(out_ch)
        self.act = nn.ReLU(inplace=True)

    def forward(self, x):
        z = self.bottleneck(x)
        y = torch.cat(
            [self.conv1(z), self.conv2(z), self.conv3(z), self.conv_pool(self.pool(x))],
            dim=1
        )
        return self.act(self.bn(y))


class InceptionResNetModule(nn.Module):
    def __init__(self, in_ch, out_ch, **kw):
        super().__init__()
        self.b1 = InceptionBlock1d(in_ch, out_ch, **kw)
        self.b2 = InceptionBlock1d(out_ch, out_ch, **kw)
        self.b3 = InceptionBlock1d(out_ch, out_ch, **kw)
        self.short = nn.Identity() if in_ch == out_ch else nn.Sequential(
            nn.Conv1d(in_ch, out_ch, 1, bias=False),
            nn.BatchNorm1d(out_ch)
        )
        self.act = nn.ReLU(inplace=True)

    def forward(self, x):
        res = self.short(x)
        y = self.b1(x)
        y = self.b2(y)
        y = self.b3(y)
        return self.act(y + res)


class InceptionTimeSmall(nn.Module):
    def __init__(self, c_in, n_classes, nb_filters=64, n_modules=2, bottleneck=32):
        super().__init__()
        layers, in_ch = [], c_in
        for _ in range(n_modules):
            layers.append(InceptionResNetModule(in_ch, nb_filters, bottleneck=bottleneck))
            in_ch = nb_filters
        self.features = nn.Sequential(*layers)
        self.gap = nn.AdaptiveAvgPool1d(1)
        self.fc = nn.Linear(nb_filters, n_classes)

    def forward(self, x):
        x = self.features(x)
        x = self.gap(x).squeeze(-1)
        return self.fc(x)


# ==================
# 3. Lightweight TST
# ==================

class PatchEmbed1D(nn.Module):
    def __init__(self, c_in, d_model=128, patch_len=10, stride=None):
        super().__init__()
        self.proj = nn.Conv1d(
            c_in, d_model, kernel_size=patch_len, stride=stride or patch_len, bias=False
        )

    def forward(self, x):  # (B,C,L) -> (B,N,D)
        x = self.proj(x).transpose(1, 2)
        return x


class SinPosEncoding(nn.Module):
    def __init__(self, d_model):
        super().__init__()
        self.d = d_model

    def forward(self, x):
        B, N, D = x.shape
        device = x.device
        pos = torch.arange(N, device=device).unsqueeze(1)
        div = torch.exp(torch.arange(0, D, 2, device=device) * (-math.log(10000.0) / D))
        pe = torch.zeros(N, D, device=device)
        pe[:, 0::2] = torch.sin(pos * div)
        pe[:, 1::2] = torch.cos(pos * div)
        return x + pe.unsqueeze(0)


class TSTSmall(nn.Module):
    def __init__(self, c_in, n_classes, d_model=128, n_heads=4,
                 depth=2, dim_ff=256, patch_len=10, dropout=0.1):
        super().__init__()
        self.embed = PatchEmbed1D(c_in, d_model, patch_len)
        enc_layer = nn.TransformerEncoderLayer(
            d_model=d_model, nhead=n_heads, dim_feedforward=dim_ff,
            dropout=dropout, batch_first=True, norm_first=True, activation='gelu'
        )
        self.encoder = nn.TransformerEncoder(enc_layer, num_layers=depth)
        self.pos = SinPosEncoding(d_model)
        self.head = nn.Linear(d_model, n_classes)

    def forward(self, x):
        x = self.embed(x)
        x = self.pos(x)
        x = self.encoder(x)
        return self.head(x.mean(dim=1))


# =========================
# 4. Build models (random)
# =========================

n_classes = int(len(np.unique(y_tr_ts)))
c_in = X_tr_ts.shape[1]

it_model = InceptionTimeSmall(
    c_in=c_in, n_classes=n_classes, nb_filters=64, n_modules=2, bottleneck=32
)
tst_model = TSTSmall(
    c_in=c_in, n_classes=n_classes, d_model=128, n_heads=4,
    depth=2, dim_ff=256, patch_len=10
)


# ==========================
# 5. Inference runner helpers
# ==========================

def make_runner(model, X_np, bs=512):
    model = model.to(device).eval()
    X_gpu = torch.as_tensor(X_np, dtype=torch.float32, device=device)
    N = X_np.shape[0]

    @torch.no_grad()
    def run_once():
        for s in range(0, N, bs):
            e = min(N, s + bs)
            _ = model(X_gpu[s:e])
        if torch.cuda.is_available():
            torch.cuda.synchronize()
        gpu_sync()

    return run_once, N


run_it, N_it = make_runner(it_model, X_te_ts, bs=512)
run_tst, N_tst = make_runner(tst_model, X_te_ts, bs=512)

# ===============
# 6. Idle power
# ===============

if 'P_idle_mW' not in globals():
    print("\n[Info] Sampling idle power for 20 s ...")
    P_idle_mW, _idle = sample_idle_power_mW(
        duration_s=20.0, dev_index=0, interval=0.02,
        save_csv="logs/power_idle_trace_deepts_hotfix.csv"
    )
    print(f"[Info] Mean idle power ~ {P_idle_mW:.1f} mW")
else:
    print(f"\n[Info] Reusing previously measured idle power P_idle_mW = {P_idle_mW:.1f} mW")

# ===============
# 7. Warm-up
# ===============

print("\n[Warmup] warmup ...")
for _ in range(20):
    run_it()
    run_tst()
gpu_sync()

# =====================================
# 8. Adaptive window + NVML measurement
# =====================================

rep_it = calibrate_repeats(run_it, target_s=8.0, min_rep=3, max_rep=5000)
rep_tst = calibrate_repeats(run_tst, target_s=8.0, min_rep=3, max_rep=5000)
print(f"[Info] repeats: InceptionTime={rep_it}, TST={rep_tst}")

sum_it = measure_with_bootstrap(
    "inceptiontime_torch_eager", run_it, N_it, rep_it, n_runs=5, n_boot=1000
)
sum_ts = measure_with_bootstrap(
    "tst_torch_eager", run_tst, N_tst, rep_tst, n_runs=5, n_boot=1000
)

# =========
# 9. Summary
# =========

df = pd.DataFrame([
    {
        "model": "InceptionTime (eager, no-train)",
        "mJ/inf_mean": sum_it["mean_mJ_per_inf"],
        "CI95_low": sum_it["ci95_low"],
        "CI95_high": sum_it["ci95_high"],
    },
    {
        "model": "TST (eager, no-train)",
        "mJ/inf_mean": sum_ts["mean_mJ_per_inf"],
        "CI95_low": sum_ts["ci95_low"],
        "CI95_high": sum_ts["ci95_high"],
    },
])
df.to_csv("logs/energy_summary_deepts_eager.csv", index=False)

print("\n=== Completed (InceptionTime & TST inference energy, hotfix self-contained version) ===")
print(df)
print("\nLog files:")
print("- logs/power_idle_trace_deepts_hotfix.csv")
print("- logs/power_trace_inceptiontime_torch_eager_run*.csv")
print("- logs/power_trace_tst_torch_eager_run*.csv")
print("- logs/energy_inceptiontime_torch_eager.json")
print("- logs/energy_tst_torch_eager.json")
print("- logs/energy_summary_deepts_eager.csv")

[Info] Train=(2000, 6, 150), Test=(800, 6, 150), Classes=8

[Info] Reusing previously measured idle power P_idle_mW = 26257.8 mW

[Warmup] warmup ...




[Info] repeats: InceptionTime=298, TST=1449
[Measure] inceptiontime_torch_eager - run 1/5 ...
[Measure] inceptiontime_torch_eager - run 2/5 ...


  E_mJ = float(np.trapz(ps_win, ts_win))


[Measure] inceptiontime_torch_eager - run 3/5 ...
[Measure] inceptiontime_torch_eager - run 4/5 ...
[Measure] inceptiontime_torch_eager - run 5/5 ...
[Result] inceptiontime_torch_eager: 1.508 mJ/inf (95% CI [1.499, 1.517])
[Measure] tst_torch_eager - run 1/5 ...
[Measure] tst_torch_eager - run 2/5 ...
[Measure] tst_torch_eager - run 3/5 ...
[Measure] tst_torch_eager - run 4/5 ...
[Measure] tst_torch_eager - run 5/5 ...
[Result] tst_torch_eager: 0.275 mJ/inf (95% CI [0.272, 0.277])

=== Completed (InceptionTime & TST inference energy, hotfix self-contained version) ===
                             model  mJ/inf_mean  CI95_low  CI95_high
0  InceptionTime (eager, no-train)     1.508429  1.499234   1.516858
1            TST (eager, no-train)     0.274844  0.272484   0.276661

Log files:
- logs/power_idle_trace_deepts_hotfix.csv
- logs/power_trace_inceptiontime_torch_eager_run*.csv
- logs/power_trace_tst_torch_eager_run*.csv
- logs/energy_inceptiontime_torch_eager.json
- logs/energy_tst_tor