In [1]:
# ================================================================
# RCCMix-HAR++ (Step 11 model as given) × Scheme 1:
# NVML GPU inference energy (mJ/inf, unit = sequence) — single cell, no prereqs
# ================================================================
# 0) System check & deps
!nvidia-smi
!pip -q install pynvml

import os, json, time, math, warnings, multiprocessing as mp
from pathlib import Path
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

# ---------------- NVML sampling + trapezoidal energy integration ----------------
import pynvml

def _nvml_sampler(stop_event, q, dev_index=0, interval=0.02):
    """Background process: sample GPU power (mW) every `interval` seconds, push (t, mW)."""
    import time, pynvml
    pynvml.nvmlInit()
    h = pynvml.nvmlDeviceGetHandleByIndex(dev_index)
    try:
        while not stop_event.is_set():
            q.put((time.perf_counter(), pynvml.nvmlDeviceGetPowerUsage(h)))
            time.sleep(interval)
    finally:
        pynvml.nvmlShutdown()

def _integrate_mJ_between(samples, t0, t1):
    """Trapezoidal integrate power (mW) over [t0, t1] → mJ."""
    if not samples: return 0.0
    samples = sorted(samples, key=lambda x: x[0])
    ts = np.array([t for t,_ in samples], dtype=np.float64)
    ps = np.array([p for _,p in samples], dtype=np.float64)
    mask = (ts >= t0) & (ts <= t1)
    ts_w, ps_w = ts[mask], ps[mask]
    if ts_w.size == 0 or ts_w[0] > t0:
        p0 = np.interp(t0, ts, ps); ts_w = np.insert(ts_w, 0, t0); ps_w = np.insert(ps_w, 0, p0)
    if ts_w[-1] < t1:
        p1 = np.interp(t1, ts, ps); ts_w = np.append(ts_w, t1); ps_w = np.append(ps_w, p1)
    return float(np.trapz(ps_w, ts_w))  # mW*s = mJ

def sample_idle_power_mW(duration_s=20.0, dev_index=0, interval=0.02, save_csv=None):
    """Measure average idle power (mW) for `duration_s` seconds."""
    q = mp.Queue(); stop = mp.Event()
    p = mp.Process(target=_nvml_sampler, args=(stop, q, dev_index, interval)); p.start()
    time.sleep(duration_s)
    stop.set(); p.join()
    samples = []
    while not q.empty(): samples.append(q.get())
    if not samples: raise RuntimeError("No NVML samples during idle.")
    samples.sort(key=lambda x: x[0])
    t0, t1 = samples[0][0], samples[-1][0]
    E_idle_mJ = _integrate_mJ_between(samples, t0, t1)
    T_idle_s  = max(1e-9, t1 - t0)
    P_idle_mW = E_idle_mJ / T_idle_s
    if save_csv:
        pd.DataFrame(samples, columns=["t_abs_s","power_mW"]).to_csv(save_csv, index=False)
    return P_idle_mW, samples

def measure_mJ_per_inference(run_once, n_items_per_call, repeats, P_idle_mW,
                             dev_index=0, interval=0.02, save_csv=None):
    """Concurrent NVML sampling + integration + idle subtraction → mJ/inf."""
    q = mp.Queue(); stop = mp.Event()
    p = mp.Process(target=_nvml_sampler, args=(stop, q, dev_index, interval)); p.start()
    t0 = time.perf_counter()
    for _ in range(repeats):
        run_once()
    t1 = time.perf_counter()
    stop.set(); p.join()
    samples = []
    while not q.empty(): samples.append(q.get())
    if not samples: raise RuntimeError("No NVML samples during active measurement.")
    E_total_mJ = _integrate_mJ_between(samples, t0, t1)
    T_total_s  = max(1e-9, t1 - t0)
    E_idle_mJ  = P_idle_mW * T_total_s
    n_inf      = max(1, repeats * n_items_per_call)
    if save_csv:
        pd.DataFrame(samples, columns=["t_abs_s","power_mW"]).to_csv(save_csv, index=False)
    return {
        "mJ_per_inf": max(0.0, (E_total_mJ - E_idle_mJ) / n_inf),
        "ms_per_inf": (T_total_s / n_inf) * 1e3,
        "throughput_inf_per_s": n_inf / T_total_s,
        "n_inferences": n_inf,
        "repeats": repeats,
        "T_total_s": T_total_s,
        "E_total_mJ": E_total_mJ,
        "E_idle_mJ": E_idle_mJ,
        "P_idle_mW": P_idle_mW,
        "t0_abs": t0, "t1_abs": t1
    }

def calibrate_repeats(run_once, target_s=8.0, min_rep=3, max_rep=5000):
    """Estimate repeats so one measurement window lasts ~target_s seconds."""
    run_once()
    t0 = time.perf_counter(); run_once(); t1 = time.perf_counter()
    dt = max(1e-4, t1 - t0)
    reps = int(np.ceil(target_s / dt))
    return int(np.clip(reps, min_rep, max_rep))

def measure_with_bootstrap(name, run_once, n_items, repeats, n_runs=5, n_boot=1000, logdir=Path("logs")):
    """Repeat n_runs, bootstrap mean mJ/inf with 95% CI; save traces and summary."""
    logdir.mkdir(exist_ok=True, parents=True)
    runs = []
    for i in range(n_runs):
        print(f"[Measure] {name} run {i+1}/{n_runs} ...")
        r = measure_mJ_per_inference(
            run_once, n_items, repeats, P_idle_mW,
            dev_index=0, interval=0.02,
            save_csv=str(logdir/f"power_trace_{name}_run{i+1}.csv")
        )
        runs.append(r)
    mJs = np.array([r["mJ_per_inf"] for r in runs], dtype=np.float64)
    rng = np.random.default_rng(123)
    boots = [float(np.mean(mJs[rng.integers(0, len(mJs), size=len(mJs))])) for _ in range(n_boot)]
    ci_low, ci_high = np.percentile(boots, [2.5, 97.5])
    summary = {"model": name, "mean_mJ_per_inf": float(mJs.mean()),
               "ci95_low": float(ci_low), "ci95_high": float(ci_high), "runs": runs}
    with open(logdir/f"energy_{name}.json", "w") as f: json.dump(summary, f, indent=2)
    print(f"[Result] {name}: {summary['mean_mJ_per_inf']:.3f} mJ/inf (95% CI [{summary['ci95_low']:.3f}, {summary['ci95_high']:.3f}])")
    return summary

# ---------------- Ensure minimal configs/features exist (does not change model structure/params) ----------------
BASE = Path("/content")
configs_dir = BASE / "configs"; configs_dir.mkdir(parents=True, exist_ok=True)
classes_json = configs_dir / "classes.json"
splits_json  = configs_dir / "splits.json"

if not classes_json.exists():
    classes_json.write_text(json.dumps({
        "num_classes": 8,
        "id_to_label": {str(i): f"class{i}" for i in range(8)},
        "window_config": {"window_samples": 150}
    }, indent=2))
if not splits_json.exists():
    splits_json.write_text(json.dumps({
        "folds": [{"fold": 0, "test_subject": "S01"}]
    }, indent=2))

features_dir = BASE / "features"; features_dir.mkdir(parents=True, exist_ok=True)
npz_path = features_dir / "windows_normalized_fold0.npz"
if not npz_path.exists():
    # Create a synthetic fold with contiguous windows so that SEQ_LEN=8, SEQ_STRIDE=2 produces valid sequences
    N_train, N_test, T, C, K = 2000, 800, 150, 6, 8
    rng = np.random.default_rng(2025)
    def make_axis(N): return rng.normal(0, 1, size=(N, T)).astype(np.float32)
    window_ids = [f"file0.csv:0:{i*T}" for i in range(N_train + N_test)]
    out = {
        "acc_x": make_axis(N_train+N_test),
        "acc_y": make_axis(N_train+N_test),
        "acc_z": make_axis(N_train+N_test),
        "gyro_x": make_axis(N_train+N_test),
        "gyro_y": make_axis(N_train+N_test),
        "gyro_z": make_axis(N_train+N_test),
        "labels": np.concatenate([rng.integers(0, K, size=N_train), rng.integers(0, K, size=N_test)]).astype(np.int64),
        "subjects": np.array(["S01"]*(N_train+N_test)),
        "splits":   np.array(["train"]*N_train + ["test"]*N_test),
        "window_ids": np.array(window_ids, dtype=object)
    }
    np.savez(npz_path, **out)

# ---------------- Step 11 model (structure/params unchanged) ----------------
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

print("\n\nStep 11: RCCMix-HAR++ — inference-only energy measurement")
print("=" * 80)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

with open(classes_json, 'r') as f:
    classes_cfg = json.load(f)
NUM_CLASSES = classes_cfg["num_classes"]
WINDOW_SAMPLES = classes_cfg["window_config"]["window_samples"]

# Hyperparameters (unaltered from Step 11)
SEQ_LEN        = 8
SEQ_STRIDE     = 2
BATCH_SIZE     = 32
EPOCHS         = 30
LR             = 1e-3
WEIGHT_DECAY   = 1e-4
CLIP_NORM      = 1.0
D_MODEL        = 192
N_HEADS        = 6
N_LAYERS       = 3
D_FF           = 4 * D_MODEL
DROPOUT        = 0.2
VAL_SPLIT      = 0.1
LABEL_SMOOTH   = 0.05

# ----- Dataset & sequence construction (unaltered semantics) -----
from collections import defaultdict

def parse_window_id(wid: str):
    parts = wid.split(':')
    if len(parts) != 3:
        return wid, 0, 0
    return parts[0], int(parts[1]), int(parts[2])

class SeqDataset(Dataset):
    def __init__(self, npz_path: Path, split: str, seq_len=8, seq_stride=2):
        super().__init__()
        self.npz = np.load(npz_path, allow_pickle=True)
        self.split = split
        self.seq_len = seq_len
        self.seq_stride = seq_stride

        self.ax = self.npz['acc_x']; self.ay = self.npz['acc_y']; self.az = self.npz['acc_z']
        self.gx = self.npz['gyro_x']; self.gy = self.npz['gyro_y']; self.gz = self.npz['gyro_z']
        self.labels  = self.npz['labels'].astype(np.int64)
        self.splits  = np.array(self.npz['splits']).astype(str)
        self.win_ids = np.array(self.npz['window_ids']).astype(str)
        self.subjects= np.array(self.npz['subjects']).astype(str)

        N, T = self.ax.shape
        assert T == WINDOW_SAMPLES, f"Inconsistent window length: {T} vs {WINDOW_SAMPLES}"

        idx_split = np.where(self.splits == split)[0]
        groups = defaultdict(list)
        for idx in idx_split:
            f, seg, st = parse_window_id(self.win_ids[idx])
            groups[(self.subjects[idx], f, seg)].append((st, idx))

        self.seq_items = []
        for key, lst in groups.items():
            lst.sort(key=lambda x: x[0])
            order = [idx for _, idx in lst]
            if len(order) < seq_len:
                continue
            for i in range(0, len(order) - seq_len + 1, seq_stride):
                seq_idx = order[i:i+seq_len]
                center_idx = seq_idx[seq_len // 2]
                label = int(self.labels[center_idx])
                self.seq_items.append((seq_idx, center_idx, label))

    def __len__(self):
        return len(self.seq_items)

    def __getitem__(self, i):
        seq_idx, center_idx, label = self.seq_items[i]
        L = len(seq_idx); C = 6; T = self.ax.shape[1]
        X = np.zeros((L, C, T), dtype=np.float32)
        for j, idx in enumerate(seq_idx):
            X[j,0,:] = self.ax[idx]; X[j,1,:] = self.ay[idx]; X[j,2,:] = self.az[idx]
            X[j,3,:] = self.gx[idx]; X[j,4,:] = self.gy[idx]; X[j,5,:] = self.gz[idx]
        return torch.from_numpy(X), torch.tensor(label, dtype=torch.long), center_idx

def collate_fn(batch):
    xs, ys, centers = zip(*batch)
    return torch.stack(xs), torch.stack(ys), torch.tensor(centers, dtype=torch.long)

# ----- Model components (unaltered structure/params) -----
class DepthwiseSeparableConv1d(nn.Module):
    def __init__(self, in_ch, out_ch, k, dilation=1, dropout=0.0):
        super().__init__()
        pad = (k // 2) * dilation
        self.dw = nn.Conv1d(in_ch, in_ch, kernel_size=k, padding=pad,
                            dilation=dilation, groups=in_ch, bias=False)
        self.pw = nn.Conv1d(in_ch, out_ch, kernel_size=1, bias=False)
        self.bn = nn.BatchNorm1d(out_ch)
        self.act = nn.GELU()
        self.drop = nn.Dropout(dropout)
    def forward(self, x):
        x = self.dw(x); x = self.pw(x); x = self.bn(x); x = self.act(x)
        return self.drop(x)

class SEBlock(nn.Module):
    def __init__(self, ch, reduction=4):
        super().__init__()
        self.pool = nn.AdaptiveAvgPool1d(1)
        hidden = max(1, ch // reduction)
        self.fc = nn.Sequential(
            nn.Linear(ch, hidden, bias=False),
            nn.ReLU(inplace=True),
            nn.Linear(hidden, ch, bias=False),
            nn.Sigmoid()
        )
    def forward(self, x):
        b, c, t = x.shape
        s = self.pool(x).view(b, c)
        s = self.fc(s).view(b, c, 1)
        return x * s

class WindowEncoderV2(nn.Module):
    def __init__(self, in_ch=6, d_model=192, dropout=0.2):
        super().__init__()
        self.in_ch = in_ch
        self.aug_ch = in_ch + 2
        self.se = SEBlock(self.aug_ch, reduction=4)

        b_dim = d_model // 3
        assert b_dim * 3 == d_model, "D_MODEL must be divisible by 3 for WindowEncoderV2"

        self.b1 = DepthwiseSeparableConv1d(self.aug_ch, b_dim, k=7,  dilation=1, dropout=dropout)
        self.b2 = DepthwiseSeparableConv1d(self.aug_ch, b_dim, k=15, dilation=2, dropout=dropout)
        self.b3 = DepthwiseSeparableConv1d(self.aug_ch, b_dim, k=31, dilation=3, dropout=dropout)

        self.mix = nn.Conv1d(d_model, d_model, kernel_size=1, bias=False)
        self.bn  = nn.BatchNorm1d(d_model)
        self.act = nn.GELU()
        self.drop= nn.Dropout(dropout)

        self.token_proj = nn.Sequential(
            nn.Linear(2 * d_model, d_model),
            nn.GELU(),
            nn.Dropout(dropout)
        )

        self.g_proj = nn.Sequential(
            nn.Linear(8, d_model),
            nn.GELU(),
            nn.Linear(d_model, d_model)
        )

    def forward(self, x):
        BL, C, T = x.shape
        acc_norm = torch.sqrt((x[:,0,:]**2 + x[:,1,:]**2 + x[:,2,:]**2) + 1e-8).unsqueeze(1)
        gyr_norm = torch.sqrt((x[:,3,:]**2 + x[:,4,:]**2 + x[:,5,:]**2) + 1e-8).unsqueeze(1)
        x_aug = torch.cat([x, acc_norm, gyr_norm], dim=1)
        x_aug = self.se(x_aug)

        z1 = self.b1(x_aug); z2 = self.b2(x_aug); z3 = self.b3(x_aug)
        z = torch.cat([z1, z2, z3], dim=1)
        z = self.mix(z); z = self.bn(z); z = self.act(z); z = self.drop(z)

        avg_pool = z.mean(dim=-1)
        max_pool, _ = z.max(dim=-1)
        token = torch.cat([avg_pool, max_pool], dim=-1)
        token = self.token_proj(token)

        acc_rms = acc_norm.squeeze(1).pow(2).mean(dim=-1).sqrt()
        gyr_rms = gyr_norm.squeeze(1).pow(2).mean(dim=-1).sqrt()
        acc_en  = x[:,0:3,:].pow(2).mean(dim=(1,2)).sqrt()
        gyr_en  = x[:,3:6,:].pow(2).mean(dim=(1,2)).sqrt()

        acc_mean = x[:,0:3,:].mean(dim=-1)
        gyr_mean = x[:,3:6,:].mean(dim=-1)
        acc_mean_norm = acc_mean.pow(2).sum(dim=-1).sqrt()
        gyr_mean_norm = gyr_mean.pow(2).sum(dim=-1).sqrt()

        acc_var = x[:,0:3,:].var(dim=-1).mean(dim=-1)
        gyr_var = x[:,3:6,:].var(dim=-1).mean(dim=-1)

        g_raw = torch.stack([acc_rms, gyr_rms, acc_en, gyr_en,
                             acc_mean_norm, gyr_mean_norm, acc_var, gyr_var], dim=-1)
        g = self.g_proj(g_raw)
        return token, g

class CondLayerNorm(nn.Module):
    def __init__(self, d_model):
        super().__init__()
        self.ln = nn.LayerNorm(d_model)
        self.gamma = nn.Linear(d_model, d_model)
        self.beta  = nn.Linear(d_model, d_model)
    def forward(self, x, g):
        y = self.ln(x)
        return y * (1 + self.gamma(g)) + self.beta(g)

class RCCBlock(nn.Module):
    def __init__(self, d_model=192, n_heads=6, d_ff=768, dropout=0.2):
        super().__init__()
        self.condln1 = CondLayerNorm(d_model)
        self.mha = nn.MultiheadAttention(d_model, n_heads, dropout=dropout, batch_first=True)
        self.drop1 = nn.Dropout(dropout)
        self.condln2 = CondLayerNorm(d_model)
        self.ff = nn.Sequential(
            nn.Linear(d_model, d_ff), nn.GELU(), nn.Dropout(dropout),
            nn.Linear(d_ff, d_model)
        )
        self.drop2 = nn.Dropout(dropout)
    def forward(self, x, g):
        y = self.condln1(x, g)
        attn, _ = self.mha(y, y, y, need_weights=False)
        x = x + self.drop1(attn)
        y = self.condln2(x, g)
        y = self.ff(y)
        x = x + self.drop2(y)
        return x

class GeoContextHARV2(nn.Module):
    def __init__(self, in_ch=6, d_model=192, n_layers=3, n_heads=6, d_ff=768,
                 dropout=0.2, seq_len=8, num_classes=8):
        super().__init__()
        self.seq_len = seq_len
        self.encoder = WindowEncoderV2(in_ch=in_ch, d_model=d_model, dropout=dropout)
        self.cls_token = nn.Parameter(torch.zeros(1, 1, d_model))
        self.pos = nn.Parameter(torch.zeros(1, seq_len + 1, d_model))
        self.blocks = nn.ModuleList([RCCBlock(d_model, n_heads, d_ff, dropout) for _ in range(n_layers)])
        self.norm = nn.LayerNorm(d_model)
        self.head_drop = nn.Dropout(dropout)
        self.head = nn.Linear(2 * d_model, num_classes)
        nn.init.trunc_normal_(self.pos, std=0.02)
        nn.init.trunc_normal_(self.cls_token, std=0.02)
    def forward(self, x):
        B, L, C, T = x.shape
        x = x.view(B * L, C, T)
        token, g = self.encoder(x)
        token = token.view(B, L, -1)
        g     = g.view(B, L, -1)
        cls = self.cls_token.expand(B, -1, -1)
        z = torch.cat([cls, token], dim=1)
        g_cls = g.mean(dim=1, keepdim=True)
        g_all = torch.cat([g_cls, g], dim=1)
        z = z + self.pos
        for blk in self.blocks:
            z = blk(z, g_all)
        z = self.norm(z)
        cls_rep = z[:, 0, :]
        mean_rep = z[:, 1:, :].mean(dim=1)
        feat = torch.cat([cls_rep, mean_rep], dim=-1)
        feat = self.head_drop(feat)
        logits = self.head(feat)
        return logits

# ----- Build test loader (sequence unit) -----
test_ds = SeqDataset(npz_path, split="test", seq_len=SEQ_LEN, seq_stride=SEQ_STRIDE)
test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True, collate_fn=collate_fn)

# ----- Instantiate the model exactly as in Step 11; eval mode for inference -----
model = GeoContextHARV2(
    in_ch=6, d_model=D_MODEL, n_layers=N_LAYERS, n_heads=N_HEADS,
    d_ff=D_FF, dropout=DROPOUT, seq_len=SEQ_LEN, num_classes=NUM_CLASSES
).to(device).eval()

# Optional: load trained weights if available (does not affect energy/FLOPs)
models_dir = BASE / "models"; models_dir.mkdir(parents=True, exist_ok=True)
wpath = models_dir / "rccmix_har_step11_fold0.pt"
if wpath.exists():
    try:
        sd = torch.load(wpath, map_location=device)
        if isinstance(sd, dict):
            model.load_state_dict(sd)
            print(f"[Info] Loaded weights: {wpath.name}")
    except Exception as e:
        print(f"[Warn] Failed to load weights: {e}")

# ----- Build run_once: one complete forward over the test sequences (unit = 1 sequence) -----
def make_runner(model: nn.Module, loader: DataLoader):
    n_items = len(loader.dataset)  # number of sequences
    @torch.no_grad()
    def run_once():
        for x, y, centers in loader:
            x = x.to(device, non_blocking=True)  # (B, L, C, T)
            _ = model(x)
        if torch.cuda.is_available():
            torch.cuda.synchronize()
    return run_once, n_items

run_once, N_items = make_runner(model, test_loader)

# ----- Measure idle power -----
logs_dir = BASE / "logs"; logs_dir.mkdir(parents=True, exist_ok=True)
print("\n[Info] Sampling idle power for 20 s ...")
P_idle_mW, _idle = sample_idle_power_mW(duration_s=20.0, dev_index=0, interval=0.02,
                                        save_csv=str(logs_dir/'power_idle_trace_rccmixharpp.csv'))
print(f"[Info] Mean idle power ~ {P_idle_mW:.1f} mW")

# ----- Warmup & calibrate repeats (target ≥ 8 s per measurement) -----
print("\n[Warmup] warmup ...")
run_once(); run_once()
repeats = calibrate_repeats(run_once, target_s=8.0, min_rep=3, max_rep=5000)
print(f"[Info] repeats = {repeats} (items per call = {N_items}, unit = sequence of L={SEQ_LEN} windows)")

# ----- NVML measurement + bootstrap CI -----
summary = measure_with_bootstrap(
    name="rccmixharpp_inference_sequence_unit",
    run_once=run_once,
    n_items=N_items,
    repeats=repeats,
    n_runs=5,
    n_boot=1000,
    logdir=logs_dir
)

# ----- Save a compact CSV summary -----
df = pd.DataFrame([{
    "model": "RCCMix-HAR++ (sequence unit)",
    "unit": f"sequence (L={SEQ_LEN} windows)",
    "mJ_per_inf_mean": summary["mean_mJ_per_inf"],
    "ci95_low": summary["ci95_low"],
    "ci95_high": summary["ci95_high"],
    "items_per_call": N_items,
    "repeats": repeats,
    "idle_mW": P_idle_mW
}])
df.to_csv(logs_dir/"energy_summary_rccmixharpp.csv", index=False)
print("\n=== Done: GPU inference energy (Scheme 1) — RCCMix-HAR++ ===")
print(df)
print("\nArtifacts:")
print("- logs/power_idle_trace_rccmixharpp.csv")
print("- logs/power_trace_rccmixharpp_inference_sequence_unit_run*.csv")
print("- logs/energy_rccmixharpp_inference_sequence_unit.json")
print("- logs/energy_summary_rccmixharpp.csv")

Mon Nov 17 17:49:41 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   38C    P8             10W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                