In [2]:
# ================================================================
# DeepConvContext (official architecture-aligned, A100 v2.1) × Scheme 1:
# NVML GPU inference energy (mJ per 3-second window) on Google Colab — single cell, no prereqs
# ================================================================
# 0) System check & deps
!nvidia-smi
!pip -q install pynvml

import os, json, time, math, pathlib, warnings, multiprocessing as mp
from pathlib import Path
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

# ---------------- NVML sampling + trapezoidal energy integration ----------------
import pynvml

def _nvml_sampler(stop_event, q, dev_index=0, interval=0.02):
    """Background process: sample GPU power (mW) every `interval` seconds, push (t, mW)."""
    import time, pynvml
    pynvml.nvmlInit()
    h = pynvml.nvmlDeviceGetHandleByIndex(dev_index)
    try:
        while not stop_event.is_set():
            q.put((time.perf_counter(), pynvml.nvmlDeviceGetPowerUsage(h)))
            time.sleep(interval)
    finally:
        pynvml.nvmlShutdown()

def _integrate_mJ_between(samples, t0, t1):
    """Trapezoidal integrate power (mW) over [t0, t1] → mJ."""
    if not samples: return 0.0
    samples = sorted(samples, key=lambda x: x[0])
    ts = np.array([t for t,_ in samples], dtype=np.float64)
    ps = np.array([p for _,p in samples], dtype=np.float64)
    mask = (ts >= t0) & (ts <= t1)
    ts_w, ps_w = ts[mask], ps[mask]
    if ts_w.size == 0 or ts_w[0] > t0:
        p0 = np.interp(t0, ts, ps); ts_w = np.insert(ts_w, 0, t0); ps_w = np.insert(ps_w, 0, p0)
    if ts_w[-1] < t1:
        p1 = np.interp(t1, ts, ps); ts_w = np.append(ts_w, t1); ps_w = np.append(ps_w, p1)
    return float(np.trapz(ps_w, ts_w))  # mW*s = mJ

def sample_idle_power_mW(duration_s=20.0, dev_index=0, interval=0.02, save_csv=None):
    """Measure average idle power (mW) for `duration_s` seconds."""
    q = mp.Queue(); stop = mp.Event()
    p = mp.Process(target=_nvml_sampler, args=(stop, q, dev_index, interval)); p.start()
    time.sleep(duration_s)
    stop.set(); p.join()
    samples = []
    while not q.empty(): samples.append(q.get())
    if not samples: raise RuntimeError("No NVML samples during idle.")
    samples.sort(key=lambda x: x[0])
    t0, t1 = samples[0][0], samples[-1][0]
    E_idle_mJ = _integrate_mJ_between(samples, t0, t1)
    T_idle_s  = max(1e-9, t1 - t0)
    P_idle_mW = E_idle_mJ / T_idle_s
    if save_csv:
        pd.DataFrame(samples, columns=["t_abs_s","power_mW"]).to_csv(save_csv, index=False)
    return P_idle_mW, samples

def measure_mJ_per_window(run_once, n_windows_per_call, repeats, P_idle_mW,
                          dev_index=0, interval=0.02, save_csv=None):
    """Concurrent NVML sampling + integration + idle subtraction → energy/latency per window."""
    q = mp.Queue(); stop = mp.Event()
    p = mp.Process(target=_nvml_sampler, args=(stop, q, dev_index, interval)); p.start()
    t0 = time.perf_counter()
    for _ in range(repeats):
        run_once()
    t1 = time.perf_counter()
    stop.set(); p.join()
    samples = []
    while not q.empty(): samples.append(q.get())
    if not samples: raise RuntimeError("No NVML samples during active measurement.")
    E_total_mJ = _integrate_mJ_between(samples, t0, t1)
    T_total_s  = max(1e-9, t1 - t0)
    E_idle_mJ  = P_idle_mW * T_total_s
    n_windows  = max(1, repeats * n_windows_per_call)
    mJ_per_window = max(0.0, (E_total_mJ - E_idle_mJ) / n_windows)
    ms_per_window = (T_total_s / n_windows) * 1e3
    if save_csv:
        pd.DataFrame(samples, columns=["t_abs_s","power_mW"]).to_csv(save_csv, index=False)
    return {
        "mJ_per_window": mJ_per_window,
        "ms_per_window": ms_per_window,
        "throughput_windows_per_s": n_windows / T_total_s,
        "n_windows": n_windows,
        "repeats": repeats,
        "T_total_s": T_total_s,
        "E_total_mJ": E_total_mJ,
        "E_idle_mJ": E_idle_mJ,
        "P_idle_mW": P_idle_mW,
        "t0_abs": t0, "t1_abs": t1
    }

def calibrate_repeats(run_once, target_s=8.0, min_rep=3, max_rep=5000):
    """Estimate repeats so one measurement window lasts ~target_s seconds."""
    run_once()
    t0 = time.perf_counter(); run_once(); t1 = time.perf_counter()
    dt = max(1e-4, t1 - t0)
    reps = int(np.ceil(target_s / dt))
    return int(np.clip(reps, min_rep, max_rep))

def measure_with_bootstrap(name, run_once, n_windows, repeats, n_runs=5, n_boot=1000, logdir=Path("logs")):
    """Repeat n_runs, bootstrap the per-window mean with 95% CI; save traces and summary."""
    logdir.mkdir(exist_ok=True, parents=True)
    runs = []
    for i in range(n_runs):
        print(f"[Measure] {name} run {i+1}/{n_runs} ...")
        r = measure_mJ_per_window(
            run_once, n_windows, repeats, P_idle_mW,
            dev_index=0, interval=0.02,
            save_csv=str(logdir/f"power_trace_{name}_run{i+1}.csv")
        )
        runs.append(r)
    mJ = np.array([r["mJ_per_window"] for r in runs], dtype=np.float64)
    ms = np.array([r["ms_per_window"] for r in runs], dtype=np.float64)
    rng = np.random.default_rng(123)
    boots_mJ = [float(np.mean(mJ[rng.integers(0, len(mJ), size=len(mJ))])) for _ in range(n_boot)]
    ci_low_mJ, ci_high_mJ = np.percentile(boots_mJ, [2.5, 97.5])
    summary = {
        "model": name,
        "mean_mJ_per_window": float(mJ.mean()),
        "ci95_low_mJ": float(ci_low_mJ),
        "ci95_high_mJ": float(ci_high_mJ),
        "mean_ms_per_window": float(ms.mean()),
        "runs": runs
    }
    with open(logdir/f"energy_{name}.json", "w") as f: json.dump(summary, f, indent=2)
    print(f"[Result] {name}: {summary['mean_mJ_per_window']:.3f} mJ per window "
          f"(95% CI [{summary['ci95_low_mJ']:.3f}, {summary['ci95_high_mJ']:.3f}]); "
          f"{summary['mean_ms_per_window']:.3f} ms per window")
    return summary

# ---------------- Step 10 model (official architecture-aligned, A100 v2.1) — structure/params unchanged ----------------
# Environment variables that MUST be set before importing torch
os.environ["TORCHINDUCTOR_DISABLE_CUDA_GRAPHS"] = "1"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
os.environ.setdefault("OMP_NUM_THREADS", "16")
os.environ.setdefault("MKL_NUM_THREADS", "16")
os.environ.setdefault("OPENBLAS_NUM_THREADS", "16")
os.environ.setdefault("NUMEXPR_MAX_THREADS", "16")

import warnings as _warnings; _warnings.filterwarnings("ignore", message=r".*CUDA Graph is empty.*")

import json as _json_shadow
import numpy as _np_shadow
from pathlib import Path as _Path_shadow
from typing import List, Tuple

import torch
import torch.nn as nn
import torch.optim as optim
from torch import amp
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Global A100 acceleration switches (unaltered)
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
torch.backends.cudnn.benchmark = True
torch.set_float32_matmul_precision('high')

print("\n\nStep 10: DeepConvContext (official architecture-aligned, A100 80GB high-throughput v2.1)")
print("=" * 78)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")

# Configs (create minimal defaults if missing; does not change model structure/params)
CFG_DIR = Path('/content/configs'); CFG_DIR.mkdir(parents=True, exist_ok=True)
classes_json = CFG_DIR / 'classes.json'
splits_json  = CFG_DIR / 'splits.json'
if not classes_json.exists():
    classes_json.write_text(json.dumps({"num_classes": 8, "standard_classes": [f"class{i}" for i in range(8)]}, indent=2))
if not splits_json.exists():
    splits_json.write_text(json.dumps({"folds": [{"fold": 0, "test_subject": "S01"}]}, indent=2))

with open(CFG_DIR / 'classes.json', 'r') as f:
    classes_cfg = json.load(f)
with open(CFG_DIR / 'splits.json', 'r') as f:
    splits_cfg = json.load(f)

NUM_CLASSES    = int(classes_cfg['num_classes'])
STANDARD_NAMES = classes_cfg.get('standard_classes', [str(i) for i in range(NUM_CLASSES)])

# Windowing & sequence settings (unaltered)
NUM_CHANNELS      = 6
SAMPLES_PER_WIN   = 150  # 3 s @ 50 Hz
STRIDE_SAMPLES    = 75
CONTEXT_LEN_WINS  = 100
WINDOW_SECONDS    = 3.0  # reporting only

# Hyperparameters (unaltered)
EPOCHS        = 30
BASE_LR       = 1e-4
WEIGHT_DECAY  = 1e-6
DROPOUT_P     = 0.5
BIDIRECTIONAL = True

USE_AMP     = True
AMP_DTYPE   = torch.float16
TRAIN_BATCH = 256
EVAL_BATCH  = 256
LEARNING_RATE = BASE_LR * (TRAIN_BATCH / 100.0)

CPU = os.cpu_count() or 16
WORKERS  = min(32, max(8, CPU // 2))
PREFETCH = 8
PIN_DEV  = "cuda"

print(f"\nConfig: num_classes={NUM_CLASSES}, channels={NUM_CHANNELS}, samples_per_window=150, sequence_length={CONTEXT_LEN_WINS}")
print(f"batch={EVAL_BATCH}, lr(base)={BASE_LR}, wd={WEIGHT_DECAY}, AMP(FP16)={USE_AMP}, TF32=on\n")

# Official architecture (unaltered)
class DeepConvLSTM_Intra(nn.Module):
    def __init__(self, in_ch=6, conv_ch=64, kernel_size=9, lstm_units=128):
        super().__init__()
        pad = kernel_size // 2
        self.conv1 = nn.Conv1d(in_ch,   conv_ch, kernel_size, padding=pad)
        self.conv2 = nn.Conv1d(conv_ch, conv_ch, kernel_size, padding=pad)
        self.conv3 = nn.Conv1d(conv_ch, conv_ch, kernel_size, padding=pad)
        self.conv4 = nn.Conv1d(conv_ch, conv_ch, kernel_size, padding=pad)
        self.relu  = nn.ReLU(inplace=True)
        self.lstm  = nn.LSTM(input_size=conv_ch, hidden_size=lstm_units, num_layers=1, batch_first=True)
    def forward(self, x_win):
        x = self.relu(self.conv1(x_win))
        x = self.relu(self.conv2(x))
        x = self.relu(self.conv3(x))
        x = self.relu(self.conv4(x))
        x = x.permute(0, 2, 1)
        _, (h_n, _) = self.lstm(x)
        return h_n[-1]

class DeepConvContext(nn.Module):
    def __init__(self, num_channels=6, num_classes=8,
                 conv_channels=64, intra_lstm_units=128,
                 inter_lstm_units=128, projection_dim=128,
                 dropout=0.5, bidirectional=True):
        super().__init__()
        self.intra = DeepConvLSTM_Intra(num_channels, conv_channels, 9, intra_lstm_units)
        self.proj  = nn.Linear(intra_lstm_units, projection_dim)
        self.inter = nn.LSTM(input_size=projection_dim, hidden_size=inter_lstm_units,
                             num_layers=1, batch_first=True, bidirectional=bidirectional)
        inter_out = inter_lstm_units * (2 if bidirectional else 1)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(inter_out, num_classes)
    def forward(self, x):
        B, S, C, T = x.shape
        x2d = x.reshape(B*S, C, T)
        feats = self.intra(x2d).view(B, S, -1)
        proj  = self.proj(feats)
        inter_out, _ = self.inter(proj)
        inter_out = self.dropout(inter_out)
        logits = self.fc(inter_out)
        return logits

# Dataset (unaltered logic)
class HARSequenceDataset(Dataset):
    def __init__(self, npz_file: Path, split='train', sequence_length=100):
        data = np.load(npz_file, allow_pickle=True)
        mask = data['splits'] == split
        labels = data['labels'][mask]
        subjects = data['subjects'][mask]
        channels = ['acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_y', 'gyro_z']
        wins = np.stack([data[ch][mask] for ch in channels], axis=1).astype(np.float32)  # (N,C,T)
        self.sequences, self.seq_labels = [], []
        for subj in np.unique(subjects):
            m = subjects == subj
            w_subj, y_subj = wins[m], labels[m]
            for i in range(0, len(w_subj) - sequence_length + 1):
                self.sequences.append(w_subj[i:i+sequence_length])   # (S,C,T)
                self.seq_labels.append(y_subj[i:i+sequence_length])  # (S,)
        self.sequences = np.asarray(self.sequences)
        self.seq_labels = np.asarray(self.seq_labels)
    def __len__(self): return len(self.seq_labels)
    def __getitem__(self, idx):
        return (torch.from_numpy(self.sequences[idx]),
                torch.from_numpy(self.seq_labels[idx]).long())

# Minimal feature bootstrap if missing (does not change model structure/params)
features_dir = Path('/content/features'); features_dir.mkdir(parents=True, exist_ok=True)
npz_path = features_dir / 'windows_normalized_fold0.npz'
if not npz_path.exists():
    N_train, N_test, T, C, K = 2000, 800, 150, 6, NUM_CLASSES
    rng = np.random.default_rng(2025)
    def make_axis(N): return rng.normal(0, 1, size=(N, T)).astype(np.float32)
    out = {
        'acc_x': make_axis(N_train+N_test), 'acc_y': make_axis(N_train+N_test), 'acc_z': make_axis(N_train+N_test),
        'gyro_x': make_axis(N_train+N_test), 'gyro_y': make_axis(N_train+N_test), 'gyro_z': make_axis(N_train+N_test),
        'labels': np.concatenate([rng.integers(0, K, size=N_train), rng.integers(0, K, size=N_test)]).astype(np.int64),
        'subjects': np.array([b'S01']*(N_train+N_test)),
        'splits': np.array(['train']*N_train + ['test']*N_test)
    }
    np.savez(npz_path, **out)

# Build test loader (matching Step 10 batch size); fallback if some kwargs unsupported
test_ds  = HARSequenceDataset(npz_path, split='test', sequence_length=CONTEXT_LEN_WINS)
try:
    test_loader  = DataLoader(
        test_ds,
        batch_size=EVAL_BATCH,
        shuffle=False,
        drop_last=False,
        num_workers=WORKERS,
        pin_memory=True,
        pin_memory_device=PIN_DEV,
        persistent_workers=True,
        prefetch_factor=PREFETCH
    )
except TypeError:
    test_loader  = DataLoader(
        test_ds,
        batch_size=EVAL_BATCH,
        shuffle=False,
        drop_last=False,
        num_workers=WORKERS,
        pin_memory=True
    )

# Instantiate model with exact hyperparameters; eval mode for inference
model = DeepConvContext(
    num_channels=NUM_CHANNELS, num_classes=NUM_CLASSES,
    conv_channels=64, intra_lstm_units=128,
    inter_lstm_units=128, projection_dim=128,
    dropout=DROPOUT_P, bidirectional=BIDIRECTIONAL
).to(device).eval()

# Optional: torch.compile for max-autotune (no change to structure/params)
try:
    model = torch.compile(model, mode="max-autotune", fullgraph=False, dynamic=False)
    print("✓ Enabled torch.compile(max-autotune)")
except Exception as e:
    print(f"⚠ torch.compile unavailable, continuing without: {e}")

# Optional: load trained weights if present (does not affect energy/FLOPs)
models_dir = Path('/content/models'); models_dir.mkdir(parents=True, exist_ok=True)
wpath = models_dir / 'deepconvcontext_fold0.pth'
if wpath.exists():
    try:
        sd = torch.load(wpath, map_location=device)
        if isinstance(sd, dict):
            model.load_state_dict(sd)
            print(f"[Info] Loaded weights: {wpath.name}")
    except Exception as e:
        print(f"[Warn] Failed to load weights: {e}")

# Build run_once: one complete forward over the test loader (AMP dtype as in Step 10)
def make_runner(model: nn.Module, loader: DataLoader):
    n_sequences = len(loader.dataset)
    n_windows_per_call = n_sequences * CONTEXT_LEN_WINS  # total windows computed per full pass
    @torch.no_grad()
    def run_once():
        for x, _ in loader:
            x = x.to(device, non_blocking=True)  # (B,S,C,T)
            with amp.autocast('cuda', dtype=AMP_DTYPE, enabled=USE_AMP):
                _ = model(x)
        if torch.cuda.is_available():
            torch.cuda.synchronize()
    return run_once, n_windows_per_call

run_once, N_windows_per_call = make_runner(model, test_loader)

# Measure idle power
logs_dir = Path('/content/logs'); logs_dir.mkdir(parents=True, exist_ok=True)
print("\n[Info] Sampling idle power for 20 s ...")
P_idle_mW, _idle = sample_idle_power_mW(duration_s=20.0, dev_index=0, interval=0.02,
                                        save_csv=str(logs_dir/'power_idle_trace_dcc_official_v21.csv'))
print(f"[Info] Mean idle power ~ {P_idle_mW:.1f} mW")

# Warmup & calibrate repeats (target ≥ 8 s per measurement)
print("\n[Warmup] warmup ...")
run_once(); run_once()
repeats = calibrate_repeats(run_once, target_s=8.0, min_rep=3, max_rep=5000)
print(f"[Info] repeats = {repeats} (windows per call = {N_windows_per_call})")

# NVML measurement + bootstrap CI (per-window metrics)
summary = measure_with_bootstrap(
    name="deepconvcontext_official_v21_inference_per_window",
    run_once=run_once,
    n_windows=N_windows_per_call,
    repeats=repeats,
    n_runs=5,
    n_boot=1000,
    logdir=logs_dir
)

# Save a compact CSV summary (mJ/ms per 3-second window)
df = pd.DataFrame([{
    "model": "DeepConvContext (official, A100 v2.1)",
    "window_seconds": WINDOW_SECONDS,                   # 3.0 if 150 samples @ 50 Hz
    "mJ_per_3s_window_mean": summary["mean_mJ_per_window"],
    "ci95_low_mJ": summary["ci95_low_mJ"],
    "ci95_high_mJ": summary["ci95_high_mJ"],
    "ms_per_3s_window_mean": summary["mean_ms_per_window"],
    "windows_per_call": N_windows_per_call,
    "repeats": repeats,
    "idle_mW": P_idle_mW
}])
df.to_csv(logs_dir/"energy_summary_deepconvcontext_official_v21_per_3s_window.csv", index=False)
print("\n=== Done: GPU inference energy — mJ per 3-second window (Scheme 1) ===")
print(df)
print("\nArtifacts:")
print("- logs/power_idle_trace_dcc_official_v21.csv")
print("- logs/power_trace_deepconvcontext_official_v21_inference_per_window_run*.csv")
print("- logs/energy_deepconvcontext_official_v21_inference_per_window.json")
print("- logs/energy_summary_deepconvcontext_official_v21_per_3s_window.csv")

Mon Nov 17 18:30:12 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-80GB          Off |   00000000:00:05.0 Off |                    0 |
| N/A   33C    P0             60W /  400W |   17855MiB /  81920MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

AUTOTUNE addmm(25600x128, 25600x128, 128x128)
strides: [0, 1], [128, 1], [1, 128]
dtypes: torch.float16, torch.float16, torch.float16
  triton_mm_15 0.0174 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, USE_FAST_ACCUM=False, num_stages=2, num_warps=8
  triton_mm_17 0.0174 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, USE_FAST_ACCUM=False, num_stages=3, num_warps=4
  triton_mm_16 0.0184 ms 94.4% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, USE_FAST_ACCUM=False, num_stages=3, num_warps=4
  triton_mm_14 0.0195 ms 89.5% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, USE_FAST_ACCUM=False, num_stages=4, num_warps=8
  bias_addmm 0.0205 ms 85.0% 
  triton_mm_13 0.0205 ms 85.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVE

[Info] repeats = 25 (windows per call = 70100)
[Measure] deepconvcontext_official_v21_inference_per_window run 1/5 ...
[Measure] deepconvcontext_official_v21_inference_per_window run 2/5 ...
[Measure] deepconvcontext_official_v21_inference_per_window run 3/5 ...
[Measure] deepconvcontext_official_v21_inference_per_window run 4/5 ...
[Measure] deepconvcontext_official_v21_inference_per_window run 5/5 ...
[Result] deepconvcontext_official_v21_inference_per_window: 0.561 mJ per window (95% CI [0.553, 0.570]); 0.004 ms per window

=== Done: GPU inference energy — mJ per 3-second window (Scheme 1) ===
                                   model  window_seconds  \
0  DeepConvContext (official, A100 v2.1)             3.0   

   mJ_per_3s_window_mean  ci95_low_mJ  ci95_high_mJ  ms_per_3s_window_mean  \
0               0.561497     0.553002      0.569882               0.004416   

   windows_per_call  repeats       idle_mW  
0             70100       25  60598.627309  

Artifacts:
- logs/power_idl