In [1]:
# ================================================================
# DeepConvContext (official architecture-aligned · from-scratch) × Scheme 1:
# NVML GPU inference energy (mJ/inf) on Google Colab — single cell, no prereqs
# ================================================================
# 0) System check & deps
!nvidia-smi
!pip -q install pynvml

import os, json, time, math, pathlib, warnings, multiprocessing as mp
from pathlib import Path
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

# ---------------- NVML sampling + trapezoidal energy integration ----------------
import pynvml

def _nvml_sampler(stop_event, q, dev_index=0, interval=0.02):
    """Background process: sample GPU power (mW) every `interval` seconds, push (t, mW)."""
    import time, pynvml
    pynvml.nvmlInit()
    h = pynvml.nvmlDeviceGetHandleByIndex(dev_index)
    try:
        while not stop_event.is_set():
            q.put((time.perf_counter(), pynvml.nvmlDeviceGetPowerUsage(h)))
            time.sleep(interval)
    finally:
        pynvml.nvmlShutdown()

def _integrate_mJ_between(samples, t0, t1):
    """Trapezoidal integrate power (mW) over [t0, t1] → mJ."""
    if not samples: return 0.0
    samples = sorted(samples, key=lambda x: x[0])
    ts = np.array([t for t,_ in samples], dtype=np.float64)
    ps = np.array([p for _,p in samples], dtype=np.float64)
    mask = (ts >= t0) & (ts <= t1)
    ts_w, ps_w = ts[mask], ps[mask]
    if ts_w.size == 0 or ts_w[0] > t0:
        p0 = np.interp(t0, ts, ps); ts_w = np.insert(ts_w, 0, t0); ps_w = np.insert(ps_w, 0, p0)
    if ts_w[-1] < t1:
        p1 = np.interp(t1, ts, ps); ts_w = np.append(ts_w, t1); ps_w = np.append(ps_w, p1)
    return float(np.trapz(ps_w, ts_w))  # mW*s = mJ

def sample_idle_power_mW(duration_s=20.0, dev_index=0, interval=0.02, save_csv=None):
    """Measure average idle power (mW) for `duration_s` seconds."""
    q = mp.Queue(); stop = mp.Event()
    p = mp.Process(target=_nvml_sampler, args=(stop, q, dev_index, interval)); p.start()
    time.sleep(duration_s)
    stop.set(); p.join()
    samples = []
    while not q.empty(): samples.append(q.get())
    if not samples: raise RuntimeError("No NVML samples during idle.")
    samples.sort(key=lambda x: x[0])
    t0, t1 = samples[0][0], samples[-1][0]
    E_idle_mJ = _integrate_mJ_between(samples, t0, t1)
    T_idle_s  = max(1e-9, t1 - t0)
    P_idle_mW = E_idle_mJ / T_idle_s
    if save_csv:
        pd.DataFrame(samples, columns=["t_abs_s","power_mW"]).to_csv(save_csv, index=False)
    return P_idle_mW, samples

def measure_mJ_per_inference(run_once, n_items_per_call, repeats, P_idle_mW,
                             dev_index=0, interval=0.02, save_csv=None):
    """Concurrent NVML sampling + integration + idle subtraction → mJ/inf."""
    q = mp.Queue(); stop = mp.Event()
    p = mp.Process(target=_nvml_sampler, args=(stop, q, dev_index, interval)); p.start()
    t0 = time.perf_counter()
    for _ in range(repeats):
        run_once()
    t1 = time.perf_counter()
    stop.set(); p.join()
    samples = []
    while not q.empty(): samples.append(q.get())
    if not samples: raise RuntimeError("No NVML samples during active measurement.")
    E_total_mJ = _integrate_mJ_between(samples, t0, t1)
    T_total_s  = max(1e-9, t1 - t0)
    E_idle_mJ  = P_idle_mW * T_total_s
    n_inf      = max(1, repeats * n_items_per_call)
    if save_csv:
        pd.DataFrame(samples, columns=["t_abs_s","power_mW"]).to_csv(save_csv, index=False)
    return {
        "mJ_per_inf": max(0.0, (E_total_mJ - E_idle_mJ) / n_inf),
        "ms_per_inf": (T_total_s / n_inf) * 1e3,
        "throughput_inf_per_s": n_inf / T_total_s,
        "n_inferences": n_inf,
        "repeats": repeats,
        "T_total_s": T_total_s,
        "E_total_mJ": E_total_mJ,
        "E_idle_mJ": E_idle_mJ,
        "P_idle_mW": P_idle_mW,
        "t0_abs": t0, "t1_abs": t1
    }

def calibrate_repeats(run_once, target_s=8.0, min_rep=3, max_rep=5000):
    """Estimate repeats so one measurement window lasts ~target_s seconds."""
    run_once()
    t0 = time.perf_counter(); run_once(); t1 = time.perf_counter()
    dt = max(1e-4, t1 - t0)
    reps = int(np.ceil(target_s / dt))
    return int(np.clip(reps, min_rep, max_rep))

def measure_with_bootstrap(name, run_once, n_items, repeats, n_runs=5, n_boot=1000, logdir=Path("logs")):
    """Repeat n_runs, bootstrap mean mJ/inf with 95% CI; save traces and summary."""
    logdir.mkdir(exist_ok=True, parents=True)
    runs = []
    for i in range(n_runs):
        print(f"[Measure] {name} run {i+1}/{n_runs} ...")
        r = measure_mJ_per_inference(
            run_once, n_items, repeats, P_idle_mW,
            dev_index=0, interval=0.02,
            save_csv=str(logdir/f"power_trace_{name}_run{i+1}.csv")
        )
        runs.append(r)
    mJs = np.array([r["mJ_per_inf"] for r in runs], dtype=np.float64)
    rng = np.random.default_rng(123)
    boots = [float(np.mean(mJs[rng.integers(0, len(mJs), size=len(mJs))])) for _ in range(n_boot)]
    ci_low, ci_high = np.percentile(boots, [2.5, 97.5])
    summary = {"model": name, "mean_mJ_per_inf": float(mJs.mean()),
               "ci95_low": float(ci_low), "ci95_high": float(ci_high), "runs": runs}
    with open(logdir/f"energy_{name}.json", "w") as f: json.dump(summary, f, indent=2)
    print(f"[Result] {name}: {summary['mean_mJ_per_inf']:.3f} mJ/inf (95% CI [{summary['ci95_low']:.3f}, {summary['ci95_high']:.3f}])")
    return summary

# ---------------- Ensure minimal configs/features exist (does not change model structure/params) ----------------
BASE = Path('/content')
CFG_DIR = BASE/'configs'; CFG_DIR.mkdir(parents=True, exist_ok=True)
classes_json = CFG_DIR/'classes.json'
splits_json  = CFG_DIR/'splits.json'
if not classes_json.exists():
    classes_json.write_text(json.dumps({"num_classes": 8, "standard_classes": [f"class{i}" for i in range(8)]}, indent=2))
if not splits_json.exists():
    splits_json.write_text(json.dumps({"folds": [{"fold": 0, "test_subject": "S01"}]}, indent=2))

features_dir = BASE/'features'; features_dir.mkdir(parents=True, exist_ok=True)
npz_path = features_dir/'windows_normalized_fold0.npz'
if not npz_path.exists():
    N_train, N_test, T, C, K = 2000, 800, 150, 6, 8
    rng = np.random.default_rng(2025)
    def make_axis(N): return rng.normal(0, 1, size=(N, T)).astype(np.float32)
    out = {
        'acc_x': make_axis(N_train+N_test), 'acc_y': make_axis(N_train+N_test), 'acc_z': make_axis(N_train+N_test),
        'gyro_x': make_axis(N_train+N_test), 'gyro_y': make_axis(N_train+N_test), 'gyro_z': make_axis(N_train+N_test),
        'labels': np.concatenate([rng.integers(0, K, size=N_train), rng.integers(0, K, size=N_test)]).astype(np.int64),
        'subjects': np.array([b'S01']*(N_train+N_test)),
        'splits': np.array(['train']*N_train + ['test']*N_test)
    }
    np.savez(npz_path, **out)

# ---------------- Step 10 model (official architecture-aligned · from-scratch) — structure/params unchanged ----------------
import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'  # GPU-memory friendly

import json
import math
import numpy as np
from pathlib import Path
from typing import List, Tuple

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# 0) Basic configuration
torch.backends.cudnn.benchmark = True
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("\n\nStep 10: DeepConvContext (official architecture-aligned · from-scratch implementation)")
print("=" * 78)
print(f"Device in use: {device}")

SEED = 1
def set_seed(seed=SEED):
    import random
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
set_seed(SEED)

CFG_DIR = Path('/content/configs')
with open(CFG_DIR / 'classes.json', 'r') as f:
    classes_cfg = json.load(f)
with open(CFG_DIR / 'splits.json', 'r') as f:
    splits_cfg = json.load(f)

NUM_CLASSES    = int(classes_cfg['num_classes'])
STANDARD_NAMES = classes_cfg.get('standard_classes', [str(i) for i in range(NUM_CLASSES)])

NUM_CHANNELS      = 6
SAMPLES_PER_WIN   = 150
WIN_OVERLAP       = 0.5
STRIDE_SAMPLES    = int(SAMPLES_PER_WIN * (1 - WIN_OVERLAP))  # 75
CONTEXT_LEN_WINS  = 100

EPOCHS        = 30
LEARNING_RATE = 1e-4
WEIGHT_DECAY  = 1e-6
STEP_SIZE     = 10
GAMMA         = 0.9
DROPOUT_P     = 0.5
BIDIRECTIONAL = False
HIDDEN_UNITS  = 128
CONV_CHANNELS = 64
KERNEL_SIZE   = 9
PADDING       = KERNEL_SIZE // 2

TRAIN_BATCH  = 16
EVAL_BATCH   = 100

print(f"\nHyper-parameters: num_classes={NUM_CLASSES}, num_channels={NUM_CHANNELS}, samples_per_window={SAMPLES_PER_WIN}")
print(f"Sequence length (windows)={CONTEXT_LEN_WINS}, train_batch={TRAIN_BATCH}, epochs={EPOCHS}, lr={LEARNING_RATE}, wd={WEIGHT_DECAY}, step@{STEP_SIZE}×{GAMMA}\n")

# 1) Model (official architecture)
class DeepConvLSTM_Intra(nn.Module):
    def __init__(self, in_ch=6, conv_ch=64, kernel_size=9, hidden=128):
        super().__init__()
        pad = kernel_size // 2
        self.conv1 = nn.Conv1d(in_ch,   conv_ch, kernel_size, padding=pad)
        self.conv2 = nn.Conv1d(conv_ch, conv_ch, kernel_size, padding=pad)
        self.conv3 = nn.Conv1d(conv_ch, conv_ch, kernel_size, padding=pad)
        self.conv4 = nn.Conv1d(conv_ch, conv_ch, kernel_size, padding=pad)
        self.relu  = nn.ReLU(inplace=True)
        self.lstm  = nn.LSTM(input_size=conv_ch, hidden_size=hidden, num_layers=1, batch_first=True)
    def forward(self, x_win):  # (N, C, T)
        x = self.relu(self.conv1(x_win))
        x = self.relu(self.conv2(x))
        x = self.relu(self.conv3(x))
        x = self.relu(self.conv4(x))
        x = x.permute(0, 2, 1)  # (N, T, C)
        _, (h_n, _) = self.lstm(x)
        return h_n[-1]         # (N, hidden)

class DeepConvContext(nn.Module):
    def __init__(self,
                 num_channels=6,
                 num_classes=8,
                 conv_channels=64,
                 hidden_intra=128,
                 hidden_inter=128,
                 dropout=0.5,
                 bidirectional=False):
        super().__init__()
        self.intra = DeepConvLSTM_Intra(num_channels, conv_channels, KERNEL_SIZE, hidden_intra)
        self.inter = nn.LSTM(input_size=hidden_intra,
                             hidden_size=hidden_inter,
                             num_layers=1,
                             batch_first=True,
                             bidirectional=bidirectional)
        inter_out = hidden_inter * (2 if bidirectional else 1)
        self.dropout = nn.Dropout(dropout)
        self.fc      = nn.Linear(inter_out, num_classes)
    def forward(self, x):               # x: (B, S, C, T)
        B, S, C, T = x.shape
        x = x.reshape(B * S, C, T)
        feats = self.intra(x)           # (B*S, hidden_intra)
        feats = feats.view(B, S, -1)    # (B, S, hidden_intra)
        inter_out, _ = self.inter(feats)  # (B, S, inter_out)
        inter_out = self.dropout(inter_out)
        logits = self.fc(inter_out)     # (B, S, K)
        return logits

# 2) Dataset
class HARSequenceDataset(Dataset):
    def __init__(self, npz_file: Path, split='train', sequence_length=100):
        data = np.load(npz_file, allow_pickle=True)
        mask = data['splits'] == split
        labels   = data['labels'][mask]
        subjects = data['subjects'][mask]
        channels = ['acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_y', 'gyro_z']
        wins = np.stack([data[ch][mask] for ch in channels], axis=1).astype(np.float32)  # (N_wins, C=6, T=150)
        self.sequences, self.seq_labels = [], []
        for subj in np.unique(subjects):
            m = subjects == subj
            w_subj = wins[m]; y_subj = labels[m]
            if len(w_subj) < sequence_length: continue
            for i in range(0, len(w_subj) - sequence_length + 1):
                self.sequences.append(w_subj[i:i + sequence_length])   # (S, C, T)
                self.seq_labels.append(y_subj[i:i + sequence_length])  # (S,)
        self.sequences = np.asarray(self.sequences)
        self.seq_labels = np.asarray(self.seq_labels)
    def __len__(self): return len(self.seq_labels)
    def __getitem__(self, idx):
        x = torch.from_numpy(self.sequences[idx])        # (S, C, T)
        y = torch.from_numpy(self.seq_labels[idx]).long()
        return x, y

# Build test loader as in Step 10 (EVAL_BATCH=100, shuffle=False)
test_ds  = HARSequenceDataset(npz_path, split='test', sequence_length=CONTEXT_LEN_WINS)
test_loader  = DataLoader(
    test_ds,
    batch_size=EVAL_BATCH,
    shuffle=False,
    drop_last=False,
    num_workers=2,
    pin_memory=True,
    persistent_workers=False
)

# Instantiate the model with exact hyperparameters; eval mode for inference
model = DeepConvContext(
    num_channels=NUM_CHANNELS,
    num_classes=NUM_CLASSES,
    conv_channels=CONV_CHANNELS,
    hidden_intra=HIDDEN_UNITS,
    hidden_inter=HIDDEN_UNITS,
    dropout=DROPOUT_P,
    bidirectional=BIDIRECTIONAL
).to(device).eval()

# Optional: load trained weights if available (does not affect energy/FLOPs)
models_dir = BASE/'models'; models_dir.mkdir(parents=True, exist_ok=True)
wpath = models_dir / 'deepconvcontext_fold0.pth'
if wpath.exists():
    try:
        model.load_state_dict(torch.load(wpath, map_location=device))
        print(f"[Info] Loaded weights: {wpath.name}")
    except Exception as e:
        print(f"[Warn] Failed to load weights: {e}")

# Build run_once: one complete forward over the test loader
def make_runner(model: nn.Module, loader: DataLoader):
    n_items = len(loader.dataset)
    @torch.no_grad()
    def run_once():
        for x, _ in loader:
            x = x.to(device, non_blocking=True)  # (B, S, C, T)
            _ = model(x)
        if torch.cuda.is_available():
            torch.cuda.synchronize()
    return run_once, n_items

run_once, N_items = make_runner(model, test_loader)

# Measure idle power
logs_dir = BASE/'logs'; logs_dir.mkdir(parents=True, exist_ok=True)
print("\n[Info] Sampling idle power for 20 s ...")
P_idle_mW, _idle = sample_idle_power_mW(duration_s=20.0, dev_index=0, interval=0.02,
                                        save_csv=str(logs_dir/'power_idle_trace_dcc_fromscratch.csv'))
print(f"[Info] Mean idle power ~ {P_idle_mW:.1f} mW")

# Warmup & calibrate repeats (target ≥ 8 s per measurement)
print("\n[Warmup] warmup ...")
run_once(); run_once()
repeats = calibrate_repeats(run_once, target_s=8.0, min_rep=3, max_rep=5000)
print(f"[Info] repeats = {repeats} (items per call = {N_items})")

# NVML measurement + bootstrap CI
summary = measure_with_bootstrap(
    name="deepconvcontext_fromscratch_inference",
    run_once=run_once,
    n_items=N_items,
    repeats=repeats,
    n_runs=5,
    n_boot=1000,
    logdir=logs_dir
)

# Save a compact CSV summary
df = pd.DataFrame([{
    "model": "DeepConvContext (official-structure, from-scratch)",
    "mJ_per_inf_mean": summary["mean_mJ_per_inf"],
    "ci95_low": summary["ci95_low"],
    "ci95_high": summary["ci95_high"],
    "items_per_call": N_items,
    "repeats": repeats,
    "idle_mW": P_idle_mW
}])
df.to_csv(logs_dir/"energy_summary_deepconvcontext_fromscratch.csv", index=False)
print("\n=== Done: GPU inference energy (Scheme 1) ===")
print(df)
print("\nArtifacts:")
print("- logs/power_idle_trace_dcc_fromscratch.csv")
print("- logs/power_trace_deepconvcontext_fromscratch_inference_run*.csv")
print("- logs/energy_deepconvcontext_fromscratch_inference.json")
print("- logs/energy_summary_deepconvcontext_fromscratch.csv")

Mon Nov 17 16:59:22 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-80GB          Off |   00000000:00:05.0 Off |                    0 |
| N/A   34C    P0             55W /  400W |       0MiB /  81920MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                