In [3]:
# ================================================================
# rTsfNet (lightweight in-house version) × Option 1:
# NVML-based GPU inference energy — mJ per window (auto-computed window_seconds)
# ================================================================
# 0) Basic environment and dependencies
!nvidia-smi
!pip -q install pynvml

import os, math, json, time, pathlib, gc, warnings, multiprocessing as mp
from pathlib import Path
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

# ---------------- NVML sampling & energy integration utilities (per-window reporting) ----------------
import pynvml

def _nvml_sampler(stop_event, q, dev_index=0, interval=0.02):
    """Subprocess: sample power (mW) every `interval` and send back (t_abs, mW)."""
    import time, pynvml
    pynvml.nvmlInit()
    h = pynvml.nvmlDeviceGetHandleByIndex(dev_index)
    try:
        while not stop_event.is_set():
            q.put((time.perf_counter(), pynvml.nvmlDeviceGetPowerUsage(h)))
            time.sleep(interval)
    finally:
        pynvml.nvmlShutdown()

def _integrate_mJ_between(samples, t0, t1):
    """Trapezoidal integration of power (mW) over [t0, t1], returning mJ."""
    if not samples:
        return 0.0
    samples = sorted(samples, key=lambda x: x[0])
    ts = np.array([t for t, _ in samples], dtype=np.float64)
    ps = np.array([p for _, p in samples], dtype=np.float64)
    m = (ts >= t0) & (ts <= t1)
    ts_w = ts[m]; ps_w = ps[m]
    if ts_w.size == 0 or ts_w[0] > t0:
        p0 = np.interp(t0, ts, ps); ts_w = np.insert(ts_w, 0, t0); ps_w = np.insert(ps_w, 0, p0)
    if ts_w[-1] < t1:
        p1 = np.interp(t1, ts, ps); ts_w = np.append(ts_w, t1); ps_w = np.append(ps_w, p1)
    return float(np.trapz(ps_w, ts_w))  # mW*s = mJ

def sample_idle_power_mW(duration_s=20.0, dev_index=0, interval=0.02, save_csv=None):
    """Measure mean idle power (mW), optionally saving the power trace."""
    import time
    q = mp.Queue(); stop = mp.Event()
    p = mp.Process(target=_nvml_sampler, args=(stop, q, dev_index, interval)); p.start()
    time.sleep(duration_s)
    stop.set(); p.join()
    samples = []
    while not q.empty(): samples.append(q.get())
    if not samples:
        raise RuntimeError("NVML did not capture any power samples (idle).")
    samples.sort(key=lambda x: x[0])
    t0, t1 = samples[0][0], samples[-1][0]
    E_idle_mJ = _integrate_mJ_between(samples, t0, t1)
    T_idle_s = max(1e-9, t1 - t0)
    P_idle_mW = E_idle_mJ / T_idle_s
    if save_csv:
        pd.DataFrame(samples, columns=["t_abs_s", "power_mW"]).to_csv(save_csv, index=False)
    return P_idle_mW, samples

def measure_mJ_per_window(run_once, n_windows_per_call, repeats, P_idle_mW,
                          dev_index=0, interval=0.02, save_csv=None):
    """Concurrent NVML sampling + integration + idle subtraction; return per-window energy & latency."""
    import time
    q = mp.Queue(); stop = mp.Event()
    p = mp.Process(target=_nvml_sampler, args=(stop, q, dev_index, interval)); p.start()

    # Measurement window
    t0 = time.perf_counter()
    for _ in range(repeats):
        run_once()
    t1 = time.perf_counter()

    stop.set(); p.join()
    samples = []
    while not q.empty(): samples.append(q.get())
    if not samples:
        raise RuntimeError("NVML did not capture any power samples (measurement).")

    E_total_mJ = _integrate_mJ_between(samples, t0, t1)
    T_total_s  = max(1e-9, t1 - t0)
    E_idle_mJ  = P_idle_mW * T_total_s
    n_windows  = max(1, repeats * n_windows_per_call)

    if save_csv:
        pd.DataFrame(samples, columns=["t_abs_s", "power_mW"]).to_csv(save_csv, index=False)

    return {
        "mJ_per_window": max(0.0, (E_total_mJ - E_idle_mJ) / n_windows),
        "ms_per_window": (T_total_s / n_windows) * 1e3,
        "throughput_windows_per_s": n_windows / T_total_s,
        "n_windows": n_windows,
        "repeats": repeats,
        "T_total_s": T_total_s,
        "E_total_mJ": E_total_mJ,
        "E_idle_mJ": E_idle_mJ,
        "P_idle_mW": P_idle_mW,
        "t0_abs": t0, "t1_abs": t1,
    }

def calibrate_repeats(run_once, target_s=8.0, min_rep=3, max_rep=5000):
    """Estimate repeats so that one measurement lasts ≈ target_s to reduce sampling noise."""
    import time
    run_once()
    t0 = time.perf_counter(); run_once(); t1 = time.perf_counter()
    dt = max(1e-4, t1 - t0)
    reps = int(np.ceil(target_s / dt))
    return int(np.clip(reps, min_rep, max_rep))

def measure_with_bootstrap(name, run_once, n_windows, repeats, n_runs=5, n_boot=1000, logdir=Path("logs")):
    """Repeat n_runs, compute bootstrap 95% CI; save traces and a per-window summary JSON."""
    logdir.mkdir(exist_ok=True)
    res_list = []
    for i in range(n_runs):
        print(f"[Measure] {name} run {i+1}/{n_runs} ...")
        r = measure_mJ_per_window(
            run_once, n_windows, repeats,
            P_idle_mW=P_idle_mW, dev_index=0, interval=0.02,
            save_csv=str(logdir / f"power_trace_{name}_run{i+1}.csv")
        )
        res_list.append(r)

    mJ = np.array([r["mJ_per_window"] for r in res_list], dtype=np.float64)
    ms = np.array([r["ms_per_window"] for r in res_list], dtype=np.float64)
    rng = np.random.default_rng(123)
    boots_mJ = [float(np.mean(mJ[rng.integers(0, len(mJ), size=len(mJ))])) for _ in range(n_boot)]
    ci_low, ci_high = np.percentile(boots_mJ, [2.5, 97.5])

    summary = {
        "model": name,
        "mean_mJ_per_window": float(mJ.mean()),
        "ci95_low_mJ": float(ci_low),
        "ci95_high_mJ": float(ci_high),
        "mean_ms_per_window": float(ms.mean()),
        "runs": res_list,
    }
    with open(logdir / f"energy_{name}.json", "w") as f:
        json.dump(summary, f, indent=2)
    print(
        f"[Result] {name}: {summary['mean_mJ_per_window']:.3f} mJ per window  "
        f"(95% CI [{summary['ci95_low_mJ']:.3f}, {summary['ci95_high_mJ']:.3f}]);  "
        f"{summary['mean_ms_per_window']:.3f} ms per window"
    )
    return summary

# ---------------- rTsfNet (original architecture and hyperparameters — unchanged) ----------------
import tensorflow as tf
from tensorflow.keras import Input
from tensorflow.keras.layers import Dense, Dropout, LayerNormalization, LeakyReLU, Layer, Lambda, Flatten, GlobalAveragePooling1D, Activation
from tensorflow.keras.models import Model
from tensorflow.keras.regularizers import l2
from tensorflow.keras import backend as K

SEED = 42
tf.random.set_seed(SEED); np.random.seed(SEED)
FS = 50.0
IMU_ROT_HEADS = 2
MLP_BASE = 128
MLP_DEPTH = 3
DROPOUT = 0.5
LR = 1e-3
WEIGHT_DECAY = 1e-6
USE_ORIG_INPUT = True

BASE = Path('/content')
features_dir = BASE / 'features'
models_dir = BASE / 'models'
logs_dir = BASE / 'logs'
models_dir.mkdir(parents=True, exist_ok=True)
logs_dir.mkdir(parents=True, exist_ok=True)

def load_fold_data(fold_k, features_dir: Path):
    npz_file = features_dir / f'windows_normalized_fold{fold_k}.npz'
    data = np.load(npz_file, allow_pickle=True)
    X = np.stack([data['acc_x'], data['acc_y'], data['acc_z'], data['gyro_x'], data['gyro_y'], data['gyro_z']], axis=-1)  # [N,T,6]
    y = data['labels']; splits = data['splits']
    train_mask = splits == 'train'; test_mask = splits == 'test'
    return X[train_mask], y[train_mask], X[test_mask], y[test_mask]

class TSFFeatureLayer(Layer):
    def __init__(self, fs=50.0, **kwargs):
        super().__init__(**kwargs); self.fs = float(fs); self.eps = 1e-8
    def get_config(self): cfg = super().get_config(); cfg.update({'fs': self.fs}); return cfg
    def call(self, x):
        mean = tf.reduce_mean(x, axis=1, keepdims=True)
        std  = tf.math.reduce_std(x, axis=1, keepdims=True) + self.eps
        maxv = tf.reduce_max(x, axis=1, keepdims=True); minv = tf.reduce_min(x, axis=1, keepdims=True)
        ptp  = maxv - minv; rms = tf.sqrt(tf.reduce_mean(tf.square(x), axis=1, keepdims=True))
        energy = tf.reduce_sum(tf.square(x), axis=1, keepdims=True)
        skew = tf.reduce_mean(tf.pow((x-mean)/std, 3), axis=1, keepdims=True)
        kurt = tf.reduce_mean(tf.pow((x-mean)/std, 4), axis=1, keepdims=True)
        signs = tf.sign(x); sign_changes = tf.abs(signs[:,1:,:] - signs[:,:-1,:]); zcr = tf.reduce_mean(sign_changes, axis=1, keepdims=True) / 2.0
        x_t1 = x[:,:-1,:]; x_tn1 = x[:,1:,:]
        ar1 = tf.reduce_sum(x_t1*x_tn1, axis=1, keepdims=True) / (tf.reduce_sum(tf.square(x_t1), axis=1, keepdims=True) + self.eps)
        x_t2 = x[:,:-2,:]; x_tn2 = x[:,2:,:]
        ar2 = tf.reduce_sum(x_t2*x_tn2, axis=1, keepdims=True) / (tf.reduce_sum(tf.square(x_t2), axis=1, keepdims=True) + self.eps)
        xc = x - mean; x_bc_t = tf.transpose(xc, [0,2,1]); fft = tf.signal.rfft(x_bc_t); power = tf.square(tf.abs(fft)) + self.eps; power = tf.transpose(power, [0,2,1])
        F = tf.shape(power)[1]; freqs = tf.linspace(0.0, tf.cast(self.fs, tf.float32)/2.0, F); freqs = tf.reshape(freqs, [1,F,1])
        p = power / (tf.reduce_sum(power, axis=1, keepdims=True) + self.eps)
        centroid = tf.reduce_sum(p * freqs, axis=1, keepdims=True)
        entropy  = -tf.reduce_sum(p * tf.math.log(p + self.eps), axis=1, keepdims=True) / (tf.math.log(tf.cast(F, tf.float32) + self.eps))
        geo = tf.exp(tf.reduce_mean(tf.math.log(power), axis=1, keepdims=True)); ari = tf.reduce_mean(power, axis=1, keepdims=True)
        flatness = geo / (ari + self.eps)
        w = tf.nn.softmax(power * 10.0, axis=1); soft_peak = tf.reduce_sum(w * freqs, axis=1, keepdims=True)
        def band(low, high):
            mask = tf.cast((freqs >= low) & (freqs < high), tf.float32)
            return tf.reduce_sum(power * mask, axis=1, keepdims=True) / (tf.reduce_sum(power, axis=1, keepdims=True) + self.eps)
        bp1 = band(0.5, 3.0); bp2 = band(3.0, 8.0); bp3 = band(8.0, 15.0)
        res = tf.concat([mean,std,maxv,minv,ptp,rms,energy,skew,kurt,zcr,ar1,ar2,centroid,entropy,flatness,soft_peak,bp1,bp2,bp3], axis=1)
        return tf.transpose(res, [0,2,1])

class Multihead3DRotation(Layer):
    def __init__(self, head_nums=2, base_kn=64, param_depth=2, **kwargs):
        super().__init__(**kwargs); self.head_nums=head_nums; self.base_kn=base_kn; self.param_depth=param_depth; self.eps=1e-8
        self.gap = GlobalAveragePooling1D(); self.mlp=[Dense(self.base_kn, activation='relu') for _ in range(self.param_depth)]
        self.out_heads=[Dense(4, activation='tanh') for _ in range(self.head_nums)]
    def get_config(self): cfg = super().get_config(); cfg.update({'head_nums':self.head_nums,'base_kn':self.base_kn,'param_depth':self.param_depth}); return cfg
    def compute_output_shape(self, input_shape): return [tf.TensorShape(input_shape) for _ in range(self.head_nums)]
    def _axis_angle_to_R(self, axis_raw, angle_raw):
        axis = axis_raw / (tf.norm(axis_raw, axis=-1, keepdims=True) + self.eps); theta = angle_raw * math.pi
        B = tf.shape(axis)[0]; ux,uy,uz = axis[:,0],axis[:,1],axis[:,2]; z = tf.zeros_like(ux)
        Kmat = tf.stack([z,-uz,uy,uz,z,-ux,-uy,ux,z], axis=-1); Kmat = tf.reshape(Kmat,[B,3,3])
        I = tf.tile(tf.eye(3, dtype=axis.dtype)[None,...],[B,1,1]); u = tf.expand_dims(axis,-1); uuT = tf.matmul(u,u,transpose_b=True)
        cos = tf.reshape(tf.cos(theta),[-1,1,1]); sin = tf.reshape(tf.sin(theta),[-1,1,1])
        return cos*I + (1.0-cos)*uuT + sin*Kmat
    def call(self, x):
        acc, gyr = x[:,:,:3], x[:,:,3:6]; pooled = self.gap(x); h=pooled
        for layer in self.mlp: h = layer(h)
        out_list=[]
        for oh in self.out_heads:
            p=oh(h); axis=p[:,:3]; angle=tf.expand_dims(p[:,3],-1); R=self._axis_angle_to_R(axis,angle)
            acc_t=tf.transpose(acc,[0,2,1]); acc_rot_t=tf.matmul(R,acc_t); acc_rot=tf.transpose(acc_rot_t,[0,2,1])
            gyr_t=tf.transpose(gyr,[0,2,1]); gyr_rot_t=tf.matmul(R,gyr_t); gyr_rot=tf.transpose(gyr_rot_t,[0,2,1])
            out_list.append(tf.concat([acc_rot,gyr_rot],axis=-1))
        return out_list

def add_l2_channels(x):
    acc = x[:,:,:3]; gyr = x[:,:,3:6]
    l2_acc = tf.sqrt(tf.reduce_sum(tf.square(acc), axis=-1, keepdims=True))
    l2_gyr = tf.sqrt(tf.reduce_sum(tf.square(gyr), axis=-1, keepdims=True))
    return tf.concat([x,l2_acc,l2_gyr],axis=-1)

def r_tsf_net(x_shape, n_classes, learning_rate=LR, base_kn=MLP_BASE, depth=MLP_DEPTH, dropout_rate=DROPOUT,
              imu_rot_heads=IMU_ROT_HEADS, fs=FS, use_orig_input=USE_ORIG_INPUT):
    inputs = Input(shape=x_shape[1:]); x = inputs
    rot_layer = Multihead3DRotation(head_nums=imu_rot_heads, base_kn=64, param_depth=2, name='multihead_rot')
    rotated_list = rot_layer(x)
    streams=[]
    if use_orig_input: streams.append(Lambda(add_l2_channels,name='orig_plus_l2')(x))
    for i,xr in enumerate(rotated_list): streams.append(Lambda(add_l2_channels,name=f'rot{i}_plus_l2')(xr))
    concat_streams = Lambda(lambda lst: tf.concat(lst, axis=-1), name='concat_streams')(streams)
    tsf = TSFFeatureLayer(fs=fs, name='tsf')(concat_streams)
    z = Flatten(name='flatten')(tsf)
    for k in range(depth-1, -1, -1):
        z = Dense(base_kn*(2**k), kernel_regularizer=l2(WEIGHT_DECAY), name=f'fc_{k}')(z)
        z = LayerNormalization(epsilon=1e-7, name=f'ln_{k}')(z)
        z = LeakyReLU(name=f'lrelu_{k}')(z)
        z = Dropout(dropout_rate, name=f'drop_{k}')(z)
    logits = Dense(n_classes, kernel_regularizer=l2(WEIGHT_DECAY), name='logits')(z)
    probs  = Activation('softmax', dtype='float32', name='softmax')(logits)
    model = Model(inputs, probs, name='rTsfNet_officially_aligned_fixed')
    opt = tf.keras.optimizers.Adam(learning_rate=learning_rate, amsgrad=True)
    model.compile(loss='sparse_categorical_crossentropy', optimizer=opt, metrics=['accuracy'])
    return model

# ---------------- Data: prefer real features; otherwise generate demo data ----------------
def scan_available_folds(features_dir: Path):
    ks=[]
    for f in features_dir.glob("windows_normalized_fold*.npz"):
        try: ks.append(int(f.stem.replace("windows_normalized_fold","")))
        except Exception: pass
    return sorted(set(ks))

available_folds = scan_available_folds(features_dir)
ACTIVE_FOLDS = available_folds[:1] if available_folds else [0]
print(f"[Info] Available folds: {available_folds}  |  Planned measurement: {ACTIVE_FOLDS}")

def make_synth_fold(n_train=4000, n_test=800, T=150, C=6, n_classes=8, seed=2025):
    rng = np.random.default_rng(seed)
    Xtr = rng.normal(0,1,size=(n_train,T,C)).astype(np.float32)
    Xte = rng.normal(0,1,size=(n_test, T,C)).astype(np.float32)
    ytr = rng.integers(0,n_classes,size=n_train).astype(np.int64)
    yte = rng.integers(0,n_classes,size=n_test).astype(np.int64)
    return Xtr,ytr,Xte,yte

# ---------------- TF inference wrapper: one full pass (unit = window) ----------------
def make_tf_runner(model: tf.keras.Model, X_test_np: np.ndarray, bs: int = 256):
    device = "/GPU:0" if tf.config.list_physical_devices('GPU') else "/CPU:0"
    with tf.device(device):
        X_gpu = tf.convert_to_tensor(X_test_np.astype(np.float32))  # keep resident
    N = X_test_np.shape[0]
    @tf.function(jit_compile=False)
    def fwd(x): return model(x, training=False)
    def run_once():
        last=None
        for s in range(0, N, bs):
            e = min(N, s+bs)
            last = fwd(X_gpu[s:e])
        _ = tf.reduce_sum(last).numpy()  # device sync
    return run_once, N  # N windows per call

# ---------------- Idle power ----------------
print("\n[Info] Measuring idle power for 20 s ...")
P_idle_mW, _idle = sample_idle_power_mW(duration_s=20.0, dev_index=0, interval=0.02,
                                        save_csv=str(logs_dir/'power_idle_trace_rtsfnet.csv'))
print(f"[Info] Mean idle power ~ {P_idle_mW:.1f} mW")

# ---------------- Per-fold measurement ----------------
summary_rows = []
for k in ACTIVE_FOLDS:
    print("\n" + "="*72)
    print(f"[rTsfNet] Fold {k} — preparing data and model (original architecture/hyperparameters)")
    if k in available_folds:
        X_train, y_train, X_test, y_test = load_fold_data(k, features_dir)
        n_classes = int(np.max(np.maximum(y_train.max(), y_test.max())) + 1)
    else:
        print("[Warn] Real features not found; using synthetic data to demonstrate the measurement pipeline.")
        X_train, y_train, X_test, y_test = make_synth_fold()
        n_classes = int(np.max(np.maximum(y_train.max(), y_test.max())) + 1)

    # Derive window_seconds from data length and FS (for explicit reporting)
    window_seconds = float(X_test.shape[1] / FS)

    model = r_tsf_net(x_shape=X_train.shape, n_classes=n_classes,
                      learning_rate=LR, base_kn=MLP_BASE, depth=MLP_DEPTH, dropout_rate=DROPOUT,
                      imu_rot_heads=IMU_ROT_HEADS, fs=FS, use_orig_input=USE_ORIG_INPUT)

    wpath = models_dir / f"model_fold{k}.weights.h5"
    if wpath.exists():
        try:
            model.load_weights(wpath); print(f"[Info] Loaded weights: {wpath.name}")
        except Exception as e:
            print(f"[Warn] Failed to load weights: {e}")

    # Optional quick sanity check (not part of energy measurement)
    try:
        acc = (model.predict(X_test, batch_size=256, verbose=0).argmax(1) == y_test).mean()
        print(f"[Check] Fold {k} quick accuracy: {acc:.3f}")
    except Exception as e:
        print(f"[Warn] Skipping accuracy check: {e}")

    run_once, N_windows_per_call = make_tf_runner(model, X_test, bs=256)

    # Warmup
    for _ in range(3): run_once()

    # Ensure ≥8 s effective window
    repeats = calibrate_repeats(run_once, target_s=8.0, min_rep=3, max_rep=5000)
    print(f"[Info] repeats = {repeats}  (windows per call = {N_windows_per_call})")

    # Measure (per-window) + bootstrap CI
    tag = f"rtsfnet_fold{k}_per_window"
    summ = measure_with_bootstrap(
        name=tag, run_once=run_once, n_windows=N_windows_per_call,
        repeats=repeats, n_runs=5, n_boot=1000, logdir=logs_dir
    )

    summary_rows.append({
        "fold": k,
        "model": f"rTsfNet (fold {k})",
        "window_seconds": window_seconds,
        "mJ_per_window_mean": summ["mean_mJ_per_window"],
        "ci95_low_mJ": summ["ci95_low_mJ"],
        "ci95_high_mJ": summ["ci95_high_mJ"],
        "ms_per_window_mean": summ["mean_ms_per_window"],
    })

    K.clear_session(); gc.collect()

# ---------------- Summary output ----------------
df_sum = pd.DataFrame(summary_rows).sort_values("fold").reset_index(drop=True)
df_sum.to_csv(logs_dir / "energy_summary_rtsfnet_per_window.csv", index=False)
print("\n=== Completed (rTsfNet GPU inference energy · per window · original architecture and hyperparameters) ===")
print(df_sum)
print("\nLog files:")
print("- logs/power_idle_trace_rtsfnet.csv")
print("- logs/power_trace_rtsfnet_fold*_per_window_run*.csv")
print("- logs/energy_rtsfnet_fold*_per_window.json")
print("- logs/energy_summary_rtsfnet_per_window.csv")

Mon Nov 17 18:35:45 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-80GB          Off |   00000000:00:05.0 Off |                    0 |
| N/A   32C    P0             60W /  400W |   26109MiB /  81920MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                