## Computational performance and real-time feasibility

In [1]:
import torch
import torch.nn as nn
import numpy as np
import sys
import os
import random
import time
import matplotlib.pyplot as plt

src_path = os.path.abspath(os.path.join(os.getcwd(), 'src'))
if src_path not in sys.path:
    sys.path.append(src_path)
    
from utils import MIMONetDataset, DeepONetDataset, ChannelScaler
from mimonet_drop import MIMONet_Drop
from mimonet import MIMONet

In [2]:
# Device + name
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
dev_name = (torch.cuda.get_device_name(torch.cuda.current_device()) 
            if torch.cuda.is_available() else "cpu")
print("Device:", dev_name)

Device: NVIDIA GH200 120GB


In [3]:
working_dir = "/projects/bcnx/kazumak2/MIMONet/HeatExchanger"
data_dir = os.path.join(working_dir, "data")

In [4]:
# trunk dataset
trunk_input = np.load(os.path.join(data_dir, "share/trunk.npz"))['trunk']

# min-max scaling [-1, 1]
trunk_input[:, 0] = 2 * (trunk_input[:, 0] - np.min(trunk_input[:, 0])) / (np.max(trunk_input[:, 0]) - np.min(trunk_input[:, 0])) - 1
trunk_input[:, 1] = 2 * (trunk_input[:, 1] - np.min(trunk_input[:, 1])) / (np.max(trunk_input[:, 1]) - np.min(trunk_input[:, 1])) - 1

# branch input dataset
branch = np.load(os.path.join(data_dir, "branch.npz"))

branch1 = branch['branch1']
branch2 = branch['branch2']

print("Branch1 shape:", branch1.shape)
print("Branch2 shape:", branch2.shape)


# create a dictionary for the output channel names
# 0: turb-kinetic-energy
# 1: pressure
# 2: temperature
# 3: z-velocity
# 4: y-velocity
# 5: x-velocity
# 6: velocity-magnitude

dict_channel = {
    0: 'turb-kinetic-energy',
    1: 'pressure',
    2: 'temperature',
    3: 'z-velocity',
    4: 'y-velocity',
    5: 'x-velocity',
    6: 'velocity-magnitude'
}

# select the output channel
target_channel = [1, 3, 4, 5, 6]

# print the selected output channel names
# target_label is used to store the names of the selected output channels for further processing (e.g., plotting)
print("Selected output channels:")
target_label = []
for channel in target_channel:
    print(dict_channel[channel])
    target_label.append(dict_channel[channel])    
    
# target dataset
target = np.load(os.path.join(data_dir, "target.npy"))

print("Target shape:", target.shape)

## extract the output channels
## select the desired channels using the list (target_channel)
target = target[..., target_channel]

# print the shape of the extracted target
print("Extracted target shape:", target.shape)

Branch1 shape: (1546, 2)
Branch2 shape: (1546, 100)
Selected output channels:
pressure
z-velocity
y-velocity
x-velocity
velocity-magnitude
Target shape: (1546, 3977, 7)
Extracted target shape: (1546, 3977, 5)


## Set Dataloader for Test

In [5]:
# dataset and dataloader
dataset = MIMONetDataset(
    [branch1, branch2],  # branch_data_list
    trunk_input,         # trunk_data
    target               # target_data
)

# Inspect a single sample from dataset
sample = dataset[0]
branch_data_sample, trunk_data_sample, target_data_sample = sample

for i, b in enumerate(branch_data_sample):
    print(f"Branch {i+1} dtype:", b.dtype, "shape:", b.shape)
print("Trunk dtype:", trunk_data_sample.dtype, "shape:", trunk_data_sample.shape)
print("Target dtype:", target_data_sample.dtype, "shape:", target_data_sample.shape)


Branch 1 dtype: torch.float32 shape: torch.Size([2])
Branch 2 dtype: torch.float32 shape: torch.Size([100])
Trunk dtype: torch.float32 shape: torch.Size([3977, 2])
Target dtype: torch.float32 shape: torch.Size([3977, 5])


In [6]:
# Architecture parameters
dim = 256
branch_input_dim1 = 2
branch_input_dim2 = 100
trunk_input_dim = 2

# Define the model arguments for orig_MIMONet
model_args = {
    'branch_arch_list': [
        [branch_input_dim1, 512, 512, 512, dim],
        [branch_input_dim2, 512, 512, 512, dim]
    ],
    'trunk_arch': [trunk_input_dim, 256, 256, 256, dim],
    'num_outputs': target.shape[-1] -1,  # number of output channels
    'activation_fn': nn.ReLU,
    'merge_type': 'mul',
    'dropout_p': 0.1  # Dropout rate
}

model = MIMONet_Drop(**model_args)
model = model.to(device)

# Print parameter count
num_params = sum(p.numel() for p in model.parameters())
print(f"Total number of parameters: {num_params:,}")

# load the model
model.load_state_dict(torch.load('HeatExchanger/checkpoints/best_model_dropout.pt'))

Total number of parameters: 1,762,052


  model.load_state_dict(torch.load('HeatExchanger/checkpoints/best_model_dropout.pt'))


<All keys matched successfully>

In [7]:
dataloader = torch.utils.data.DataLoader(
    dataset,
    batch_size=32,
    shuffle=False,
    num_workers=1,
    pin_memory=True
)

In [8]:
import torch, time, numpy as np, subprocess

# ======================================================
# TRY NVML INIT (fallback to nvidia-smi if unavailable)
# ======================================================
use_nvml = False
try:
    import pynvml
    pynvml.nvmlInit()
    handle = pynvml.nvmlDeviceGetHandleByIndex(0)
    use_nvml = True
    print("Using NVML for power logging.")
except Exception as e:
    print("NVML unavailable, using nvidia-smi fallback.")
    print("Reason:", e)

def read_power_mem():
    """Read GPU power (W) and memory (MB) using NVML or nvidia-smi."""
    if use_nvml:
        power = pynvml.nvmlDeviceGetPowerUsage(handle) / 1000.0  # W
        mem = pynvml.nvmlDeviceGetMemoryInfo(handle).used / 1e6  # MB
    else:
        cmd = [
            "nvidia-smi",
            "--query-gpu=power.draw,memory.used",
            "--format=csv,noheader,nounits"
        ]
        try:
            output = subprocess.check_output(cmd).decode().strip()
            power, mem = map(float, output.split(","))
        except Exception:
            power, mem = np.nan, np.nan
    return power, mem

# ======================================================
# CONFIGURATION
# ======================================================
n_warmup = 10
n_repeats = 1000
print("Device name:", dev_name)

model = model.to(device)
model.eval()

# ======================================================
# WARMUP
# ======================================================
with torch.no_grad():
    for i, (branch_data, trunk_data, target_data) in enumerate(dataloader):
        branch_data = [b.to(device) for b in branch_data]
        trunk_data = trunk_data.to(device)
        _ = model(branch_data, trunk_data)
        if i >= n_warmup:
            break

torch.cuda.synchronize()
print("Warmup complete.")

# ======================================================
# TIMING + POWER LOGGING
# ======================================================
latencies, powers, mem_used = [], [], []

with torch.no_grad():
    for i, (branch_data, trunk_data, target_data) in enumerate(dataloader):
        branch_data = [b.to(device, non_blocking=True) for b in branch_data]
        trunk_data = trunk_data.to(device, non_blocking=True)

        # --- pre-inference metrics
        power_pre, mem_pre = read_power_mem()

        torch.cuda.synchronize()
        t0 = time.perf_counter()

        pred = model(branch_data, trunk_data)

        torch.cuda.synchronize()
        t1 = time.perf_counter()

        # --- post-inference metrics
        power_post, mem_post = read_power_mem()

        latencies.append(t1 - t0)
        powers.append(np.nanmean([power_pre, power_post]))
        mem_used.append(np.nanmax([mem_pre, mem_post]))

        if i >= n_repeats:
            break

# ======================================================
# AGGREGATE RESULTS
# ======================================================
latencies, powers, mem_used = map(np.array, [latencies, powers, mem_used])

mean_latency, std_latency = latencies.mean(), latencies.std()
mean_power, std_power = powers.mean(), powers.std()
energy_per_inf = latencies * powers
mean_energy, std_energy = energy_per_inf.mean(), energy_per_inf.std()
mean_mem, std_mem, peak_mem = mem_used.mean(), mem_used.std(), mem_used.max()

# ======================================================
# PRINT & SAVE
# ======================================================
print(f"Mean latency per inference: {mean_latency*1000:.3f} ± {std_latency*1000:.3f} ms")
print(f"Throughput: {1/mean_latency:.1f} ± {(std_latency/mean_latency**2):.1f} samples/s")
print(f"Average Power: {mean_power:.2f} ± {std_power:.2f} W")
print(f"Energy per inference: {mean_energy:.3f} ± {std_energy:.3f} J")
print(f"Peak VRAM usage: {peak_mem:.1f} MB (mean {mean_mem:.1f} ± {std_mem:.1f})")

# --- save log ---
output_file = "HeatExchanger/analysis/benchmark_all_devices.txt"
with open(output_file, "a") as f:
    f.write(f"\n{'='*50}\n")
    f.write(f"Device name: {dev_name}\n")
    f.write(f"Mean latency per inference: {mean_latency*1000:.3f} ± {std_latency*1000:.3f} ms\n")
    f.write(f"Throughput: {1/mean_latency:.1f} ± {(std_latency/mean_latency**2):.1f} samples/s\n")
    f.write(f"Average Power: {mean_power:.2f} ± {std_power:.2f} W\n")
    f.write(f"Energy per inference: {mean_energy:.3f} ± {std_energy:.3f} J\n")
    f.write(f"Peak VRAM usage: {peak_mem:.1f} MB (mean {mean_mem:.1f} ± {std_mem:.1f})\n")

print(f"\nAppended results to {output_file}")


Using NVML for power logging.
Device name: NVIDIA GH200 120GB
Warmup complete.
Mean latency per inference: 4.307 ± 2.852 ms
Throughput: 232.2 ± 153.8 samples/s
Average Power: 177.11 ± 19.15 W
Energy per inference: 0.778 ± 0.637 J
Peak VRAM usage: 3451.3 MB (mean 3450.3 ± 0.3)

Appended results to HeatExchanger/analysis/benchmark_all_devices.txt
