# Deployment Benchmarking Script

This script evaluates inference time, model size, and memory usage for a trained PyTorch model under deployment-like CPU conditions.

## What it measures
- Inference time (average over 100 CPU runs)
- Model size (`.pt` file saved via `torch.save`)
- RAM usage during execution (`psutil`)

## Usage
1. Set `MODEL_PATH` to your saved model.
2. Define `INPUT_SHAPE` as used during training (e.g., `(1, 800, 7)`).
3. Run the script. Results are printed to the console.

All benchmarks are performed on CPU to reflect edge deployment scenarios.


In [1]:
import torch
import torch.nn as nn
import time
import os
import numpy as np
import psutil
import torch.nn.functional as F

class GRUClassifier(nn.Module):
    def __init__(self, input_size, hidden_size=64, num_layers=2, dropout=0.2, bidirectional=True, pooling="attention"):
        super().__init__()
        self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout, bidirectional=bidirectional)
        self.pooling = pooling
        direction_factor = 2 if bidirectional else 1
        if pooling == "attention":
            self.attention = nn.Sequential(
                nn.Linear(hidden_size * direction_factor, hidden_size),
                nn.Tanh(),
                nn.Linear(hidden_size, 1),
            )
        self.fc = nn.Sequential(
            nn.Linear(hidden_size * direction_factor, 32),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(32, 1),
            nn.Sigmoid()
        )

    def attention_pooling(self, gru_out):
        weights = self.attention(gru_out)
        weights = torch.softmax(weights, dim=1)
        return torch.sum(weights * gru_out, dim=1)

    def forward(self, x):
        out, _ = self.gru(x)
        if self.pooling == "attention":
            out = self.attention_pooling(out)
        else:
            out = out[:, -1, :]
        return self.fc(out)



In [3]:
# %%
# --- Load the model ---
MODEL_PATH = "results/GRU_DATA_AUGMENTATION/20250530_233726/model_20250531_195904.pt"

INPUT_SHAPE = (1, 800, 7)
device = torch.device("cpu")

model = GRUClassifier(input_size=INPUT_SHAPE[2], hidden_size=64, num_layers=2, dropout=0.5, bidirectional=False, pooling="attention")
state_dict = torch.load(MODEL_PATH, map_location=device)
model.load_state_dict(state_dict)
model.to(device)
model.eval()

print("✅ GRU model loaded on CPU")

✅ GRU model loaded on CPU


In [4]:
# -------------------------------
# Dummy preprocessing function
# -------------------------------
def mock_preprocessing():
    # Simulate any windowing or normalization before inference
    x = np.random.randn(*INPUT_SHAPE).astype(np.float32)
    return torch.tensor(x)

In [5]:
# --- Inference Benchmark ---
from timeit import default_timer as timer
import gc

sample_input = mock_preprocessing().to(device)
for _ in range(10):  # Warm-up
    _ = model(sample_input)

times, latencies = [], []
for _ in range(100):
    gc.collect()
    input_tensor = mock_preprocessing().to(device)

    start_all = time.perf_counter()
    _ = model(input_tensor)
    end_all = time.perf_counter()

    start_infer = time.perf_counter()
    _ = model(sample_input)
    end_infer = time.perf_counter()

    latencies.append((end_all - start_all) * 1000)
    times.append((end_infer - start_infer) * 1000)

avg_time = np.mean(times)
std_time = np.std(times)
avg_latency = np.mean(latencies)

print(f"Inference Time (avg ± std): {avg_time:.2f} ± {std_time:.2f} ms")
print(f"Total Window-to-decision Latency: {avg_latency:.2f} ms")

Inference Time (avg ± std): 29.50 ± 0.51 ms
Total Window-to-decision Latency: 23.86 ms


In [6]:
# -------------------------------
# Model Size
# -------------------------------
model_size_mb = os.path.getsize(MODEL_PATH) / (1024 * 1024)
print(f"Model Size: {model_size_mb:.2f} MB")

Model Size: 0.18 MB


In [18]:
# --- Memory Usage ---
sample_input = torch.randn(INPUT_SHAPE).to(device)
process = psutil.Process(os.getpid())
mem_before = process.memory_info().rss

with torch.no_grad():
    _ = model(sample_input)

mem_after = process.memory_info().rss
delta_mem_mb = (mem_after - mem_before) / (1024 ** 2)
total_mem_mb = mem_after / (1024 ** 2)

print(f"Δ Memory Used During Inference: {delta_mem_mb:.2f} MB")
print(f"Total Memory Usage After Inference: {total_mem_mb:.2f} MB")

Δ Memory Used During Inference: 0.00 MB
Total Memory Usage After Inference: 282.16 MB


In [9]:
# %%
# --- Energy Estimate ---
ENERGY_ESTIMATED_WATT = 12
inference_energy = (avg_time / 1000) * ENERGY_ESTIMATED_WATT
print(f"Estimated Energy per Inference: {inference_energy:.3f} J")

Estimated Energy per Inference: 0.354 J


In [15]:
import subprocess
import sys

result = subprocess.run(
    [sys.executable, "memory_benchmark_GRU.py"],
    stdout=subprocess.PIPE,
    stderr=subprocess.PIPE,
    text=True  # decode bytes to string
)

print("STDOUT:\n", result.stdout)
print("STDERR:\n", result.stderr)


STDOUT:
 Δ Memory Used During Inference: 5.33 MB
Total Memory Usage After Inference: 181.05 MB

STDERR:
 
