# Deployment Benchmarking Script

This script evaluates inference time, model size, and memory usage for a trained PyTorch model under deployment-like CPU conditions.

## What it measures
- Inference time (average over 100 CPU runs)
- Model size (`.pt` file saved via `torch.save`)
- RAM usage during execution (`psutil`)

## Usage
1. Set `MODEL_PATH` to your saved model.
2. Define `INPUT_SHAPE` as used during training (e.g., `(1, 800, 7)`).
3. Run the script. Results are printed to the console.

All benchmarks are performed on CPU to reflect edge deployment scenarios.


In [21]:
import torch
import torch.nn as nn
import time
import os
import numpy as np
import psutil
# Define the Bi-LSTM model
# %%
class LSTMClassifier(nn.Module):
    def __init__(self, input_size, hidden_size=128, num_layers=1, dropout=0.3, bidirectional=True, pooling="attention"):
        super(LSTMClassifier, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.bidirectional = bidirectional
        self.pooling = pooling
        self.direction_factor = 2 if bidirectional else 1

        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout,
            bidirectional=bidirectional
        )

        if pooling == "attention":
            self.attention = nn.Sequential(
                nn.Linear(hidden_size * self.direction_factor, hidden_size),
                nn.Tanh(),
                nn.Linear(hidden_size, 1),
            )

        self.fc = nn.Sequential(
            nn.Linear(hidden_size * self.direction_factor, 32),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(32, 1),
            nn.Sigmoid()
        )

    def attention_pooling(self, lstm_out):
        weights = self.attention(lstm_out)
        weights = torch.softmax(weights, dim=1)
        return torch.sum(weights * lstm_out, dim=1)

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        if self.pooling == "attention":
            pooled = self.attention_pooling(lstm_out)
        else:
            pooled = lstm_out[:, -1, :]
        return self.fc(pooled)


In [22]:
# %% [markdown]
# ## Load the Model

# %%
MODEL_PATH = "results/biLSTM_DATA_AUGMENTATION/20250520_183833/model_20250521_024421.pt"
INPUT_SHAPE = (1, 800, 7)
device = torch.device("cpu")

model = LSTMClassifier(input_size=INPUT_SHAPE[2], pooling="attention")
state_dict = torch.load(MODEL_PATH, map_location=device)

# Remove 'module.' prefix if present
clean_state_dict = {k.replace("module.", ""): v for k, v in state_dict.items()}
model.load_state_dict(clean_state_dict)
model.to(device)
model.eval()

print("✅ Model loaded successfully on CPU.")

✅ Model loaded successfully on CPU.




In [23]:
# -------------------------------
# Dummy preprocessing function
# -------------------------------
def mock_preprocessing():
    # Simulate any windowing or normalization before inference
    x = np.random.randn(*INPUT_SHAPE).astype(np.float32)
    return torch.tensor(x)

In [31]:
# -------------------------------
# Inference + Latency Benchmark (Improved)
# -------------------------------
from timeit import default_timer as timer
import gc

sample_input = mock_preprocessing().to(device)

# Warm-up
for _ in range(10):
    _ = model(sample_input)

times = []
latencies = []

with torch.no_grad():
    for _ in range(100):
        gc.collect()  # Clear Python garbage
        torch.cuda.empty_cache()  # Safe for CPU use too

        # Simulate full latency (windowing + inference)
        start_all = timer()
        input_tensor = mock_preprocessing().to(device)
        _ = model(input_tensor)
        end_all = timer()

        # Isolated model-only inference
        start_infer = timer()
        _ = model(sample_input)
        end_infer = timer()

        latencies.append((end_all - start_all) * 1000)
        times.append((end_infer - start_infer) * 1000)

# Inference time stats
avg_time = np.mean(times)
std_time = np.std(times)
print(f"Inference Time (avg ± std over 100 runs): {avg_time:.2f} ± {std_time:.2f} ms")

# Total latency (preprocessing + inference)
avg_latency = np.mean(latencies)
print(f"Window-to-decision Latency: {avg_latency:.2f} ms")


Inference Time (avg ± std over 100 runs): 16.98 ± 0.29 ms
Window-to-decision Latency: 17.27 ms


In [25]:
# -------------------------------
# Model Size
# -------------------------------
model_size_mb = os.path.getsize(MODEL_PATH) / (1024 * 1024)
print(f"Model Size: {model_size_mb:.2f} MB")

Model Size: 0.70 MB


In [26]:
import psutil
import os
import torch
import gc

gc.collect()
torch.cuda.empty_cache()  # Safe even on CPU

# Prepare input
sample_input = torch.randn(INPUT_SHAPE).to(device)

# Ensure any pending memory allocation is flushed
torch.cuda.synchronize() if torch.cuda.is_available() else None

# Track process
process = psutil.Process(os.getpid())
mem_before = process.memory_info().rss

# Run inference
with torch.no_grad():
    output = model(sample_input)
    del output  # Explicitly delete output

torch.cuda.synchronize() if torch.cuda.is_available() else None
gc.collect()

mem_after = process.memory_info().rss
delta_mem_mb = (mem_after - mem_before) / (1024 ** 2)
total_mem_mb = mem_after / (1024 ** 2)

print(f"Δ Memory Used During Inference: {delta_mem_mb:.2f} MB")
print(f"Total Memory Usage After Inference: {total_mem_mb:.2f} MB")


Δ Memory Used During Inference: 0.00 MB
Total Memory Usage After Inference: 314.69 MB


In [27]:
ENERGY_ESTIMATED_WATT = 12  # conservative CPU-only active power draw
inference_energy = (avg_time / 1000) * ENERGY_ESTIMATED_WATT  # in joules
print(f"Estimated Energy per Inference: {inference_energy:.3f} J")


Estimated Energy per Inference: 0.204 J


In [28]:
import subprocess
import sys

result = subprocess.run(
    [sys.executable, "memory_benchmark.py"],
    stdout=subprocess.PIPE,
    stderr=subprocess.PIPE,
    text=True  # decode bytes to string
)

print("STDOUT:\n", result.stdout)
print("STDERR:\n", result.stderr)


STDOUT:
 Δ Memory Used During Inference: 11.05 MB
Total Memory Usage After Inference: 189.03 MB

STDERR:

