In [9]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from optimum_benchmark import Benchmark, BenchmarkConfig, TorchrunConfig, InferenceConfig, PyTorchConfig
from optimum_benchmark.logging_utils import setup_logging

def run_benchmark():
    model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
    backend = "pytorch"

    # Setup logging with INFO level
    setup_logging(level="INFO")
    
    # Configure the launcher for a single GPU (nproc_per_node=1)
    launcher_config = TorchrunConfig(nproc_per_node=1)
    
    # Configure the inference scenario to measure latency and memory,
    # and set the input shape for the sequence length
    scenario_config = InferenceConfig(latency=True, memory=True, input_shapes={"sequence_length": 128})
    
    # Configure the backend with model details and GPU device settings.
    # The no_weights=True flag can be used if you don't need to load model weights (for a dry run, etc.)
    backend_config = PyTorchConfig(model=model_name, device="cuda", device_ids="0", no_weights=True)
    
    # Define the overall benchmark configuration
    benchmark_config = BenchmarkConfig(
        name=f"{backend}_{model_name}",
        scenario=scenario_config,
        launcher=launcher_config,
        backend=backend_config,
    )
    
    # Launch the benchmark and convert the report to a dictionary
    benchmark_report = Benchmark.launch(benchmark_config)
    benchmark_results = benchmark_report.to_dict()
    
    return benchmark_results

if __name__ == "__main__":
    results = run_benchmark()
    print("Benchmark Results:")
    print(results)


[[36m2025-03-14 11:07:11,588[0m][[34mbackend[0m][[32mINFO[0m] - CUDA_VISIBLE_DEVICES was set to 0.[0m
[[36m2025-03-14 11:07:11,816[0m][[34mtorchrun[0m][[32mINFO[0m] - Allocated torchrun launcher[0m
[RANK-PROCESS-0][[36m2025-03-14 11:07:28,561[0m][[34mtorchrun[0m][[32mINFO[0m] - 	+ Setting torch.distributed cuda device to 0[0m
[RANK-PROCESS-0][[36m2025-03-14 11:07:28,614[0m][[34mtorchrun[0m][[32mINFO[0m] - 	+ Initializing torch.distributed process group[0m
[RANK-PROCESS-0][[36m2025-03-14 11:07:28,668[0m][[34mdatasets[0m][[32mINFO[0m] - PyTorch version 2.1.2 available.[0m
[RANK-PROCESS-0][[36m2025-03-14 11:07:30,074[0m][[34mpytorch[0m][[32mINFO[0m] - Allocating pytorch backend[0m
[RANK-PROCESS-0][[36m2025-03-14 11:07:30,074[0m][[34mpytorch[0m][[32mINFO[0m] - 	+ Seeding backend with 42[0m
[RANK-PROCESS-0][[36m2025-03-14 11:07:30,075[0m][[34mpytorch[0m][[32mINFO[0m] - 	+ Benchmarking a Transformers model[0m
[RANK-PROCESS-0][[36m2025-

In [11]:
import json

def display_dict_nicely(d):
    print(json.dumps(d, indent=4))

# Example usage
your_dict = {'load_model': {'memory': {'unit': 'MB', 'max_ram': 842.002432, 'max_global_vram': 14725.218304, 'max_process_vram': 4915.724288, 'max_reserved': 4401.922048, 'max_allocated': 4400.195072}, 'latency': {'unit': 's', 'values': [0.43676364135742185], 'count': 1, 'total': 0.43676364135742185, 'mean': 0.43676364135742185, 'p50': 0.43676364135742185, 'p90': 0.43676364135742185, 'p95': 0.43676364135742185, 'p99': 0.43676364135742185, 'stdev': 0, 'stdev_': 0}, 'throughput': None, 'energy': None, 'efficiency': None}}

display_dict_nicely(your_dict)


{
    "load_model": {
        "memory": {
            "unit": "MB",
            "max_ram": 842.002432,
            "max_global_vram": 14725.218304,
            "max_process_vram": 4915.724288,
            "max_reserved": 4401.922048,
            "max_allocated": 4400.195072
        },
        "latency": {
            "unit": "s",
            "values": [
                0.43676364135742185
            ],
            "count": 1,
            "total": 0.43676364135742185,
            "mean": 0.43676364135742185,
            "p50": 0.43676364135742185,
            "p90": 0.43676364135742185,
            "p95": 0.43676364135742185,
            "p99": 0.43676364135742185,
            "stdev": 0,
            "stdev_": 0
        },
        "throughput": null,
        "energy": null,
        "efficiency": null
    }
}
