In [None]:
# import torch
# print("PyTorch built with CUDA:", torch.version.cuda)

# print("GPU compute capability:", torch.cuda.get_device_capability(0) if torch.cuda.is_available() else "No GPU detected")
# print("GPU name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU detected")

# print("CUDA runtime version:", torch.version.cuda)
# print("Is CUDA available?", torch.cuda.is_available())


  import pynvml  # type: ignore[import]


PyTorch built with CUDA: 12.6
GPU compute capability: (7, 5)
GPU name: NVIDIA GeForce GTX 1650 Ti with Max-Q Design
CUDA runtime version: 12.6
Is CUDA available? True


In [2]:
import torch

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Running benchmarks on device: {DEVICE}")

print(torch.__version__)
print(torch.version.cuda)
print(torch.cuda.is_available())


Running benchmarks on device: cuda
2.8.0+cu126
12.6
True


In [2]:
# from app.utils import get_ram_usage_mb


# import sys
# import os

# # Add project root (two levels up from notebooks/)
# project_root = os.path.abspath(os.path.join(os.getcwd(), "../.."))
# sys.path.append(project_root)

import utils


  import pynvml  # type: ignore[import]


In [4]:
import os
import time
import torch
import torchvision.models as models
from torch.profiler import profile, record_function, ProfilerActivity
from torch.utils.tensorboard import SummaryWriter
import pandas as pd
import psutil
import utils  # our helper functions (in app/utils.py)


# -----------------------------
# CONFIG
# -----------------------------
BATCH_SIZES = [1, 4, 8, 16, 32]
INPUT_SHAPE = (3, 224, 224)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
RESULTS_CSV = os.path.join("results", "benchmark_results.csv")
LOG_DIR = os.path.join("logs", "tensorboard")


def benchmark_densenet():
    os.makedirs("results", exist_ok=True)
    os.makedirs(LOG_DIR, exist_ok=True)

    writer = SummaryWriter(LOG_DIR)
    results = []

    print(f"Running benchmarks on device: {DEVICE}")

    # -----------------------------
    # Load model
    # -----------------------------
    start_time = time.time()
    model = models.densenet121(weights=models.DenseNet121_Weights.IMAGENET1K_V1).to(DEVICE)  # no pretrained weights
    model.eval()
    model_load_time = (time.time() - start_time) * 1000  # ms

    # -----------------------------
    # Run for each batch size
    # -----------------------------
    for batch_size in BATCH_SIZES:
        print(f"\nBenchmarking batch size = {batch_size}")
        inputs = torch.randn(batch_size, *INPUT_SHAPE).to(DEVICE)

        # warmup
        with torch.no_grad():
            for _ in range(3):
                _ = model(inputs)

        # profiling
        with profile(
            activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA] if DEVICE == "cuda" else [ProfilerActivity.CPU],
            record_shapes=True,
            profile_memory=True,
            with_stack=True
        ) as prof:
            with record_function("model_inference"):
                torch.cuda.synchronize() if DEVICE == "cuda" else None
                start = time.time()
                with torch.no_grad():
                    outputs = model(inputs)
                torch.cuda.synchronize() if DEVICE == "cuda" else None
                end = time.time()

        # latency and throughput
        latency_ms = (end - start) * 1000
        throughput = batch_size / (end - start)

        # system stats
        ram_usage = utils.get_ram_usage_mb()
        vram_usage = utils.get_vram_usage_mb() if DEVICE == "cuda" else 0
        cpu_util = utils.get_cpu_utilization()
        gpu_util = utils.get_gpu_utilization() if DEVICE == "cuda" else 0

        # log to tensorboard
        writer.add_scalar(f"Latency/batch_{batch_size}", latency_ms)
        writer.add_scalar(f"Throughput/batch_{batch_size}", throughput)
        writer.add_scalar(f"RAM_Usage_MB/batch_{batch_size}", ram_usage)
        if DEVICE == "cuda":
            writer.add_scalar(f"VRAM_Usage_MB/batch_{batch_size}", vram_usage)
            writer.add_scalar(f"GPU_Utilization/batch_{batch_size}", gpu_util)

        # append to results
        results.append({
            "model_variant": "densenet121_baseline",
            "batch_size": batch_size,
            "device": DEVICE,
            "ram_usage_mb": ram_usage,
            "vram_usage_mb": vram_usage,
            "cpu_utilization_pct": cpu_util,
            "gpu_utilization_pct": gpu_util,
            "latency_ms": latency_ms,
            "throughput_samples_sec": throughput,
            "accuracy_top1": "NA",  # will be added in Part 2
            "accuracy_top5": "NA",
            "model_size_mb": utils.get_model_size_mb(model),
            "optimization_technique": "baseline",
            "model_load_time_ms": model_load_time
        })

    writer.close()

    # save CSV
    df = pd.DataFrame(results)
    df.to_csv(RESULTS_CSV, index=False)
    print(f"\n✅ Results saved to {RESULTS_CSV}")


if __name__ == "__main__":
    benchmark_densenet()


Running benchmarks on device: cuda

Benchmarking batch size = 1

Benchmarking batch size = 4

Benchmarking batch size = 8

Benchmarking batch size = 16

Benchmarking batch size = 32

✅ Results saved to results\benchmark_results.csv


In [None]:
import os
import time
import torch
import pandas as pd
from torch.profiler import profile, record_function, ProfilerActivity
from torchvision import models
import utils
import optimisations  # NEW

# Add optimization techniques
# OPTIMIZATIONS = ["baseline", "amp", "jit", "quantization"]
OPTIMIZATIONS = ["baseline", "amp"]

def benchmark_densenet(batch_sizes=[1, 4, 8, 16, 32], device=None):
    results = []

    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(device) 
    sample_input = torch.randn(1, 3, 224, 224).to(device)

    for opt in OPTIMIZATIONS:
        for batch_size in batch_sizes:
            input_tensor = torch.randn(batch_size, 3, 224, 224).to(device)

            # Load model with optimization
            model = optimisations.get_model(opt, device, sample_input)
            model.eval()

            # Warmup
            for _ in range(5):
                with torch.no_grad():
                    if opt == "amp" and device == "cuda":
                        with torch.amp.autocast("cuda"):
                            _ = model(input_tensor)
                    else:
                        _ = model(input_tensor)

            # Measure inference time
            torch.cuda.synchronize() if device == "cuda" else None
            start_time = time.time()
            with torch.no_grad():
                if opt == "amp" and device == "cuda":
                    with torch.amp.autocast("cuda"):
                        _ = model(input_tensor)
                else:
                    _ = model(input_tensor)
            torch.cuda.synchronize() if device == "cuda" else None
            latency = (time.time() - start_time) * 1000  # ms

            throughput = batch_size / (latency / 1000)

            # Collect metrics
            ram_usage = utils.get_ram_usage_mb()
            vram_usage = utils.get_vram_usage_mb() if device == "cuda" else None
            cpu_util = utils.get_cpu_utilization()
            gpu_util = utils.get_gpu_utilization() if device == "cuda" else None
            model_size_mb = utils.get_model_size_mb(model)

            results.append({
                "model_variant": "densenet121",
                "batch_size": batch_size,
                "device": device,
                "ram_usage_mb": ram_usage,
                "vram_usage_mb": vram_usage,
                "cpu_utilization_pct": cpu_util,
                "gpu_utilization_pct": gpu_util,
                "latency_ms": latency,
                "throughput_samples_sec": throughput,
                "accuracy_top1": None,   # still NA until dataset added
                "accuracy_top5": None,   # still NA
                "model_size_mb": model_size_mb,
                "optimization_technique": opt,
            })
            
            print(f"optimization: {opt} and batch size : {batch_size}")

    # Save results
    project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
    results_dir = os.path.join(project_root, "results")
    os.makedirs(results_dir, exist_ok=True)

    result_csv = os.path.join(results_dir, "benchmark_results.csv")
    df = pd.DataFrame(results)

    # overwrite safe
    if os.path.exists(result_csv):
        os.remove(result_csv)
    df.to_csv(result_csv, index=False)

    print(f"[INFO] Benchmarking complete. Results saved to {result_csv}")


if __name__ == "__main__":
    benchmark_densenet()


cuda
optimization: baseline and batch size : 1
optimization: baseline and batch size : 4
optimization: baseline and batch size : 8
optimization: baseline and batch size : 16
optimization: baseline and batch size : 32
optimization: amp and batch size : 1
optimization: amp and batch size : 4
optimization: amp and batch size : 8
optimization: amp and batch size : 16
optimization: amp and batch size : 32


NameError: name '__file__' is not defined

In [1]:
import os
import time
import torch
import pandas as pd
from torch.profiler import profile, record_function, ProfilerActivity
from torch.utils.tensorboard import SummaryWriter
from torchvision import models
import utils
import optimisations  # NEW

# Which optimization techniques to benchmark
OPTIMIZATIONS = ["quantization", "baseline", "amp"]  # add "jit", "quantization" later if needed
BATCH_SIZES = [1, 4, 8, 16, 32]

def benchmark_densenet(batch_sizes=BATCH_SIZES, device=None):
    results = []

    # -----------------------------
    # Setup device
    # -----------------------------
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"[INFO] Running benchmarks on device: {device}")

    # Setup TensorBoard writer
    try:
        project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
    except NameError:
        # fallback for Jupyter notebooks
        project_root = os.getcwd()
        
    log_dir = os.path.join(project_root, "logs", "tensorboard")
    os.makedirs(log_dir, exist_ok=True)
    writer = SummaryWriter(log_dir)

    # Sample input for optimisations (needed by JIT / AMP)
    sample_input = torch.randn(1, 3, 224, 224).to(device)

    # -----------------------------
    # Loop over optimizations
    # -----------------------------
    for opt in OPTIMIZATIONS:
        print(f"\n[INFO] === Optimization: {opt} ===")

        # Load model once per optimization
        start_time = time.time()
        
        # model = optimisations.get_model(opt, device, sample_input)
        model_full = optimisations.get_model("baseline", device, sample_input)  # get plain model first
        if opt == "quantization":
            model = optimisations.apply_quantization(model_full.cpu())  # quantize on CPU
            run_device = "cpu"
        else:
            model = optimisations.get_model(opt, device, sample_input)
            run_device = device

        model.eval()
        model_load_time = (time.time() - start_time) * 1000  # ms

        for batch_size in batch_sizes:
            print(f"[INFO] Benchmarking batch size = {batch_size}")
            input_tensor = torch.randn(batch_size, 3, 224, 224).to(run_device)

            # Warmup
            with torch.no_grad():
                for _ in range(3):
                    if opt == "amp" and device == "cuda":
                        with torch.amp.autocast("cuda"):
                            _ = model(input_tensor)
                    else:
                        _ = model(input_tensor)

            # Profiling + inference
            with profile(
                activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA] if device == "cuda" else [ProfilerActivity.CPU],
                record_shapes=True,
                profile_memory=True,
                with_stack=True
            ) as prof:
                with record_function("model_inference"):
                    torch.cuda.synchronize() if device == "cuda" else None
                    start = time.time()
                    with torch.no_grad():
                        if opt == "amp" and device == "cuda":
                            with torch.amp.autocast("cuda"):
                                outputs = model(input_tensor)
                        else:
                            outputs = model(input_tensor)
                    torch.cuda.synchronize() if device == "cuda" else None
                    end = time.time()

            # Latency & throughput
            latency_ms = (end - start) * 1000
            throughput = batch_size / (end - start)

            # Collect metrics
            ram_usage = utils.get_ram_usage_mb()
            vram_usage = utils.get_vram_usage_mb() if device == "cuda" else None
            cpu_util = utils.get_cpu_utilization()
            gpu_util = utils.get_gpu_utilization() if device == "cuda" else None
            model_size_mb = utils.get_model_size_mb(model)

            # Log to TensorBoard
            writer.add_scalar(f"{opt}/Latency_batch_{batch_size}", latency_ms)
            writer.add_scalar(f"{opt}/Throughput_batch_{batch_size}", throughput)
            writer.add_scalar(f"{opt}/RAM_MB_batch_{batch_size}", ram_usage)
            if device == "cuda":
                writer.add_scalar(f"{opt}/VRAM_MB_batch_{batch_size}", vram_usage)
                writer.add_scalar(f"{opt}/GPU_util_batch_{batch_size}", gpu_util)

            # Append to results
            results.append({
                "model_variant": "densenet121",
                "batch_size": batch_size,
                "device": device,
                "ram_usage_mb": ram_usage,
                "vram_usage_mb": vram_usage,
                "cpu_utilization_pct": cpu_util,
                "gpu_utilization_pct": gpu_util,
                "latency_ms": latency_ms,
                "throughput_samples_sec": throughput,
                "accuracy_top1": None,   # will be added in Part 2
                "accuracy_top5": None,
                "model_size_mb": model_size_mb,
                "optimization_technique": opt,
                "model_load_time_ms": model_load_time,
            })

            print(f"[DONE] {opt} | batch={batch_size} | "
                  f"latency={latency_ms:.2f}ms | throughput={throughput:.2f}/s")

    # -----------------------------
    # Save results
    # -----------------------------
    results_dir = os.path.join(project_root, "results")
    os.makedirs(results_dir, exist_ok=True)
    result_csv = os.path.join(results_dir, "benchmark_results.csv")

    df = pd.DataFrame(results)
    if os.path.exists(result_csv):
        os.remove(result_csv)
    df.to_csv(result_csv, index=False)

    writer.close()
    print(f"\n✅ Benchmarking complete. Results saved to {result_csv}")


if __name__ == "__main__":
    benchmark_densenet()


[INFO] Running benchmarks on device: cuda

[INFO] === Optimization: quantization ===
[INFO] Benchmarking batch size = 1
[DONE] quantization | batch=1 | latency=118.00ms | throughput=8.47/s
[INFO] Benchmarking batch size = 4
[DONE] quantization | batch=4 | latency=428.99ms | throughput=9.32/s
[INFO] Benchmarking batch size = 8
[DONE] quantization | batch=8 | latency=805.00ms | throughput=9.94/s
[INFO] Benchmarking batch size = 16
[DONE] quantization | batch=16 | latency=1464.00ms | throughput=10.93/s
[INFO] Benchmarking batch size = 32
[DONE] quantization | batch=32 | latency=3228.00ms | throughput=9.91/s

[INFO] === Optimization: baseline ===
[INFO] Benchmarking batch size = 1
[DONE] baseline | batch=1 | latency=88.45ms | throughput=11.31/s
[INFO] Benchmarking batch size = 4
[DONE] baseline | batch=4 | latency=51.00ms | throughput=78.43/s
[INFO] Benchmarking batch size = 8
[DONE] baseline | batch=8 | latency=63.02ms | throughput=126.95/s
[INFO] Benchmarking batch size = 16
[DONE] basel