In [2]:
import os
import time
import torch
from pathlib import Path

# optional libs
try:
    from ptflops import get_model_complexity_info
    HAS_PT = True
except Exception:
    HAS_PT = False
try:
    from thop import profile as thop_profile
    HAS_THOP = True
except Exception:
    HAS_THOP = False
try:
    from torchinfo import summary
    HAS_INFO = True
except Exception:
    HAS_INFO = False

import timm

# CONFIG / auto-find checkpoint
PROJECT_ROOT = Path(r"C:\Local D\Galeri Belajar\Project\Computer Vision\scancer")
pth_files = list(PROJECT_ROOT.rglob("*.pth"))
ptl_files = list(PROJECT_ROOT.rglob("*.ptl")) + list(PROJECT_ROOT.rglob("*.pt"))
if pth_files:
    CKPT = str(pth_files[0])
    CKPT_TYPE = "pth"
elif ptl_files:
    CKPT = str(ptl_files[0])
    CKPT_TYPE = "ptl"
else:
    raise FileNotFoundError(f"No .pth or .ptl found under {PROJECT_ROOT} - place checkpoint or set CKPT manually")

print("Using checkpoint:", CKPT, "type:", CKPT_TYPE)

# model meta (adjust if needed for your training)
MODEL_NAME = "mobilenetv4_hybrid_medium.e500_r224_in1k"
NUM_CLASSES = 10
INPUT_SIZE = (3, 224, 224)  # (C,H,W)
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def safe_load_state_dict_into(model, state):
    # Try common checkpoint formats
    try:
        model.load_state_dict(state)
        return model
    except Exception:
        if isinstance(state, dict):
            # common keys
            for k in ("model_state_dict", "state_dict", "model"):
                if k in state:
                    try:
                        model.load_state_dict(state[k])
                        return model
                    except Exception:
                        continue
    raise RuntimeError("Failed to load checkpoint into model - unsupported format")

# If .pth: rebuild nn.Module; if .ptl: load TorchScript and wrap
if CKPT_TYPE == "pth":
    model = timm.create_model(MODEL_NAME, pretrained=False, num_classes=NUM_CLASSES)
    state = torch.load(CKPT, map_location="cpu")
    try:
        model = safe_load_state_dict_into(model, state)
    except Exception as e:
        print("Warning: could not load state_dict into timm model:", e)
    model.eval().to(DEVICE)
    is_script = False
else:
    # load TorchScript / lite
    script_mod = torch.jit.load(CKPT, map_location="cpu")
    script_mod.eval()
    class ScriptWrapper(torch.nn.Module):
        def __init__(self, script_mod):
            super().__init__()
            self.script_mod = script_mod
        def forward(self, x):
            if x.dim() == 3:
                x = x.unsqueeze(0)
            # some mobile modules expect channels_last; make contiguous but keep robust
            try:
                x = x.contiguous(memory_format=torch.channels_last)
            except Exception:
                x = x.contiguous()
            return self.script_mod(x)
    model = ScriptWrapper(script_mod).to(DEVICE)
    is_script = True

# params & size
def count_params(m):
    try:
        return sum(p.numel() for p in m.parameters())
    except Exception:
        try:
            sd = m.state_dict()
            return sum(v.numel() for v in sd.values())
        except Exception:
            return None

params_total = count_params(model if not is_script else script_mod)
print("Total params:", params_total if params_total is not None else "unknown (script/prepacked)")

ckpt_size_mb = Path(CKPT).stat().st_size / (1024*1024)
print("Checkpoint size (MB):", f"{ckpt_size_mb:.3f}")

# FLOPs: prefer ptflops on nn.Module; for script attempt wrapper; fallback to thop
if HAS_PT:
    try:
        # ptflops expects (C,H,W)
        flops, params_pt = get_model_complexity_info(model, (INPUT_SIZE[0], INPUT_SIZE[1], INPUT_SIZE[2]),
                                                    as_strings=False, print_per_layer_stat=False)
        if flops is not None:
            print("FLOPs:", f"{int(flops):,}", "->", f"{flops/1e9:.4f}", "GFLOPs")
        else:
            print("ptflops returned None")
    except Exception as e:
        print("ptflops failed:", e)

elif HAS_THOP:
    try:
        dummy = torch.randn(1, *INPUT_SIZE).to(DEVICE)
        macs, params_thop = thop_profile(model, inputs=(dummy,), verbose=False)
        flops_est = macs * 2
        print("thop -> MACs:", f"{int(macs):,}", "Est FLOPs:", f"{int(flops_est):,}", f"({flops_est/1e9:.4f} GFLOPs)")
    except Exception as e:
        print("thop failed:", e)
else:
    print("ptflops/thop not installed; skipping FLOPs estimation")

# torchinfo summary (only for nn.Module CPU)
if HAS_INFO and not is_script:
    try:
        summary(model.cpu(), input_size=(1, *INPUT_SIZE), col_names=("input_size", "output_size", "num_params"))
    except Exception as e:
        print("torchinfo failed:", e)

# latency & throughput
def measure(device, iters=50, warmup=10, batch=1):
    model.to(device)
    dummy = torch.randn(batch, *INPUT_SIZE).to(device)
    with torch.no_grad():
        for _ in range(warmup):
            _ = model(dummy)
        if device.type == "cuda":
            torch.cuda.synchronize()
        t0 = time.time()
        for _ in range(iters):
            _ = model(dummy)
            if device.type == "cuda":
                torch.cuda.synchronize()
        t1 = time.time()
    avg_ms = (t1 - t0) / iters * 1000
    imgs_per_s = 1000 / avg_ms * batch if avg_ms > 0 else float("inf")
    return avg_ms, imgs_per_s

print("Measuring on device:", DEVICE)
lat_ms, ips = measure(DEVICE, iters=30, warmup=5, batch=1)
print(f"Latency avg: {lat_ms:.2f} ms, Throughput: {ips:.1f} img/s")

# GPU memory peek
if DEVICE.type == "cuda":
    import gc
    torch.cuda.empty_cache(); gc.collect()
    with torch.no_grad():
        _ = model(torch.randn(1, *INPUT_SIZE).to(DEVICE))
    torch.cuda.synchronize()
    print("CUDA allocated (MB):", torch.cuda.memory_allocated() / 1024**2)
    print("CUDA reserved (MB):", torch.cuda.memory_reserved() / 1024**2)

# short profiling (best-effort)
try:
    import torch.profiler
    activities = [torch.profiler.ProfilerActivity.CPU]
    if DEVICE.type == "cuda":
        activities.append(torch.profiler.ProfilerActivity.CUDA)
    with torch.profiler.profile(activities=activities, record_shapes=True, with_stack=False) as prof:
        model.to(DEVICE)
        inp = torch.randn(1, *INPUT_SIZE).to(DEVICE)
        with torch.no_grad():
            model(inp)
    print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))
except Exception as e:
    print("profiler skipped:", e)

# Done
print("Done.")

Using checkpoint: C:\Local D\Galeri Belajar\Project\Computer Vision\scancer\model\best_model.pth type: pth
Total params: 9806458
Checkpoint size (MB): 37.922
FLOPs: 949,824,320 -> 0.9498 GFLOPs
Total params: 9806458
Checkpoint size (MB): 37.922
FLOPs: 949,824,320 -> 0.9498 GFLOPs
Measuring on device: cpu
Measuring on device: cpu
Latency avg: 77.01 ms, Throughput: 13.0 img/s
Latency avg: 77.01 ms, Throughput: 13.0 img/s
--------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                        Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
--------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                aten::conv2d         0.82%     798.100us        67.75%      66.188ms     551.567us           120  
                           aten::convolution         