In [None]:
import torch
import cv2
import numpy as np
from retinaface.models.retinaface import RetinaFace
from retinaface.utils.box_utils import decode, decode_landm
from retinaface.data import cfg_mnet
from retinaface.layers.functions.prior_box import PriorBox
from retinaface.utils.nms.py_cpu_nms import py_cpu_nms

# Check GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"[INFO] Using device: {device}")
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))

# Load image
img_path = "test.jpg"
img_raw = cv2.imread(img_path, cv2.IMREAD_COLOR)
img = np.float32(img_raw)
img -= (104, 117, 123)
img = img.transpose(2, 0, 1)
img = torch.from_numpy(img).unsqueeze(0).to(device)

# Model config and load
cfg = cfg_mnet
net = RetinaFace(cfg=cfg, phase='test')
net = net.to(device)
net.eval()

# Load pretrained weights manually
checkpoint = torch.load("retinaface_mobile025.pth", map_location=device)
net.load_state_dict(checkpoint)
print("[INFO] Model loaded successfully.")

# Inference
loc, conf, landms = net(img)

# Decode outputs
im_height, im_width, _ = img_raw.shape
scale = torch.Tensor([im_width, im_height, im_width, im_height]).to(device)
priorbox = PriorBox(cfg, image_size=(im_height, im_width))
priors = priorbox.forward().to(device)
boxes = decode(loc.data.squeeze(0), priors, cfg['variance'])
boxes = boxes * scale / 1
boxes = boxes.cpu().numpy()

landms = decode_landm(landms.data.squeeze(0), priors, cfg['variance'])
landms = landms * torch.tensor([img_raw.shape[1], img_raw.shape[0]] * 5).to(device) / 1
landms = landms.cpu().numpy()

conf = conf.data.squeeze(0).cpu().numpy()[:, 1]
dets = np.hstack((boxes, conf[:, np.newaxis])).astype(np.float32)
keep = py_cpu_nms(dets, 0.4)
dets = dets[keep]
landms = landms[keep]

# Draw results
for det, lm in zip(dets, landms):
    if det[4] < 0.5:
        continue
    x1, y1, x2, y2 = map(int, det[:4])
    cv2.rectangle(img_raw, (x1, y1), (x2, y2), (0, 255, 0), 2)
    for i in range(5):
        x, y = int(lm[2 * i]), int(lm[2 * i + 1])
        cv2.circle(img_raw, (x, y), 2, (0, 0, 255), -1)

# Show result
cv2.imshow("RetinaFace Detection", img_raw)
cv2.waitKey(0)
cv2.destroyAllWindows()


[INFO] Using device: cuda
GPU: NVIDIA GeForce RTX 3060
[INFO] Model loaded successfully.


RuntimeError: The size of tensor a (10) must match the size of tensor b (8) at non-singleton dimension 1