In [2]:
import time
import torch
from torch.backends import cudnn
from matplotlib import colors
import cv2
import numpy as np

from backbone import EfficientDetBackbone
from efficientdet.utils import BBoxTransform, ClipBoxes
from utils.utils import postprocess, STANDARD_COLORS, standard_to_bgr, get_index_label, plot_one_box

compound_coef = 0
force_input_size = None  # set None to use default size

# Camera setup
cap = cv2.VideoCapture(0)  # 0 for the default camera, or you can use the index for an external camera
if not cap.isOpened():
    print("Error: Could not open webcam.")
    exit()

# Replace this part with your project's anchor config
anchor_ratios = [(1.0, 1.0), (1.4, 0.7), (0.7, 1.4)]
anchor_scales = [2 ** 0, 2 ** (1.0 / 3.0), 2 ** (2.0 / 3.0)]

threshold = 0.2
iou_threshold = 0.2

use_cuda = True
use_float16 = False
cudnn.fastest = True
cudnn.benchmark = True

obj_list = ['belt', 'sunglasses', 'boot', 'cowboy_hat', 'jacket']
# obj_list = ['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light',
#             'fire hydrant', '', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep',
#             'cow', 'elephant', 'bear', 'zebra', 'giraffe', '', 'backpack', 'umbrella', '', '', 'handbag', 'tie',
#             'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove',
#             'skateboard', 'surfboard', 'tennis racket', 'bottle', '', 'wine glass', 'cup', 'fork', 'knife', 'spoon',
#             'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut',
#             'cake', 'chair', 'couch', 'potted plant', 'bed', '', 'dining table', '', '', 'toilet', '', 'tv',
#             'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink',
#             'refrigerator', '', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier',
#             'toothbrush']

color_list = standard_to_bgr(STANDARD_COLORS)
# tf bilinear interpolation is different from any other's, just make do
input_sizes = [512, 640, 768, 896, 1024, 1280, 1280, 1536, 1536]
input_size = input_sizes[compound_coef] if force_input_size is None else force_input_size

# Load the model
# weights = 'weights/efficientdet-d0.pth'
weights = 'efficientdet-d0_19_7520.pth'
model = EfficientDetBackbone(compound_coef=compound_coef, num_classes=len(obj_list),
                             ratios=anchor_ratios, scales=anchor_scales)
model.load_state_dict(torch.load(weights, map_location='cpu', weights_only=False))
model.requires_grad_(False)
model.eval()

if use_cuda:
    model = model.cuda()
if use_float16:
    model = model.half()

# Preprocessing utilities
regressBoxes = BBoxTransform()
clipBoxes = ClipBoxes()

def preprocess(ori_img, max_size=512, mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)):
    # Resize and pad the image to maintain aspect ratio
    h, w, _ = ori_img.shape
    scale = min(max_size / h, max_size / w)
    new_h, new_w = int(h * scale), int(w * scale)
    resized_img = cv2.resize(ori_img, (new_w, new_h))

    # Padding
    top, bottom = (max_size - new_h) // 2, (max_size - new_h + 1) // 2
    left, right = (max_size - new_w) // 2, (max_size - new_w + 1) // 2
    padded_img = cv2.copyMakeBorder(resized_img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(0, 0, 0))

    # Normalize the image
    normalized_img = (padded_img[..., ::-1] / 255 - mean) / std  # BGR -> RGB normalization

    # Add a batch dimension for the model
    framed_img = np.expand_dims(normalized_img, axis=0)
    
    # # Debug: print out image dimensions and scaling factors
    # print(f"Original Image Size: ({h}, {w}), Scaled Image Size: ({new_h}, {new_w}), Padding: Top-{top}, Left-{left}")

    # Return the image with meta information (including padding)
    framed_meta = {
        'height': h,
        'width': w,
        'scale': scale,
        'new_w': new_w,
        'new_h': new_h,
        'padding_w': left + right,
        'padding_h': top + bottom
    }

    return framed_img, framed_meta

def invert_affine(metas, preds):
    for i in range(len(preds)):
        meta = metas[i]
        if 'rois' not in preds[i] or preds[i]['rois'].ndim != 2:
            continue

        new_w, new_h = meta['new_w'], meta['new_h']
        old_w, old_h = meta['width'], meta['height']
        top, left = (meta['padding_h'] // 2, meta['padding_w'] // 2)

        preds[i]['rois'][:, [0, 2]] *= (old_w / new_w)
        preds[i]['rois'][:, [1, 3]] *= (old_h / new_h)

        preds[i]['rois'][:, [0, 2]] -= left
        preds[i]['rois'][:, [1, 3]] -= top

        preds[i]['rois'] = np.clip(preds[i]['rois'], 0, max(old_w, old_h))  # 防止越界
        preds[i]['rois'] = np.round(preds[i]['rois'])  # 避免浮点误差

    return preds


# Main loop to capture frames from webcam
while True:
    # Read a frame from the webcam
    ret, frame = cap.read()
    if not ret:
        print("Error: Failed to capture image.")
        break

    # Preprocess the frame (resize, normalize, etc.)
    ori_img = frame.copy()
    framed_img, framed_meta = preprocess(ori_img, max_size=input_size)

    # Prepare the input for the model (convert to tensor)
    if use_cuda:
        x = torch.from_numpy(framed_img).cuda()
    else:
        x = torch.from_numpy(framed_img)
    
    x = x.to(torch.float32 if not use_float16 else torch.float16).permute(0, 3, 1, 2)

    # Run inference
    with torch.no_grad():
        features, regression, classification, anchors = model(x)
        out = postprocess(x, anchors, regression, classification, regressBoxes, clipBoxes, threshold, iou_threshold)
        out = invert_affine([framed_meta], out)

    # Display results
    for i in range(len(out)):
        if len(out[i]['rois']) == 0:
            continue

        for j in range(len(out[i]['rois'])):
            x1, y1, x2, y2 = out[i]['rois'][j].astype(int)
            class_id = out[i]['class_ids'][j]
            score = float(out[i]['scores'][j])
            obj = obj_list[class_id]

            print(f" Class ID: {class_id}, Label: {obj}, Score: {score} , Bounding Box: [{x1}, {y1}, {x2}, {y2}]")
            plot_one_box(ori_img, [x1, y1, x2, y2], label=obj, score=score, color=color_list[get_index_label(obj, obj_list)])



    # Display the processed frame with detections
    cv2.imshow('EfficientDet Inference', ori_img)

    # Press 'q' to exit the loop
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

#


ModuleNotFoundError: No module named 'backbone'