In [8]:
import time
import torch
from torch.backends import cudnn
from matplotlib import colors
import cv2
import numpy as np

from backbone import EfficientDetBackbone
from efficientdet.utils import BBoxTransform, ClipBoxes
from utils.utils import postprocess, STANDARD_COLORS, standard_to_bgr, get_index_label, plot_one_box

compound_coef = 0
force_input_size = None  # set None to use default size

# Camera setup
cap = cv2.VideoCapture(0)  # 0 for the default camera, or you can use the index for an external camera
if not cap.isOpened():
    print("Error: Could not open webcam.")
    exit()

# Replace this part with your project's anchor config
anchor_ratios = [(1.0, 1.0), (1.4, 0.7), (0.7, 1.4)]
anchor_scales = [2 ** 0, 2 ** (1.0 / 3.0), 2 ** (2.0 / 3.0)]

threshold = 0.2
iou_threshold = 0.2

use_cuda = True
use_float16 = False
cudnn.fastest = True
cudnn.benchmark = True

obj_list = ['belt', 'sunglasses', 'boot', 'cowboy_hat', 'jacket']

color_list = standard_to_bgr(STANDARD_COLORS)
# tf bilinear interpolation is different from any other's, just make do
input_sizes = [512, 640, 768, 896, 1024, 1280, 1280, 1536, 1536]
input_size = input_sizes[compound_coef] if force_input_size is None else force_input_size

# Load the model
model = EfficientDetBackbone(compound_coef=compound_coef, num_classes=len(obj_list),
                             ratios=anchor_ratios, scales=anchor_scales)
model.load_state_dict(torch.load(f'efficientdet-d0_19_7520.pth', map_location='cpu', weights_only=False))
model.requires_grad_(False)
model.eval()

if use_cuda:
    model = model.cuda()
if use_float16:
    model = model.half()

# Preprocessing utilities
regressBoxes = BBoxTransform()
clipBoxes = ClipBoxes()

def preprocess(ori_img, max_size=512, mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)):
    # Resize and pad the image to maintain aspect ratio
    h, w, _ = ori_img.shape
    scale = min(max_size / h, max_size / w)
    new_h, new_w = int(h * scale), int(w * scale)
    resized_img = cv2.resize(ori_img, (new_w, new_h))

    # Padding
    top, bottom = (max_size - new_h) // 2, (max_size - new_h + 1) // 2
    left, right = (max_size - new_w) // 2, (max_size - new_w + 1) // 2
    padded_img = cv2.copyMakeBorder(resized_img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(0, 0, 0))

    # Normalize the image
    normalized_img = (padded_img[..., ::-1] / 255 - mean) / std  # BGR -> RGB normalization

    # Add a batch dimension for the model
    framed_img = np.expand_dims(normalized_img, axis=0)
    
    # Return the image with meta information (including padding)
    framed_meta = {
        'height': h,
        'width': w,
        'scale': scale,
        'new_w': new_w,
        'new_h': new_h,
        'padding_w': left + right,
        'padding_h': top + bottom
    }

    return framed_img, framed_meta

def invert_affine(metas, preds):
    for i in range(len(preds)):
        # Ensure metas is a list or dict
        meta = metas[i]
        
        # Check the structure of rois
        if 'rois' in preds[i]:
            print(f"Preds Rois: {preds[i]['rois'].shape}")
        else:
            print(f"No 'rois' key in preds[i], structure: {preds[i]}")
        
        # If rois is a 1D array, we need to handle it accordingly
        if preds[i]['rois'].ndim == 1:
            preds[i]['rois'] = np.expand_dims(preds[i]['rois'], axis=0)

        # Now, proceed with the inverse transformation if rois is 2D
        new_w, new_h = meta['new_w'], meta['new_h']
        old_w, old_h = meta['width'], meta['height']
        padding_w, padding_h = meta['padding_w'], meta['padding_h']

        preds[i]['rois'][:, [0, 2]] = preds[i]['rois'][:, [0, 2]] * (new_w / old_w)
        preds[i]['rois'][:, [1, 3]] = preds[i]['rois'][:, [1, 3]] * (new_h / old_h)

        # Remove padding
        preds[i]['rois'][:, [0, 2]] = preds[i]['rois'][:, [0, 2]] - padding_w
        preds[i]['rois'][:, [1, 3]] = preds[i]['rois'][:, [1, 3]] - padding_h

    return preds

# Main loop to capture frames from webcam
while True:
    # Read a frame from the webcam
    ret, frame = cap.read()
    if not ret:
        print("Error: Failed to capture image.")
        break

    # Preprocess the frame (resize, normalize, etc.)
    ori_img = frame.copy()
    framed_img, framed_meta = preprocess(ori_img, max_size=input_size)

    # Prepare the input for the model (convert to tensor)
    if use_cuda:
        x = torch.from_numpy(framed_img).cuda()
    else:
        x = torch.from_numpy(framed_img)
    
    x = x.to(torch.float32 if not use_float16 else torch.float16).permute(0, 3, 1, 2)

    # Run inference
    with torch.no_grad():
        features, regression, classification, anchors = model(x)
        out = postprocess(x, anchors, regression, classification, regressBoxes, clipBoxes, threshold, iou_threshold)
        out = invert_affine([framed_meta], out)

    # Display results
    for i in range(len(out)):
        if len(out[i]['rois']) == 0:
            continue

        for j in range(len(out[i]['rois'])):
            x1, y1, x2, y2 = out[i]['rois'][j].astype(int)
            obj = obj_list[out[i]['class_ids'][j]]
            score = float(out[i]['scores'][j])
            plot_one_box(ori_img, [x1, y1, x2, y2], label=obj, score=score, color=color_list[get_index_label(obj, obj_list)])

    # Display the processed frame with detections
    cv2.imshow('EfficientDet Inference', ori_img)

    # Press 'q' to exit the loop
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release the camera and close OpenCV windows
cap.release()
cv2.destroyAllWindows()


Preds Rois: (2, 4)
Preds Rois: (2, 4)
Preds Rois: (2, 4)
Preds Rois: (2, 4)
Preds Rois: (2, 4)
Preds Rois: (2, 4)
Preds Rois: (2, 4)
Preds Rois: (2, 4)
Preds Rois: (2, 4)
Preds Rois: (2, 4)
Preds Rois: (2, 4)
Preds Rois: (2, 4)
Preds Rois: (2, 4)
Preds Rois: (2, 4)
Preds Rois: (2, 4)
Preds Rois: (2, 4)
Preds Rois: (2, 4)
Preds Rois: (2, 4)
Preds Rois: (2, 4)
Preds Rois: (2, 4)
Preds Rois: (2, 4)
Preds Rois: (2, 4)
Preds Rois: (2, 4)
Preds Rois: (2, 4)
Preds Rois: (2, 4)
Preds Rois: (2, 4)
Preds Rois: (2, 4)
Preds Rois: (2, 4)
Preds Rois: (2, 4)
Preds Rois: (2, 4)
Preds Rois: (2, 4)
Preds Rois: (2, 4)
Preds Rois: (2, 4)
Preds Rois: (2, 4)
Preds Rois: (2, 4)
Preds Rois: (2, 4)
Preds Rois: (2, 4)
Preds Rois: (2, 4)
Preds Rois: (2, 4)
Preds Rois: (2, 4)
Preds Rois: (2, 4)
Preds Rois: (2, 4)
Preds Rois: (2, 4)
Preds Rois: (2, 4)
Preds Rois: (2, 4)
Preds Rois: (2, 4)
Preds Rois: (2, 4)
Preds Rois: (2, 4)
Preds Rois: (2, 4)
Preds Rois: (2, 4)
Preds Rois: (2, 4)
Preds Rois: (2, 4)
Preds Rois: 

KeyboardInterrupt: 