In [74]:
import glob
import os
import random
from pathlib import Path
import cv2
import numpy as np

import sys
from os.path import abspath, join, dirname
base_path = abspath('./../training/model/yolov7/')
sys.path.append(base_path)

from utils.datasets import letterbox

img_formats = ['bmp', 'jpg', 'jpeg', 'png', 'tif', 'tiff', 'dng', 'webp', 'mpo']  # acceptable image suffixes

class LoadImages:  
    def __init__(self, path, img_size=640, stride=32):
        
        # path is directing to the json file with labels
        image_ids = []
        with open(path, 'r') as f:
            data = json.load(f)
            for image_id, info in data.items():
                image_ids.append(image_id)

        image_ids_set = set(image_ids)
        print(f'found {len(image_ids)} image IDs in {path}')
        print(f'ended up with {len(image_ids_set)} IDs after converting to a set')
        
        images = []
        for image_id in image_ids_set:
            image_path = join(dirname(dirname(path)), 'like', image_id + '.jpg')
            images.append(image_path)
        
        print(f'list image path length = {len(images)}')
        ni = len(images)

        self.img_size = img_size
        self.stride = stride
        self.files = images
        self.nf = ni  # number of files
        self.video_flag = False
        self.mode = 'image'
        self.cap = None
        assert self.nf > 0, f'No images found in {p}. ' \
                            f'Supported formats are:\nimages: {img_formats}'

    def __iter__(self):
        self.count = 0
        return self

    def __next__(self):
        if self.count == self.nf:
            raise StopIteration
        path = self.files[self.count]

        # Read image
        self.count += 1
        img0 = cv2.imread(path)  # BGR
        assert img0 is not None, 'Image Not Found ' + path

        # Padded resize
        img = letterbox(img0, self.img_size, stride=self.stride)[0]

        # Convert
        img = img[:, :, ::-1].transpose(2, 0, 1)  # BGR to RGB, to 3x416x416
        img = np.ascontiguousarray(img)

        return path, img, img0, self.cap

In [104]:
def hagrid_xywh_to_xyxy(bboxes, img_shape):
    """
    Convert bounding boxes from Hagrid format (xywh) to (xyxy).

    Args:
        bboxes (list of list): List of bounding boxes in the format
                               [[top left X, top left Y, width, height], ...].
        img_shape (tuple): Shape of the image as (height, width, channels).

    Returns:
        list of list: List of converted bounding boxes in the format
                       [[x_min, y_min, x_max, y_max], ...].
    """
    
    height, width, _ = img_shape
    xyxy_bboxes = []
    
    for bbox in bboxes:
        x, y, w, h = bbox
        
        # Convert normalized values back to pixel values
        x_min = x * width
        y_min = y * height
        x_max = (x + w) * width
        y_max = (y + h) * height
        
        xyxy_bboxes.append([x_min, y_min, x_max, y_max])
    
    return xyxy_bboxes

In [117]:
from pathlib import Path
import torch
# import torch.backends.cudnn as cudnn
from numpy import random
import sys
from os.path import abspath
from tqdm import tqdm
import cv2
import matplotlib.pyplot as plt

# Add the new path to the system path
base_path = abspath('./../training/model/yolov7/')
sys.path.append(base_path)

from models.experimental import attempt_load
# from utils.datasets import LoadImages
from utils.general import check_img_size, non_max_suppression, scale_coords
from utils.torch_utils import select_device, TracedModel
from utils.plots import plot_one_box


def detect(opt):
    source, weights, view_img, save_txt, imgsz, trace = opt.source, opt.weights, opt.view_img, opt.save_txt, opt.img_size, not opt.no_trace

    # Initialize
    device = select_device(opt.device)
    half = device.type != 'cpu'  # half precision only supported on CUDA

    # Load model
    model = attempt_load(weights, map_location=device)  # load FP32 model
    stride = int(model.stride.max())  # model stride
    imgsz = check_img_size(imgsz, s=stride)  # check img_size

    # if trace:
    #     model = TracedModel(model, device, opt.img_size)

    if half:
        model.half()  # to FP16

    # Set Dataloader
    dataset = LoadImages(source, img_size=imgsz, stride=stride)

    # Get all gestures bbxs info
    hagrid_annotations = {}
    with open(os.path.join(source), 'r') as f:
        data = json.load(f)

        # Check if images in the label file are in the test set
        for image_id, info in data.items():
            hagrid_annotations[image_id] = info
    print(f'hagrid_annotations dictionary length = {len(hagrid_annotations)}')
    
    # Run inference
    if device.type != 'cpu':
        model(torch.zeros(1, 3, imgsz, imgsz).to(device).type_as(next(model.parameters())))  # run once
    old_img_w = old_img_h = imgsz
    old_img_b = 1

    for path, img, im0s, vid_cap in tqdm(dataset):
        img = torch.from_numpy(img).to(device)
        img = img.half() if half else img.float()  # uint8 to fp16/32
        img /= 255.0  # 0 - 255 to 0.0 - 1.0
        if img.ndimension() == 3:
            img = img.unsqueeze(0)

        # Warmup
        if device.type != 'cpu' and (old_img_b != img.shape[0] or old_img_h != img.shape[2] or old_img_w != img.shape[3]):
            old_img_b = img.shape[0]
            old_img_h = img.shape[2]
            old_img_w = img.shape[3]
            for i in range(3):
                model(img, augment=opt.augment)[0]

        # Inference
        with torch.no_grad():   # Calculating gradients would cause a GPU memory leak
            pred = model(img, augment=opt.augment)[0]

        # Apply NMS
        pred = non_max_suppression(pred, opt.conf_thres, opt.iou_thres, classes=opt.classes, agnostic=opt.agnostic_nms)

        # Process detections
        for i, det in enumerate(pred):  # detections per image
            print(f"i {i}")
            p, s, im0, frame = path, '', im0s, getattr(dataset, 'frame', 0)

            p = Path(p)  # to Path        p.name=img.jpg
            gn = torch.tensor(im0.shape)[[1, 0, 1, 0]]  # normalization gain whwh
            if len(det):
                # Rescale boxes from img_size to im0 size
                det[:, :4] = scale_coords(img.shape[2:], det[:, :4], im0.shape).round()

                # Print results
                for c in det[:, -1].unique():
                    n = (det[:, -1] == c).sum()  # detections per class
                    s += f"{n} person{'s' * (n > 1)}, "  # add to string
                    print(s)
                
                # print(det)
                
                best_bbox = None
                highest_conf = 0

                # Iterate through the detections
                for *xyxy, conf, _ in det:
                    conf = conf.item()  # Convert confidence tensor to float

                    # Update if the current confidence is higher than the highest found
                    if conf > highest_conf:
                        highest_conf = conf
                        best_bbox = xyxy
                # print(best_bbox)
                
            else:
                print("no human detected")
                print(im0.shape)
                best_bbox = torch.tensor([0, 0, im0.shape[1], im0.shape[0]], device='cuda:0')
                print(best_bbox)
        
        # Get ground truth gestures bbxs in xyxy format
        gestures_info = hagrid_annotations[p.stem]
        gestures_bbxs = hagrid_xywh_to_xyxy(gestures_info['bboxes'], im0.shape)
        
        print(f'Image {p.name}') # p - path to image
        print(f'image loaded, shape: {im0.shape}') # im0 - original image
        print(f'human detected, bbox: {best_bbox}') # best bbox - human bbx for cropping in xyxy
        print(f'ground truth gestures loaded, bboxes: {gestures_bbxs}') # hand_bbxs - hands bbxs groud truth in xyxy
        
#     plot_one_box(best_bbox, im0, label='Human', color=(255, 0, 0), line_thickness=3)
#     for gesture_bbox in gestures_bbxs:
#         plot_one_box(gesture_bbox, im0, label='Gesture', color=(0, 255, 0), line_thickness=3)
#         # cv2.imshow(str(p), im0)
#         # cv2.waitKey(1)  # 1 millisecond
        
        
#     plt.imshow(cv2.cvtColor(im0, cv2.COLOR_BGR2RGB))
#     plt.show()
#     plt.pause(0.001)  # Use a small value for quick transitions
#     plt.clf() 
        

In [118]:
import argparse

opt = argparse.Namespace(
    weights=['./../training/runs/train/MIAP_person_detection/weights/best.pt'],
    source="./../datasets/HaGRID_test/val/like.json",
    img_size=640,
    conf_thres=0.317,
    iou_thres=0.45,
    device='0',
    view_img=False,
    save_txt=False,
    save_conf=False,
    nosave=False,
    classes=None,
    agnostic_nms=False,
    augment=False,
    update=False,
    project='runs/detect',
    name='reducing_detect_func',
    exist_ok=False,
    no_trace=False
)

detect(opt)

Fusing layers... 
IDetect.fuse
found 100 image IDs in ./../datasets/HaGRID_test/val/like.json
ended up with 100 IDs after converting to a set
list image path length = 100
hagrid_annotations dictionary length = 100


2it [00:00, 11.83it/s]

i 0
2 persons, 
Image 002cc8a1-7a8b-4b2f-bf0f-7f74548c37b4.jpg
image loaded, shape: (1920, 1440, 3)
human detected, bbox: [tensor(224., device='cuda:0'), tensor(1266., device='cuda:0'), tensor(1030., device='cuda:0'), tensor(1920., device='cuda:0')]
ground truth gestures loaded, bboxes: [[381.702096, 1482.3213696, 532.6340112, 1697.596128]]
i 0
1 person, 
Image 0082629f-d111-424f-8a65-ae8314a7f5ec.jpg
image loaded, shape: (1920, 1440, 3)
human detected, bbox: [tensor(0., device='cuda:0'), tensor(712., device='cuda:0'), tensor(922., device='cuda:0'), tensor(1920., device='cuda:0')]
ground truth gestures loaded, bboxes: [[332.163864, 1002.2755775999999, 582.5210976, 1366.3686911999998]]
i 0
1 person, 
Image 00c15276-437f-49ef-84ac-80cab0a01f26.jpg
image loaded, shape: (640, 480, 3)
human detected, bbox: [tensor(4., device='cuda:0'), tensor(4., device='cuda:0'), tensor(478., device='cuda:0'), tensor(638., device='cuda:0')]
ground truth gestures loaded, bboxes: [[123.9533472, 229.656966400

7it [00:00, 13.59it/s]

i 0
2 persons, 
Image 000f0e41-9f04-4d07-b328-2be30e3563ef.jpg
image loaded, shape: (1080, 1920, 3)
human detected, bbox: [tensor(646., device='cuda:0'), tensor(307., device='cuda:0'), tensor(1269., device='cuda:0'), tensor(979., device='cuda:0')]
ground truth gestures loaded, bboxes: [[1006.6659840000001, 445.83298560000003, 1154.6716416, 654.0506352]]
i 0
1 person, 
Image 005762c1-ddc8-4ea3-becd-05f3ec326b03.jpg
image loaded, shape: (1920, 1440, 3)
human detected, bbox: [tensor(343., device='cuda:0'), tensor(730., device='cuda:0'), tensor(846., device='cuda:0'), tensor(1888., device='cuda:0')]
ground truth gestures loaded, bboxes: [[347.9302224, 1402.051488, 421.1425872, 1535.5348608], [568.5979968, 934.9836096, 662.0827824, 1074.2397312]]
i 0
1 person, 
Image 00277cca-b999-4527-bba3-9461fd2eb45b.jpg
image loaded, shape: (1920, 1440, 3)
human detected, bbox: [tensor(393., device='cuda:0'), tensor(400., device='cuda:0'), tensor(965., device='cuda:0'), tensor(1662., device='cuda:0')]
g

9it [00:00, 12.92it/s]

i 0
1 person, 
Image 007c575a-9c00-4034-9e6b-3ad43ffd1285.jpg
image loaded, shape: (1920, 1440, 3)
human detected, bbox: [tensor(16., device='cuda:0'), tensor(449., device='cuda:0'), tensor(1433., device='cuda:0'), tensor(1917., device='cuda:0')]
ground truth gestures loaded, bboxes: [[925.2946800000001, 959.5262784, 1298.1508992, 1545.4515456]]
i 0
1 person, 
Image 00283977-caa9-4259-9629-d60d52352e37.jpg
image loaded, shape: (1920, 1440, 3)
human detected, bbox: [tensor(16., device='cuda:0'), tensor(663., device='cuda:0'), tensor(1428., device='cuda:0'), tensor(1902., device='cuda:0')]
ground truth gestures loaded, bboxes: [[399.30461280000003, 835.5189312, 566.24976, 1057.8582144]]
i 0
1 person, 
Image 0097b35b-dbda-40e3-82c7-70df32d39e8e.jpg
image loaded, shape: (885, 1920, 3)
human detected, bbox: [tensor(559., device='cuda:0'), tensor(251., device='cuda:0'), tensor(1280., device='cuda:0'), tensor(878., device='cuda:0')]
ground truth gestures loaded, bboxes: [[832.5265535999999, 4

13it [00:00, 13.42it/s]

i 0
1 person, 
Image 002d9ca5-18d5-4731-965a-37f0ea3ddd71.jpg
image loaded, shape: (1920, 1440, 3)
human detected, bbox: [tensor(440., device='cuda:0'), tensor(1234., device='cuda:0'), tensor(1038., device='cuda:0'), tensor(1918., device='cuda:0')]
ground truth gestures loaded, bboxes: [[552.9581424, 1390.5638208, 692.8692192, 1587.8487167999997]]
i 0
1 person, 
Image 00928418-f7a2-41fe-a664-977d7a925c24.jpg
image loaded, shape: (720, 1280, 3)
human detected, bbox: [tensor(319., device='cuda:0'), tensor(118., device='cuda:0'), tensor(1027., device='cuda:0'), tensor(718., device='cuda:0')]
ground truth gestures loaded, bboxes: [[487.2786176, 338.895936, 636.2341504000001, 576.6089472000001]]
i 0
1 person, 
Image 00ac838c-5078-4af8-96e4-9deea13d2449.jpg
image loaded, shape: (885, 1920, 3)
human detected, bbox: [tensor(556., device='cuda:0'), tensor(144., device='cuda:0'), tensor(1310., device='cuda:0'), tensor(880., device='cuda:0')]
ground truth gestures loaded, bboxes: [[765.4828223999

15it [00:01, 12.73it/s]

i 0
1 person, 
Image 00bb069d-3f5a-4a97-b1d1-b9f0cd2355ac.jpg
image loaded, shape: (1920, 1437, 3)
human detected, bbox: [tensor(104., device='cuda:0'), tensor(674., device='cuda:0'), tensor(1329., device='cuda:0'), tensor(1870., device='cuda:0')]
ground truth gestures loaded, bboxes: [[632.75959875, 1555.9462656, 865.71822147, 1784.7566208], [902.7131541900001, 1022.7254975999999, 1055.53339083, 1280.636544]]
i 0
1 person, 
Image 0069d29e-4e71-433b-96e1-09657ca3d7eb.jpg
image loaded, shape: (1080, 1920, 3)
human detected, bbox: [tensor(521., device='cuda:0'), tensor(94., device='cuda:0'), tensor(1408., device='cuda:0'), tensor(1072., device='cuda:0')]
ground truth gestures loaded, bboxes: [[810.2366016, 349.963632, 976.1693184, 588.1658616]]
i 0
1 person, 
Image 00bdb0b3-b1d6-4a44-a69a-23d5ca6a1363.jpg
image loaded, shape: (1920, 1440, 3)
human detected, bbox: [tensor(200., device='cuda:0'), tensor(420., device='cuda:0'), tensor(1300., device='cuda:0'), tensor(1917., device='cuda:0')]

19it [00:01, 12.19it/s]

i 0
1 person, 
Image 007e3be7-4095-469b-81aa-427c13454d77.jpg
image loaded, shape: (1920, 1440, 3)
human detected, bbox: [tensor(282., device='cuda:0'), tensor(223., device='cuda:0'), tensor(1248., device='cuda:0'), tensor(1638., device='cuda:0')]
ground truth gestures loaded, bboxes: [[294.785928, 1253.1305664, 492.767064, 1439.4936192], [808.4094623999999, 424.2133248, 1005.2552447999999, 741.6840768]]
i 0
1 person, 
Image 009c5053-dd12-4d34-b394-747addb2624c.jpg
image loaded, shape: (1920, 1440, 3)
human detected, bbox: [tensor(76., device='cuda:0'), tensor(208., device='cuda:0'), tensor(1312., device='cuda:0'), tensor(1881., device='cuda:0')]
ground truth gestures loaded, bboxes: [[681.6880368, 745.16304, 955.1661839999999, 1098.1962624]]
i 0
1 person, 
Image 0022cb1a-e7c1-48aa-b48c-027dca084de9.jpg
image loaded, shape: (1440, 1920, 3)
human detected, bbox: [tensor(776., device='cuda:0'), tensor(686., device='cuda:0'), tensor(1210., device='cuda:0'), tensor(1439., device='cuda:0')]

21it [00:01, 12.05it/s]

i 0
2 persons, 
Image 00c5b3b6-420d-481d-bd80-123a3c86b6ae.jpg
image loaded, shape: (1920, 1440, 3)
human detected, bbox: [tensor(303., device='cuda:0'), tensor(1046., device='cuda:0'), tensor(1194., device='cuda:0'), tensor(1920., device='cuda:0')]
ground truth gestures loaded, bboxes: [[715.4166240000001, 1458.4248768, 892.8457920000001, 1729.9729536]]
i 0
1 person, 
Image 005cff15-454f-48cd-a16c-38dcedd9c092.jpg
image loaded, shape: (1920, 932, 3)
human detected, bbox: [tensor(227., device='cuda:0'), tensor(619., device='cuda:0'), tensor(782., device='cuda:0'), tensor(1920., device='cuda:0')]
ground truth gestures loaded, bboxes: [[422.26694384, 750.0123456, 538.21595476, 917.7186624]]
i 0
1 person, 
Image 0058a6b3-41d2-4500-b6a2-e2b922d3459b.jpg
image loaded, shape: (1920, 1440, 3)
human detected, bbox: [tensor(446., device='cuda:0'), tensor(98., device='cuda:0'), tensor(1094., device='cuda:0'), tensor(1910., device='cuda:0')]
ground truth gestures loaded, bboxes: [[472.03388640000

25it [00:01, 11.87it/s]

i 0
2 persons, 
Image 00644ab6-291c-43b2-9308-9f832b25e5af.jpg
image loaded, shape: (1088, 1920, 3)
human detected, bbox: [tensor(712., device='cuda:0'), tensor(281., device='cuda:0'), tensor(1454., device='cuda:0'), tensor(1084., device='cuda:0')]
ground truth gestures loaded, bboxes: [[1095.7829376, 433.9949312, 1210.5086975999998, 608.01341568]]
i 0
1 person, 
Image 002f1d76-250f-4322-b1c3-4dd4906780d4.jpg
image loaded, shape: (1920, 1440, 3)
human detected, bbox: [tensor(447., device='cuda:0'), tensor(344., device='cuda:0'), tensor(1106., device='cuda:0'), tensor(1905., device='cuda:0')]
ground truth gestures loaded, bboxes: [[795.2474016, 643.0511232, 930.6647279999999, 871.5605952]]
i 0
1 person, 
Image 004ae8d7-e079-4665-a5cf-0d9b3fbc9398.jpg
image loaded, shape: (1920, 1440, 3)
human detected, bbox: [tensor(366., device='cuda:0'), tensor(564., device='cuda:0'), tensor(1196., device='cuda:0'), tensor(1916., device='cuda:0')]
ground truth gestures loaded, bboxes: [[622.4306976, 9

27it [00:02, 12.33it/s]

i 0
1 person, 
Image 00d33495-6758-405f-b01a-01bc90111f77.jpg
image loaded, shape: (1920, 1440, 3)
human detected, bbox: [tensor(249., device='cuda:0'), tensor(673., device='cuda:0'), tensor(1137., device='cuda:0'), tensor(1920., device='cuda:0')]
ground truth gestures loaded, bboxes: [[503.6707008, 883.9590528, 634.4064432, 1112.295456]]
i 0
1 person, 
Image 0014182f-d756-42b3-896c-c52de76926bb.jpg
image loaded, shape: (1920, 886, 3)
human detected, bbox: [tensor(160., device='cuda:0'), tensor(353., device='cuda:0'), tensor(825., device='cuda:0'), tensor(1914., device='cuda:0')]
ground truth gestures loaded, bboxes: [[337.17857878, 684.4971072000001, 490.6545732, 885.350496]]
i 0
1 person, 
Image 00bf711f-e2cc-4cb5-b0ac-573198cd6455.jpg
image loaded, shape: (1920, 1440, 3)
human detected, bbox: [tensor(98., device='cuda:0'), tensor(685., device='cuda:0'), tensor(1097., device='cuda:0'), tensor(1920., device='cuda:0')]
ground truth gestures loaded, bboxes: [[466.1243568, 1038.5552256, 

31it [00:02, 12.55it/s]

i 0
1 person, 
Image 00ba83da-684b-4e7c-b24d-704c49878cca.jpg
image loaded, shape: (720, 1280, 3)
human detected, bbox: [tensor(421., device='cuda:0'), tensor(226., device='cuda:0'), tensor(1079., device='cuda:0'), tensor(717., device='cuda:0')]
ground truth gestures loaded, bboxes: [[658.8876288, 385.0527744, 794.696704, 580.2317856]]
i 0
1 person, 
Image 00bbfad5-cb07-4511-9a9b-c81708a4f35a.jpg
image loaded, shape: (1440, 1920, 3)
human detected, bbox: [tensor(485., device='cuda:0'), tensor(225., device='cuda:0'), tensor(1629., device='cuda:0'), tensor(1436., device='cuda:0')]
ground truth gestures loaded, bboxes: [[979.0283903999999, 789.6855024, 1180.7371584, 1069.5068928]]
i 0
1 person, 
Image 009f2d36-52cc-4564-8db8-4586a41243f7.jpg
image loaded, shape: (1920, 885, 3)
human detected, bbox: [tensor(150., device='cuda:0'), tensor(758., device='cuda:0'), tensor(716., device='cuda:0'), tensor(1914., device='cuda:0')]
ground truth gestures loaded, bboxes: [[151.57923345, 1500.8573376,

33it [00:02, 12.46it/s]

i 0
1 person, 
Image 0035dfca-3c94-49f4-93fb-df03dc1514ae.jpg
image loaded, shape: (1920, 1440, 3)
human detected, bbox: [tensor(396., device='cuda:0'), tensor(748., device='cuda:0'), tensor(1039., device='cuda:0'), tensor(1870., device='cuda:0')]
ground truth gestures loaded, bboxes: [[559.7073072000001, 1233.4553088, 668.201256, 1346.4320256], [664.3968048, 974.6405952, 765.256896, 1096.68672]]
i 0
1 person, 
Image 0091d4ef-861c-415a-85eb-b238c217d771.jpg
image loaded, shape: (1920, 1440, 3)
human detected, bbox: [tensor(138., device='cuda:0'), tensor(590., device='cuda:0'), tensor(1440., device='cuda:0'), tensor(1902., device='cuda:0')]
ground truth gestures loaded, bboxes: [[506.5164144, 1014.2867136, 767.6384975999999, 1381.4408448]]
i 0
1 person, 
Image 004f1c70-1735-49e9-ad16-4b176d555d89.jpg
image loaded, shape: (1920, 1440, 3)
human detected, bbox: [tensor(532., device='cuda:0'), tensor(1192., device='cuda:0'), tensor(1074., device='cuda:0'), tensor(1920., device='cuda:0')]
gr

38it [00:02, 15.96it/s]

i 0
2 persons, 
Image 00569fde-a9fb-4ed4-ae4d-583242e31fa9.jpg
image loaded, shape: (1920, 1440, 3)
human detected, bbox: [tensor(513., device='cuda:0'), tensor(387., device='cuda:0'), tensor(910., device='cuda:0'), tensor(1728., device='cuda:0')]
ground truth gestures loaded, bboxes: [[701.02332, 664.8698688, 792.7378127999999, 803.0334144], [845.8509888, 1009.5675648000001, 902.9325167999999, 1112.7648768000001]]
i 0
1 person, 
Image 003be7e7-0725-4e19-b81b-2ffaf11b33b7.jpg
image loaded, shape: (1920, 1440, 3)
human detected, bbox: [tensor(11., device='cuda:0'), tensor(599., device='cuda:0'), tensor(1250., device='cuda:0'), tensor(1920., device='cuda:0')]
ground truth gestures loaded, bboxes: [[318.2838912, 1202.665056, 707.4460367999999, 1713.6924672]]
i 0
1 person, 
Image 006d1874-d4ca-4ec1-b2e2-2e3ee7f7f64c.jpg
image loaded, shape: (1920, 1440, 3)
human detected, bbox: [tensor(330., device='cuda:0'), tensor(630., device='cuda:0'), tensor(1158., device='cuda:0'), tensor(1918., devi

43it [00:03, 19.01it/s]

i 0
1 person, 
Image 00bdebb7-70c9-4a49-b6a4-a904b7ca8eb5.jpg
image loaded, shape: (1920, 1440, 3)
human detected, bbox: [tensor(148., device='cuda:0'), tensor(202., device='cuda:0'), tensor(1410., device='cuda:0'), tensor(1908., device='cuda:0')]
ground truth gestures loaded, bboxes: [[725.0528448, 814.5486143999999, 1053.0614736000002, 1239.6759935999999]]
i 0
1 person, 
Image 004b66dd-19e0-47cd-968e-c1b56f9e9179.jpg
image loaded, shape: (480, 640, 3)
human detected, bbox: [tensor(118., device='cuda:0'), tensor(20., device='cuda:0'), tensor(500., device='cuda:0'), tensor(479., device='cuda:0')]
ground truth gestures loaded, bboxes: [[244.685056, 172.5472128, 303.1488192, 266.2874064]]
i 0
3 persons, 
Image 005621d3-5e1f-4833-9abe-a2dfbd8216a6.jpg
image loaded, shape: (480, 640, 3)
human detected, bbox: [tensor(168., device='cuda:0'), tensor(205., device='cuda:0'), tensor(315., device='cuda:0'), tensor(479., device='cuda:0')]
ground truth gestures loaded, bboxes: [[319.18197760000004,

48it [00:03, 18.48it/s]

i 0
1 person, 
Image 0055ba40-8dd6-4227-bd60-b26b2367db1d.jpg
image loaded, shape: (1920, 1440, 3)
human detected, bbox: [tensor(191., device='cuda:0'), tensor(618., device='cuda:0'), tensor(896., device='cuda:0'), tensor(1920., device='cuda:0')]
ground truth gestures loaded, bboxes: [[320.95314720000005, 951.1572288, 481.6436112, 1191.8189568], [368.43141599999996, 1405.7592960000002, 530.3260655999999, 1634.5139904000002]]
i 0
1 person, 
Image 0077139f-ca88-4a68-bf1d-6cdca0d25144.jpg
image loaded, shape: (1200, 1600, 3)
human detected, bbox: [tensor(493., device='cuda:0'), tensor(278., device='cuda:0'), tensor(1212., device='cuda:0'), tensor(1200., device='cuda:0')]
ground truth gestures loaded, bboxes: [[875.892432, 584.846376, 1002.316096, 766.8444719999999]]
i 0
1 person, 
Image 00d3d9f2-3f88-4061-b066-7db3fd65efec.jpg
image loaded, shape: (1920, 1440, 3)
human detected, bbox: [tensor(53., device='cuda:0'), tensor(270., device='cuda:0'), tensor(1316., device='cuda:0'), tensor(1917

52it [00:03, 18.72it/s]

i 0
2 persons, 
Image 00aeb6ef-475b-4ff9-b706-3076f5125ea2.jpg
image loaded, shape: (720, 1280, 3)
human detected, bbox: [tensor(381., device='cuda:0'), tensor(187., device='cuda:0'), tensor(952., device='cuda:0'), tensor(717., device='cuda:0')]
ground truth gestures loaded, bboxes: [[605.8656, 441.5113224, 724.2945792, 609.2530056]]
i 0
1 person, 
Image 00665c3e-a39b-4c9d-bc38-2a09476c7f83.jpg
image loaded, shape: (1920, 1440, 3)
human detected, bbox: [tensor(8., device='cuda:0'), tensor(10., device='cuda:0'), tensor(1436., device='cuda:0'), tensor(1914., device='cuda:0')]
ground truth gestures loaded, bboxes: [[26.392204800000002, 773.3356224, 613.5726096, 1780.8293952]]
i 0
1 person, 
Image 00a00d26-2ccb-4827-be61-05c880ec612c.jpg
image loaded, shape: (1920, 1434, 3)
human detected, bbox: [tensor(441., device='cuda:0'), tensor(228., device='cuda:0'), tensor(1043., device='cuda:0'), tensor(1755., device='cuda:0')]
ground truth gestures loaded, bboxes: [[628.5531744, 689.8607615999999

54it [00:03, 18.37it/s]

i 0
1 person, 
Image 00706134-abfb-4b54-8795-f4add9ac6eaf.jpg
image loaded, shape: (1920, 1440, 3)
human detected, bbox: [tensor(33., device='cuda:0'), tensor(239., device='cuda:0'), tensor(1435., device='cuda:0'), tensor(1917., device='cuda:0')]
ground truth gestures loaded, bboxes: [[475.859736, 875.0003519999999, 750.6302544, 1310.6259072]]
i 0
1 person, 
Image 0079bfd0-b778-4cc1-9041-0e3f9e8c60f8.jpg
image loaded, shape: (1920, 1440, 3)
human detected, bbox: [tensor(422., device='cuda:0'), tensor(266., device='cuda:0'), tensor(1382., device='cuda:0'), tensor(1914., device='cuda:0')]
ground truth gestures loaded, bboxes: [[409.73837760000004, 1243.8501311999999, 546.0495552, 1415.9732736], [924.0881327999999, 409.88238720000004, 1073.6085024, 633.2989056]]
i 0
1 person, 
Image 00325ded-ad36-4df3-8f05-c68cd7d274de.jpg
image loaded, shape: (1920, 1080, 3)
human detected, bbox: [tensor(350., device='cuda:0'), tensor(819., device='cuda:0'), tensor(950., device='cuda:0'), tensor(1917., d

59it [00:03, 18.57it/s]

i 0
1 person, 
Image 0036c112-1ede-4c28-9096-74e050a5bf95.jpg
image loaded, shape: (1920, 1920, 3)
human detected, bbox: [tensor(312., device='cuda:0'), tensor(902., device='cuda:0'), tensor(1272., device='cuda:0'), tensor(1920., device='cuda:0')]
ground truth gestures loaded, bboxes: [[908.6524608, 915.4513728, 1088.51232, 1154.9406528]]
i 0
1 person, 
Image 003a02c6-6c42-48cb-b013-7c2d12dea782.jpg
image loaded, shape: (1024, 1280, 3)
human detected, bbox: [tensor(209., device='cuda:0'), tensor(122., device='cuda:0'), tensor(1267., device='cuda:0'), tensor(1020., device='cuda:0')]
ground truth gestures loaded, bboxes: [[736.4281344, 543.52603136, 950.717504, 867.08103168]]
i 0
1 person, 
Image 003ae419-fa37-409e-8b7c-bfc7e9cebd2b.jpg
image loaded, shape: (1920, 1440, 3)
human detected, bbox: [tensor(283., device='cuda:0'), tensor(618., device='cuda:0'), tensor(1167., device='cuda:0'), tensor(1917., device='cuda:0')]
ground truth gestures loaded, bboxes: [[338.2033968, 1190.4145728, 52

64it [00:04, 19.54it/s]

i 0
1 person, 
Image 005c9c2f-1c90-4355-9b63-870680f23809.jpg
image loaded, shape: (960, 1280, 3)
human detected, bbox: [tensor(450., device='cuda:0'), tensor(127., device='cuda:0'), tensor(694., device='cuda:0'), tensor(882., device='cuda:0')]
ground truth gestures loaded, bboxes: [[548.8754944, 271.7381952, 600.9015039999999, 344.2490016]]
i 0
1 person, 
Image 0015865c-be5c-4966-9c49-1a1f2e511ffb.jpg
image loaded, shape: (1920, 1434, 3)
human detected, bbox: [tensor(23., device='cuda:0'), tensor(628., device='cuda:0'), tensor(1028., device='cuda:0'), tensor(1917., device='cuda:0')]
ground truth gestures loaded, bboxes: [[444.45483474, 1056.0956351999998, 627.91137336, 1345.0950527999998]]
i 0
1 person, 
Image 00a9cb1b-ce93-4953-a6ec-5a666ff5f1e5.jpg
image loaded, shape: (1920, 1437, 3)
human detected, bbox: [tensor(166., device='cuda:0'), tensor(428., device='cuda:0'), tensor(1302., device='cuda:0'), tensor(1916., device='cuda:0')]
ground truth gestures loaded, bboxes: [[735.72951504

68it [00:04, 19.01it/s]

i 0
2 persons, 
Image 00bc44f3-0284-4a6a-892f-2131f68faf3d.jpg
image loaded, shape: (1440, 1920, 3)
human detected, bbox: [tensor(305., device='cuda:0'), tensor(368., device='cuda:0'), tensor(1350., device='cuda:0'), tensor(1438., device='cuda:0')]
ground truth gestures loaded, bboxes: [[727.2115584000001, 813.7504224, 894.9230592000001, 1060.7527008], [893.8959936, 482.3967456, 1093.0324416, 704.4994944]]
i 0
1 person, 
Image 001fccec-f203-4a22-8dfb-4f44d7b9a3a1.jpg
image loaded, shape: (1920, 1440, 3)
human detected, bbox: [tensor(27., device='cuda:0'), tensor(202., device='cuda:0'), tensor(1334., device='cuda:0'), tensor(1908., device='cuda:0')]
ground truth gestures loaded, bboxes: [[326.143368, 693.6516672, 574.3526544, 1085.5747392]]
i 0
1 person, 
Image 007df56b-f02c-437b-899a-1b2723e2987c.jpg
image loaded, shape: (1920, 1440, 3)
human detected, bbox: [tensor(283., device='cuda:0'), tensor(713., device='cuda:0'), tensor(1236., device='cuda:0'), tensor(1920., device='cuda:0')]
gr

73it [00:04, 18.35it/s]

i 0
1 person, 
Image 006864d0-7f14-4a22-8f98-e949c1bd6166.jpg
image loaded, shape: (1920, 1440, 3)
human detected, bbox: [tensor(156., device='cuda:0'), tensor(540., device='cuda:0'), tensor(1437., device='cuda:0'), tensor(1914., device='cuda:0')]
ground truth gestures loaded, bboxes: [[932.2145280000001, 1111.16016, 1287.6542496, 1703.7303936]]
i 0
1 person, 
Image 009b4a76-9223-4801-a7dc-31d45cab3df9.jpg
image loaded, shape: (1920, 1440, 3)
human detected, bbox: [tensor(202., device='cuda:0'), tensor(850., device='cuda:0'), tensor(880., device='cuda:0'), tensor(1918., device='cuda:0')]
ground truth gestures loaded, bboxes: [[432.58255199999996, 1246.222848, 572.2641072, 1458.0861696000002]]
i 0
1 person, 
Image 00c4bf83-2adb-42dc-8a22-e126a731cba3.jpg
image loaded, shape: (1920, 1434, 3)
human detected, bbox: [tensor(405., device='cuda:0'), tensor(910., device='cuda:0'), tensor(979., device='cuda:0'), tensor(1920., device='cuda:0')]
ground truth gestures loaded, bboxes: [[415.4164064

77it [00:04, 18.09it/s]

i 0
1 person, 
Image 00155881-1a8f-4541-9ad8-8323675d8c46.jpg
image loaded, shape: (1920, 1440, 3)
human detected, bbox: [tensor(321., device='cuda:0'), tensor(748., device='cuda:0'), tensor(1352., device='cuda:0'), tensor(1917., device='cuda:0')]
ground truth gestures loaded, bboxes: [[810.5732208000001, 1251.2231616, 1020.8800656000001, 1585.3124352]]
i 0
1 person, 
Image 00818540-8153-41d3-aba7-2369eaea25b9.jpg
image loaded, shape: (1080, 1920, 3)
human detected, bbox: [tensor(564., device='cuda:0'), tensor(164., device='cuda:0'), tensor(1192., device='cuda:0'), tensor(1068., device='cuda:0')]
ground truth gestures loaded, bboxes: [[818.3738496000001, 441.114336, 962.1173952, 636.2727336]]
i 0
1 person, 
Image 00b0f3a6-37c7-48cd-9aa3-99ecdfbef18b.jpg
image loaded, shape: (1920, 841, 3)
human detected, bbox: [tensor(97., device='cuda:0'), tensor(591., device='cuda:0'), tensor(689., device='cuda:0'), tensor(1920., device='cuda:0')]
ground truth gestures loaded, bboxes: [[316.108032420

81it [00:05, 18.00it/s]

i 0
1 person, 
Image 002d020f-7843-4018-a085-484e7319b00a.jpg
image loaded, shape: (1440, 1920, 3)
human detected, bbox: [tensor(344., device='cuda:0'), tensor(446., device='cuda:0'), tensor(1478., device='cuda:0'), tensor(1438., device='cuda:0')]
ground truth gestures loaded, bboxes: [[703.9152, 869.0328, 924.3612096, 1213.0883999999999]]
i 0
1 person, 
Image 0020abdb-2c4e-4805-9b7f-b3a854d1e519.jpg
image loaded, shape: (1920, 1440, 3)
human detected, bbox: [tensor(335., device='cuda:0'), tensor(119., device='cuda:0'), tensor(1075., device='cuda:0'), tensor(1920., device='cuda:0')]
ground truth gestures loaded, bboxes: [[615.1041216, 793.7803968000001, 802.6770384, 1020.9702144]]
i 0
2 persons, 
Image 00130511-f659-4015-b405-5f6d0d9f26c7.jpg
image loaded, shape: (1920, 1440, 3)
human detected, bbox: [tensor(582., device='cuda:0'), tensor(538., device='cuda:0'), tensor(790., device='cuda:0'), tensor(1151., device='cuda:0')]
ground truth gestures loaded, bboxes: [[661.5069695999999, 664

84it [00:05, 18.70it/s]

i 0
1 person, 
Image 009d26fd-738a-4950-9ef3-8684dc02d446.jpg
image loaded, shape: (960, 1280, 3)
human detected, bbox: [tensor(468., device='cuda:0'), tensor(518., device='cuda:0'), tensor(869., device='cuda:0'), tensor(960., device='cuda:0')]
ground truth gestures loaded, bboxes: [[579.5610368, 676.9852992, 656.6782976000001, 794.7171648]]
i 0
1 person, 
Image 00b153d1-c817-41d4-a22d-ca08e9f221ab.jpg
image loaded, shape: (1425, 1920, 3)
human detected, bbox: [tensor(748., device='cuda:0'), tensor(566., device='cuda:0'), tensor(1193., device='cuda:0'), tensor(1421., device='cuda:0')]
ground truth gestures loaded, bboxes: [[882.332928, 820.4832224999999, 963.202272, 934.4227739999999]]
i 0
1 person, 
Image 00985f6d-aab5-49c6-a2de-e39d6045bce0.jpg
image loaded, shape: (1920, 1440, 3)
human detected, bbox: [tensor(192., device='cuda:0'), tensor(365., device='cuda:0'), tensor(1114., device='cuda:0'), tensor(1917., device='cuda:0')]
ground truth gestures loaded, bboxes: [[609.0340896, 987.

89it [00:05, 19.25it/s]

i 0
1 person, 
Image 0007edf9-a94e-46f7-98a4-a0fd6ba59654.jpg
image loaded, shape: (1920, 1440, 3)
human detected, bbox: [tensor(341., device='cuda:0'), tensor(454., device='cuda:0'), tensor(1090., device='cuda:0'), tensor(1917., device='cuda:0')]
ground truth gestures loaded, bboxes: [[527.5002528, 754.3153536000001, 689.4203904, 970.2878784], [951.3349775999999, 1375.9332096, 1100.09916, 1601.7481728]]
i 0
1 person, 
Image 00ae4da4-9462-4804-ab99-5d2e409cd4b2.jpg
image loaded, shape: (1920, 1434, 3)
human detected, bbox: [tensor(0., device='cuda:0'), tensor(717., device='cuda:0'), tensor(1401., device='cuda:0'), tensor(1918., device='cuda:0')]
ground truth gestures loaded, bboxes: [[803.10974976, 1068.4888128, 1065.92350422, 1409.6502719999999]]
i 0
2 persons, 
Image 005258eb-43a0-4339-952e-ce60286f85a9.jpg
image loaded, shape: (720, 1280, 3)
human detected, bbox: [tensor(12., device='cuda:0'), tensor(368., device='cuda:0'), tensor(128., device='cuda:0'), tensor(517., device='cuda:0'

95it [00:05, 22.58it/s]

i 0
1 person, 
Image 00b6c2e6-f283-4f7a-ab51-0e4e2da05b53.jpg
image loaded, shape: (1920, 1080, 3)
human detected, bbox: [tensor(252., device='cuda:0'), tensor(472., device='cuda:0'), tensor(914., device='cuda:0'), tensor(1914., device='cuda:0')]
ground truth gestures loaded, bboxes: [[440.61947999999995, 982.6191935999999, 595.5329628, 1190.515872]]
i 0
1 person, 
Image 004881c0-533c-46f1-b7a8-e7633dd41f68.jpg
image loaded, shape: (480, 640, 3)
human detected, bbox: [tensor(11., device='cuda:0'), tensor(57., device='cuda:0'), tensor(585., device='cuda:0'), tensor(479., device='cuda:0')]
ground truth gestures loaded, bboxes: [[184.94428159999998, 212.64363360000002, 297.2277888, 380.8932048]]
i 0
1 person, 
Image 00aa414d-2b1e-4504-8ca8-b1da1ebb7538.jpg
image loaded, shape: (1440, 1920, 3)
human detected, bbox: [tensor(633., device='cuda:0'), tensor(146., device='cuda:0'), tensor(1724., device='cuda:0'), tensor(1438., device='cuda:0')]
ground truth gestures loaded, bboxes: [[1089.13355

98it [00:05, 21.13it/s]

i 0
1 person, 
Image 000484ab-5fd0-49b8-9253-23a22b71d7b1.jpg
image loaded, shape: (1920, 1440, 3)
human detected, bbox: [tensor(282., device='cuda:0'), tensor(444., device='cuda:0'), tensor(1430., device='cuda:0'), tensor(1917., device='cuda:0')]
ground truth gestures loaded, bboxes: [[501.62999040000005, 1657.4670336, 875.1230064000001, 1906.5631872], [837.903888, 859.3820352, 1075.1846256000001, 1172.0039424]]
i 0
1 person, 
Image 0030c39d-f220-4be5-9ae5-fe176c17a821.jpg
image loaded, shape: (1920, 1440, 3)
human detected, bbox: [tensor(317., device='cuda:0'), tensor(530., device='cuda:0'), tensor(1420., device='cuda:0'), tensor(1916., device='cuda:0')]
ground truth gestures loaded, bboxes: [[756.5215968, 836.610816, 936.5750495999999, 1107.1706304]]
i 0
1 person, 
Image 00d56515-03a6-45a9-853e-5ae7ea46e374.jpg
image loaded, shape: (1920, 1440, 3)
human detected, bbox: [tensor(206., device='cuda:0'), tensor(508., device='cuda:0'), tensor(1108., device='cuda:0'), tensor(1917., device

100it [00:06, 16.29it/s]


i 0
1 person, 
Image 001c5c43-c9fe-4766-b791-d6bf309c7f95.jpg
image loaded, shape: (1920, 1440, 3)
human detected, bbox: [tensor(19., device='cuda:0'), tensor(441., device='cuda:0'), tensor(1401., device='cuda:0'), tensor(1917., device='cuda:0')]
ground truth gestures loaded, bboxes: [[461.4044688, 1131.9921408, 769.1532768, 1596.0994176]]


In [None]:
from utils.general import xyxy2xywh

# prep HaGRID_test

In [27]:
import os
import json
import shutil

# Paths
original_dataset_path = './../datasets/HaGRID/'
test_dataset_path = './../datasets/HaGRID_test/'

# Load the 100 test images
test_images = [f for f in os.listdir(os.path.join(test_dataset_path, 'like')) if f.endswith('.jpg')]

# Create a set for easier matching
test_image_set = set(os.path.splitext(f)[0] for f in test_images)

In [28]:
len(test_images)

100

In [29]:
len(test_image_set)

100

In [35]:
# test_image_set

In [37]:
original_dataset_path = './../datasets/HaGRID/'
test_dataset_path = './../datasets/HaGRID_test/'

In [53]:
# Initialize an empty list for the labels
matched_labels = {}

# Iterate through label files in the original dataset
for split in ['train', 'val', 'test']:
    label_path = os.path.join(original_dataset_path, split)
    with open(os.path.join(label_path, "like.json"), 'r') as f:
        data = json.load(f)

        # Check if images in the label file are in the test set
        for image_id, info in data.items():
            if image_id in test_image_set:
                matched_labels[image_id] = info

In [52]:
sample_keys=random.sample(list(matched_labels.keys()), min(10, len(matched_labels)))
for image_id in sample_keys:
    print(f'{{{image_id} : {matched_labels[image_id]}}}')

{002d020f-7843-4018-a085-484e7319b00a : {'bboxes': [[0.3666225, 0.603495, 0.11481563, 0.2389275]], 'user_id': '25bc760d508911e2c3453c3dd593092a0bf1b83c01817182eea3899fa76964c2', 'labels': ['like']}}
{00818540-8153-41d3-aba7-2369eaea25b9 : {'bboxes': [[0.42623638, 0.4084392, 0.07486643, 0.18070222]], 'user_id': 'a1b0bb3f79eb35269ae7e804eea8557fb1c7e97259cb8b5d98bba7fc870d91da', 'labels': ['like']}}
{00bf711f-e2cc-4cb5-b0ac-573198cd6455 : {'bboxes': [[0.32369747, 0.54091418, 0.11421507, 0.12234949], [0.59147623, 0.87560417, 0.13760348, 0.09844989]], 'user_id': '22ce972ece766dc47a240644048fb8399e4e69075f225c634bfaacb2a84e7423', 'labels': ['like', 'no_gesture']}}
{0069d29e-4e71-433b-96e1-09657ca3d7eb : {'bboxes': [[0.42199823, 0.3240404, 0.08642329, 0.22055762]], 'user_id': '31ef5349bc0bdc95cb2dcede7fc6bdce4c99b4ea4e70bb495f50082ec1f32afe', 'labels': ['like']}}
{009b4a76-9223-4801-a7dc-31d45cab3df9 : {'bboxes': [[0.30040455, 0.6490744, 0.09700108, 0.11034548]], 'user_id': 'dd7f74b0a40f86ca

In [54]:
output_path = os.path.join(test_dataset_path, 'val', 'like.json')
with open(output_path, 'w') as outfile:
    json.dump(matched_labels, outfile, indent=4)

print(f"\nMatched labels saved to {output_path}.")


Matched labels saved to ./../datasets/HaGRID_test/val/like.json.


In [58]:
test_labels = []
with open(output_path, 'r') as f:
    data = json.load(f)
    for image_id, info in data.items():
        test_labels.append(image_id)

test_label_set = set(test_labels)

In [57]:
len(test_labels)

100

In [61]:
len(test_label_set)

100

In [62]:
test_image_set == test_label_set

True