In [1]:
from PIL import Image
from ultralytics import YOLO
import numpy as np
import pandas as pd
import cv2
import sys
import os
import time
from copy import deepcopy
from model.SiameseReId import SiameseReId

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model = YOLO("yolov8x.pt") # COCO128 classes https://github.com/ultralytics/yolov5/blob/master/data/coco128.yaml
# model = YOLO("yolov8n-seg.pt")
siamese_net = SiameseReId(os.path.join('model','weights','model_final.pt'))

In [3]:
def detection(frames: np.array, should_save: bool = False) -> dict:
    results = model(deepcopy(frames), save=should_save)
    detected = {}

    for frame in range(len(results)):
        # print(frame, results[frame].boxes.numpy(), results[frame].boxes.cls)
        boxes = results[frame].boxes
        detected[frame] = []
        for box in boxes:
            box = box.cpu().numpy()
            # print(box, box.cls, box.conf)
            if int(box.cls[0]) == 0 and box.conf[0] > 0.6:
                detected[frame].append({
                    "xyxy": box.xyxy,
                    "xywh": box.xywh,
                    "conf": box.conf[0],
                    "center": box.xywh[0][0:2]
                })
        # print('\n')
    return detected

In [4]:
def distance(point1: np.array, point2: np.array) -> float:
    return np.linalg.norm(point1 - point2)

def is_closer_enough(dist: float, epsilon: float = 30) -> bool:
    return dist <= epsilon

In [5]:
def mean_bbox(frames: dict) -> dict:
    same_bb = []
    mean_bb = []

    if len(frames) > 1:
        for bbox0 in frames[0]: # bbox nel frame 0
            same_bb.append([])
            same_bb[-1].append(bbox0)
            # print(same_bbox)
            # print("0: ", bbox0["center"])

            for bbox1 in frames[1]: # bbox nel frame 1
                # print("\t1: ", bbox1["center"], "DIST: ", distance(bbox0["center"], bbox1["center"]))
                if is_closer_enough(distance(bbox0["center"], bbox1["center"])): # bbox0 <-> bbox1
                    same_bb[-1].append(bbox1)
                    # print("FOUND IT \t1: ", bbox1["center"])

            for bbox2 in frames[2]: # bbox nel frame 2
                # print("\t2: ", bbox2["center"], "DIST: ", distance(same_bbox[-1][-1]["center"], bbox2["center"]))
                if is_closer_enough(distance(same_bb[-1][-1]["center"], bbox2["center"])): # (bbox1 or bbox0) <-> bbox2
                    same_bb[-1].append(bbox2)
                    # print("FOUND IT \t2: ", bbox2["center"])

        for bboxes in same_bb:
            bbs = {
                "xyxy": np.mean(np.array([frame["xyxy"] for frame in bboxes]), axis=0),
                "xywh": np.mean(np.array([frame["xywh"] for frame in bboxes]), axis=0),
                "conf": np.mean(np.array([frame["conf"] for frame in bboxes])),
                "center": np.mean(np.array([frame["center"] for frame in bboxes]), axis=0),
            }
            # print([frame["xyxy"] for frame in bboxes])
            # mean_bb.append(np.mean(np.array([frame["xyxy"] for frame in bboxes]), axis=0))
            mean_bb.append(bbs)
    else:
        for bbox0 in frames[0]:
            mean_bb.append({
                "xyxy": bbox0["xyxy"],
                "xywh": bbox0["xywh"],
                "conf": bbox0["conf"],
                "center": bbox0["center"],
            })

    return mean_bb

In [6]:
file_path = 'test_aula.mp4'
delay = 1
window_name = 'frame'

cap = cv2.VideoCapture(file_path)
# cap = cv2.VideoCapture(0)

if not cap.isOpened():
    sys.exit()

count = 3
# frames = []
round = 0

video_out = cv2.VideoWriter("./video_output.mp4", cv2.VideoWriter_fourcc(*'DIVX'), int(cap.get(cv2.CAP_PROP_FPS)), (int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))))

features = pd.DataFrame(columns=["features","color"])
# similarities = pd.DataFrame(columns=["track_id", "prob"])

while round < 100:
# while True:
    reads = [cap.read() for i in range(count)] # [(True, [...]] if frame read else [(False, None)]
    frames = [read[1] for read in reads if read[0]]
    rets = [read[0] for read in reads]

    # os.makedirs("./runs/detect/predict/og/")
    # frames_to_detect = list(frames.copy())

    det = detection(frames)
    mean_bboxes = mean_bbox(det)

    # mean_bboxes = det

    for i in range(len(frames)):
        if rets[i]:
            # os.makedirs("./runs/detect/predict/og_f%d/" % i)
            # cv2.imwrite("./runs/detect/predict/og_f%d/image.jpg" %i, frames[i])

            drew_frame = frames[i].copy()


            for j in range(len(mean_bboxes)):
                # print(mean_bboxes[i][j])
                xyxy = mean_bboxes[j]["xyxy"][0].astype(int).tolist()

                cropped = Image.fromarray(cv2.cvtColor(frames[i], cv2.COLOR_BGR2RGB)).crop(xyxy)
                # cropped.save(f"./runs/detect/predict/og_f{i}/cropped{j}.jpg")

                fv = siamese_net.fv_encoding(cropped).cpu()

                track_id = None
                if features.empty:
                    color = np.random.randint(0,255,3).tolist()
                    features = pd.concat([features, pd.DataFrame({"features": [fv], "color": [color]})], ignore_index=True)
                    track_id = features.index[-1]
                    # print("FIRST: => ", track_id, "\n")
                else:
                    similarities = pd.DataFrame(columns=["track_id", "prob"])

                    for id, row in features.iterrows():
                        prob = siamese_net.similarity(cropped, row["features"]).data[0].cpu().numpy()
                        # print(prob)

                        similarities = pd.concat([similarities, pd.DataFrame({"track_id": id, "prob": prob})], ignore_index=True)

                    # print("SIMILARITIES: \n", similarities, "\n", similarities["prob"].idxmax(), "\n", similarities.loc[similarities["prob"].idxmax()])

                    track_id = similarities.loc[similarities["prob"].idxmax()]["track_id"]
                    # print(similarities)

                    if similarities.loc[track_id]["prob"] > 0.85: # nel caso aggiorniamo le features con quelle nuove trovate?
                        features.loc[track_id]["features"] = fv
                        color = features.loc[track_id]["color"]
                        # print("TROVATO:  ==> ", track_id, "\n")
                        # display(cropped)
                        # print(similarities.iloc[track_id]["prob"])
                    else:
                        color = np.random.randint(0,255,3).tolist()
                        features = pd.concat([features, pd.DataFrame({"features": [fv], "color": [color]})], ignore_index=True)
                        track_id = features.index[-1]
                        # print("NUOVO: => ", track_id, "\n")
                    # print(df, "\n")

                # print(xyxy)
                # print(color)
                drew_frame = cv2.rectangle(drew_frame, (xyxy[0], xyxy[1]), (xyxy[2], xyxy[3]), color, 3)
                drew_frame = cv2.putText(drew_frame, str(track_id), (xyxy[0], xyxy[1]), cv2.FONT_HERSHEY_SIMPLEX, 1.5, color, 2, cv2.LINE_AA)

            # cv2.imwrite(f"./prova/img_r{round}_f{i}.jpg", drew_frame)
            video_out.write(drew_frame.copy())

    # os.rename("./runs/detect/predict", "./runs/detect/predict%d" % round)

    round += 1

    if rets[2]:
        # cv2.imshow(window_name, frame)
        if cv2.waitKey(delay) & 0xFF == ord('q'):
            break

    else:
        cap.set(cv2.CAP_PROP_POS_FRAMES, 0)
        break

cap.release()
video_out.release()
# cv2.destroyWindow(window_name)

OpenCV: FFMPEG: tag 0x58564944/'DIVX' is not supported with codec id 12 and format 'mp4 / MP4 (MPEG-4 Part 14)'
OpenCV: FFMPEG: fallback to use tag 0x7634706d/'mp4v'
Ultralytics YOLOv8.0.30 🚀 Python-3.10.8 torch-1.13.0+cu117 CUDA:0 (NVIDIA GeForce GTX 1650, 3909MiB)
YOLOv8x summary (fused): 268 layers, 68200608 parameters, 0 gradients, 257.8 GFLOPs

0: 384x640 2 persons, 9 chairs, 1 dining table, 1 laptop, 1: 384x640 2 persons, 9 chairs, 1 laptop, 2: 384x640 2 persons, 11 chairs, 1 laptop, 215.3ms
Speed: 0.2ms pre-process, 71.8ms inference, 0.8ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 9 chairs, 1 laptop, 1: 384x640 2 persons, 8 chairs, 1 dining table, 1 laptop, 2: 384x640 2 persons, 10 chairs, 1 dining table, 1 laptop, 198.8ms
Speed: 0.2ms pre-process, 66.3ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 9 chairs, 1 laptop, 1 clock, 1: 384x640 2 persons, 8 chairs, 1 laptop, 2: 384x640 2 persons, 10 chairs, 1 lapt