# **Pipeline for object detection and tracking in 3D:**

### **Required packages:**

In [7]:
#!pip install -r requirements.txt 
#!pip install setuptools==69.2.0
import torch
import numpy as np
import cv2
import os
from itertools import zip_longest
from scipy.spatial.distance import cdist
from scipy.optimize import linear_sum_assignment
import random
import math
from ultralytics import YOLO

from pathlib import Path
from utils.general import (Profile, cv2, scale_boxes)
from utils.augmentations import letterbox
from utils.torch_utils import select_device, smart_inference_mode
from sort import Sort

### **1. Definition of Objects and Functions**

##### 2D Object detection : YOLOV5

In [8]:
# DEFINE PARAMETERS:
weights='weights/yolo11l.pt'
source_1="data_2/view1"
source_2="data_2/view2"
device="cpu"
project="runs"
name="exp"
save_path = "outputs"

image_size=(512, 512)
conf_thres=0.25
max_det=20
line_thickness=2
iou_thres=0.45

save_txt=True
save_csv=False
nosave=False
hide_labels=False 
hide_conf=True

# DEFINE PARAMETERS FOR SORT:
sort_max_age = 40
sort_min_hits = 2
sort_iou_thresh = 0.1
track_color_id = 0

# CAMERA PARAMETERS:
focal_length=707.0493
baseline=0.06
fx=707.0493
fy=707.0493
cx=640.0
cy=360.0

camera_matrix = np.array([[707.0493, 0, 640.0], 
                         [0, 707.0493, 360.0],
                         [0, 0, 1]])

rotation_matrix = np.array([[0.99, 0.01, 0.04], 
                           [-0.01, 0.99, -0.01], 
                           [-0.04, 0.01, 0.99]])

translation_vector = np.array([0.06, 0.93, 1.65])

In [9]:
class ObjectDetector():
    
    def __init__(self, device, weights, source_1, source_2, image_size, save_path):
        self.device = select_device(device)
        self.model = YOLO(weights)
        self.stride, self.names = 32, self.model.names
        self.imgsz = image_size
        self.dt = Profile(device=self.device)
        self.source_1 = Path(source_1)
        self.source_2 = Path(source_2)
        self.files_1 = [f for f in self.source_1.glob('*') if f.suffix.lower() in ['.jpg', '.jpeg', '.png']]
        self.files_2 = [f for f in self.source_2.glob('*') if f.suffix.lower() in ['.jpg', '.jpeg', '.png']]
        
        self.save_path = save_path
        if not os.path.exists(self.save_path):
            os.makedirs(self.save_path)
        
        self.track_color_id_1 = 0
        self.track_color_id_2 = 0
        self.sort_tracker_1 = Sort(max_age=sort_max_age, min_hits=sort_min_hits, iou_threshold=sort_iou_thresh) 
        self.sort_tracker_2 = Sort(max_age=sort_max_age, min_hits=sort_min_hits, iou_threshold=sort_iou_thresh) 
        
    def __iter__(self):
        for file_1, file_2 in zip_longest(self.files_1, self.files_2, fillvalue=None):
            img_1 = cv2.imread(str(file_1))
            img_2 = cv2.imread(str(file_2))
            yield (img_1, img_2)
    
    def draw_boxes(self, img, bbox, identities=None):
        for i, box in enumerate(bbox):
            x1, y1, x2, y2 = [int(i) for i in box]
            id = int(identities[i]) if identities is not None else 0
            data = (int((box[0]+box[2])/2),(int((box[1]+box[3])/2)))
            label = str(id)

            color = self.compute_color_for_labels(id)
            (w, h), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.6, 1)
            cv2.rectangle(img, (x1, y1), (x2, y2),color, 2)
            cv2.rectangle(img, (x1, y1 - 20), (x1 + w, y1), (255,191,0), -1)
            cv2.putText(img, label, (x1, y1 - 5),cv2.FONT_HERSHEY_SIMPLEX, 0.6, 
            [255, 255, 255], 1)
            cv2.circle(img, data, 3, color, -1)

        return img
    
    def draw_box(self, img, bbox, identitie=None):
        x1, y1, x2, y2 = [int(i) for i in bbox]
        id = int(identitie) if identitie is not None else 0
        data = (int((bbox[0]+bbox[2])/2),(int((bbox[1]+bbox[3])/2)))
        label = str(id)

        color = self.compute_color_for_labels(id)
        (w, h), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.6, 1)
        cv2.rectangle(img, (x1, y1), (x2, y2),color, 2)
        cv2.rectangle(img, (x1, y1 - 20), (x1 + w, y1), (255,191,0), -1)
        cv2.putText(img, label, (x1, y1 - 5),cv2.FONT_HERSHEY_SIMPLEX, 0.6, 
        [255, 255, 255], 1)
        cv2.circle(img, data, 3, color, -1)

        return img
    @smart_inference_mode()
    def detect_object_2D(self, im, frame_num, save_images, sort_tracker, track_color_id):
        with self.dt:
            image_original = im
            im = letterbox(im, self.imgsz, stride=self.stride, auto=True)[0]
            im = im.transpose((2, 0, 1))[::-1]
            im = np.ascontiguousarray(im)
            im = torch.from_numpy(im).to(self.device)
            im = im.float().unsqueeze(0)
            im /= 255
                        
            pred = self.model(im)
            
            lines = []
            x_centers_list = []
            y_list = []
            
            for det in pred:
                detection = det.boxes.data
                detection[:, :4] = scale_boxes(im.shape[2:], detection[:, :4], image_original.shape).round() # Rescale to original size
                dets_to_sort = np.empty((0, 6))
                for *xyxy, conf, cls in reversed(detection):
                    if cls not in [0, 1, 2]:
                        continue
                    
                    c = int(cls)
                    x1, y1, x2, y2 = [int(coord) for coord in xyxy]
                    dets_to_sort = np.vstack((dets_to_sort, np.array([x1, y1, x2, y2, conf, c])))
                            
                # Run SORT:
                tracked_dets = sort_tracker.update(dets_to_sort)
                tracks = sort_tracker.getTrackers()
                
                #OUTPUT:
                for track in tracks:
                    #if track.id == 25:
                        #print('track id 25!')
                        #if track.history != []:
                        #    print('drawing 25!')
                        #    pred = track.history[-1][0]
                        #    self.draw_box(image_original, pred, 400)
                    
                    if save_images:
                        color = self.compute_color_for_labels(track_color_id)
                        [cv2.line(image_original, (int(track.centroidarr[i][0]),int(track.centroidarr[i][1])), 
                                (int(track.centroidarr[i+1][0]),int(track.centroidarr[i+1][1])),
                                color, thickness=3) for i,_ in  enumerate(track.centroidarr) 
                                if i < len(track.centroidarr)-1 ] 
                        track_color_id += 1
                    
                    box = track.get_state()[0,:4]                    
                    x_center, y = self.convert_bbox_to_center(box)
                    alpha = self.get_alpha(x_center, fx, cx)
                    x_centers_list.append(x_center)
                    
                    world_coord = self.convert_bbox_to_3d(box, camera_matrix, rotation_matrix, translation_vector, self.names[track.detclass])
                    
                    line = (f"{frame_num} {track.id} {self.names[track.detclass]} {0 if track.time_since_update < 5 else 1} "
                        f"{'-'} {alpha:.6f} {box[0]:.6f} {box[1]:.6f} {box[2]:.6f} {box[3]:.6f} {world_coord[0]:.6f} {world_coord[1]:.6} {world_coord[2]:.6}")

                    lines.append(line)
                    y_list.append(y)
                
                if len(tracked_dets)>0:
                    bbox_xyxy = tracked_dets[:,:4]
                    identities = tracked_dets[:, 8]
                    self.draw_boxes(image_original, bbox_xyxy, identities)
                    
            if save_images:
                cv2.imwrite(str(self.save_path + f"/{frame_num}.png"), image_original)
            
            return lines, x_centers_list, y_list

    def compute_color_for_labels(self, label, palette = (2 ** 11 - 1, 2 ** 15 - 1, 2 ** 20 - 1)):
        color = [int(int(p * (label ** 2 - label + 1)) % 255) for p in palette]
        return tuple(color)
    
    def convert_bbox_to_center(self, bbox):
        w = bbox[2] - bbox[0]
        h = bbox[3] - bbox[1]
        x = bbox[0] + w/2.
        y = bbox[1] + h/2.
        return [x, y]
    
    def get_alpha(self, object_center_x, fx, cx):
        '''
        Calculation of alpha from object relative to the camera
        camera_mtx is an array with camera's intrinsics: fx, fy, cx, cy
        object_coord are the coordinates of the object in the image frame: u, v
        '''
        norm_x = (object_center_x - cx) / fx
        bearing = np.arctan(norm_x)

        bearing = (bearing + np.pi) % (2 * np.pi) - np.pi
        
        return bearing
    
    def convert_bbox_to_3d(self, bbox_2d, camera_matrix, rotation_matrix, translation_vector, object_class):
        # Convertir las coordenadas 2D de las esquinas de la caja delimitadora a un array 4x2
        bbox_2d = np.array([[bbox_2d[0], bbox_2d[1]], 
                            [bbox_2d[0], bbox_2d[3]], 
                            [bbox_2d[2], bbox_2d[3]], 
                            [bbox_2d[2], bbox_2d[1]]])

        # Convertir coordenadas 2D a coordenadas 3D de la cámara
        bbox_3d_camera = self.unproject_points(bbox_2d, camera_matrix)

        # Convertir coordenadas 3D de la cámara a coordenadas 3D del mundo
        bbox_3d_world = self.transform_points(bbox_3d_camera, rotation_matrix, translation_vector)

        # Calcular las dimensiones 3D del objeto
        obj_height = np.max(bbox_3d_world[:, 1]) - np.min(bbox_3d_world[:, 1])
        obj_width = np.max(bbox_3d_world[:, 0]) - np.min(bbox_3d_world[:, 0])

        # Determinar la longitud del objeto según la clase
        if object_class == "person":
            obj_length = 0.5  # Longitud típica de un peatón
        elif object_class == "car":
            obj_length = 4.5  # Longitud típica de un vehículo
        elif object_class == "bicycle":
            obj_length = 1.8  # Longitud típica de un ciclista
        else:
            obj_length = 2.0  # Valor por defecto

        return obj_height, obj_width, obj_length

    def unproject_points(self, points_2d, camera_matrix):
        # Convertir coordenadas 2D a coordenadas 3D de la cámara
        points_3d = np.ones((points_2d.shape[0], 4))
        points_3d[:, 0] = (points_2d[:, 0] - camera_matrix[0, 2]) / camera_matrix[0, 0]
        points_3d[:, 1] = (points_2d[:, 1] - camera_matrix[1, 2]) / camera_matrix[1, 1]
        points_3d[:, 2] = 1.0
        
        return points_3d

    def transform_points(self, points_3d, rotation_matrix, translation_vector):
        # Asegúrate de trabajar con coordenadas cartesianas (N, 3)
        points_3d = points_3d[:, :3]  # Ignorar el componente homogéneo si existe
        points_world = np.zeros((points_3d.shape[0], 3))  # Cambiar a 3 columnas

        for i in range(points_3d.shape[0]):
            point_3d = points_3d[i]
            points_world[i] = np.dot(rotation_matrix, point_3d) + translation_vector

        return points_world
    
    def create_video(self, images_dir, output_video_path, fps=30):
        image_files = sorted([f for f in os.listdir(images_dir) if f.lower().endswith(('.png', '.jpg', '.jpeg'))])

        if not image_files:
            raise ValueError("No se encontraron imágenes en el directorio especificado.")

        # Leer la primera imagen para obtener el tamaño del video
        first_image_path = os.path.join(images_dir, image_files[0])
        first_image = cv2.imread(first_image_path)
        height, width, _ = first_image.shape

        # Definir el códec y el objeto VideoWriter
        fourcc = cv2.VideoWriter_fourcc(*'mp4v')  # Codec para archivos .mp4
        video_writer = cv2.VideoWriter(output_video_path, fourcc, fps, (width, height))

        # Escribir cada imagen en el archivo de video
        for image_file in image_files:
            image_path = os.path.join(images_dir, image_file)
            frame = cv2.imread(image_path)
            video_writer.write(frame)

        # Liberar el VideoWriter
        video_writer.release()
        print(f"Video creado exitosamente: {output_video_path}")
        

In [10]:
detector = ObjectDetector(device, weights, source_1, source_2, image_size, save_path)

YOLOv5  f84e0aa Python-3.10.5 torch-2.5.1+cpu CPU



##### AUXLIAR FUNCTIONS:

In [11]:
def calculate_2d_to_3d_camera_coordinates(left_center_x, right_center_x, center_y, focal_length, baseline, fy, cx, cy):
    """
    Calculate the 3D position (X, Y, Z) of an object using stereo vision.
    
    Parameters:
    left_center_x: x-coordinate of object center in left image
    right_center_x: x-coordinate of object center in right image
    center_y: y-coordinate of object (assumed same in both images)
    focal_length: focal length in pixels
    baseline: distance between cameras in meters
    fy: focal length in y direction (pixels)
    cx: principal point x-coordinate (pixels)
    cy: principal point y-coordinate (pixels)
    
    Returns:
    tuple: (X, Y, Z) coordinates in meters
    """
    # Calculate disparity (horizontal pixel difference between the left and right image)
    disparity = abs(left_center_x - right_center_x)
    if disparity > 10 or disparity == 0:
        disparity = random.uniform(0.01, 2) 
    
    # Calculate Z (depth)
    Z = (focal_length * baseline) / disparity
    
    # Calculate X (horizontal position)
    # Using left image coordinates for triangulation
    X = ((left_center_x - cx) * Z) / fx
    
    # Calculate Y (vertical position)
    Y = ((center_y - cy) * Z) / fy
    
    return X,Y,Z

def align_detections(lines2, x_centers_list1, x_centers_list2):
    """
    Alinea las líneas y centros detectados entre dos cámaras usando el algoritmo húngaro.
    
    Args:
        lines1: Lista de líneas detectadas en la cámara 1
        lines2: Lista de líneas detectadas en la cámara 2
        x_centers_list1: Lista de centros X detectados en la cámara 1
        x_centers_list2: Lista de centros X detectados en la cámara 2
    
    Returns:
        lines2_aligned: Líneas de la cámara 2 alineadas con las de la cámara 1
        x_centers_list2_aligned: Centros de la cámara 2 alineados con los de la cámara 1
    """
    # Convertir listas a arrays numpy si no lo están ya
    x_centers1 = np.array(x_centers_list1).reshape(-1, 1)
    x_centers2 = np.array(x_centers_list2).reshape(-1, 1)
    
    # Calcular matriz de distancias entre centros
    D = cdist(x_centers1, x_centers2)
    
    # Aplicar el algoritmo húngaro (linear assignment)
    _, col_ind = linear_sum_assignment(D)
        
    # Alinear los centros
    x_centers_list2_aligned = [x_centers_list2[i] for i in col_ind]
    
    # Alinear las líneas correspondientes
    lines2_aligned = [lines2[i] for i in col_ind]
    
    return lines2_aligned, x_centers_list2_aligned

def calculate_rotation_y(x_camera, z_camera):
    """
    Calculate object's rotation around Y-axis in camera coordinates
    Args:
        x_camera: x coordinate in camera frame
        z_camera: z coordinate in camera frame
    Returns:
        rotation_y: rotation around Y-axis in radians [-π,π]
    """
    # Primero calculamos el ángulo base usando atan2
    base_angle = math.atan2(x_camera, z_camera)
    
    # La rotación del objeto es perpendicular a la dirección de observación
    # Añadimos π/2 para obtener la rotación del objeto
    rotation_y = base_angle + (math.pi / 2)
    
    # Normalizamos a [-π,π]
    rotation_y = (rotation_y + math.pi) % (2 * math.pi) - np.pi
    
    return f" {rotation_y:.6f}"



In [12]:
for i, images in enumerate(detector):
    lines1, x_centers_list1, y_list = detector.detect_object_2D(images[0], i, False, detector.sort_tracker_1, detector.track_color_id_1)
    lines2, x_centers_list2, _ = detector.detect_object_2D(images[1], i, True, detector.sort_tracker_2, detector.track_color_id_2)
    
    lines2_aligned, x_centers_list2_aligned = align_detections(lines2, x_centers_list1, x_centers_list2)
    
    for x_center_left, x_center_right, y, line in zip(x_centers_list1, x_centers_list2_aligned, y_list, lines1):
        camera_coord = calculate_2d_to_3d_camera_coordinates(x_center_left, x_center_right, y, focal_length, baseline, fy, cx, cy)
        camera_coord_str = f" {camera_coord[0]:.6f} {camera_coord[1]:.6f} {camera_coord[2]:.6f}"
        
        rotation_y = calculate_rotation_y(camera_coord[0], camera_coord[2])
        
        print(line + camera_coord_str + rotation_y)


0: 160x512 8 persons, 1 bicycle, 4 cars, 1 motorcycle, 1 bench, 285.6ms
Speed: 0.0ms preprocess, 285.6ms inference, 1.0ms postprocess per image at shape (1, 3, 160, 512)

0: 160x512 6 persons, 3 bicycles, 4 cars, 1 motorcycle, 213.4ms
Speed: 0.0ms preprocess, 213.4ms inference, 10.7ms postprocess per image at shape (1, 3, 160, 512)
0 0 person 0 - -0.554617 193.000000 161.000000 211.000000 195.000000 0.047861 0.0256842 0.5 -18.108364 -7.524480 29.231749 1.016179
0 1 person 0 - -0.388625 339.000000 155.000000 362.000000 221.000000 0.092738 0.0331377 0.5 -23.221148 -13.796330 56.713287 1.182172
0 2 car 0 - -0.584207 152.000000 180.000000 193.000000 198.000000 0.025783 0.0576622 4.5 -2.805000 -1.026000 4.242296 0.986589
0 3 car 0 - 0.502977 962.000000 185.000000 1096.000000 221.000000 0.052302 0.188134 4.5 32.513710 -13.122500 59.097162 2.073773
0 4 car 0 - 0.637329 1130.000000 187.000000 1197.000000 222.000000 0.049954 0.0943074 4.5 10.470000 -3.110000 14.140986 2.208126
0 5 person 0 - -

In [13]:
detector.create_video(images_dir="./outputs", output_video_path="./output_video.mp4", fps=10)

Video creado exitosamente: ./output_video.mp4
