### Object detection

In [1]:
# !pip install ultralytics (Don't forget to install it)

import cv2
import time
import numpy as np
from ultralytics import YOLO

In [2]:
# Load the YOLOv11 model for segmentation (or make sure to download 'yolo11n-seg.pt')
# You can check https://docs.ultralytics.com/tasks/segment/ for other models

model = YOLO("yolo11n-seg")

# Define the video source (Can be the path or camera index)
cap = cv2.VideoCapture(0)
cap.set(cv2.CAP_PROP_FRAME_WIDTH, 1920)
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 1080)

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # Measure processing time to calculate latency
    start_time = time.time()
    
    # Perform detection on the frame 
    results = model(
        frame, 
        conf=0.7,  # Confidence threshold: show detections over 70%
        classes=[0]  # "0" to focus only on people
        )
    latency = (time.time() - start_time) * 1000  # milliseconds of latency

    # Access bounding boxes
    boxes_obj = results[0].boxes
    if boxes_obj is not None and len(boxes_obj) > 0:
        bboxes = boxes_obj.xyxy.cpu().numpy()   # [x1, y1, x2, y2]
        confs = boxes_obj.conf.cpu().numpy()    # Confidence scores
        classes = boxes_obj.cls.cpu().numpy()   # Class indices
        
        for i, box in enumerate(bboxes):
            x1, y1, x2, y2 = map(int, box)
            
            # Get the class name if available
            class_name = model.names[int(classes[i])] if hasattr(model, 'names') else str(int(classes[i]))
            label = f'{class_name} {confs[i]:.2f}'
            # Draw bounding box and label on the frame
            cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 0, 255), 2)
            cv2.putText(frame, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 2)

    # Process segmentations: assign a random color to each detected mask
    masks_obj = results[0].masks
    if masks_obj is not None and len(masks_obj) > 0:
        # Extract masks; assumes masks_obj.data is a tensor
        masks = masks_obj.data.cpu().numpy() if hasattr(masks_obj.data, 'cpu') else masks_obj.data
        for mask in masks:
            # Convert the mask to binary (threshold 0.5) and scale to 0–255
            mask_bin = (mask > 0.5).astype(np.uint8) * 255
            # Resize the mask to match the frame size
            mask_bin = cv2.resize(mask_bin, (frame.shape[1], frame.shape[0]), interpolation=cv2.INTER_NEAREST)
            
            # Create a 3-channel boolean mask
            binary_mask = cv2.threshold(mask_bin, 127, 255, cv2.THRESH_BINARY)[1]
            binary_mask_3c = cv2.merge([binary_mask, binary_mask, binary_mask])
            
            # Generate a random color (BGR)
            random_color = np.random.randint(0, 256, size=(3,), dtype=np.uint8)
            # Create a color mask image same size as frame
            colored_mask = np.full((frame.shape[0], frame.shape[1], 3), random_color, dtype=np.uint8)
            
            # Combine mask with the frame: in regions where mask is 255, use the random color
            output_frame = frame.copy()
            output_frame[binary_mask_3c == 255] = colored_mask[binary_mask_3c == 255]
            
            # Update the frame with the colored mask (preserve the natural background)
            frame = output_frame
        
        # Display the number of detected masks
        cv2.putText(frame, f'Masks: {len(masks)}', (10, 70),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 255), 2)
    
    # Display the latency on the frame
    cv2.putText(frame, f'Latency: {latency:.1f}ms', (10, 30),
                cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 0, 0), 2)

    # Show the processed frame in real-time
    cv2.imshow("YOLOv11-Seg - Real-time Segmentation", frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()



0: 384x640 1 person, 289.2ms
Speed: 11.1ms preprocess, 289.2ms inference, 22.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 186.8ms
Speed: 4.7ms preprocess, 186.8ms inference, 0.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 130.7ms
Speed: 2.3ms preprocess, 130.7ms inference, 0.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 107.4ms
Speed: 2.4ms preprocess, 107.4ms inference, 0.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 104.3ms
Speed: 2.4ms preprocess, 104.3ms inference, 2.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 113.7ms
Speed: 2.3ms preprocess, 113.7ms inference, 3.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 113.6ms
Speed: 2.5ms preprocess, 113.6ms inference, 0.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 112.3ms
Speed: 1.9ms preprocess, 112.3ms inference, 

### Video Detection

In [1]:
video_path = "Video Useful/people-detection.mp4"

In [2]:
output_path = "Save Images & Videos/output_video.avi"

In [3]:
import cv2
import numpy as np
from ultralytics import YOLO

Use https://www.photopea.com/ for determine distintive points, take a frame of the image, usign the rule on the program, We can determine where put line on. 

In [9]:
# To know whether the person in the video crosses the determined digital limit or not

def signed_distance(point, line):
    """
    Calculates the signed distance from a point to a line defined by two points.
    Allows us to determine on which side of the line the point lies.
    point: (x, y)
    line: ((x1, y1), (x2, y2))
    """
    x, y = point
    (x1, y1), (x2, y2) = line
    num = (y2 - y1) * x - (x2 - x1) * y + x2 * y1 - y2 * x1
    den = np.sqrt((y2 - y1) ** 2 + (x2 - x1) ** 2)
    return num / den if den != 0 else 0

# Define counting lines
line1 = ((130, 120), (15, 250))  # ((130, 180), (25, 300)) 
line2 = ((650, 175), (720, 275))

# Counters for each line
count_line1 = 0
count_line2 = 0

# Threshold to associate detections between frames
distance_threshold = 25 #50

# List to store centroids from the previous frame
prev_centroids = []

# Load YOLO model (assumes "person" is class 0 in COCO)
model = YOLO("yolo11n.pt")

# Open input video and configure VideoWriter for output video
cap = cv2.VideoCapture(video_path)
if not cap.isOpened():
    raise ValueError("Could not open input video.")

fps = cap.get(cv2.CAP_PROP_FPS)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fourcc = cv2.VideoWriter_fourcc(*'XVID') # "XVID because is light format"
writer = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

while True:
    ret, frame = cap.read()
    if not ret:
        break

    # Detection with YOLOv11
    results = model(frame, conf=0.7)
    # Access the first result from the list
    boxes_obj = results[0].boxes

    current_centroids = []
    if boxes_obj is not None and len(boxes_obj) > 0:
        # Extract bounding boxes and classes as NumPy arrays
        bboxes = boxes_obj.xyxy.cpu().numpy()  # Array of shape (N,4)
        classes = boxes_obj.cls.cpu().numpy()  # Array of shape (N,)
        # Filter detections of "person" by checking that the class is 0
        for i in range(len(bboxes)):
            if int(classes[i]) == 0:
                x1, y1, x2, y2 = map(int, bboxes[i])
                centroid = ((x1 + x2) // 2, (y1 + y2) // 2)
                current_centroids.append(centroid)
                cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
                cv2.circle(frame, centroid, 4, (0, 255, 0), -1)

    # Draw the counting lines
    cv2.line(frame, line1[0], line1[1], (255, 0, 0), 2)
    cv2.line(frame, line2[0], line2[1], (0, 0, 255), 2)

    # Compare each current centroid with those from the previous frame to detect crossings
    for curr in current_centroids:
        best_distance = float('inf')
        best_prev = None
        for prev in prev_centroids:
            d = np.linalg.norm(np.array(curr) - np.array(prev))
            if d < best_distance and d < distance_threshold:
                best_distance = d
                best_prev = prev
        if best_prev is not None:
            # Check crossing of line 1
            prev_side1 = signed_distance(best_prev, line1)
            curr_side1 = signed_distance(curr, line1)
            if prev_side1 * curr_side1 < 0:
                count_line1 += 1
            # Check crossing of line 2
            prev_side2 = signed_distance(best_prev, line2)
            curr_side2 = signed_distance(curr, line2)
            if prev_side2 * curr_side2 < 0:
                count_line2 += 1

    # Draw the counters on the frame
    cv2.putText(frame, f"Clothes Section: {count_line1}", (10, 30),
                cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 0), 2)
    cv2.putText(frame, f"Sport Section: {count_line2}", (10, 70),
                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)

    # Write the processed frame to the output video
    writer.write(frame)

    # Update previous frame centroids
    prev_centroids = current_centroids.copy()

cap.release()
writer.release()
print(f"Video processed and saved to: {output_path}")



0: 384x640 (no detections), 115.1ms
Speed: 6.8ms preprocess, 115.1ms inference, 0.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 79.7ms
Speed: 2.5ms preprocess, 79.7ms inference, 1.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 69.7ms
Speed: 2.3ms preprocess, 69.7ms inference, 0.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 83.9ms
Speed: 2.3ms preprocess, 83.9ms inference, 0.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 143.7ms
Speed: 2.4ms preprocess, 143.7ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 78.2ms
Speed: 2.9ms preprocess, 78.2ms inference, 0.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 89.5ms
Speed: 2.3ms preprocess, 89.5ms inference, 0.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 86.2ms
Speed: 3.7ms preprocess, 86.2

In [1]:
import cv2

In [None]:

# Path to your image
imagen_path = "Image Useful\\pose_landmarks_index.png"  # <-- change this to your image

# Load image
imagen = cv2.imread(imagen_path)

# Function called on mouse click
def mostrar_coordenadas(event, x, y, flags, param):
    if event == cv2.EVENT_LBUTTONDOWN:
        print(f"Coordinates: x={x}, y={y}")
        # Draw a circle at the click position
        cv2.circle(imagen, (x, y), 5, (0, 255, 0), -1)
        cv2.putText(imagen, f"({x},{y})", (x+10, y-10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0,255,0), 1)
        cv2.imshow("Interactive Image", imagen)

# Show the image and start listening for mouse events
cv2.imshow("Interactive Image", imagen)
cv2.setMouseCallback("Interactive Image", mostrar_coordenadas)

# Wait until a key is pressed
cv2.waitKey(0)
cv2.destroyAllWindows()
