In [13]:
import cv2
import numpy as np
import csv
from ultralytics import YOLO

# Load the YOLOv11 model
model = YOLO("yolo11l.pt")

# Open the video file
video_path = "input_videos/football.mp4"
cap = cv2.VideoCapture(video_path)

# Check if video opened successfully
if not cap.isOpened():
    print("Error: Could not open video.")
    exit()

# Get video properties
fps = cap.get(cv2.CAP_PROP_FPS)
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

# Define codec and create VideoWriter object to save output video
out = cv2.VideoWriter('output_videos/annotated_football.mp4',
                      cv2.VideoWriter_fourcc(*'mp4v'), fps,
                      (frame_width, frame_height))

# Open CSV file to write output
csv_file = open('output_data/team_counts.csv', mode='w', newline='')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['video_time', 'team_a_count', 'team_b_count'])  # Write header

frame_count = 0

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    frame_count += 1
    video_time = frame_count / fps  # Calculate time in seconds
    print(f"Processing frame {frame_count} at time {video_time:.2f} seconds")

    # Run YOLOv11 model to detect persons
    results = model.predict(source=frame, save=False, conf=0.5, iou=0.5, classes=[0])  # Class 0 corresponds to 'person'

    # Initialize counters for each team
    team_a_count = 0
    team_b_count = 0

    for result in results:
        boxes = result.boxes
        for box in boxes:
            # Extract bounding box coordinates
            x1, y1, x2, y2 = map(int, box.xyxy[0])

            # Extract the region of interest (ROI) corresponding to the detected person
            roi = frame[y1:y2, x1:x2]

            # Convert ROI to HSV color space
            hsv_roi = cv2.cvtColor(roi, cv2.COLOR_BGR2HSV)

            # Define HSV color ranges for Team A (Red) and Team B (White) jerseys
            # Red requires two ranges to capture both ends of the hue spectrum
            team_a_lower1 = np.array([0, 100, 100])    # Lower bound for Team A red color (lower range)
            team_a_upper1 = np.array([10, 255, 255])   # Upper bound for Team A red color (lower range)
            team_a_lower2 = np.array([170, 100, 100])  # Lower bound for Team A red color (upper range)
            team_a_upper2 = np.array([180, 255, 255])  # Upper bound for Team A red color (upper range)

            # White is low saturation, high brightness
            team_b_lower = np.array([0, 0, 200])       # Lower bound for Team B white color
            team_b_upper = np.array([180, 30, 255])    # Upper bound for Team B white color

            # Create masks for each team's color
            mask_a1 = cv2.inRange(hsv_roi, team_a_lower1, team_a_upper1)
            mask_a2 = cv2.inRange(hsv_roi, team_a_lower2, team_a_upper2)
            mask_a = cv2.bitwise_or(mask_a1, mask_a2)

            mask_b = cv2.inRange(hsv_roi, team_b_lower, team_b_upper)

            # Calculate the percentage of each color in the ROI
            team_a_ratio = np.sum(mask_a) / (roi.size / 3)
            team_b_ratio = np.sum(mask_b) / (roi.size / 3)

            # Determine team affiliation based on color dominance
            if team_a_ratio > team_b_ratio and team_a_ratio > 0.1:  # Threshold to avoid noise
                team_a_count += 1
                label = "Team A"
                color = (0, 0, 255)  # Red color for bounding box
            elif team_b_ratio > team_a_ratio and team_b_ratio > 0.1:
                team_b_count += 1
                label = "Team B"
                color = (255, 255, 255)  # White color for bounding box
            else:
                label = "Unknown"
                color = (0, 255, 255)  # Yellow color for bounding box

            # Draw bounding box and label on the frame
            cv2.rectangle(frame, (x1, y1), (x2, y2), color, 2)
            cv2.putText(frame, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)

    # Display team counts on the frame
    cv2.putText(frame, f"Team A: {team_a_count}", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
    cv2.putText(frame, f"Team B: {team_b_count}", (10, 70), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)

    # Write the annotated frame to the output video
    out.write(frame)

    # Write to CSV
    csv_writer.writerow([video_time, team_a_count, team_b_count])

    # Optional: Display the frame (comment out if not needed)
    # cv2.imshow('Frame', frame)
    # if cv2.waitKey(1) & 0xFF == ord('q'):
    #     break

# Release resources
cap.release()
out.release()
csv_file.close()  # Close the CSV file
cv2.destroyAllWindows()

Processing frame 1 at time 0.10 seconds

0: 384x640 10 persons, 299.1ms
Speed: 12.0ms preprocess, 299.1ms inference, 14.6ms postprocess per image at shape (1, 3, 384, 640)
Processing frame 2 at time 0.20 seconds

0: 384x640 10 persons, 233.5ms
Speed: 1.4ms preprocess, 233.5ms inference, 0.6ms postprocess per image at shape (1, 3, 384, 640)
Processing frame 3 at time 0.30 seconds

0: 384x640 10 persons, 215.8ms
Speed: 1.6ms preprocess, 215.8ms inference, 0.8ms postprocess per image at shape (1, 3, 384, 640)
Processing frame 4 at time 0.40 seconds

0: 384x640 10 persons, 377.7ms
Speed: 1.8ms preprocess, 377.7ms inference, 0.8ms postprocess per image at shape (1, 3, 384, 640)
Processing frame 5 at time 0.50 seconds

0: 384x640 11 persons, 261.6ms
Speed: 2.4ms preprocess, 261.6ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)
Processing frame 6 at time 0.60 seconds

0: 384x640 11 persons, 235.9ms
Speed: 2.2ms preprocess, 235.9ms inference, 0.6ms postprocess per image at s

If don't know the color beforehead, get the dominant color than green from each bounding box, then use KMeans to cluster the colors into two teams.

Following code based on color clustering is not that accurate. In the produciton code, team color can be given as a parameter to have a more accurate result.

Team A and Team B's color can alternate in different frames in the annotaed_football_2.mp4


In [20]:
import cv2
import numpy as np
import csv
from ultralytics import YOLO
from sklearn.cluster import KMeans
from collections import Counter

# Load the YOLOv11 model
model = YOLO("yolo11l.pt")

# Open the video file
video_path = "input_videos/football.mp4"
cap = cv2.VideoCapture(video_path)

# Check if video opened successfully
if not cap.isOpened():
    print("Error: Could not open video.")
    exit()

# Get video properties
fps = cap.get(cv2.CAP_PROP_FPS)
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

# Define codec and create VideoWriter object to save output video
out = cv2.VideoWriter('output_videos/annotated_football_2.mp4',
                      cv2.VideoWriter_fourcc(*'mp4v'), fps,
                      (frame_width, frame_height))

# Open CSV file to write output
csv_file = open('output_data/team_count_2.csv', mode='w', newline='')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['video_time', 'team_a_count', 'team_b_count'])  # Write header

frame_count = 0
color_assignment = None  # Initialize color assignment to None

# Define HSV range for green color to filter out football ground
green_lower = np.array([35, 40, 40])   # Lower bound for green color
green_upper = np.array([85, 255, 255]) # Upper bound for green color

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    frame_count += 1
    video_time = frame_count / fps  # Calculate time in seconds
    print(f"Processing frame {frame_count} at time {video_time:.2f} seconds")

    # Run YOLOv11 model to detect persons
    results = model.predict(source=frame, save=False, conf=0.5, iou=0.5, classes=[0])  # Class 0 corresponds to 'person'

    # Store detected player colors for clustering
    player_colors = []

    # Collect dominant non-green color for each detected person
    for box in results[0].boxes:
        # Extract bounding box coordinates
        x1, y1, x2, y2 = map(int, box.xyxy[0])

        # Extract the region of interest (ROI) corresponding to the detected person
        roi = frame[y1:y2, x1:x2]

        # Resize ROI for consistency in color analysis
        roi_resized = cv2.resize(roi, (50, 50), interpolation=cv2.INTER_AREA)

        # Convert ROI to HSV color space
        hsv_roi = cv2.cvtColor(roi_resized, cv2.COLOR_BGR2HSV)

        # Mask out green pixels
        non_green_mask = cv2.inRange(hsv_roi, green_lower, green_upper)
        non_green_pixels = roi_resized[non_green_mask == 0]  # Keep only non-green pixels

        # Check if there are enough non-green pixels to find a dominant color
        if len(non_green_pixels) == 0:
            continue

        # Use KMeans to find the dominant non-green color in the ROI
        kmeans = KMeans(n_clusters=1, random_state=0).fit(non_green_pixels)
        dominant_color_rgb = kmeans.cluster_centers_[0].astype(int)
        
        # Add dominant non-green color to player_colors
        player_colors.append(dominant_color_rgb)

    # If there are not enough player colors, skip to the next frame
    if len(player_colors) < 2:
        print("Not enough players detected to determine teams.")
        continue

    # Perform KMeans clustering on player colors to identify team groups
    kmeans_teams = KMeans(n_clusters=2, random_state=0).fit(player_colors)
    team_labels = kmeans_teams.labels_

    # Assign consistent colors to teams based on the first frame with enough detections
    if color_assignment is None:
        # Determine which cluster is Team A (red) and Team B (white) based on average color intensity
        if np.mean(kmeans_teams.cluster_centers_[0]) < np.mean(kmeans_teams.cluster_centers_[1]):
            color_assignment = {0: (0, 0, 255), 1: (255, 255, 255)}  # Team A is red, Team B is white
        else:
            color_assignment = {0: (255, 255, 255), 1: (0, 0, 255)}  # Team A is white, Team B is red

    # Count the number of players in each team based on clustering results
    team_counts = Counter(team_labels)
    team_a_count = team_counts[0]
    team_b_count = team_counts[1]

    # Draw bounding boxes and assign labels based on consistent team color assignment
    for idx, box in enumerate(results[0].boxes):  # Iterate over filtered boxes only
        # Extract bounding box coordinates
        x1, y1, x2, y2 = map(int, box.xyxy[0])

        # Get team label for the current player from the clustering results
        team_label = team_labels[idx]
        color = color_assignment[team_label]
        label = f"Team {'A' if team_label == 0 else 'B'}"

        # Draw bounding box and label on the frame
        cv2.rectangle(frame, (x1, y1), (x2, y2), color, 2)
        cv2.putText(frame, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)

    # Display team counts on the frame
    cv2.putText(frame, f"Team A: {team_a_count}", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, color_assignment[0], 2)
    cv2.putText(frame, f"Team B: {team_b_count}", (10, 70), cv2.FONT_HERSHEY_SIMPLEX, 1, color_assignment[1], 2)

    # Write the annotated frame to the output video
    out.write(frame)

    # Write to CSV
    csv_writer.writerow([video_time, team_a_count, team_b_count])

    # Optional: Display the frame (comment out if not needed)
    # cv2.imshow('Frame', frame)
    # if cv2.waitKey(1) & 0xFF == ord('q'):
    #     break

# Release resources
cap.release()
out.release()
csv_file.close()  # Close the CSV file
cv2.destroyAllWindows()

Processing frame 1 at time 0.10 seconds

0: 384x640 10 persons, 254.7ms
Speed: 2.6ms preprocess, 254.7ms inference, 0.8ms postprocess per image at shape (1, 3, 384, 640)
Processing frame 2 at time 0.20 seconds

0: 384x640 10 persons, 218.7ms
Speed: 2.0ms preprocess, 218.7ms inference, 0.8ms postprocess per image at shape (1, 3, 384, 640)
Processing frame 3 at time 0.30 seconds

0: 384x640 10 persons, 264.7ms
Speed: 1.5ms preprocess, 264.7ms inference, 0.8ms postprocess per image at shape (1, 3, 384, 640)
Processing frame 4 at time 0.40 seconds

0: 384x640 10 persons, 248.1ms
Speed: 2.3ms preprocess, 248.1ms inference, 0.6ms postprocess per image at shape (1, 3, 384, 640)
Processing frame 5 at time 0.50 seconds

0: 384x640 11 persons, 252.6ms
Speed: 2.1ms preprocess, 252.6ms inference, 0.9ms postprocess per image at shape (1, 3, 384, 640)
Processing frame 6 at time 0.60 seconds

0: 384x640 11 persons, 236.1ms
Speed: 2.9ms preprocess, 236.1ms inference, 0.8ms postprocess per image at sha