# Inference video with 


In [33]:
#MODEL_CHECKPOINT = "theButcher22/deformable-detr-football-finetuned"
MODEL_CHECKPOINT = "theButcher22/deta-swin-large"
#MODEL_CHECKPOINT = "../data/models/deta-swin/checkpoint_7450/"
PATH = "../data/input_videos"
VIDEO_NAME = "esa_04.mp4"
OUTPUT_PATH = "../data/output_videos"
OUTPUT_VIDEO_NAME = "esa_04_deta.mp4"
#VIDEO_PATH = 'esa_01.mov'

## Video Data Preparation

In [None]:
path = PATH

import os

video_files = [
    ("football_02.mp4", "https://drive.google.com/file/d/1t6agoqggZKx6thamUuPAIdN_1zR9v9S"),
    ("football_03.mp4", "https://drive.google.com/uc?id=12TqauVZ9tLAv8kWxTTBFWtgt2hNQ4_ZF"),
    ("football_04.mp4", "https://drive.google.com/uc?id=19PGw55V8aA6GZu5-Aac5_9mCy3fNxmEf"),
    ("football_05.mp4", "https://drive.google.com/uc?id=1OG8K6wqUw9t7lp9ms1M48DxRhwTYciK-"),
    ("football_06.mp4", "https://drive.google.com/uc?id=1yYPKuXbHsCxqjA9G-S6aeR2Kcnos8RPU"),
    ("football_07.mp4", "https://drive.google.com/uc?id=1vVwjW1dE1drIdd4ZSILfbCGPD4weoNiu"),
]

for filename, url in video_files:
    file_path = os.path.join(path, filename)
    print(file_path)
    if not os.path.exists(file_path):
        print(f"Downloading {filename} from {url}")
        !gdown -O "{file_path}" "{url}"

# Inferance on images

In [35]:
import os

try:
    from google.colab import userdata
    hf_token=userdata.get('HF_TOKEN')


except ImportError:
    from dotenv import load_dotenv
    load_dotenv(dotenv_path='../config/.env')
    hf_token = os.getenv("HF_TOKEN")

In [None]:
from huggingface_hub import login

login(token=hf_token)

In [None]:
import torch
import supervision as sv
from PIL import Image
from transformers import AutoImageProcessor, AutoModelForObjectDetection

processor = AutoImageProcessor.from_pretrained(MODEL_CHECKPOINT, use_fast=True)
model = AutoModelForObjectDetection.from_pretrained(MODEL_CHECKPOINT)

video_path = os.path.join(path, VIDEO_NAME)

id2label = {1: "ball", 2: "goalkeeper", 3: "player", 4: "referee"}
label2id = {v: k for k, v in id2label.items()}
print(label2id)


In [38]:
def frame_inference(frame, threshold=0.5):
    """
    This function performs inference on a single video frame to detect objects.

    Parameters:
    frame (numpy.ndarray): The input video frame as a NumPy array.

    Returns:
    detections (sv.Detections): An object containing the detected objects and their properties.
    
    The function processes the input frame using a pre-trained object detection model and returns the detections.
    """
    detections = []
    img = Image.fromarray(frame)

    inputs = processor(images=img, return_tensors="pt")

    with torch.no_grad():
        outputs = model(**inputs)

    width, height = img.size
    target_size = torch.tensor([[height, width]])
    results = processor.post_process_object_detection(
        outputs=outputs, threshold=threshold, target_sizes=target_size)[0]
        
    detections = sv.Detections.from_transformers(
        transformers_results=results,
        id2label=model.config.id2label
    )
    return detections


In [39]:
def annotate_frame(frame, detections, threshold=0.5):
    """
    Annotates a given frame with detected objects.

    Parameters:
    frame (numpy.ndarray): The input frame to be annotated.
    detections (sv.Detections): The detections obtained from the model.

    Returns:
    numpy.ndarray: The annotated frame with visual indicators for detected objects.
    
    The function uses different annotators to draw indicators for different objects in the frame. It specifically highlights the 'ball' 
    detection with a triangle.
    """
    ellipse_annotator = sv.EllipseAnnotator(
    color=sv.ColorPalette.from_hex(['#00BFFF', '#FF1493', '#FFD700']),
    thickness=2
    )
    label_annotator = sv.LabelAnnotator(
        color=sv.ColorPalette.from_hex(['#FF8C00', '#00BFFF', '#FF1493', '#FFD700']),
        text_color=sv.Color.from_hex('#000000'),
        text_position=sv.Position.BOTTOM_CENTER,
        text_scale=0.5,
        text_thickness=1,
        text_padding=10,
        smart_position=True
    )
    triangle_annotator = sv.TriangleAnnotator(
    color=sv.Color.from_hex('#FFD700'),
    base=25,
    height=21,
    outline_thickness=1
    )
    
    labels = [
        model.config.id2label[class_id]
        for class_id
        in detections.class_id
    ]

    ball_id = label2id["ball"]  
    ball_detections = detections[detections.class_id == ball_id]
    ball_detections.xyxy = sv.pad_boxes(xyxy=ball_detections.xyxy, px=10)

    all_detections = detections[detections.class_id != ball_id]
    all_detections = all_detections.with_nms(threshold=threshold, class_agnostic=True)
    all_detections.class_id -= 1

    annotated_frame = frame.copy()

    annotated_frame = ellipse_annotator.annotate(
        scene=annotated_frame,
        detections=detections)
    
    annotated_frame = label_annotator.annotate(
        scene=annotated_frame,
        detections=detections,
        labels=labels)
    
    annotated_frame = triangle_annotator.annotate(
    scene=annotated_frame,
    detections=ball_detections)

    return annotated_frame

In [None]:
#plot firt video frame
frame_generator = sv.get_video_frames_generator(video_path)
for _ in range(5):  # Skip the first x frames
    next(frame_generator)
frame = next(frame_generator)
detections = frame_inference(frame, threshold=0.2)
annotated_frame = annotate_frame(frame, detections, threshold=0.2)
print(detections)
sv.plot_image(annotated_frame)

## Video Inference

In [41]:
import numpy as np
import supervision as sv

def inference_video(video_path: str, output_video_path: str,threshold=0.5, verbose=False):
    """
    This function performs video inference using a specified video path and outputs the annotated video to a given output path.

    Parameters:
    - video_path (str): The path to the input video file.
    - output_video_path (str): The path where the output video will be saved.
    - verbose (bool): A flag indicating whether to print detailed processing information for each frame. Default is False.

    The function utilizes a tracker (ByteTrack) to maintain object identities across frames and a smoother to enhance detection stability.
    It processes each frame of the video, applies detections, annotates the frames accordingly, and saves the annotated video to the specified output path.
    """
    tracker = sv.ByteTrack()
    smoother = sv.DetectionsSmoother()
    if verbose:
        frame_idx = 0
        video_info = sv.VideoInfo.from_video_path(video_path)
        total_frames = video_info.total_frames
        print("Start processing video: ", video_info)



    def callback(frame: np.ndarray, _: int) -> np.ndarray:
        detections = frame_inference(frame, threshold=threshold)
        detections = tracker.update_with_detections(detections)
        detections = smoother.update_with_detections(detections)
        annotated_frame = annotate_frame(frame, detections, threshold=threshold)
        if verbose:
            nonlocal frame_idx 
            frame_idx += 1
            class_counts = {}
            for detection in detections:
                class_name = detection[-1]['class_name']
                if class_name in class_counts:
                    class_counts[class_name] += 1
                else:
                    class_counts[class_name] = 1
            
            detected_classes = {class_name: count for class_name, count in class_counts.items() if count > 0}
            print(f"Processing frame {frame_idx} of {total_frames}: size {frame.shape}, detections: {detected_classes}")
        return annotated_frame

    sv.process_video(
        source_path=video_path,
        target_path=output_video_path,
        callback=callback
    )

In [None]:
video_path=PATH + "/" + VIDEO_NAME
output_video_path=OUTPUT_PATH + "/" + OUTPUT_VIDEO_NAME

sv.VideoInfo.from_video_path(video_path)

inference_video(video_path, output_video_path, verbose=True, threshold=0.3)