In [2]:
import json
import subprocess
from collections import Counter
from typing import Optional

import cv2
import easyocr
import numpy as np
import pandas as pd
from tqdm import tqdm
from ultralytics import YOLO

In [3]:
def get_video_rotation(video_path):
    cmd = [
        "ffprobe",
        "-v",
        "error",
        "-select_streams",
        "v:0",
        "-show_entries",
        "stream_tags=rotate:stream_side_data=rotation",
        "-of",
        "json",
        video_path,
    ]
    result = subprocess.run(
        cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
    )
    info = json.loads(result.stdout)

    # First, try the rotate tag (classic)
    try:
        return int(info["streams"][0]["tags"]["rotate"])
    except (KeyError, ValueError):
        pass

    # Then, check side_data_list
    try:
        side_data = info["streams"][0].get("side_data_list", [])
        for item in side_data:
            if "rotation" in item:
                return int(item["rotation"])
    except (KeyError, ValueError):
        pass

    return 0  # default: no rotation

In [None]:
def rotate_frame(frame, rotation) -> np.ndarray:
    if rotation == 90:
        frame = cv2.rotate(frame, cv2.ROTATE_90_COUNTERCLOCKWISE)
    elif rotation == -90:
        frame = cv2.rotate(frame, cv2.ROTATE_90_CLOCKWISE)
    elif rotation == 180:
        frame = cv2.rotate(frame, cv2.ROTATE_180)

    return frame


def time_to_seconds(timestr: str) -> int:
    parts = timestr.split(":")
    parts = [int(p) for p in parts]
    if len(parts) == 2:
        minutes, seconds = parts
        return minutes * 60 + seconds
    elif len(parts) == 3:
        hours, minutes, seconds = parts
        return hours * 3600 + minutes * 60 + seconds
    else:
        raise ValueError(f"Invalid time format: {timestr}")


def get_start_and_end_frames(
    segment_time_range: tuple[str, str], start_frame: int, end_frame: int, fps: float
) -> tuple[int, int]:
    start_sec, end_sec = map(time_to_seconds, segment_time_range)
    start_sec = max(start_sec - 1, start_frame)
    end_sec = min(end_sec + 1, end_frame)
    start_frame = int(start_sec * fps)
    end_frame = int(end_sec * fps)

    return start_frame, end_frame


def detect_airplane_ids(
    video_path: str,
    model,
    every_nth_frame: int,
    plane_confidence_threshold: float,
    ocr_reader,
    ocr_confidence_threshold: float,
    segment_time_range: Optional[tuple[str, str]] = None,
):
    # Get video rotation
    rotation = get_video_rotation(video_path)

    # Load video
    cap = cv2.VideoCapture(video_path)

    fps = cap.get(cv2.CAP_PROP_FPS)
    end_frame = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    start_frame = 0

    # Limit to segment
    if segment_time_range is not None:
        start_frame, end_frame = get_start_and_end_frames(
            segment_time_range, start_frame, end_frame, fps
        )

    cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame)

    # List of detected IDs
    detected_ids = []
    frame_idx = start_frame
    while cap.isOpened() and frame_idx < end_frame:
        ret, frame = cap.read()

        if not ret:
            break

        if frame_idx % every_nth_frame != 0:
            frame_idx += 1
            continue

        frame = rotate_frame(frame, rotation)

        # Run object detection
        results = model(frame, verbose=False)[0]
        for box in results.boxes:
            cls = int(box.cls[0])
            conf = float(box.conf[0])

            label = model.names[cls]
            if label != "airplane":
                continue

            if conf < plane_confidence_threshold:
                continue

            # Get bounding box
            x1, y1, x2, y2 = map(int, box.xyxy[0])
            crop = frame[y1:y2, x1:x2]

            # Run OCR on cropped region
            ocr_results = ocr_reader.readtext(crop)
            for _, text, text_conf in ocr_results:
                if float(text_conf) < ocr_confidence_threshold:
                    continue

                cleaned = text.strip().upper()
                detected_ids.append(cleaned)

        frame_idx += 1

    cap.release()
    return detected_ids

In [100]:
CONFIDENCE_THRESHOLD = 0.5
TEXT_MIN_CONF = [0.1, 0.3, 0.5, 0.7]
EVERY_NTH_FRAME = 15

model = YOLO("yolov8m.pt", verbose=False)
ocr_reader = easyocr.Reader(["en"])

In [101]:
df = pd.read_csv("dataset.csv")

# Drop duplicates for easier segment handling
df = df.drop_duplicates(subset=["Video file"], keep="first")

# Drop rows without registration
df = df.dropna(subset=["Registration"])

df[[f"Prediction {conf}" for conf in TEXT_MIN_CONF]] = None

In [103]:
for idx, row in tqdm(df.iterrows(), total=len(df)):
    file = row["Video file"]
    segment_time_range = row["Segment start"], row["Segment end"]

    for conf in TEXT_MIN_CONF:
        all_ids = detect_airplane_ids(
            f"data/{file}",
            model,
            EVERY_NTH_FRAME,
            CONFIDENCE_THRESHOLD,
            ocr_reader,
            conf,
            segment_time_range,
        )
        counter_ids = Counter(all_ids)
        if len(counter_ids) > 0:
            df.at[idx, f"Prediction {conf}"] = str(counter_ids)
        else:
            break

100%|██████████| 21/21 [05:22<00:00, 15.34s/it]


In [104]:
df

Unnamed: 0,Video file,Usable,Registration,Segment start,Segment end,Comment,Prediction 0.1,Prediction 0.3,Prediction 0.5,Prediction 0.7
0,IMG_3353.MOV,Yes,OK-LTY,00:02,00:02,,"Counter({'OK-LTY': 1, '0K-LI': 1, '0': 1})",Counter({'OK-LTY': 1}),,
1,IMG_3354.MOV,Yes,OK-BIT,00:15,00:17,,"Counter({'OK-BIT': 2, '0R-BIT': 1})",Counter({'0R-BIT': 1}),,
2,IMG_3355.MOV,Yes,OK-BIT,00:01,00:02,,"Counter({'LK-BIT': 2, 'UK-BIT': 1})",Counter({'LK-BIT': 1}),,
3,IMG_3358.MOV,Yes,OK-LTY,00:02,00:03,,"Counter({'@K-LIY': 3, 'K-LIY': 1})","Counter({'@K-LIY': 3, 'K-LIY': 1})",Counter({'@K-LIY': 2}),Counter({'@K-LIY': 1})
4,IMG_3359.MOV,Yes,OK-BIT,00:20,00:23,,"Counter({'O-BI': 1, 'O4-BN': 1, 'OL-BII': 1})",,,
5,IMG_3360.MOV,Yes,OK-FAH,00:12,00:16,,"Counter({'ON FAH': 1, 'ON FA': 1})",,,
6,IMG_3363.MOV,Yes,OK-LTY,00:07,00:12,,Counter({'OKL': 1}),Counter({'OKL': 1}),,
7,IMG_3367.MOV,Yes,OK-BIT,00:01,00:03,Úplně perfektní,Counter({'OK-BIT': 4}),Counter({'OK-BIT': 4}),Counter({'OK-BIT': 4}),Counter({'OK-BIT': 4})
8,IMG_3368.MOV,Yes,OK-BIT,00:03,00:05,,Counter({'OK-BIT': 6}),Counter({'OK-BIT': 6}),Counter({'OK-BIT': 5}),Counter({'OK-BIT': 5})
9,IMG_3370.MOV,Yes,OK-COK,00:10,00:12,,"Counter({'AERO': 7, 'OK COK': 5, 'JUE': 2, 'OK...","Counter({'AERO': 7, 'OK COK': 5, 'JUE': 2, 'OK...","Counter({'AERO': 7, 'JUE': 2, 'OK COK': 1})","Counter({'AERO': 7, 'JUE': 1})"


In [None]:
cap = cv2.VideoCapture("data/IMG_3353.MOV")
rotation = get_video_rotation("data/IMG_3353.MOV")

# Process every n-th frame
frame_skip = 20
frame_count = 0

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    frame_count += 1
    if frame_count % frame_skip != 0:
        continue  # skip this frame without processing

    if rotation == 90:
        frame = cv2.rotate(frame, cv2.ROTATE_90_COUNTERCLOCKWISE)
    elif rotation == -90:
        frame = cv2.rotate(frame, cv2.ROTATE_90_CLOCKWISE)
    elif rotation == 180:
        frame = cv2.rotate(frame, cv2.ROTATE_180)

    results = model(frame)
    for result in results:
        for x1, y1, x2, y2 in map(lambda b: map(int, b), result.boxes.xyxy):
            cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)

    cv2.imshow("Detection", frame)
    if cv2.waitKey(1) == ord("q"):
        break

cap.release()
cv2.destroyAllWindows()

Python(63139) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.



0: 640x384 (no detections), 451.4ms
Speed: 23.0ms preprocess, 451.4ms inference, 8.4ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 (no detections), 588.8ms
Speed: 2.3ms preprocess, 588.8ms inference, 0.9ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 (no detections), 789.0ms
Speed: 2.1ms preprocess, 789.0ms inference, 1.8ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 (no detections), 434.9ms
Speed: 1.5ms preprocess, 434.9ms inference, 0.8ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 (no detections), 456.3ms
Speed: 3.2ms preprocess, 456.3ms inference, 3.1ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 (no detections), 358.9ms
Speed: 1.6ms preprocess, 358.9ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 airplane, 270.7ms
Speed: 2.6ms preprocess, 270.7ms inference, 17.6ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 airplane, 252.9ms
Speed: 1.3ms preprocess, 2

: 

In [3]:
get_video_rotation("data/IMG_3353.MOV")

-90