In [7]:
import json
import subprocess
from collections import Counter

import cv2
import easyocr
import pandas as pd
from tqdm import tqdm
from ultralytics import YOLO

In [8]:
def get_video_rotation(video_path):
    cmd = [
        "ffprobe",
        "-v",
        "error",
        "-select_streams",
        "v:0",
        "-show_entries",
        "stream_tags=rotate:stream_side_data=rotation",
        "-of",
        "json",
        video_path,
    ]
    result = subprocess.run(
        cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
    )
    info = json.loads(result.stdout)

    # First, try the rotate tag (classic)
    try:
        return int(info["streams"][0]["tags"]["rotate"])
    except (KeyError, ValueError):
        pass

    # Then, check side_data_list
    try:
        side_data = info["streams"][0].get("side_data_list", [])
        for item in side_data:
            if "rotation" in item:
                return int(item["rotation"])
    except (KeyError, ValueError):
        pass

    return 0  # default: no rotation

In [9]:
def detect_airplane_ids(
    video_path: str,
    model,
    target_fps: int,
    plane_confidence_threshold: float,
    ocr_reader,
    ocr_confidence_threshold: float,
):
    # Get video rotation
    rotation = get_video_rotation(video_path)

    # Load video
    cap = cv2.VideoCapture(video_path)

    # Limit FPS
    fps = cap.get(cv2.CAP_PROP_FPS)
    frame_skip = fps // target_fps

    frame_idx = 0

    # List of detected IDs
    detected_ids = []

    while cap.isOpened():
        ret, frame = cap.read()

        if not ret:
            break

        # Rotate frame
        if rotation == 90:
            frame = cv2.rotate(frame, cv2.ROTATE_90_COUNTERCLOCKWISE)
        elif rotation == -90:
            frame = cv2.rotate(frame, cv2.ROTATE_90_CLOCKWISE)
        elif rotation == 180:
            frame = cv2.rotate(frame, cv2.ROTATE_180)

        if frame_idx % frame_skip != 0:
            frame_idx += 1
            continue

        # Run object detection
        results = model(frame, verbose=False)[0]
        for box in results.boxes:
            cls = int(box.cls[0])
            conf = float(box.conf[0])
            if conf < plane_confidence_threshold:
                continue

            label = model.names[cls]
            if label != "airplane":
                continue

            # Get bounding box
            x1, y1, x2, y2 = map(int, box.xyxy[0])
            crop = frame[y1:y2, x1:x2]

            # Run OCR on cropped region
            ocr_results = ocr_reader.readtext(crop)
            for _, text, text_conf in ocr_results:
                if float(text_conf) < ocr_confidence_threshold:
                    continue
                cleaned = text.strip().upper()
                if len(cleaned) > 3:
                    detected_ids.append(cleaned)

        frame_idx += 1

    cap.release()
    return detected_ids

In [10]:
CONFIDENCE_THRESHOLD = 0.5
TEXT_MIN_CONF = 0.4
TARGET_FPS = 4

model = YOLO("yolov8x.pt", verbose=False)
ocr_reader = easyocr.Reader(["en"])

In [11]:
df = pd.read_csv("dataset.csv")
df["Prediction"] = None

In [None]:
for idx, row in tqdm(df.iterrows(), total=len(df)):
    file = row["Video file"]

    all_ids = detect_airplane_ids(
        f"data/{file}",
        model,
        TARGET_FPS,
        CONFIDENCE_THRESHOLD,
        ocr_reader,
        TEXT_MIN_CONF,
    )
    counter_ids = Counter(all_ids)

    if len(counter_ids) > 0:
        df.at[idx, "Prediction"] = counter_ids.most_common(1)[0][0]

100%|██████████| 40/40 [12:36<00:00, 18.91s/it]


In [None]:
cap = cv2.VideoCapture("data/IMG_3353.MOV")
rotation = get_video_rotation("data/IMG_3353.MOV")

# Process every n-th frame
frame_skip = 20
frame_count = 0

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    frame_count += 1
    if frame_count % frame_skip != 0:
        continue  # skip this frame without processing

    if rotation == 90:
        frame = cv2.rotate(frame, cv2.ROTATE_90_COUNTERCLOCKWISE)
    elif rotation == -90:
        frame = cv2.rotate(frame, cv2.ROTATE_90_CLOCKWISE)
    elif rotation == 180:
        frame = cv2.rotate(frame, cv2.ROTATE_180)

    results = model(frame)
    for result in results:
        for x1, y1, x2, y2 in map(lambda b: map(int, b), result.boxes.xyxy):
            cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)

    cv2.imshow("Detection", frame)
    if cv2.waitKey(1) == ord("q"):
        break

cap.release()
cv2.destroyAllWindows()

Python(63139) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.



0: 640x384 (no detections), 451.4ms
Speed: 23.0ms preprocess, 451.4ms inference, 8.4ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 (no detections), 588.8ms
Speed: 2.3ms preprocess, 588.8ms inference, 0.9ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 (no detections), 789.0ms
Speed: 2.1ms preprocess, 789.0ms inference, 1.8ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 (no detections), 434.9ms
Speed: 1.5ms preprocess, 434.9ms inference, 0.8ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 (no detections), 456.3ms
Speed: 3.2ms preprocess, 456.3ms inference, 3.1ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 (no detections), 358.9ms
Speed: 1.6ms preprocess, 358.9ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 airplane, 270.7ms
Speed: 2.6ms preprocess, 270.7ms inference, 17.6ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 airplane, 252.9ms
Speed: 1.3ms preprocess, 2

: 

In [3]:
get_video_rotation("data/IMG_3353.MOV")

-90