In [1]:
from ultralytics import YOLO
import cv2
import easyocr
from glob import glob
from collections import Counter
import pandas as pd
import subprocess
import json
from tqdm import tqdm

In [4]:
def get_video_rotation(video_path):
    cmd = [
        "ffprobe", "-v", "error", "-select_streams", "v:0",
        "-show_entries", "stream_tags=rotate:stream_side_data=rotation",
        "-of", "json", video_path
    ]
    result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
    info = json.loads(result.stdout)

    # First, try the rotate tag (classic)
    try:
        return int(info["streams"][0]["tags"]["rotate"])
    except (KeyError, ValueError):
        pass

    # Then, check side_data_list
    try:
        side_data = info["streams"][0].get("side_data_list", [])
        for item in side_data:
            if "rotation" in item:
                return int(item["rotation"])
    except (KeyError, ValueError):
        pass

    return 0  # default: no rotation

In [None]:
def detect_airplane_ids(
    video_path: str,
    model,
    target_fps: int,
    plane_confidence_threshold: float,
    ocr_reader,
    ocr_confidence_threshold: float,
):
    rotation = get_video_rotation(video_path)

    cap = cv2.VideoCapture(video_path)

    fps = cap.get(cv2.CAP_PROP_FPS)
    frame_skip = fps // target_fps

    frame_idx = 0
    detected_ids = []

    while cap.isOpened():
        ret, frame = cap.read()

        if not ret:
            break

        if rotation == 90:
            frame = cv2.rotate(frame, cv2.ROTATE_90_COUNTERCLOCKWISE)
        elif rotation == -90:
            frame = cv2.rotate(frame, cv2.ROTATE_90_CLOCKWISE)
        elif rotation == 180:
            frame = cv2.rotate(frame, cv2.ROTATE_180)

        if frame_idx % frame_skip != 0:
            frame_idx += 1
            continue

        # Run object detection
        results = model(frame, verbose=False)[0]
        for box in results.boxes:
            cls = int(box.cls[0])
            conf = float(box.conf[0])
            if conf < plane_confidence_threshold:
                continue

            label = model.names[cls]
            if label != "airplane":
                continue

            # Get bounding box
            x1, y1, x2, y2 = map(int, box.xyxy[0])
            crop = frame[y1:y2, x1:x2]

            # Run OCR on cropped region
            ocr_results = ocr_reader.readtext(crop)
            for _, text, text_conf in ocr_results:
                if float(text_conf) < ocr_confidence_threshold:
                    continue
                cleaned = text.strip().upper()
                if len(cleaned) > 3:
                    detected_ids.append(cleaned)

        frame_idx += 1

    cap.release()
    return detected_ids

In [6]:
CONFIDENCE_THRESHOLD = 0.5
TEXT_MIN_CONF = 0.4
TARGET_FPS = 4

model = YOLO("yolov8x.pt", verbose=False)
ocr_reader = easyocr.Reader(['en'])

In [7]:
df = pd.read_csv('dataset.csv')
df['Prediction'] = None

In [10]:
for idx, row in tqdm(df.iterrows(), total=len(df)):
    file = row['Video file']

    all_ids = detect_airplane_ids(
        f'data/{file}',
        model,
        TARGET_FPS,
        CONFIDENCE_THRESHOLD,
        ocr_reader,
        TEXT_MIN_CONF
    )
    counter_ids = Counter(all_ids)

    if len(counter_ids) > 0:
        df.at[idx, 'Prediction'] = counter_ids.most_common(1)[0][0]

100%|██████████| 40/40 [12:36<00:00, 18.91s/it]


In [11]:
df

Unnamed: 0,Video file,Usable,Registration,Segment start,Segment end,Comment,Prediction
0,IMG_3353.MOV,Yes,OK-LTY,00:02,00:02,,
1,IMG_3354.MOV,Yes,OK-BIT,00:15,00:17,,
2,IMG_3355.MOV,Yes,OK-BIT,00:01,00:02,,
3,IMG_3358.MOV,Yes,OK-LTY,00:02,00:03,,@K-LIY
4,IMG_3359.MOV,Yes,OK-BIT,00:20,00:23,,
5,IMG_3360.MOV,Yes,OK-FAH,00:12,00:16,,
6,IMG_3363.MOV,Yes,OK-LTY,00:07,00:12,,
7,IMG_3367.MOV,Yes,OK-BIT,00:01,00:03,Úplně perfektní,OK-BIT
8,IMG_3368.MOV,Yes,OK-BIT,00:03,00:05,,OK-BIT
9,IMG_3370.MOV,Yes,OK-COK,00:10,00:12,,AERO


In [8]:
cap = cv2.VideoCapture("data/IMG_3353.MOV")
rotation = get_video_rotation("data/IMG_3353.MOV")

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break
    if rotation == 90:
        frame = cv2.rotate(frame, cv2.ROTATE_90_COUNTERCLOCKWISE)
    elif rotation == -90:
        frame = cv2.rotate(frame, cv2.ROTATE_90_CLOCKWISE)
    elif rotation == 180:
        frame = cv2.rotate(frame, cv2.ROTATE_180)

    results = model(frame)
    for result in results:
        boxes = result.boxes.xyxy  # Bounding boxes
        for box in boxes:
            x1, y1, x2, y2 = map(int, box)
            cv2.rectangle(frame, (x1, y1), (x2, y2), (0,255,0), 2)
    cv2.imshow("Detection", frame)
    if cv2.waitKey(1) == ord('q'):
        break
cap.release()
cv2.destroyAllWindows()


0: 640x384 (no detections), 595.3ms
Speed: 16.4ms preprocess, 595.3ms inference, 16.2ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 (no detections), 585.8ms
Speed: 2.9ms preprocess, 585.8ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 (no detections), 940.8ms
Speed: 2.0ms preprocess, 940.8ms inference, 1.8ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 (no detections), 507.6ms
Speed: 2.3ms preprocess, 507.6ms inference, 0.7ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 (no detections), 314.3ms
Speed: 1.7ms preprocess, 314.3ms inference, 0.6ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 (no detections), 291.8ms
Speed: 1.3ms preprocess, 291.8ms inference, 0.4ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 (no detections), 300.4ms
Speed: 1.0ms preprocess, 300.4ms inference, 0.4ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 (no detections), 294.6ms
Speed: 1.0ms pre

In [3]:
get_video_rotation("data/IMG_3353.MOV")

-90