## Imports

In [9]:
import os, shutil, random, cv2, kaggle
import pandas as pd
from pathlib import Path

## Downloading database

In [10]:
if not os.path.exists("Aircrafts"):
    kaggle.api.dataset_download_files('a2015003713/militaryaircraftdetectiondataset', path='Aircrafts', unzip=True)

## Pathes definitions

In [11]:
base_dir = Path("Dataset")
dataset_dir = Path("Aircrafts") / "dataset"
train_dir = base_dir / "train"
valid_dir = base_dir / "valid"
test_dir = base_dir / "test"

## Loading all images

In [12]:
images = list(dataset_dir.glob("*.jpg"))

## Generating annotations

In [13]:
current_id = 0
class_mapping = {}
for img in images:
    csv_path = img.with_suffix(".csv")
    if not csv_path.exists():
        os.remove(img)
        continue

    df, im = pd.read_csv(csv_path), cv2.imread(str(img))
    if im is None:
        continue

    h, w = im.shape[:2]
    for _, row in df.iterrows():
        if row['class'] not in class_mapping:
            class_mapping[row['class']] = current_id
            current_id += 1

    annotations = [
        f"{class_mapping[row['class']]} "
        f"{(row['xmin'] + row['xmax']) / (2 * w):.6f} {(row['ymin'] + row['ymax']) / (2 * h):.6f} "
        f"{(row['xmax'] - row['xmin']) / w:.6f} {(row['ymax'] - row['ymin']) / h:.6f}"
        for _, row in df.iterrows()
        ]

    with open(img.with_suffix(".txt"), "w") as f:
        f.write("\n".join(annotations))
    os.remove(csv_path)

## Creating the new dataset folders

In [14]:
(train_dir / "images").mkdir(parents=True, exist_ok=True)
(train_dir / "labels").mkdir(parents=True, exist_ok=True)

(valid_dir / "images").mkdir(parents=True, exist_ok=True)
(valid_dir / "labels").mkdir(parents=True, exist_ok=True)

(test_dir / "images").mkdir(parents=True, exist_ok=True)
(test_dir / "labels").mkdir(parents=True, exist_ok=True)

## Generating .yaml file for YOLO

In [15]:
with open(base_dir / "data.yaml", "w") as f:
    f.write(f"train: {'train/images'}\nval: {'valid/images'}\ntest: {'test/images'}\n")
    f.write(f"nc: {len(class_mapping)}\nnames: {sorted(class_mapping, key=class_mapping.get)}\n")

## Loading all images

In [16]:
images = list(dataset_dir.glob("*.jpg"))

## Splitting the data into 80% train, 10% validation and 10% test

In [17]:
random.shuffle(images)
splits = {"train": 0.8, "valid": 0.1, "test": 0.1}
n = len(images)
split_map = {
    "train": images[:int(splits["train"] * n)],
    "valid": images[int(splits["train"] * n):int((splits["train"] + splits["valid"]) * n)],
    "test": images[int((splits["train"] + splits["valid"]) * n):]
}

## Moving the images and annotations to the new dataset

In [18]:
for s, imgs in split_map.items():
    for img in imgs:
        shutil.copy(img, base_dir / s / "images" / img.name)
        shutil.copy(img.with_suffix(".txt"), base_dir / s / "labels" / img.with_suffix(".txt").name)

## Removing the old database

In [19]:
shutil.rmtree("Aircrafts")

In [20]:
class_mapping

{'Mi28': 0,
 'UH60': 1,
 'F16': 2,
 'H6': 3,
 'F18': 4,
 'F22': 5,
 'F35': 6,
 'JAS39': 7,
 'B52': 8,
 'JH7': 9,
 'KF21': 10,
 'US2': 11,
 'Be200': 12,
 'EF2000': 13,
 'C2': 14,
 'P3': 15,
 'CL415': 16,
 'E2': 17,
 'Rafale': 18,
 'Mig31': 19,
 'Su25': 20,
 'XB70': 21,
 'YF23': 22,
 'Su57': 23,
 'F15': 24,
 'KC135': 25,
 'An72': 26,
 'An22': 27,
 'B2': 28,
 'An124': 29,
 'Tu160': 30,
 'C5': 31,
 'J20': 32,
 'B1': 33,
 'C130': 34,
 'A10': 35,
 'V22': 36,
 'C390': 37,
 'F117': 38,
 'WZ7': 39,
 'E7': 40,
 'CH47': 41,
 'AH64': 42,
 'Mi24': 43,
 'U2': 44,
 'AG600': 45,
 'KJ600': 46,
 'RQ4': 47,
 'Tu22M': 48,
 'SR71': 49,
 'J10': 50,
 'Mirage2000': 51,
 'AV8B': 52,
 'Tu95': 53,
 'A400M': 54,
 'MQ9': 55,
 'F14': 56,
 'Su34': 57,
 'F4': 58,
 'Mi8': 59,
 'Mig29': 60,
 'Z10': 61,
 'JF17': 62,
 'TB001': 63,
 'KAAN': 64,
 'Ka52': 65,
 'Il76': 66,
 'C17': 67,
 'EMB314': 68,
 'An225': 69,
 'Y20': 70,
 'Su24': 71,
 'TB2': 72,
 'Ka27': 73,
 'Tornado': 74,
 'Vulcan': 75,
 'Z19': 76,
 'V280': 77,
 'Mi26'