In [1]:
import os
import glob
from tqdm import tqdm

import pandas as pd
import cv2

from sklearn.model_selection import StratifiedGroupKFold

In [2]:
working_dir = 'synthesis-car-od'
dir_len = len(working_dir)

path = os.getcwd().replace('\\', '/')
index = path.find(working_dir)
working_dir = path[:index + dir_len + 1]
working_dir

'/Users/a16/Desktop/JJ/self_study/projects/synthesis-car-od/'

In [3]:
DATA_PATH = os.path.join(working_dir, 'data/')
TRAIN_PATH = os.path.join(DATA_PATH, 'train/')
TEST_PATH = os.path.join(DATA_PATH, 'test/')
DEST_PATH = os.path.join(DATA_PATH, 'yolo/')
DEST_IMAGE_PATH = os.path.join(DEST_PATH, 'images/')
DEST_LABEL_PATH = os.path.join(DEST_PATH, 'labels/')

os.makedirs(DEST_IMAGE_PATH, exist_ok=True)
os.makedirs(DEST_LABEL_PATH, exist_ok=True)

# Make Yolo Data Structure

In [5]:
def make_yolo_dataset(image_paths, txt_paths=None):
    for image_path, txt_path in zip(tqdm(image_paths), txt_paths):
        img = cv2.imread(image_path)

        dest_image_path = os.path.join(DEST_IMAGE_PATH, os.path.basename(image_path))
        cv2.imwrite(dest_image_path, img)

        if txt_paths:
            with open(txt_path, 'r') as txt_file:
                yolo_labels = []

                img_h, img_w, _ = img.shape
                lines = txt_file.readlines()

                for line in lines:
                    line = list(map(float, line.strip().split(' ')))

                    label = int(line[0])

                    bbox = line[1:]

                    xs = [bbox[i] for i in range(0, 8, 2)]
                    ys = [bbox[i] for i in range(1, 8, 2)]

                    x_min = min(xs)
                    y_min = min(ys)
                    x_max = max(xs)
                    y_max = max(ys)

                    box_w = x_max - x_min 
                    box_h = y_max - y_min

                    normalized_x = (x_min + x_max) / (2 * img_w)
                    normalized_y = (y_min + y_max) / (2 * img_h)

                    normalized_box_w = box_w / img_w
                    normalized_box_h = box_h / img_h

                    yolo_labels.append(f'{label} {normalized_x} {normalized_y} {normalized_box_w} {normalized_box_h}')

            dest_label_path = os.path.join(DEST_LABEL_PATH, os.path.basename(txt_path))

            with open(dest_label_path, 'w') as label_file:
                for yolo_label in yolo_labels:
                    label_file.write(f'{yolo_label}\n')

In [6]:
train_imgs = sorted(glob.glob(os.path.join(TRAIN_PATH, '*.png')))
train_txts = sorted(glob.glob(os.path.join(TRAIN_PATH, '*.txt')))

test_imgs = sorted(glob.glob(os.path.join(TEST_PATH, '*.png')))

make_yolo_dataset(train_imgs, train_txts)
make_yolo_dataset(test_imgs)

100%|██████████| 6481/6481 [11:33<00:00,  9.34it/s]
  0%|          | 0/3400 [00:00<?, ?it/s]


TypeError: 'NoneType' object is not iterable

# Stratified Group KFold Split & Make YAML Config

In [6]:
img_names = []
txt_names = []
labels = []
bboxes = []

for img, txt in zip(train_imgs, train_txts):
    img_name = img.replace('\\', '/').split('/')[-1]
    txt_name = txt.replace('\\', '/').split('/')[-1]

    with open(txt, 'r') as t:
        lines = t.readlines()

        for line in lines:
            line = line.strip()
            label = int(float(line.split(' ')[0]))
            bbox = ' '.join(line.split(' ')[1:])

            img_names.append(img_name)
            txt_names.append(txt_name)
            labels.append(label)
            bboxes.append(bbox)

In [9]:
train_df = pd.DataFrame({
    'img' : img_names,
    'txt' : txt_names,
    'label' : labels,
    'bbox' : bboxes
})
train_df.head()

Unnamed: 0,img,label,bbox
0,syn_00000.png,9,1037 209 1312 209 1312 448 1037 448
1,syn_00000.png,25,804 425 1127 425 1127 783 804 783
2,syn_00000.png,12,330 250 583 250 583 511 330 511
3,syn_00001.png,16,1000 98 1295 98 1295 405 1000 405
4,syn_00001.png,14,678 175 926 175 926 421 678 421


In [10]:
SEED = 41

sgkf = StratifiedGroupKFold(
    n_splits=4,
    shuffle=True,
    random_state=SEED
)

In [11]:
train_indices = []
val_indices = []

for train_idx, val_idx in sgkf.split(train_df['img'], train_df['label'], train_df['img']):
    train_indices.append(train_idx)
    val_indices.append(val_idx)