In [None]:
from pathlib import Path
import shutil
src_folder = Path("origin_jpg_data")
ratio_folder = Path("dataset_by_aspect_ratio")
shutil.rmtree(ratio_folder, ignore_errors=True)
general_folder = Path("dataset_by_aspect_ratio") / "general"

In [None]:
import json
with open("labels_config.json", 'r') as f:
    config = json.load(f)

In [None]:
en_config, zh_config = config['en'], config['zh']

In [None]:
en_config, zh_config

In [None]:
labels_map = {}
for k in zh_config:
    labels_map[zh_config[k]] = en_config[k]

In [None]:
labels_map['合格'] = 'valid'

In [None]:
labels_map

In [None]:
from collections import defaultdict
general_files = defaultdict(list)
for file in src_folder.rglob("**/*.jpg"):
    abs_path = str(file.absolute())
    labels = []
    for k in labels_map:
        if k in abs_path:
            label = labels_map[k]
            labels.append(label)
    if len(labels) == 1:
        general_files[labels[0]].append(file)
    else:
        print(file)

In [None]:
for label, files in general_files.items():
    for file in files:
        tgt_path = general_folder / label / file.name
        tgt_path.parent.mkdir(exist_ok=True, parents=True)
        print(tgt_path)
        shutil.copyfile(file, tgt_path)

In [None]:
from PIL import Image
for file in general_folder.rglob("**/*.jpg"):
    img = Image.open(file)
    height, width = img.height, img.width
    if height > width:
        print(file)
        img.transpose(Image.Transpose.ROTATE_90).save(file)

In [None]:
train_folder = ratio_folder / "train"
valid_folder = ratio_folder / "valid"

In [None]:
from sklearn.model_selection import train_test_split
for label_folder in general_folder.iterdir():
    ratio_dict = defaultdict(list)
    label_name = label_folder.name
    for file in label_folder.iterdir():
        img = Image.open(file)
        height, width = img.height, img.width
        proportion = width // height
        ratio_dict[proportion].append(file)
    for files in ratio_dict.values():
        if len(files) > 2:
            train_files, test_files = train_test_split(files)
        else:
            train_files, test_files = files, []
        for train_file in train_files:
            tgt_train_file = train_folder / label_name / train_file.name
            tgt_train_file.parent.mkdir(exist_ok=True, parents=True)
            shutil.copyfile(train_file, tgt_train_file)
        for valid_file in test_files:
            tgt_valid_file = valid_folder / label_name / valid_file.name
            tgt_valid_file.parent.mkdir(exist_ok=True, parents=True)
            shutil.copyfile(valid_file, tgt_valid_file)


In [None]:
train_file_names = [file.name for file in train_folder.rglob("**/*.jpg")]
seen = set()
for file in train_file_names:
    if file not in seen:
        seen.add(file)
    else:
        print(file)

In [None]:
valid_file_names = [file.name for file in valid_folder.rglob("**/*.jpg")]

In [None]:
train_file_names

In [None]:
train_file_names=set(train_file_names)
valid_file_names=set(valid_file_names)

In [19]:
train_file_names & valid_file_names

{'4190-041-6001-002-C515-D30G-05.dcm.jpg'}

In [None]:
train_file_names