In [None]:
from pathlib import Path
import shutil
import os

def prepare_dbnet_dataset_from_vietnamese(root_dir: Path, output_dir: Path = Path("datasets")):
    # Đường dẫn thư mục dữ liệu gốc
    train_img_dir = root_dir / "train_images"
    test_img_dir = root_dir / "test_image"
    # Nếu bạn muốn dùng unseen_test_images làm test thì gộp thêm ở đây
    # unseen_img_dir = root_dir / "unseen_test_images" 

    label_dir = root_dir / "labels"

    # Đường dẫn output chuẩn DBNet
    out_train_img = output_dir / "train" / "img"
    out_train_gt = output_dir / "train" / "gt"
    out_test_img = output_dir / "test" / "img"
    out_test_gt = output_dir / "test" / "gt"

    for d in [out_train_img, out_train_gt, out_test_img, out_test_gt]:
        d.mkdir(parents=True, exist_ok=True)

    def process_images(img_dir, out_img_dir, out_gt_dir):
        img_files = sorted([f for f in os.listdir(img_dir) if f.lower().endswith((".jpg", ".jpeg", ".png"))])
        samples = []
        for img_name in img_files:
            img_stem = Path(img_name).stem
            # Convert "im0001" => 1
            try:
                num_id = int(img_stem.replace("im", ""))
            except Exception:
                print(f"Skip {img_name}, cannot parse id")
                continue
            label_file = f"gt_{num_id}.txt"
            label_path = label_dir / label_file
            if not label_path.exists():
                print(f"Label not found for {img_name} at {label_path}")
                continue

            # Copy ảnh + label
            shutil.copy(img_dir / img_name, out_img_dir / img_name)
            shutil.copy(label_path, out_gt_dir / f"{img_stem}.txt")

            samples.append((f"./datasets/{out_img_dir.parent.name}/img/{img_name}",
                            f"./datasets/{out_gt_dir.parent.name}/gt/{img_stem}.txt"))
        return samples

    print("Processing training images...")
    train_samples = process_images(train_img_dir, out_train_img, out_train_gt)
    print(f"Train samples: {len(train_samples)}")

    print("Processing test images...")
    test_samples = process_images(test_img_dir, out_test_img, out_test_gt)
    print(f"Test samples: {len(test_samples)}")

    # Ghi train.txt
    with open(output_dir / "train.txt", "w", encoding="utf-8") as f:
        for img_path, gt_path in train_samples:
            f.write(f"{img_path}\t{gt_path}\n")

    # Ghi test.txt
    with open(output_dir / "test.txt", "w", encoding="utf-8") as f:
        for img_path, gt_path in test_samples:
            f.write(f"{img_path}\t{gt_path}\n")

    print("Data preparation finished.")
    print(f"train.txt: {(output_dir / 'train.txt').absolute()}")
    print(f"test.txt: {(output_dir / 'test.txt').absolute()}")

if __name__ == "__main__":
    root_path = Path(r"../data/vietnamese")
    prepare_dbnet_dataset_from_vietnamese(root_path)

In [4]:
from pathlib import Path
import cv2

dataset_root = Path(r"C:\Users\ts834\Downloads\datasets")

def check_sample(txt_path):
    with open(txt_path, "r", encoding="utf-8") as f:
        lines = f.readlines()
    print(f"Label file: {txt_path}")
    for line in lines[:5]:
        print(line.strip())

def check_image(img_path):
    img = cv2.imread(str(img_path))
    if img is None:
        print(f"Ảnh {img_path} không đọc được.")
        return False
    else:
        print(f"Ảnh {img_path} kích thước: {img.shape}")
        return True

# Mở 5 samples đầu tiên trong train.txt để kiểm tra
train_txt = dataset_root / "train.txt"
with open(train_txt, "r", encoding="utf-8") as f:
    lines = f.readlines()

for line in lines[:5]:
    img_path_str, gt_path_str = line.strip().split("\t")
    img_path = dataset_root.parent / img_path_str.strip("./")
    gt_path = dataset_root.parent / gt_path_str.strip("./")
    check_image(img_path)
    check_sample(gt_path)
    print("-" * 30)


Ảnh C:\Users\ts834\Downloads\datasets\train\img\im0001.jpg kích thước: (288, 512, 3)
Label file: C:\Users\ts834\Downloads\datasets\train\gt\im0001.txt
65,35,82,35,82,39,65,39,###
94,10,117,10,117,41,93,41,CHẤT
118,15,147,15,148,46,118,46,LƯỢNG
149,9,165,9,165,43,150,43,TỐT
167,9,180,9,179,43,167,42,ĐỂ
------------------------------
Ảnh C:\Users\ts834\Downloads\datasets\train\img\im0002.jpg kích thước: (600, 800, 3)
Label file: C:\Users\ts834\Downloads\datasets\train\gt\im0002.txt
196,194,287,201,289,267,196,263,CỔNG
291,207,419,211,420,271,294,266,TRƯỜNG
425,220,471,220,472,273,425,271,AN
475,219,561,219,562,275,477,274,TOÀN
567,224,642,228,644,279,567,277,GIAO
------------------------------
Ảnh C:\Users\ts834\Downloads\datasets\train\img\im0003.jpg kích thước: (3456, 4608, 3)
Label file: C:\Users\ts834\Downloads\datasets\train\gt\im0003.txt
2013,273,2040,273,2041,283,2015,284,CÔNG
2044,271,2078,271,2081,282,2043,283,TRƯỜNG
2083,272,2107,272,2107,282,2083,282,ĐANG
2110,272,2122,273,212