In [15]:
import json
import numpy as np
from PIL import Image, ImageDraw
from pathlib import Path
from tools import polygon_utility



In [16]:
class Logger:
    def __init__(self):
        self.on = True
    
    def print(self, text):
        if self.on:
            print(text)

In [21]:
def load_ppocrlabel(path,
                    label_file_name = "Label.txt",
                    deprecated_labels = ["(한자)", "(일본어)"],
                    only_internal_text_label = True,
                    print_log = False
                    ):
    """_summary_

    Args:
        path (str): ppocrlabel 프로그램을 사용해 만든 레이블 파일의 경로
        deprecated_labels (list): 안쓰는 레이블 리스트 (삭제됨)
        only_internal_text_label (bool): 모든 텍스트가 간판 내에만 있는지 확인하고 그렇지 않은 이미지는 제외
        print_log (bool): 진행 상황 로그를 출력할 지 여부
        
    Returns:
        list: 레이블 정보를 로드하여 리스트 형태로 반환
        
        result = [image_label, ...]
        image_label = {"sign":label, "text":[label, ...]}
        label = {"transcription":str , "points": polygon}
        polygon: [(x1, y1), (x2, y2), ... , (xn, yn)]
        
    """

    logger = Logger()
    logger.on = print_log
    
    
    # 데이터 로드
    lable_file_path = Path(path)/label_file_name
    def load_data(path):
        with open(path) as f:
            lines = [line.rstrip("\n") for line in f.readlines()]
        return lines
    logger.print("Load all data")
    lines = load_data(lable_file_path)
    logger.print("\n")


    # 각 줄을 샘플로 변환
    def line_to_sample(line):
        image_path, labels = line.split("\t")
        labels = json.loads(labels)
        return {"image_path":image_path, "labels": labels}
    logger.print("Convert text data into sample data")
    samples = [line_to_sample(line) for line in lines]
    logger.print("\n")
    
    
    
    # 이미지가 존재하지 않는 샘플이 있는지 확인
    def check_image_exist(samples, print_log = False):
        remove_num = 0
        new_samples = []
        for sample in samples:
            image_path = Path(path)/sample["image_path"]
            print(image_path)
            if not image_path.exists():
                remove_num += 1
                if print_log:
                    logger.print(f"Removed {sample['image_path']}")
            else:
                new_samples.append(sample)
        return new_samples, remove_num
    
    logger.print("Check the label has valid image path")
    samples, remove_num = check_image_exist(samples)
    logger.print(f"\tTotal {remove_num} of samples without image was removed\n")
    
    # 간판과 텍스트 레이블 구분
    def devide_sign_and_text(sample, print_log = False):
        sign_labels, text_labels = [], []
        for label in sample["labels"]:
            if label["transcription"] == "@@@":
                sign_labels.append(label)
            else:
                text_labels.append(label)
        return {"image_path":sample["image_path"], "sign_labels":sign_labels, "text_labels":text_labels}
    logger.print("Distingush sign and test label")
    samples = [devide_sign_and_text(sample) for sample in samples]
    logger.print("\n")

    # 모든 텍스트가 간판 안에 있는 이미지 외에 제거
    def remain_only_internal_samples(samples, print_log = False):
        remove_num = 0
        new_samples = []
        for sample in samples:
            if not all([any([polygon_utility.is_polygon_inside_polygon(text_label["points"], sign_label["points"]) for sign_label in sample["sign_labels"]]) for text_label in sample["text_labels"]]):
                remove_num += 1                
                if print_log:
                    logger.print(f"유효하지 않아 제거됨: {sample['image_path']}")
            else:
                new_samples.append(sample)
        return samples, remove_num

    if only_internal_text_label:
        logger.print("Check and remove the image whose all text labels are included in the sign label")
        samples, remove_num = remain_only_internal_samples(samples)
        logger.print(f"\t{remove_num} of images were removed for including one more not internal text lable\n")
        

    # 제거할 텍스트 레이블 제거
    def remove_target_label(samples, print_log = False):
        remove_num = 0
        samples = samples.copy()
        for sample in samples:
            text_labels = []
            for text_label in sample["text_labels"]:
                # 제거할 텍스트 레이블 제거
                if text_label["transcription"] in deprecated_labels:
                    if print_log:
                        logger.print(f"Removed {text_label}")
                    remove_num += 1
                else:
                    text_labels.append(text_label)
            sample["text_labels"] = text_labels
        return samples, remove_num    
        
    if deprecated_labels is not None:
        logger.print("Remove the text labels corresponding the text labels to remove")
        samples, remove_num = remove_target_label(samples)
        logger.print(f"\t{remove_num} of text labels were removed\n")
    
    
    
    def group_sign_and_text_lables(samples):
        samples = samples.copy()
        for sample in samples:
            sign_and_text_labels = []        
            for sign_label in sample["sign_labels"]:
                sign_text_labels = []
                for text_label in sample["text_labels"]:
                    if polygon_utility.is_polygon_inside_polygon(text_label["points"], sign_label["points"]):
                        sign_text_labels.append(text_label)
                
                sign_and_text_labels.append({"sign":sign_label, "text":sign_text_labels}) 
            del sample["sign_labels"]
            del sample["text_labels"]
            sample["labels"] =  sign_and_text_labels
        return samples
    logger.print("Group sign and internal text labels")
    samples = group_sign_and_text_lables(samples)
    logger.print("\n")
    
    logger.print(f"Total image labels: {len(samples)}")
    logger.print(f"Total sign labels: {sum([len(sample['labels']) for sample in samples])}")
    logger.print(f"Total text labels: {sum([sum([len(label['text']) for label in sample['labels']]) for sample in samples])}")
    return samples

def make_and_save_detection_dataset(label_dir, samples, save_dir, dir_size=1000, label_file_name = "label.txt"):
    assert not save_dir.exists(), f"please remove {save_dir}"
    label_path = save_dir/label_file_name
    image_idx = 0

    for sample in samples:  
        image_path, labels = sample["image_path"], sample["labels"]  
        image = Image.open(label_dir/image_path)
        for label in labels:
            sign_label = label["sign"]
            text_labels = label["text"]

            cropped_image = polygon_utility.crop_by_polygon(image, sign_label["points"])
            image_path = save_dir/f"{(image_idx//dir_size + 1)}"/f"{image_idx+1}.png"
            image_path.parent.mkdir(parents=True, exist_ok=True)
            cropped_image.save(image_path)
            with open(label_path, "a") as f:
                image_file = str(image_path.relative_to(save_dir)).replace('\\', '/')
                f.write(f"{image_file}\t{json.dumps(text_labels, ensure_ascii=False)}\n")

            image_idx += 1

def make_and_save_recognition_dataset(label_dir, samples, save_dir, dir_size=1000, label_file_name = "label.txt"):
    assert not save_dir.exists(), f"please remove {save_dir}"
    label_path = save_dir/label_file_name
    image_idx = 0

    for sample in samples:  
        image_path, labels = sample["image_path"], sample["labels"]
         
        image = Image.open(label_dir/image_path)
        for label in labels:
            sign_label = label["sign"]
            text_labels = label["text"]
            
            for text_label in text_labels:
                cropped_image = polygon_utility.crop_by_polygon(image, text_label["points"])
                image_path = save_dir/f"{(image_idx//dir_size + 1)}"/f"{image_idx+1}.png"
                image_path.parent.mkdir(parents=True, exist_ok=True)
                cropped_image.save(image_path)
                with open(label_path, "a") as f:
                    image_file = str(image_path.relative_to(save_dir)).replace('\\', '/')
                    f.write(f"{image_file}\t{text_label['transcription']}\n")

                image_idx += 1

In [18]:
# label_file_path = "./sample"
# samples = load_ppocrlabel(label_file_path, print_log=True)

# make_and_save_detection_dataset(samples, Path("./result/det"))
# make_and_save_recognition_dataset(samples, Path("./result/rec"))

In [22]:

label_dir = Path("E:/workspace/paddleocr/origin_datasets/outsourcing1")
std_result_path = "E:/workspace/paddleocr/datasets/outsourcing1_std"
str_result_path = "E:/workspace/paddleocr/datasets/outsourcing1_str"

samples = load_ppocrlabel(label_dir, print_log=True, label_file_name="images/Label.txt")
make_and_save_detection_dataset(label_dir, samples, Path(std_result_path))
make_and_save_recognition_dataset(label_dir, samples, Path(str_result_path))


label_dir = Path("E:/workspace/paddleocr/origin_datasets/outsourcing1(exception)")
std_result_path = "E:/workspace/paddleocr/datasets/outsourcing1(exception)_std"
str_result_path = "E:/workspace/paddleocr/datasets/outsourcing1(exception)_str"

samples = load_ppocrlabel(label_dir, print_log=True, label_file_name="images/Label.txt")
make_and_save_detection_dataset(label_dir, samples, Path(std_result_path))
make_and_save_recognition_dataset(label_dir, samples, Path(str_result_path))


Load all data


Convert text data into sample data


Check the label has valid image path
E:\workspace\paddleocr\origin_datasets\outsourcing1\images\86180de0-0434-4093-bda9-568bc43617da-aligned.jpg
E:\workspace\paddleocr\origin_datasets\outsourcing1\images\86323f34-bd2d-4c1b-af19-be725b0aa81a-aligned.jpg
E:\workspace\paddleocr\origin_datasets\outsourcing1\images\89437e8e-2ee4-44f2-9eaf-9605456d0b4b-aligned.jpg
E:\workspace\paddleocr\origin_datasets\outsourcing1\images\94178dd8-55de-4b1f-92e8-16f56d54ccbe-aligned.jpg
E:\workspace\paddleocr\origin_datasets\outsourcing1\images\94868dbe-9cdb-4988-b0ac-23f08908e6a4-aligned.jpg
E:\workspace\paddleocr\origin_datasets\outsourcing1\images\95864f88-d086-452d-b62b-bfa95f480218-aligned.jpg
E:\workspace\paddleocr\origin_datasets\outsourcing1\images\99653e89-d7da-479f-95d9-732d6d766836-aligned.jpg
E:\workspace\paddleocr\origin_datasets\outsourcing1\images\179117bc-26eb-4bbf-9ac5-12456dae58d8-aligned.jpg
E:\workspace\paddleocr\origin_datasets\outsour