# Split (EASY, HARD)

1. label_files 변수에 학습 데이터 정보와 난이도를 구분할 평가 데이터 정보를 담는다. {"별칭": "레이블 경로"}
2. 코드를 쭉 수행한다.
3. 알아서 저장됌

In [1]:
import pandas as pd

# 데이터프레임 출력 설정 변경
pd.set_option("display.max_columns", None)  # 모든 열 출력
pd.set_option("display.max_rows", None)     # 모든 행 출력
pd.set_option("display.max_colwidth", None) # 열 너비 제한 해제
pd.set_option("display.width", 0)           # 자동 줄바꿈 비활성화

In [18]:
from database import *
import pandas as pd
from collections import Counter

def method_cache(func):
    cache_name = f"_{func.__name__}_cache"  # 메서드별 고유 캐시 이름

    def wrapper(self, *args, **kwargs):
        # 메서드별로 독립적인 캐시를 생성
        if not hasattr(self, cache_name):
            result = func(self, *args, **kwargs)
            setattr(self, cache_name, result)

        return getattr(self, cache_name)
    return wrapper
    
class SingleLabelsetAnalizer:
    def __init__(self, record):
        """
            labelset record에 대해 분석 기능을 제공한다.
            wrapper 처럼 사용한다.
        """
        self.labelsetdb = LabelsetDB2()
        self.record = self.ensure_labelset(record)
    

    
    def ensure_labelset(self, labelset):
        """ 
            labelset은 id, name, instance일 수 있다.
            따라서 이를 labelset instance로 통일하여 반환한다.
        """
        if isinstance(labelset, Labelset2):
            return labelset
        elif isinstance(labelset, str) or isinstance(labelset, int):
            return self.labelsetdb.get_record(labelset)
        
        raise Exception(f"{labelset} is Invalid work")
    
    @method_cache
    def statistic(self):
        text_labels = self.get_text_labels(labelset)
        label = text_labels
        labels
        
        return len(labelset)
    
    @property
    @method_cache
    def labels(self):
        return self.record.labels
    
    @property
    @method_cache
    def text_labels(self):
        """ 
            labelset은 [image_path, text_label]형태이며, 여기서 text_label만 반환한다.
        """
        return [label[1] for label in self.labels]
    
    @property
    @method_cache
    def char_set(self):
        char_set = set()
        for text_label in self.text_labels:
            char_set.update(set(text_label))
        return char_set

    @property
    @method_cache
    def char_num(self):
        total_counts = Counter()  # 누적할 Counter 객체

        # 각 문자열에서 개수를 계산하여 누적
        for text_label in self.text_labels:
            total_counts.update(Counter(text_label))
        return dict(total_counts)
    
analizer = SingleLabelsetAnalizer(14)
analizer.char_num

{'여': 3558,
 '성': 11879,
 '전': 17855,
 '용': 6242,
 '아': 19651,
 '린': 3454,
 '이': 39728,
 '의': 24634,
 '체': 2448,
 '험': 947,
 '방': 8983,
 '공': 10879,
 '감': 1976,
 '론': 970,
 '신': 9083,
 '서': 9189,
 '만': 4321,
 '화': 9731,
 '그': 4346,
 '리': 30160,
 '스': 32335,
 '도': 10114,
 ' ': 69363,
 '없': 496,
 '는': 8524,
 '문': 14541,
 '학': 17981,
 '교': 9418,
 '킨': 1866,
 '점': 11938,
 '심': 2374,
 '특': 1029,
 '선': 5581,
 '정': 12461,
 '수': 19521,
 '영': 10065,
 '어': 23990,
 '원': 29709,
 '흰': 49,
 '돌': 1179,
 '치': 13316,
 '과': 19776,
 '알': 1156,
 '콩': 647,
 '박': 2675,
 '대': 16730,
 '포': 6292,
 '베': 2437,
 '프': 6683,
 '멀': 223,
 '티': 4022,
 '샵': 1743,
 '김': 5490,
 '택': 1172,
 '시': 11558,
 '집': 6928,
 '쓰': 579,
 '고': 11517,
 '연': 9051,
 '구': 11305,
 '소': 14151,
 '볶': 1320,
 '떡': 1873,
 '빵': 740,
 '카': 7248,
 '페': 4637,
 '가': 15164,
 '압': 293,
 '중': 8756,
 '살': 2747,
 '림': 4049,
 '법': 3088,
 '환': 1248,
 '상': 9881,
 '적': 1091,
 '인': 22302,
 '청': 4233,
 '마': 15696,
 '루': 3155,
 '비': 11627,
 '안': 6165,
 '필': 221

In [24]:
x = analizer.char_num
x = pd.DataFrame(x.items())
len(x)

1741

In [7]:
class MyClass:
    @cache
    def compute_sum(self, x, y):
        print(f"Computing sum: {x} + {y}")
        return x + y

    @cache
    def compute_mul(self, x, y):
        print(f"Computing product: {x} * {y}")
        return x * y


# 테스트
obj = MyClass()
print(obj.compute_sum(1, 2))  # 계산 수행
print(obj.compute_sum(1, 2))  # 캐시에서 반환

print(obj.compute_mul(2, 3))  # 계산 수행
print(obj.compute_mul(2, 3))  # 캐시에서 반환


Computing sum: 1 + 2
3
3
Computing product: 2 * 3
6
6


In [9]:
labelsetdb = LabelsetDB2()
labelsetdb.record_id_to_name
labelsetdb.get_record(14).labels

[['aihub_rec/557/556890.png', '여성전용'],
 ['aihub_rec/767/766677.png', '아린이의'],
 ['aihub_rec/119/118561.png', '체험방'],
 ['aihub_rec/880/879043.png', '공감이론신서'],
 ['aihub_rec/213/212530.png', '만화'],
 ['aihub_rec/860/859901.png', '그리스도 없는'],
 ['aihub_rec/39/38121.png', '전문학교'],
 ['aihub_rec/543/542375.png', '스킨'],
 ['aihub_rec/765/764848.png', '점심특선'],
 ['aihub_rec/513/512210.png', '정수정영어학원'],
 ['aihub_rec/49/48991.png', '흰돌치과의원'],
 ['aihub_rec/279/278264.png', '알콩'],
 ['aihub_rec/46/45942.png', '박대포'],
 ['aihub_rec/105/104156.png', '베이프멀티샵'],
 ['aihub_rec/823/822065.png', '김용택 시집'],
 ['aihub_rec/681/680534.png', '쓰리고'],
 ['aihub_rec/265/264646.png', '연구소'],
 ['aihub_rec/208/207645.png', '볶떡빵'],
 ['aihub_rec/317/316365.png', '카페'],
 ['aihub_rec/567/566730.png', '가압중'],
 ['aihub_rec/808/807791.png', '살림법'],
 ['aihub_rec/880/879581.png', '환상적인'],
 ['aihub_rec/588/587807.png', '대청마루'],
 ['aihub_rec/288/287921.png', '비비안'],
 ['aihub_rec/419/418929.png', '떡'],
 ['aihub_rec/756/755656.png', '필로드 수

In [171]:
label_files = {
    # "aihub_train": ["/home/labelsets/aihub_rec_full_horizontal_clean_80:10:10/train_label.txt"],
    # "zero_shot_train": ["/home/labelsets/aihub_rec_full_horizontal_clean_80:10:10/aihub_train_with_1691_char___v2.txt"],
    # "aihub_eval": ["/home/labelsets/aihub_rec_full_horizontal_clean_80:10:10/eval_label.txt"],
    # "aihub_verified_test": ["/home/labelsets/aihub_rec_full_horizontal_clean_80:10:10/verified_testset/verified_test_label.txt"],
    # "GIST_test": ["/home/labelsets/GIST_rec_full_test/test_label_horizontal_only_korean.txt"],
    # "KAIST_test_horizontal": ["/home/datasets/KAIST_rec/label_only_korean_horizontal.txt"],
    # f"aihub_train": [f"/home/labelsets/aihub_rec_full_horizontal_clean_80:10:10/aihub_train_with_1000_char___v2.txt"]
    # "aihub_train_remove50": ["/home/labelsets/aihub_rec_full_horizontal_clean_80:10:10/aihub_train_with_1691_char___v2.txt"],
    # "aihub_train_remove40": ["/home/labelsets/aihub_rec_full_horizontal_clean_80:10:10/aihub_train_with_1701_char___v2.txt"],
    # "aihub_train_remove30": ["/home/labelsets/aihub_rec_full_horizontal_clean_80:10:10/aihub_train_with_1711_char___v2.txt"],
    # "aihub_train_remove20": ["/home/labelsets/aihub_rec_full_horizontal_clean_80:10:10/aihub_train_with_1721_char___v2.txt"],
    # "aihub_train_remove10": ["/home/labelsets/aihub_rec_full_horizontal_clean_80:10:10/aihub_train_with_1731_char___v2.txt"],
    "remove10": ["/home/labelsets/aihub_rec_full_horizontal_clean_80:10:10/aihub_train_with_1731_char___v2.txt"],
    "remove20": ["/home/labelsets/aihub_rec_full_horizontal_clean_80:10:10/aihub_train_with_1721_char___v2.txt"],
    "remove30": ["/home/labelsets/aihub_rec_full_horizontal_clean_80:10:10/aihub_train_with_1711_char___v2.txt"],
    "remove40": ["/home/labelsets/aihub_rec_full_horizontal_clean_80:10:10/aihub_train_with_1701_char___v2.txt"],
    "remove50": ["/home/labelsets/aihub_rec_full_horizontal_clean_80:10:10/aihub_train_with_1691_char___v2.txt"],
    "text": ["/home/labelsets/aihub_rec_full_horizontal_clean_80:10:10/verified_testset/verified_test_label.txt"],
}


In [164]:
label_files

{'remove10': '/home/labelsets/aihub_rec_full_horizontal_clean_80:10:10/aihub_train_with_1731_char___v2.txt',
 'remove20': '/home/labelsets/aihub_rec_full_horizontal_clean_80:10:10/aihub_train_with_1721_char___v2.txt',
 'remove30': '/home/labelsets/aihub_rec_full_horizontal_clean_80:10:10/aihub_train_with_1711_char___v2.txt',
 'remove40': '/home/labelsets/aihub_rec_full_horizontal_clean_80:10:10/aihub_train_with_1701_char___v2.txt',
 'remove50': '/home/labelsets/aihub_rec_full_horizontal_clean_80:10:10/aihub_train_with_1691_char___v2.txt',
 'text': '/home/labelsets/aihub_rec_full_horizontal_clean_80:10:10/verified_testset/verified_test_label.txt'}

In [None]:
with open() as f:
    


In [172]:
def load_label(label_files):
    labels = list()
    for file in label_files:
        print(file)
        with open(file, "r") as f:
            labels = [line.rstrip().split("\t") for line in f.readlines()]
    return labels


# {task: labels}
labels_dict = {name: load_label(files) for name, files in label_files.items()}

/home/labelsets/aihub_rec_full_horizontal_clean_80:10:10/aihub_train_with_1731_char___v2.txt
/home/labelsets/aihub_rec_full_horizontal_clean_80:10:10/aihub_train_with_1721_char___v2.txt
/home/labelsets/aihub_rec_full_horizontal_clean_80:10:10/aihub_train_with_1711_char___v2.txt
/home/labelsets/aihub_rec_full_horizontal_clean_80:10:10/aihub_train_with_1701_char___v2.txt
/home/labelsets/aihub_rec_full_horizontal_clean_80:10:10/aihub_train_with_1691_char___v2.txt
/home/labelsets/aihub_rec_full_horizontal_clean_80:10:10/verified_testset/verified_test_label.txt


In [173]:

import itertools


def get_char_set_from_labels(labels):
    char_set = set()
    for label in labels:
        char_set.update(set(label[1]))
    return char_set

char_set_dict = {task: get_char_set_from_labels(labels) for task, labels in labels_dict.items()}

for task, char_set in char_set_dict.items():
    print(f"{task}: {len(char_set)}")
    
task_combinations = list(itertools.combinations(list(char_set_dict.keys()), 2))
for task1, task2 in task_combinations: # 각 task 조합에 대해 정보 출력
    print(f"{task1} & {task2} = {len(char_set_dict[task1] & char_set_dict[task2])} | {task1} - {task2} = {len(char_set_dict[task1] - char_set_dict[task2])} | {task2} - {task1} = {len(char_set_dict[task2] - char_set_dict[task1])}")

remove10: 784
remove20: 778
remove30: 774
remove40: 770
remove50: 767
text: 1354
remove10 & remove20 = 778 | remove10 - remove20 = 6 | remove20 - remove10 = 0
remove10 & remove30 = 774 | remove10 - remove30 = 10 | remove30 - remove10 = 0
remove10 & remove40 = 770 | remove10 - remove40 = 14 | remove40 - remove10 = 0
remove10 & remove50 = 767 | remove10 - remove50 = 17 | remove50 - remove10 = 0
remove10 & text = 784 | remove10 - text = 0 | text - remove10 = 570
remove20 & remove30 = 774 | remove20 - remove30 = 4 | remove30 - remove20 = 0
remove20 & remove40 = 770 | remove20 - remove40 = 8 | remove40 - remove20 = 0
remove20 & remove50 = 767 | remove20 - remove50 = 11 | remove50 - remove20 = 0
remove20 & text = 778 | remove20 - text = 0 | text - remove20 = 576
remove30 & remove40 = 770 | remove30 - remove40 = 4 | remove40 - remove30 = 0
remove30 & remove50 = 767 | remove30 - remove50 = 7 | remove50 - remove30 = 0
remove30 & text = 774 | remove30 - text = 0 | text - remove30 = 580
remove40 

In [147]:
"뗑" in char_set_dict["aihub_train"]

False

In [174]:
def char_num_report(labels):
    report = {}
    for label in labels:
        for char in label[1]:
            if char not in report:
                report[char] = 0
            report[char] += 1
    return dict(sorted(report.items(), key=lambda item: item[1], reverse=True))


char_num_report_dict = {task: char_num_report(labels) for task, labels in labels_dict.items()}


# with open(f"/home/code/PaddleOCR/ppocr/metrics/remove_50_zero_char_set.txt", "w") as f:
#     for char in unseen_char:
#         f.write(f"{char}\n")
    # f.write(f"unseen_char: {len(unseen_char)}")

In [187]:
trained_char = list(char_num_report_dict["remove50"].keys())
char_type = 0
char_num = 0
for char, num in char_num_report_dict["text"].items():
    if char not in trained_char:
        
        char_type += 1
        char_num += num
print(char_type, char_num)

587 6264


In [156]:
def easy_hard(sample):
    char, num = sample
    if 1500 <= num  :
        return "many"
    elif 100 <= num:
        return "medium"
    elif 1 <= num:
        return "few"
    else:
        return "zero"

def get_easy_hard_report_dict(char_num_report_dict):
    easy_hard_report = dict()
    for split, num_report in char_num_report_dict.items():
        easy_hard_sub_report = {}
        
        for sample in num_report.items():
            easy_hard_sub_report.setdefault(easy_hard(sample), []).append(sample[0])
        easy_hard_report[split]=easy_hard_sub_report
    return easy_hard_report

many_few_char_set_dict = get_easy_hard_report_dict(char_num_report_dict)

for task, report in many_few_char_set_dict.items():
    for easy_hard, samples in report.items():
        print(f"({task}, {easy_hard}): {len(samples)}")
    print()
    
# train을 제외하면 many, medium, few 등이 의미가 없다고 보면 될 듯


(aihub_train, many): 257
(aihub_train, medium): 529
(aihub_train, few): 955

(aihub_verified_test, many): 34
(aihub_verified_test, medium): 335
(aihub_verified_test, few): 985



In [95]:
for k, v in many_few_char_set_dict["aihub_train"].items():
    with open(f"/home/labelsets/aihub_rec_full_horizontal_clean_80:10:10/char_set_remove10_{k}.txt", "w") as f:
        for char in v:
            f.write(char + "\n")


In [157]:
def get_split_with_char_set(labels, char_set):
    report = {
        "used": [],
        "unused": []
    }
    
    for img_path, label in labels:
        if len(set(label) & set(char_set)) == 0:
            report["unused"].append((img_path, label))
        else:
            report["used"].append((img_path, label))
    return report


def get_task_level_sample_dict(many_few_char_set_dict, char_set_dict, labels_dict, criterion = "train"):
    report = dict()
    
    many = many_few_char_set_dict[criterion]["many"]
    medium = many_few_char_set_dict[criterion]["medium"]
    few = many_few_char_set_dict[criterion]["few"]
    
    
    for task, char_set in char_set_dict.items():
        if task == criterion:
            continue
        
        unseen = char_set_dict[task] - char_set_dict[criterion]

        report[task] = dict()
        
        use_report = get_split_with_char_set(labels_dict[task], unseen)
        report[task]["unseen"] = use_report["used"]
        
        use_report = get_split_with_char_set(use_report["unused"], few)
        report[task]["hard"] = use_report["used"]
        
        use_report = get_split_with_char_set(use_report["unused"], medium)
        report[task]["normal"] = use_report["used"]
        report[task]["easy"] = use_report["unused"]
        
        
        # report[task]["hard"] = get_split_with_char_set(labels_dict[task], few)["used"]
        # report[task]["easy"] = get_split_with_char_set(labels_dict[task], few+medium)["unused"]
        # report[task]["normal"] = get_split_with_char_set(labels_dict[task], few)["unused"]
        
        
    return report

# task별 level 별 sample
task_level_sample_dict = get_task_level_sample_dict(many_few_char_set_dict, char_set_dict, labels_dict, criterion="aihub_train")

In [158]:
for task, report in task_level_sample_dict.items():
    for easy_hard, samples in report.items():
        print(f"({task}, {easy_hard}): {len(samples)}")
    print()

(aihub_verified_test, unseen): 20
(aihub_verified_test, hard): 2165
(aihub_verified_test, normal): 23961
(aihub_verified_test, easy): 36708



In [159]:
if input() == "y":
            
    from pathlib import Path

    dir_path = Path("/home/test_dataset")
    dir_path.mkdir(exist_ok=True, parents=True)

    for task, level_sample_dict in task_level_sample_dict.items():
        for level, samples in level_sample_dict.items():
            file_path = dir_path/task/level/"label.txt"
            file_path.parent.mkdir(exist_ok=True, parents=True)
            with open(file_path, "w") as f:
                for img_path, label in samples:
                    f.write(f"{img_path}\t{label}\n")
                    
            file_path = dir_path/task/level/"infer.txt"
            file_path.parent.mkdir(exist_ok=True, parents=True)
            with open(file_path, "w") as f:
                for img_path, label in samples:
                    f.write(f"{img_path}\n")

# Train split by char num
위 결과 이어서 train 데이터를 easy, normal에 대해서만 char 개수에 따라 구분

for zero-shot

In [32]:
data = task_level_sample_dict["aihub_zero_shot"]["easy"]+task_level_sample_dict["aihub_zero_shot"]["normal"]
char = list(char_num_report_dict["aihub_zero_shot"].keys())

import random
random.seed(0)

random.shuffle(char)
# interval = 100
# char_dict = {f"aihub_train_with_{interval*(i+1)}_char": char[:len(char)-interval*(i+1)] for i in range(int(len(char)/interval))}


interval = 100
char_dict = {f"aihub_train_with_{len(char)-interval*(i+1)}_char": char[:interval*(i+1)] for i in range(5)}



for k, v in char_dict.items():
    print(f"{k}: {len(v)}")

data_dict = {}
for key, value in char_dict.items():
    data_dict[key] = get_split_with_char_set(data, value)["unused"]


aihub_train_with_1641_char: 100
aihub_train_with_1541_char: 200
aihub_train_with_1441_char: 300
aihub_train_with_1341_char: 400
aihub_train_with_1241_char: 500


In [31]:
seen_unseen_dict = {}
for k, v in char_dict.items():
    temp = get_split_with_char_set(labels_dict["aihub_test"], v)
    seen = temp["unused"]
    unseen = temp["used"]
    seen_unseen_dict[k] = dict()
    seen_unseen_dict[k]["seen"] = seen
    seen_unseen_dict[k]["unseen"] = unseen

for k, v in seen_unseen_dict.items():
    print(f"{k}: seen: {len(v['seen'])}, unseen: {len(v['unseen'])}")

from pathlib import Path   

dir_path = Path("/home/test_dataset/removed_char/")
if input() == "y":
    for k, v in seen_unseen_dict.items():
        for key, value in v.items():
            file_path = dir_path/k/f"{key}_label.txt"
            file_path.parent.mkdir(exist_ok=True, parents=True)
            with open(file_path, "w") as f:
                for img_path, label in value:
                    f.write(f"{img_path}\t{label}\n")

aihub_train_with_1731_char: seen: 63428, unseen: 156
aihub_train_with_1721_char: seen: 62772, unseen: 812
aihub_train_with_1711_char: seen: 62053, unseen: 1531
aihub_train_with_1701_char: seen: 61367, unseen: 2217
aihub_train_with_1691_char: seen: 59718, unseen: 3866


[('aihub_rec/557/556890.png', '여성전용'),
 ('aihub_rec/767/766677.png', '아린이의'),
 ('aihub_rec/213/212530.png', '만화'),
 ('aihub_rec/39/38121.png', '전문학교'),
 ('aihub_rec/543/542375.png', '스킨'),
 ('aihub_rec/513/512210.png', '정수정영어학원'),
 ('aihub_rec/46/45942.png', '박대포'),
 ('aihub_rec/265/264646.png', '연구소'),
 ('aihub_rec/317/316365.png', '카페'),
 ('aihub_rec/808/807791.png', '살림법'),
 ('aihub_rec/588/587807.png', '대청마루'),
 ('aihub_rec/288/287921.png', '비비안'),
 ('aihub_rec/419/418929.png', '떡'),
 ('aihub_rec/756/755656.png', '필로드 수영복'),
 ('aihub_rec/140/139449.png', '나나무스'),
 ('aihub_rec/635/634361.png', '노동부지정'),
 ('aihub_rec/180/179612.png', '가정용'),
 ('aihub_rec/173/172989.png', '바른부동산'),
 ('aihub_rec/582/581867.png', '아로마 건강관리'),
 ('aihub_rec/41/40583.png', '라이스'),
 ('aihub_rec/885/884015.png', '이해'),
 ('aihub_rec/359/358158.png', '비단'),
 ('aihub_rec/402/401005.png', '미소약국'),
 ('aihub_rec/144/143001.png', '부동산'),
 ('aihub_rec/457/456723.png', '한의원'),
 ('aihub_rec/337/336237.png', '전문점'),
 (

In [33]:
from pathlib import Path
for key, value in data_dict.items():
    print(key, len(value))
    with open(Path("/home/labelsets/aihub_rec_full_horizontal_clean_80:10:10")/f"{key}___v3.txt", "w") as f: 
        f.write("\n".join([f"{img_path}\t{label}" for img_path, label in value]))
    

aihub_train_with_1641_char 426406
aihub_train_with_1541_char 334936
aihub_train_with_1441_char 277027
aihub_train_with_1341_char 232124
aihub_train_with_1241_char 179000


In [185]:
from pathlib import Path
for key, value in data_dict.items():
    print(key, len(value))
    with open(Path("/home/labelsets/aihub_rec_full_horizontal_clean_80:10:10")/key/'___v2.txt', "w") as f: 
        f.write("\n".join([f"{img_path}\t{label}" for img_path, label in value]))
    

aihub_train_with_50_char 647
aihub_train_with_100_char 2707
aihub_train_with_150_char 5641
aihub_train_with_200_char 10696
aihub_train_with_250_char 16615
aihub_train_with_300_char 30831
aihub_train_with_350_char 49097
aihub_train_with_400_char 77975
aihub_train_with_450_char 106829
aihub_train_with_500_char 125050
aihub_train_with_550_char 152812
aihub_train_with_600_char 224206
aihub_train_with_650_char 294739
aihub_train_with_700_char 329296
aihub_train_with_750_char 400239


# Shape distinguish
이미지가 가로형인지 세로형인지 대략적으로 구분

In [3]:
from pathlib import Path
from PIL import Image
import multiprocessing
from tqdm import tqdm

def chunk_list(data, num_chunks):
    avg = len(data) / float(num_chunks)
    chunks = []
    last = 0.0

    while last < len(data):
        chunks.append(data[int(last):int(last + avg)])
        last += avg

    return chunks

def get_shape_from_size(image, label):
    w, h = image.size
    if w >= h*1.5:
        return "horizontal"
    else:
        return "others"

def add_size(work_list, shared_list, data_dir):
    for work in tqdm(work_list):
        image_path, label = work
        image = Image.open(data_dir/image_path)
        shape = get_shape_from_size(image, label)
        shared_list.append([image_path, label, shape])

def get_shape_reportf(data_dir, label_path, worker_num = 10):
    with open(label_path) as f:
        lines = [line.strip().split("\t") for line in f.readlines() if len(line.strip().split("\t")) == 2]
        

    manager = multiprocessing.Manager()
    shared_list = manager.list()
    
    data_parts = chunk_list(lines, worker_num)
    processes = []
            
    for part in data_parts:
        p = multiprocessing.Process(target=add_size, args=(part, shared_list, data_dir))
        processes.append(p)
        p.start()
        
    for p in processes:
        p.join()
        
    shape_report = {}
    for image_path, label, shape in tqdm(shared_list):
        shape_report.setdefault(shape, []).append([image_path, label])
    
    return shape_report




In [56]:
# data_dir = Path("/home/datasets/aihub_rec/")
# label_path = Path("/home/datasets/aihub_rec/clean_label.txt")

data_dir = Path("/home/datasets/KAIST_rec")
label_path = Path("/home/datasets/KAIST_rec/label_only_korean.txt")

shape_report = get_shape_reportf(data_dir, label_path, worker_num = 50)

100%|██████████| 59/59 [00:00<00:00, 3093.03it/s]


100%|██████████| 59/59 [00:00<00:00, 2985.99it/s]
100%|██████████| 59/59 [00:00<00:00, 2277.10it/s]
100%|██████████| 59/59 [00:00<00:00, 2217.24it/s]
100%|██████████| 59/59 [00:00<00:00, 1634.55it/s]
100%|██████████| 59/59 [00:00<00:00, 1291.18it/s]
100%|██████████| 59/59 [00:00<00:00, 1245.70it/s]
100%|██████████| 59/59 [00:00<00:00, 1222.66it/s]
100%|██████████| 59/59 [00:00<00:00, 243.05it/s]
100%|██████████| 60/60 [00:00<00:00, 98.76it/s]
100%|██████████| 59/59 [00:00<00:00, 88.48it/s]
100%|██████████| 59/59 [00:00<00:00, 85.77it/s]
100%|██████████| 59/59 [00:00<00:00, 103.86it/s]
100%|██████████| 59/59 [00:00<00:00, 75.97it/s]
100%|██████████| 59/59 [00:00<00:00, 186.33it/s]
100%|██████████| 59/59 [00:00<00:00, 228.50it/s]
100%|██████████| 59/59 [00:00<00:00, 263.38it/s]
100%|██████████| 59/59 [00:00<00:00, 367.12it/s]
100%|██████████| 59/59 [00:00<00:00, 373.48it/s]
100%|██████████| 59/59 [00:00<00:00, 364.49it/s]
100%|██████████| 60/60 [00:00<00:00, 377.34it/s]
100%|██████████| 

In [57]:
for k, samples in shape_report.items():
    print(k, len(samples))

others 327
horizontal 2628


In [58]:
target_label_path = Path("/home/datasets/KAIST_rec/label_only_korean_horizontal.txt")
with open(target_label_path, "w") as f:
    for img_path, label in shape_report["horizontal"]:
        f.write(f"{img_path}\t{label}\n")
    



# 한글로만 이루어진 샘플 추출

In [38]:
import re

# 파일 경로


input_file_path = "/home/datasets/KAIST_rec/label.txt"
output_file_path = "/home/datasets/KAIST_rec/label_only_korean.txt"

# 한글 패턴 (한글 문자만 있는지 확인하기 위해)
hangul_pattern = re.compile("^[가-힣]+$")

# 결과 파일에 한글 레이블만 저장
with open(input_file_path, 'r', encoding='utf-8') as infile, open(output_file_path, 'w', encoding='utf-8') as outfile:
    for line in infile:
        try:
            # 오른쪽 레이블 추출
            label = line.strip().split('\t')[1]
            # 한글로만 이루어졌는지 확인
            if hangul_pattern.match(label):
                outfile.write(line)
        except Exception as e:
            print(e)
            print(line)

print(f"한글로만 이루어진 레이블의 샘플이 '{output_file_path}'에 저장되었습니다.")

list index out of range
2\1523.png	

list index out of range
5\4591.png	

list index out of range
5\4592.png	

list index out of range
5\4593.png	

list index out of range
6\5141.png	

한글로만 이루어진 레이블의 샘플이 '/home/datasets/KAIST_rec/label_only_korean.txt'에 저장되었습니다.


# label filtering

In [None]:
from PIL import Image
from pathlib import Path
from collections import Counter
import pandas as pd

def load_label(label_file_path):
    with open(label_file_path) as f:
        lines = [line.strip().split("\t") for line in f.readlines()]    
    for line in lines:
        if len(line) == 1:  # 레이블이 없는 경우 (공백으로 추론 된 경우)
            line.append("")
    return lines
        
def text_check(text):
    IGNORE_TEXT = ["(한자)", "((한자))", "(((한자)))", "(일본어)", "((일본어))", "(((일본어)))", "(외국어)","((외국어))","(((외국어)))",  "(영어)", "((영어))", "(((영어)))", "xx", "xxx", "xxxx", "xxxxx", "XX", "XXX", "XXXX", "XXXXX"]
    IGNORE_MASK = ["xx", "xxx", "xxxx", "xxxxx", "XX", "XXX", "XXXX", "XXXXX"]+["ㄱ","ㄴ","ㄷ","ㄹ","ㄺ","ㅁ","ㅂ","ㅅ","ㅆ","ㅇ","ㅈ","ㅊ","ㅋ","ㅌ","ㅍ","ㅎ","ㅏ","ㅑ","ㅓ","ㅕ","ㅗ","ㅛ","ㅜ","ㅠ","ㅡ","ㅣ","ㅐ","ㅒ","ㅔ","ㅖ","ㅘ","ㅙ","ㅚ","ㅝ","ㅞ","ㅟ","ㅢ"]+["!",'"',"#","$","%","&","'","(",")","*","+","-","/","0","1","2","3","4","5","6","7","8","9",":",";","<","=",">","?","A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z","[","\\","]","^","_","`","a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t","u","v","w","x","y","z","{","|","}","~","ㄱ","ㄴ","ㄷ","ㄹ","ㄺ","ㅁ","ㅅ","ㅆ","ㅇ","ㅈ","ㅊ","ㅋ","ㅌ","ㅍ","ㅎ"]+[",", ".", "º"]
    # 쓸 수 있는 test(trascription) 인지 체크하여 T, F로 반환
    if text in IGNORE_TEXT: # 금지된 텍스트에 해당하면 탈락
        return False
    else:
        for mask in IGNORE_MASK: # mask를 포함하고 있으면 탈락
            if mask in text:
                return False
    return True




def filter_label_with_mask(labels):
    filtered = []
    removed = []
    for i, (image, label) in enumerate(labels):
        if text_check(label):
            filtered.append([image, label])
        else:
            removed.append([image, label])
    return filtered, removed

def filter_by_length(labels, max_len):
    filtered = []
    removed = []
    for i, (image, label) in enumerate(labels):
        if len(label) <= max_len:
            filtered.append([image, label])
        else:
            removed.append([image, label])
    return filtered, removed

def get_char_num_report(labels):
    char_list = []
    for image, label in labels:
        char_list+=list(label)
    count = Counter(char_list)
    return dict(sorted(count.items(), key=lambda item: item[1]))
    

def get_length_report(labels):
    size_list = [len(text) for image, text in labels]

    count = Counter(size_list)
    return dict(sorted(count.items(), key=lambda item: item[0]))

def get_df(image_text_pairs):
    samples = []
    for image, text in image_text_pairs:
        samples.append({"image":image, "text":text})
    return pd.DataFrame(samples)


def make_char_set_file(char_num_report, file_path = "/home/char_set.txt"):
    char_list = list(char_num_report.keys())
    char_list.sort()

    with open(file_path, "w") as f:
        for c in char_list:
            f.write(f"{c}\n")
    print(f"{len(char_list)} characters are saved in '{file_path}'")


In [None]:
data_dir = "/home/datasets/aihub_rec"
label_file_path = "/home/datasets/aihub_rec/label.txt"
infer_file_path = "/home/datasets/aihub_rec/clean_infer_result.txt"

#################################################
labels = load_label(label_file_path)
print(f"Totel label num: {len(labels)}")

#################################################
labels, removed_labels = filter_label_with_mask(labels)
print(f"Label num after char_set filtering = {len(labels)}       ... {len(removed_labels)} samples are removed")

#################################################
MAX_LENGTH = 20
labels, removed_labels = filter_by_length(labels, MAX_LENGTH)
print(f"Label num after max length filtering = {len(labels)}       ... {len(removed_labels)} samples are removed")

#################################################
# 가로 세로 구분을 하려 했는데 필요 없을 것 같음
infers = load_label(infer_file_path)
label_df = get_df(labels)
infer_df = get_df(infers)
df = pd.merge(label_df, infer_df, on="image", suffixes=["_label", "_infer"])

#################################################
char_num_report = get_char_num_report(labels)
length_report = get_length_report(labels)

#################################################
make_char_set_file(char_num_report)

In [None]:
from pathlib import Path

def get_sample_seen_unseen_report(labels, seen_char):
    seen_unseen_report = {
        "seen": [],
        "unseen": []
    }
    
    for image, label in labels:
        unseen_char = set(label) - set(seen_char)
        if len(unseen_char) == 0:
            seen_unseen_report["seen"].append([image, label])
        else:
            seen_unseen_report["unseen"].append([image, label])

    return seen_unseen_report



trainset_path = Path("/home/labelsets/aihub_rec_full_horizontal_clean_80:10:10/train_label.txt")

testset_path_dict = {
    "aihub_eval": Path("/home/labelsets/aihub_rec_full_horizontal_clean_80:10:10/eval_label.txt"),
    "aihub_test": Path("/home/labelsets/aihub_rec_full_horizontal_clean_80:10:10/test_label.txt"),
    "GIST_test": Path("/home/labelsets/GIST_rec_full_test/test_label.txt"),
    "KAIST_test": Path("/home/labelsets/KAIST_rec_full_test/test_label.txt"),
}

trainset = load_label(trainset_path)[:10000]
testset_dict = {task: load_label(file_path) for task, file_path in testset_path_dict.items()}

trainset_char_num_report = get_char_num_report(trainset)
testset_char_num_report_dict = {task: get_char_num_report(labels) for task, labels in testset_dict.items()}

trainset_unique_char = set(trainset_char_num_report.keys())
seen_unseen_report_dict = {task: get_sample_seen_unseen_report(testset, trainset_unique_char) for task,  testset in testset_dict.items()}



# Dataset ratio split

In [65]:

import random
from pathlib import Path
random.seed(1000)

label_file_path = Path("/home/labelsets/aihub_rec_full_horizontal_clean_80:10:10/train_label.txt")

with open(label_file_path, "r") as f:
    labels = [line.rstrip().split("\t") for line in f.readlines()]


for ratio in range(1, 11):
    ratio = 0.05*ratio
    print(ratio)
    num = int(ratio*len(labels))
    print(num)
    sub_labels = random.sample(labels, num)
    with open(label_file_path.parent/f"train_{int(ratio*100)}.txt", "w") as f:
        for path, label in sub_labels:
            f.write(f"{path}\t{label}\n")




0.05
25433
0.1
50866
0.15000000000000002
76299
0.2
101733
0.25
127166
0.30000000000000004
152599
0.35000000000000003
178033
0.4
203466
0.45
228899
0.5
254333
