# Split (EASY, HARD)

In [2]:
label_files = {
    "train": ["/home/labelsets/aihub_rec_full_horizontal_clean_80:10:10/train_label.txt"],
    "eval": ["/home/labelsets/aihub_rec_full_horizontal_clean_80:10:10/eval_label.txt"],
    "test": ["/home/labelsets/aihub_rec_full_horizontal_clean_80:10:10/test_label.txt"]
}

In [3]:
def load_label(label_files):
    labels = list()
    for file in label_files:
        with open(file, "r") as f:
            labels = [line.rstrip().split("\t") for line in f.readlines()]
    return labels


# {task: labels}
labels_dict = {name: load_label(files) for name, files in label_files.items()}

In [4]:

import itertools


def get_char_set_from_labels(labels):
    char_set = set()
    for label in labels:
        char_set.update(set(label[1]))
    return char_set

char_set_dict = {task: get_char_set_from_labels(labels) for task, labels in labels_dict.items()}

for task, char_set in char_set_dict.items():
    print(f"{task}: {len(char_set)}")
    
task_combinations = list(itertools.combinations(list(char_set_dict.keys()), 2))
for task1, task2 in task_combinations: # 각 task 조합에 대해 정보 출력
    print(f"{task1} & {task2} = {len(char_set_dict[task1] & char_set_dict[task2])} | {task1} - {task2} = {len(char_set_dict[task1] - char_set_dict[task2])} | {task2} - {task1} = {len(char_set_dict[task2] - char_set_dict[task1])}")

train: 1741
eval: 1353
test: 1357
train & eval = 1328 | train - eval = 413 | eval - train = 25
train & test = 1335 | train - test = 406 | test - train = 22
eval & test = 1221 | eval - test = 132 | test - eval = 136


In [5]:
def char_num_report(labels):
    report = {}
    for label in labels:
        for char in label[1]:
            if char not in report:
                report[char] = 0
            report[char] += 1
    return dict(sorted(report.items(), key=lambda item: item[1], reverse=True))


char_num_report_dict = {task: char_num_report(labels) for task, labels in labels_dict.items()}
# char_num_report_dict

In [6]:
def easy_hard(sample):
    char, num = sample
    if 1500 <= num  :
        return "many"
    elif 100 <= num:
        return "medium"
    else:
        return "few"

def get_easy_hard_report_dict(char_num_report_dict):
    easy_hard_report = dict()
    for split, num_report in char_num_report_dict.items():
        easy_hard_sub_report = {}
        for sample in num_report.items():
            easy_hard_sub_report.setdefault(easy_hard(sample), []).append(sample[0])
        easy_hard_report[split]=easy_hard_sub_report
    return easy_hard_report

many_few_char_set_dict = get_easy_hard_report_dict(char_num_report_dict)

for task, report in many_few_char_set_dict.items():
    for easy_hard, samples in report.items():
        print(f"({task}, {easy_hard}): {len(samples)}")
    print()
    
# train을 제외하면 many, medium, few 등이 의미가 없다고 보면 될 듯


(train, many): 257
(train, medium): 529
(train, few): 955

(eval, many): 37
(eval, medium): 329
(eval, few): 987

(test, many): 35
(test, medium): 338
(test, few): 984



In [20]:
def get_split_with_char_set(labels, char_set):
    report = {
        "used": [],
        "unused": []
    }
    
    for img_path, label in labels:
        if len(set(label) & set(char_set)) == 0:
            report["unused"].append((img_path, label))
        else:
            report["used"].append((img_path, label))
    return report


def get_task_level_sample_dict(many_few_char_set_dict, char_set_dict, labels_dict, criterion = "train"):
    report = dict()
    
    many = many_few_char_set_dict[criterion]["many"]
    medium = many_few_char_set_dict[criterion]["medium"]
    few = many_few_char_set_dict[criterion]["few"]
    
    
    for task, char_set in char_set_dict.items():
        if task == criterion:
            continue
        
        unseen = char_set_dict[task] - char_set_dict[criterion]
        
        report[task] = dict()
        
        use_report = get_split_with_char_set(labels_dict[task], unseen)
        report[task]["unseen"] = use_report["used"]
        
        use_report = get_split_with_char_set(use_report["unused"], few)
        report[task]["hard"] = use_report["used"]
        
        use_report = get_split_with_char_set(use_report["unused"], medium)
        report[task]["normal"] = use_report["used"]
        report[task]["easy"] = use_report["unused"]
        
        
        # report[task]["hard"] = get_split_with_char_set(labels_dict[task], few)["used"]
        # report[task]["easy"] = get_split_with_char_set(labels_dict[task], few+medium)["unused"]
        # report[task]["normal"] = get_split_with_char_set(labels_dict[task], few)["unused"]
        
        
    return report

# task별 level 별 sample
task_level_sample_dict = get_task_level_sample_dict(many_few_char_set_dict, char_set_dict, labels_dict)

In [21]:
for task, report in task_level_sample_dict.items():
    for easy_hard, samples in report.items():
        print(f"({task}, {easy_hard}): {len(samples)}")
    print()

(eval, unseen): 27
(eval, hard): 2139
(eval, normal): 24586
(eval, easy): 36831

(test, unseen): 22
(test, hard): 2211
(test, normal): 24278
(test, easy): 37073



In [22]:

        
from pathlib import Path

dir_path = Path("/home/test_dataset")
dir_path.mkdir(exist_ok=True, parents=True)

for task, level_sample_dict in task_level_sample_dict.items():
    for level, samples in level_sample_dict.items():
        file_path = dir_path/task/level/"label.txt"
        file_path.parent.mkdir(exist_ok=True, parents=True)
        with open(file_path, "w") as f:
            for img_path, label in samples:
                f.write(f"{img_path}\t{label}\n")
                
        file_path = dir_path/task/level/"infer.txt"
        file_path.parent.mkdir(exist_ok=True, parents=True)
        with open(file_path, "w") as f:
            for img_path, label in samples:
                f.write(f"{img_path}\n")

In [37]:
eval_report = get_split_with_char_set(eval_labels, train_medium+train_few)
for k, v in eval_report.items():
    print(k, len(v))

used 26736
unused 36847


In [None]:
eval_report