# label filtering

In [80]:
from PIL import Image
from pathlib import Path
from collections import Counter
import pandas as pd

def load_label(label_file_path):
    with open(label_file_path) as f:
        lines = [line.strip().split("\t") for line in f.readlines()]    
    for line in lines:
        if len(line) == 1:  # 레이블이 없는 경우 (공백으로 추론 된 경우)
            line.append("")
    return lines
        
def text_check(text):
    IGNORE_TEXT = ["(한자)", "((한자))", "(((한자)))", "(일본어)", "((일본어))", "(((일본어)))", "(외국어)","((외국어))","(((외국어)))",  "(영어)", "((영어))", "(((영어)))", "xx", "xxx", "xxxx", "xxxxx", "XX", "XXX", "XXXX", "XXXXX"]
    IGNORE_MASK = ["xx", "xxx", "xxxx", "xxxxx", "XX", "XXX", "XXXX", "XXXXX"]+["ㄱ","ㄴ","ㄷ","ㄹ","ㄺ","ㅁ","ㅂ","ㅅ","ㅆ","ㅇ","ㅈ","ㅊ","ㅋ","ㅌ","ㅍ","ㅎ","ㅏ","ㅑ","ㅓ","ㅕ","ㅗ","ㅛ","ㅜ","ㅠ","ㅡ","ㅣ","ㅐ","ㅒ","ㅔ","ㅖ","ㅘ","ㅙ","ㅚ","ㅝ","ㅞ","ㅟ","ㅢ"]+["!",'"',"#","$","%","&","'","(",")","*","+","-","/","0","1","2","3","4","5","6","7","8","9",":",";","<","=",">","?","A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z","[","\\","]","^","_","`","a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t","u","v","w","x","y","z","{","|","}","~","ㄱ","ㄴ","ㄷ","ㄹ","ㄺ","ㅁ","ㅅ","ㅆ","ㅇ","ㅈ","ㅊ","ㅋ","ㅌ","ㅍ","ㅎ"]+[",", ".", "º"]
    # 쓸 수 있는 test(trascription) 인지 체크하여 T, F로 반환
    if text in IGNORE_TEXT: # 금지된 텍스트에 해당하면 탈락
        return False
    else:
        for mask in IGNORE_MASK: # mask를 포함하고 있으면 탈락
            if mask in text:
                return False
    return True




def filter_label_with_mask(labels):
    filtered = []
    removed = []
    for i, (image, label) in enumerate(labels):
        if text_check(label):
            filtered.append([image, label])
        else:
            removed.append([image, label])
    return filtered, removed

def filter_by_length(labels, max_len):
    filtered = []
    removed = []
    for i, (image, label) in enumerate(labels):
        if len(label) <= max_len:
            filtered.append([image, label])
        else:
            removed.append([image, label])
    return filtered, removed

def get_char_num_report(labels):
    char_list = []
    for image, label in labels:
        char_list+=list(label)
    count = Counter(char_list)
    return dict(sorted(count.items(), key=lambda item: item[1]))
    

def get_length_report(labels):
    size_list = [len(text) for image, text in labels]

    count = Counter(size_list)
    return dict(sorted(count.items(), key=lambda item: item[0]))

def get_df(image_text_pairs):
    samples = []
    for image, text in image_text_pairs:
        samples.append({"image":image, "text":text})
    return pd.DataFrame(samples)


def make_char_set_file(char_num_report, file_path = "/home/char_set.txt")
    char_list = list(char_num_report.keys())
    char_list.sort()

    with open(file_path, "w") as f:
        for c in char_list:
            f.write(f"{c}\n")
    print(f"{len(char_list)} characters are saved in '{file_path}'")

data_dir = "/home/datasets/aihub_rec"
label_file_path = "/home/datasets/aihub_rec/label.txt"
infer_file_path = "/home/datasets/aihub_rec/clean_infer_result.txt"

#################################################
labels = load_label(label_file_path)
print(f"Totel label num: {len(labels)}")

#################################################
labels, removed_labels = filter_label_with_mask(labels)
print(f"Label num after char_set filtering = {len(labels)}       ... {len(removed_labels)} samples are removed")

#################################################
MAX_LENGTH = 20
labels, removed_labels = filter_by_length(labels, MAX_LENGTH)
print(f"Label num after max length filtering = {len(labels)}       ... {len(removed_labels)} samples are removed")

#################################################
# 가로 세로 구분을 하려 했는데 필요 없을 것 같음
infers = load_label(infer_file_path)
label_df = get_df(labels)
infer_df = get_df(infers)
df = pd.merge(label_df, infer_df, on="image", suffixes=["_label", "_infer"])

#################################################
char_num_report = get_char_num_report(labels)
length_report = get_length_report(labels)

#################################################
make_char_set_file(char_num_report)



Totel label num: 884068
Label num after char_set filtering = 882993       ... 1075 samples are removed
Label num after max length filtering = 882981       ... 12 samples are removed


In [18]:
from pathlib import Path
from PIL import Image
import multiprocessing
from tqdm import tqdm

def chunk_list(data, num_chunks):
    avg = len(data) / float(num_chunks)
    chunks = []
    last = 0.0

    while last < len(data):
        chunks.append(data[int(last):int(last + avg)])
        last += avg

    return chunks

def get_shape_from_size(image, label):
    w, h = image.size
    if w >= h*1.5:
        return "horizontal"
    else:
        return "others"

def add_size(work_list, shared_list, data_dir):
    for work in tqdm(work_list):
        image_path, label = work
        image = Image.open(data_dir/image_path)
        shape = get_shape_from_size(image, label)
        shared_list.append([image_path, label, shape])

def get_shape_reportf(data_dir, label_path, worker_num = 10):
    with open(label_path) as f:
        lines = [line.strip().split("\t") for line in f.readlines() if len(line.strip().split("\t")) == 2]
        

    manager = multiprocessing.Manager()
    shared_list = manager.list()
    
    data_parts = chunk_list(lines, worker_num)
    processes = []
            
    for part in data_parts:
        p = multiprocessing.Process(target=add_size, args=(part, shared_list, data_dir))
        processes.append(p)
        p.start()
        
    for p in processes:
        p.join()
        
    shape_report = {}
    for image_path, label, shape in tqdm(shared_list):
        shape_report.setdefault(shape, []).append([image_path, label])
    
    return shape_report


data_dir = Path("/home/datasets/aihub_rec/")
label_path = Path("/home/datasets/aihub_rec/clean_label.txt")
shape_report = get_shape_reportf(data_dir, label_path, worker_num = 50)

100%|██████████| 1/1 [00:00<00:00,  2.77it/s]13it/s]] 
100%|██████████| 17660/17660 [00:27<00:00, 636.92it/s]
100%|██████████| 17660/17660 [00:27<00:00, 637.51it/s]
100%|██████████| 17660/17660 [00:27<00:00, 632.51it/s]
100%|██████████| 17659/17659 [00:28<00:00, 630.46it/s]
100%|██████████| 17660/17660 [00:28<00:00, 628.37it/s]
100%|██████████| 17659/17659 [00:28<00:00, 623.86it/s]
100%|██████████| 17660/17660 [00:28<00:00, 622.71it/s]
100%|██████████| 17659/17659 [00:28<00:00, 626.12it/s]
100%|██████████| 17660/17660 [00:28<00:00, 621.55it/s]
100%|██████████| 17660/17660 [00:28<00:00, 625.12it/s]
100%|██████████| 17660/17660 [00:28<00:00, 619.95it/s]
100%|██████████| 17660/17660 [00:28<00:00, 616.16it/s]
100%|██████████| 17660/17660 [00:28<00:00, 618.10it/s]
100%|██████████| 17660/17660 [00:28<00:00, 617.88it/s]
100%|██████████| 17660/17660 [00:28<00:00, 616.86it/s]
100%|██████████| 17660/17660 [00:26<00:00, 670.15it/s]]
100%|██████████| 17660/17660 [00:28<00:00, 615.36it/s]
100%|████

In [17]:
for k, samples in shape_report.items():
    print(k, len(samples))

horizontal 515954
others 367038


In [20]:
for k, samples in shape_report.items():
    print(k, len(samples))

horizontal 635833
others 247159


In [21]:

# for image_path, label in shape_report["vertical"]:
#     if len(label) < 4:
#         continue
#     Image.open(data_dir/image_path).resize((100, 50)).show()
#     if "s" == input():
#         break


In [23]:
shape_report

{'horizontal': [['1/1.png', '마포탑안과의원'],
  ['1/2.png', '마포정대포'],
  ['1/3.png', '삼촌'],
  ['1/4.png', '조카'],
  ['1/6.png', '두피 탈모'],
  ['1/7.png', '동종합상사'],
  ['1/8.png', '찬솔약국'],
  ['1/9.png', '삼나무'],
  ['1/10.png', '양곡소매업'],
  ['18/17671.png', '수정옥돌'],
  ['18/17672.png', '소금구이'],
  ['18/17673.png', '홍대살롱'],
  ['18/17675.png', '스킨'],
  ['18/17676.png', '육회'],
  ['18/17677.png', '뒷고기'],
  ['18/17678.png', '두꺼비숙성횟집'],
  ['36/35332.png', '수미'],
  ['36/35333.png', '분식'],
  ['36/35334.png', '월드부동산'],
  ['36/35335.png', '사랑반찬'],
  ['36/35336.png', '기백이네'],
  ['36/35337.png', '삼성태권도'],
  ['1/11.png', '허가'],
  ['1/12.png', '세도국제'],
  ['1/13.png', '여행사'],
  ['1/14.png', '다미솔'],
  ['1/15.png', '돈떵이'],
  ['1/16.png', '구이'],
  ['1/17.png', '재영식품'],
  ['1/18.png', '김치'],
  ['1/19.png', '밑반찬'],
  ['1/20.png', '디자인 쌤'],
  ['1/21.png', '헤어'],
  ['1/22.png', '나쁜여자'],
  ['1/23.png', '준이반점'],
  ['1/24.png', '은희슈퍼'],
  ['1/25.png', '두루치기'],
  ['1/26.png', '용담어린이집'],
  ['1/27.png', '사회복지법인'],
  ['1/29.png', 

In [25]:
for k, samples in shape_report.items():
    with open(f"/home/{k}_label.txt", "a") as f: 
        for image, label in samples:
            f.write(f"{image}\t{label}\n")