In [1]:
import argparse
import sys
import glob
import numpy as np
import io, os
from PIL import Image
from PIL import ImageDraw
from PIL import ImageFont
import collections

### `draw_single_char`
- `글자`, `폰트`, `canvas_size`를 받아서 128x128 이미지 출력

In [2]:
def draw_single_char(ch, font, canvas_size):
    image = Image.new('L', (canvas_size, canvas_size), color=255)
    drawing = ImageDraw.Draw(image)
    w, h = drawing.textsize(ch, font=font)
    drawing.text(
        ((canvas_size-w)/2, (canvas_size-h)/2),
        ch,
        fill=(0),
        font=font
    )
    flag = np.sum(np.array(image))
    
    # 해당 font에 글자가 없으면 return None
    if flag == 255 * 128 * 128:
        return None
    
    return image

### `draw_example`
- `글자`, `원본폰트`, `타겟폰트`, `canvas_size(=128)`를 받아서 128x256 이미지 출력

In [3]:
def draw_example(ch, src_font, dst_font, canvas_size):
    dst_img = draw_single_char(ch, dst_font, canvas_size)
    
    # 해당 font에 글자가 없으면 return None
    if not dst_img:
        return None
    
    src_img = draw_single_char(ch, src_font, canvas_size)
    example_img = Image.new("RGB", (canvas_size * 2, canvas_size), (255, 255, 255)).convert('L')
    example_img.paste(dst_img, (0, 0))
    example_img.paste(src_img, (canvas_size, 0))   
    return example_img

In [4]:
SRC_PATH = './fonts/source/'
TRG_PATH = './fonts/target/'
OUTPUT_PATH = './dataset-11172/'

src_font = glob.glob(os.path.join(SRC_PATH, '*.ttf'))[0]
print('source font:', src_font)

trg_fonts = glob.glob(os.path.join(TRG_PATH, '*.ttf'))
trg_fonts.sort()
print('target fonts:', len(trg_fonts), '개')

source font: ./fonts/source/source_font.ttf
target fonts: 46 개


- 46개는 데이터 상 너무 많으므로 직접 걸러줬다. 26개로 거름

In [5]:
target_filter = [1, 2, 8, 10, 11, 13, 14, 16, 19, 21, 23, 26, 27, 29, \
                 30, 33, 34, 35, 36, 37, 38, 39, 40, 41, 43, 44]

trg_fonts = [trg_fonts[i-1] for i in target_filter]
print(len(trg_fonts))
trg_fonts

26


['./fonts/target/01.ttf',
 './fonts/target/02.ttf',
 './fonts/target/08.ttf',
 './fonts/target/10.ttf',
 './fonts/target/11.ttf',
 './fonts/target/13.ttf',
 './fonts/target/14.ttf',
 './fonts/target/16.ttf',
 './fonts/target/19.ttf',
 './fonts/target/21.ttf',
 './fonts/target/23.ttf',
 './fonts/target/26.ttf',
 './fonts/target/27.ttf',
 './fonts/target/29.ttf',
 './fonts/target/30.ttf',
 './fonts/target/33.ttf',
 './fonts/target/34.ttf',
 './fonts/target/35.ttf',
 './fonts/target/36.ttf',
 './fonts/target/37.ttf',
 './fonts/target/38.ttf',
 './fonts/target/39.ttf',
 './fonts/target/40.ttf',
 './fonts/target/41.ttf',
 './fonts/target/43.ttf',
 './fonts/target/44.ttf']

- 글자는 완성형 한글로, 총 11,172개

In [6]:
charset = []
for i in range(0xac00,0xd7a4):
    charset.append(chr(i))
print(len(charset))

11172


- 데이터 생성 TEST

In [7]:
src_char_size = 50
trg_char_size = 55

- 한 글자 이미지 생성

In [8]:
canvas_size = 128
font = ImageFont.truetype(trg_fonts[2], size=trg_char_size)
dst_img = draw_single_char(charset[500], font, canvas_size)
dst_img

- 두 글자 (target font, source font) 이미지 생성

In [9]:
src_font = ImageFont.truetype(src_font, size=src_char_size)
e = draw_example(charset[4422], src_font, font, canvas_size)
e

In [10]:
count = 0
font_label = 0
canvas_size = 128
font_count = 0
src_char_size = 50
trg_char_size = 55
OUTPUT_PATH = './hangul-dataset-11172/'

# src_font = ImageFont.truetype(src_font, size=src_char_size)

for font in trg_fonts:            
    font = ImageFont.truetype(font, size=trg_char_size)
    character_count = 0
    for c in charset:
        e = draw_example(c, src_font, font, canvas_size)
        if e:
            e.save(os.path.join(OUTPUT_PATH, "%d_%04d.png" % (font_label, character_count)))
            character_count += 1
            count += 1
            if count % 10000 == 0:
                print("processed %d chars" % count)
    font_label += 1
print("processed %d chars, end" % count)

processed 10000 chars
processed 20000 chars
processed 30000 chars
processed 40000 chars
processed 50000 chars
processed 60000 chars
processed 70000 chars
processed 80000 chars
processed 90000 chars
processed 100000 chars
processed 110000 chars
processed 120000 chars
processed 130000 chars
processed 140000 chars
processed 150000 chars
processed 160000 chars
processed 170000 chars
processed 180000 chars
processed 190000 chars
processed 200000 chars
processed 210000 chars
processed 220000 chars
processed 230000 chars
processed 237831 chars, end


wow 237831 data!

### `package.py`로 obj 파일 생성

In [6]:
# -*- coding: utf-8 -*-
from __future__ import print_function
from __future__ import absolute_import

import argparse
import glob
import os
import pickle as pickle
import random


def pickle_examples(from_dir, train_path, val_path, train_val_split=0.2):
    """
    Compile a list of examples into pickled format, so during
    the training, all io will happen in memory
    """
    paths = glob.glob(os.path.join(from_dir, "*.png"))
    with open(train_path, 'wb') as ft:
        with open(val_path, 'wb') as fv:
            print('all data num:', len(paths))
            c = 1
            val_count = 0
            train_count = 0
            for p in paths:
                c += 1
                label = int(os.path.basename(p).split("_")[0])
                with open(p, 'rb') as f:
                    img_bytes = f.read()
                    example = (label, img_bytes)
                    r = random.random()
                    if r < train_val_split:
                        pickle.dump(example, fv)
                        val_count += 1
                        if val_count % 10000 == 0:
                            print("%d imgs saved in val.obj" % val_count)
                    else:
                        pickle.dump(example, ft)
                        train_count += 1
                        if train_count % 10000 == 0:
                            print("%d imgs saved in train.obj" % train_count)
            print("%d imgs saved in val.obj, end" % val_count)
            print("%d imgs saved in train.obj, end" % train_count)
            return

In [7]:
from_dir = './hangul-dataset-11172/'
save_dir = '../dataset/'
train_path = os.path.join(save_dir, "train.obj")
val_path = os.path.join(save_dir, "val.obj")

pickle_examples(from_dir, train_path=train_path, val_path=val_path)

all data num: 237831
10000 imgs saved in train.obj
20000 imgs saved in train.obj
30000 imgs saved in train.obj
10000 imgs saved in val.obj
40000 imgs saved in train.obj
50000 imgs saved in train.obj
60000 imgs saved in train.obj
70000 imgs saved in train.obj
20000 imgs saved in val.obj
80000 imgs saved in train.obj
90000 imgs saved in train.obj
100000 imgs saved in train.obj
110000 imgs saved in train.obj
30000 imgs saved in val.obj
120000 imgs saved in train.obj
130000 imgs saved in train.obj
140000 imgs saved in train.obj
150000 imgs saved in train.obj
40000 imgs saved in val.obj
160000 imgs saved in train.obj
170000 imgs saved in train.obj
180000 imgs saved in train.obj
190000 imgs saved in train.obj
47426 imgs saved in val.obj, end
190405 imgs saved in train.obj, end
