In [6]:
import os
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm
from fontTools.ttLib import TTFont
from PIL import Image, ImageDraw, ImageFont

In [7]:
def get_existing_chars(filename):
    ttfont = TTFont(filename)
    return list({chr(key) for table in ttfont['cmap'].tables for key in table.cmap.keys()})

def get_image(font, char, size=128):
    img = Image.new('L', (1000,1000), 255)

    draw = ImageDraw.Draw(img)
    draw.text((200,200), char, font=font)

    npimg = 255 - np.array(img)
    wmin = npimg.sum(0).nonzero()[0].min()
    wmax = npimg.sum(0).nonzero()[0].max()
    hmin = npimg.sum(1).nonzero()[0].min()
    hmax = npimg.sum(1).nonzero()[0].max()

    npimg = 255 - npimg[hmin:hmax+1,wmin:wmax+1]

    whdiff = (wmax-wmin) - (hmax-hmin)
    wmargin = abs(whdiff) // 2 if whdiff < 0 else 0
    hmargin = abs(whdiff) // 2 if whdiff >= 0 else 0

    npimg = np.pad(npimg, ((hmargin, hmargin), (wmargin, wmargin)),
                       'constant', constant_values=255)
    img = Image.fromarray(npimg).resize((size, size), resample=Image.BILINEAR)
    return img


In [8]:
# utf-8
def get_all_korean():

    def nextKorLetterFrom(letter):
        lastLetterInt = 15572643
        if not letter:
            return '가'
        a = letter
        b = a.encode('utf8')
        c = int(b.hex(), 16)

        if c == lastLetterInt:
            return False

        d = hex(c + 1)
        e = bytearray.fromhex(d[2:])

        flag = True
        while flag:
            try:
                r = e.decode('utf-8')
                flag = False
            except UnicodeDecodeError:
                c = c+1
                d = hex(c)
                e = bytearray.fromhex(d[2:])
        return e.decode()

    returns = []
    flag = True
    k = ''
    while flag:
        k = nextKorLetterFrom(k)
        if k is False:
            flag = False
        else:
            returns.append(k)
    return returns


In [12]:
ak = get_all_korean()
eng = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
kl = "ㄱㄴㄷㄹㅁㅂㅅㅇㅈㅊㅋㅌㅍㅎㄲㄸㅃㅆㅉㅏㅑㅓㅕㅗㅛㅜㅠㅡㅣㅐㅒㅔㅖㅘㅙㅚㅝㅞㅟㅢ"
sm = "0123456789!@#$%^&*()-_=+,.[]<>{}?/~♡♥"
all_letters = ak + [*eng] + [*kl] + [*sm]
fd = "../data/raw/ttfs/"
ttfs = [(fd+f, f.split(".ttf")[0]) for f in os.listdir(fd) if ".ttf" in f]
png_fd = "../data/raw/pngs/"

In [15]:
chardf = pd.DataFrame(ak)
chardf.to_parquet("../data/raw/all_korean.parquet")
pbar = tqdm(range(len(ttfs)))
# pbar = tqdm(range(1))
c = 0
for i in pbar:
    ttf = ttfs[i]
    # ttf = test_ttf[0]
    filename = ttf[0]
    try:
        font = ImageFont.truetype(filename,255)
        existing_chars = get_existing_chars(filename)
        savepics = True
    except:
        savepics = False
    if savepics:
        for j in range(len(chardf)):
        # for j in range(10):
            if not os.path.exists(png_fd+'%s__%s.png'%(ttf[1],str(j))):
                char = chardf.iloc[j].values[0]
                if char in existing_chars:
                    try:
                        img = get_image(font, char)
                        with open(png_fd+'%s__%s.png'%(ttf[1],str(j)), "wb") as f:
                            img.save(f, "PNG")
                    except:
                        # print(ttf[1], ttf[2], char)
                        c += 1
                else:
                    c += 1
    pbar.set_postfix(passed=str(c), files=str(len(os.listdir(png_fd))))

 38%|███▊      | 143/377 [28:36<1:07:05, 17.20s/it, files=281420, passed=1293832]1 extra bytes in post.stringData array
 66%|██████▋   | 250/377 [54:27<25:03, 11.84s/it, files=524434, passed=2246222]  1 extra bytes in post.stringData array
 86%|████████▌ | 323/377 [1:13:12<20:31, 22.80s/it, files=684687, passed=2901525]1 extra bytes in post.stringData array
 92%|█████████▏| 346/377 [1:18:10<09:47, 18.94s/it, files=719587, passed=3123581]1 extra bytes in post.stringData array
100%|██████████| 377/377 [1:26:52<00:00, 13.83s/it, files=769432, passed=3420068]
