In [56]:
import random
import os
random.seed(17)
import pandas as pd
from PIL import Image
from PIL import ImageDraw
from PIL import ImageFont
from string import ascii_letters, digits
from multiprocessing import Pool
from itertools import product
import multiprocessing

ONE_FONT_DATASET_SIZE=20000
MAX_FONT_SIZE=20
MIN_FONT_SIZE=10
TRAIN_SIZE = 0.8
VAL_SIZE = 0.1

In [57]:
#check available fonts
import matplotlib.font_manager
matplotlib.font_manager.findSystemFonts(fontpaths=None, fontext='ttf')

['/usr/share/fonts/truetype/kacst/mry_KacstQurn.ttf',
 '/usr/share/fonts/truetype/tlwg/Sawasdee-BoldOblique.ttf',
 '/usr/share/fonts/truetype/ubuntu-font-family/Ubuntu-MI.ttf',
 '/usr/share/fonts/truetype/liberation/LiberationSans-BoldItalic.ttf',
 '/usr/share/fonts/truetype/liberation/LiberationSerif-Bold.ttf',
 '/usr/share/fonts/truetype/tlwg/Norasi-Italic.ttf',
 '/usr/share/fonts/truetype/kacst/KacstTitleL.ttf',
 '/usr/share/fonts/truetype/dejavu/DejaVuSans-ExtraLight.ttf',
 '/usr/share/fonts/truetype/tlwg/Purisa-Bold.ttf',
 '/usr/share/fonts/truetype/ttf-bitstream-vera/VeraSe.ttf',
 '/usr/share/fonts/truetype/kacst/KacstNaskh.ttf',
 '/usr/share/fonts/truetype/nanum/NanumBarunGothicBold.ttf',
 '/usr/share/fonts/truetype/ubuntu-font-family/UbuntuMono-R.ttf',
 '/usr/share/fonts/truetype/tlwg/Waree-Oblique.ttf',
 '/usr/share/fonts/opentype/stix/STIXGeneral-Regular.otf',
 '/usr/share/fonts/truetype/nanum/NanumGothicBold.ttf',
 '/usr/share/fonts/truetype/tlwg/Laksaman-Bold.ttf',
 '/usr/s

In [58]:
#prepare functions for image generation
def get_random_pos(occupied, max_pos):
    available = max_pos - occupied
    if available<=0:
        return 0
    else:
        return random.randint(0, available)

# we assume image size is always the same
def generate_text_image(text, f_name, fontsize=10, fontname="Loma.ttf", height=25, width=100):
    colorText = "black"
    colorBackground = "white"
    font = ImageFont.truetype(fontname, fontsize)
    img = Image.new('1', (width, height), colorBackground)
    d = ImageDraw.Draw(img)
    text_width, text_height = font.getsize(text)
    pos_x = get_random_pos(text_width, width)
    pos_y = get_random_pos(text_height, height)
    d.text((pos_x,pos_y), text, fill=colorText, font=font)
    img.save(f_name)

In [59]:
#we need to be prepared for any letter and not rely on sequences so string must be random
P_LETTERS = ascii_letters+digits
def get_char():
    return random.choice(P_LETTERS)

def get_random_word(max_word_len=30, min_word_len=1):
    word_len = random.choice(range(min_word_len,max_word_len))
    res = ""
    for i in range(word_len):
        res=res+get_char()
    return res

In [60]:
def generate_font_dataset(font_name,  folder_name, size=ONE_FONT_DATASET_SIZE):
    os.mkdir(folder_name)
    for i in range(size):
        font_size = random.randint(MIN_FONT_SIZE,MAX_FONT_SIZE)
        f_name = folder_name+"/"+font_name.split(".")[0]+"_"+str(i)+".png"
        text= get_random_word()
        generate_text_image(text, f_name, font_size, font_name)

In [70]:
%%time
fonts = ["NanumBarunGothicBold.ttf","Purisa-Bold.ttf","Kinnari-BoldItalic.ttf","DejaVuSans-Oblique.ttf",
         "Laksaman-Italic.ttf","FreeMono.ttf","NanumGothicBold.ttf",
         "DejaVuSans-BoldOblique.ttf", "LiberationMono-BoldItalic.ttf", "FreeSerifItalic.ttf"]
folders = [f.split(".")[0] for f in fonts]
mypool = Pool(processes=multiprocessing.cpu_count())
mypool.starmap(generate_font_dataset, zip(fonts, folders))

CPU times: user 316 ms, sys: 344 ms, total: 660 ms
Wall time: 1min 43s


In [71]:
#generate dataframe for pytorch dataset
files = []
labels = []
for i,f in enumerate(fonts):
    folder = f.split(".")[0]
    font_files = os.listdir(folder)
    for file in font_files:
        files.append(folder+"/"+file)
    labels.extend([i]*len(font_files))

In [72]:
len(files)==len(labels)

True

In [73]:
ds_frame=pd.DataFrame(data={"files":files, "labels":labels}, columns=["files", "labels"])

In [74]:
#split into test train now
train=ds_frame.sample(frac=TRAIN_SIZE,random_state=200)
validation = train.sample(frac=VAL_SIZE)
train.drop(validation.index, inplace=True)
test=ds_frame.drop(train.index)

In [75]:
train.to_csv("train.csv")
validation.to_csv("validation.csv")
test.to_csv("test.csv")