# Goal

Create dataset with `labels.json` from the [previous notebook](./1-create-label.ipynb) and some font files. Generating dataset from font files is a simple example of creating dataset yourself for problems that don't have pre-built dataset liked `MNIST`. In the near future, I will extract data from [ETL Character Database](http://etlcdb.db.aist.go.jp) for image-based model and from [KanjiVG](https://github.com/KanjiVG/kanjivg) for stroke-based model.

In [1]:
import os
import io
import time
import json
import traceback
import itertools
import inspect
import hashlib
import binascii

from tqdm import tqdm

In [2]:
from constants import *
from utils import *
from serializable import *

The label data was created from the previous notebook.

In [3]:
label_file = LABEL_FILENAME
label_file, os.path.exists(label_file)

('labels.json', True)

In [4]:
content = open(label_file, mode='r', encoding='utf-8').read()
print(content)

{
	"source": "hiragana.txt",
	"content": "あいうえおかきくけこさしすせそたちつてとなにぬねのまみむめもはひふへほやゆよらりるれろわん",
	"labels": [
		"あ",
		"い",
		"う",
		"え",
		"お",
		"か",
		"き",
		"く",
		"け",
		"こ",
		"さ",
		"し",
		"す",
		"せ",
		"そ",
		"た",
		"ち",
		"つ",
		"て",
		"と",
		"な",
		"に",
		"ぬ",
		"ね",
		"の",
		"ま",
		"み",
		"む",
		"め",
		"も",
		"は",
		"ひ",
		"ふ",
		"へ",
		"ほ",
		"や",
		"ゆ",
		"よ",
		"ら",
		"り",
		"る",
		"れ",
		"ろ",
		"わ",
		"ん"
	]
}


We created a method to parse the `json` data back.

In [5]:
print(inspect.getsource(LabelFile))

class LabelFile:
    def __init__(self, source: str, content: str, labels: list):
        self.source = source
        self.content = content
        self.labels = labels

    def __repr__(self):
        return repr(self.__dict__)

    @staticmethod
    def parse_obj(obj: dict):
        if not isinstance(obj, dict):
            raise Exception(f'{obj} is not a dict!')

        sample = LabelFile('', '', [])
        args = {}

        for key in sample.__dict__:
            if key not in obj:
                obj_str = repr(obj)
                if len(obj_str) > 80:
                    obj_str = obj_str[:40] + '...' + obj_str[-40:]

                raise Exception(f'{obj_str} does not contain key {repr(key)}!')

            args[key] = obj[key]

        return LabelFile(**args)



In [6]:
with open(label_file, mode='r', encoding='utf-8') as infile:
    obj = json.load(infile)

obj

{'source': 'hiragana.txt',
 'content': 'あいうえおかきくけこさしすせそたちつてとなにぬねのまみむめもはひふへほやゆよらりるれろわん',
 'labels': ['あ',
  'い',
  'う',
  'え',
  'お',
  'か',
  'き',
  'く',
  'け',
  'こ',
  'さ',
  'し',
  'す',
  'せ',
  'そ',
  'た',
  'ち',
  'つ',
  'て',
  'と',
  'な',
  'に',
  'ぬ',
  'ね',
  'の',
  'ま',
  'み',
  'む',
  'め',
  'も',
  'は',
  'ひ',
  'ふ',
  'へ',
  'ほ',
  'や',
  'ゆ',
  'よ',
  'ら',
  'り',
  'る',
  'れ',
  'ろ',
  'わ',
  'ん']}

In [7]:
label_file = LabelFile.parse_obj(obj)
type(label_file), label_file

(serializable.LabelFile,
 {'source': 'hiragana.txt', 'content': 'あいうえおかきくけこさしすせそたちつてとなにぬねのまみむめもはひふへほやゆよらりるれろわん', 'labels': ['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', 'け', 'こ', 'さ', 'し', 'す', 'せ', 'そ', 'た', 'ち', 'つ', 'て', 'と', 'な', 'に', 'ぬ', 'ね', 'の', 'ま', 'み', 'む', 'め', 'も', 'は', 'ひ', 'ふ', 'へ', 'ほ', 'や', 'ゆ', 'よ', 'ら', 'り', 'る', 'れ', 'ろ', 'わ', 'ん']})

We put all all the font files in the a directory. We currently support `.otf` and `.ttf` font files only.

In [8]:
fonts_dir = FONTS_DIR
fonts_dir, os.path.exists(fonts_dir)

('fonts', True)

Define the output image size and font size to draw the character.

In [9]:
font_size = 64
image_size = 64

TODO write explanation about what the code does

In [10]:
print(inspect.getsource(fetch_font))

@measure_exec_time
def fetch_font(font_file: str, font_size=64, characters=list()):
    pillow_font = ImageFont.truetype(font=font_file, size=font_size)
    font_name = '_'.join(pillow_font.getname())

    # TODO test if this code is working or not by rendering the
    # actual image (with some uncommon kanji)
    ft_font = TTFont(font_file)
    supported_chars = []
    for cmap in ft_font['cmap'].tables:
        if cmap.isUnicode():
            for c in characters:
                if ord(c) in cmap.cmap:
                    supported_chars.append(c)

    return Font(font_name, pillow_font, font_size, font_file, supported_chars)



In [11]:
print(inspect.getsource(Font))

class Font:
    """
    Wrapper class for storing some data that we need to identify.

    For example:

    - the font name (not the file name)
    - the font size (pillow's ImageFont requires to be created with font
    size so I need to store that)
    - the ImageFont for using with pillow's drawing API
    - the font file path for using with `fonttools` to check if the font
    support a specific character or not. otherwise, it may give the tofu
    shape image.
    """

    def __init__(
        self,
        name: str,
        font: ImageFont.FreeTypeFont,
        size: int,
        path: str,
        supported_chars: list,
    ):
        self.name = name
        self.font = font
        self.size = size
        self.path = path
        self.supported_chars = supported_chars

    def __repr__(self):
        return repr((self.name, self.size, self.path))



In [12]:
labels = label_file.labels
font_list = []
file_list = os.listdir(fonts_dir)
for filename in file_list:
    child_path = os.path.join(fonts_dir, filename)

    if not os.path.isfile(child_path):
        print(f'Skipping directory {child_path}!')
        continue

    file_ext = os.path.splitext(filename)[1]
    file_ext = file_ext.lower()

    if (file_ext == '.ttf') or (file_ext == '.otf'):
        font = fetch_font(
            font_file=child_path,
            font_size=font_size,
            characters=labels,
        )

        font_list.append(font)
    else:
        print(f'Skipping unknown file type {child_path}!')
        continue

font_list

Skipping unknown file type fonts\readme.txt!


[('07YsashisaGothicTegaki_Regular', 64, 'fonts\\07YsashisaGothicTegaki_Regular.otf'),
 ('851MkPOP_Regular', 64, 'fonts\\851MkPOP_Regular.ttf'),
 ('ArmedBanana_Regular', 64, 'fonts\\ArmedBanana_Regular.ttf'),
 ('Chihaya Jyun_Regular', 64, 'fonts\\Chihaya_Jyun_Regular.ttf'),
 ('darts font_Regular', 64, 'fonts\\darts_font_Regular.ttf'),
 ('Dining message_Regular', 64, 'fonts\\Dining_message_Regular.ttf'),
 ('ElmerFont_Regular', 64, 'fonts\\ElmerFont_Regular.ttf'),
 ('Ghatee_Regular', 64, 'fonts\\Ghatee_Regular.ttf'),
 ('Gyate-Luminescence_Regular', 64, 'fonts\\Gyate-Luminescence_Regular.ttf'),
 ('HGKyokashotai_Medium', 64, 'fonts\\HGKyokashotai_Medium.ttf'),
 ('HonyaJi-Re_Regular', 64, 'fonts\\HonyaJi-Re_Regular.ttf'),
 ('HuiFont_Regular', 64, 'fonts\\HuiFont_Regular.ttf'),
 ('IPAGothic_Regular', 64, 'fonts\\IPAGothic_Regular.ttf'),
 ('Jiyucho_Regular', 64, 'fonts\\Jiyucho_Regular.ttf'),
 ('KanjiStrokeOrders_Regular', 64, 'fonts\\KanjiStrokeOrders_Regular.ttf'),
 ('kouichi.sakurai font fe

In [13]:
font.supported_chars

['あ',
 'い',
 'う',
 'え',
 'お',
 'か',
 'き',
 'く',
 'け',
 'こ',
 'さ',
 'し',
 'す',
 'せ',
 'そ',
 'た',
 'ち',
 'つ',
 'て',
 'と',
 'な',
 'に',
 'ぬ',
 'ね',
 'の',
 'ま',
 'み',
 'む',
 'め',
 'も',
 'は',
 'ひ',
 'ふ',
 'へ',
 'ほ',
 'や',
 'ゆ',
 'よ',
 'ら',
 'り',
 'る',
 'れ',
 'ろ',
 'わ',
 'ん']

In [14]:
unsupported_char_and_font_combinations = []
for font in tqdm(font_list):
    ns_chars = [c for c in labels if not c in font.supported_chars]

    if not len(ns_chars) == 0:
        unsupported_char_and_font_combinations.append((font.name, ns_chars))

unsupported_char_and_font_combinations

100%|███████████████████████████████████████████████████████████████████████████████| 46/46 [00:00<00:00, 15375.99it/s]


[]

In [15]:
render_tasks = list(itertools.product(labels, font_list))
len(render_tasks)

2070

In [16]:
out_dir = DATASETS_DIR
dataset_dirname = os.path.splitext(label_file.source)[0]
dataset_dir = os.path.join(out_dir, dataset_dirname)

out_dir, dataset_dirname, dataset_dir

('datasets', 'hiragana', 'datasets\\hiragana')

In [17]:
serialized_records_filepath = os.path.join(dataset_dir, f'datasets{XFormat.EXTENSION}')
metadata_filepath = os.path.join(dataset_dir, METADATA_FILENAME)

serialized_records_filepath, metadata_filepath

('datasets\\hiragana\\datasets.xformat', 'datasets\\hiragana\\metadata.json')

In [18]:
if os.path.exists(dataset_dir):
    print(f'dataset_dir {repr(dataset_dir)} existed!')
    backup_filepath = backup_file_by_modified_date(dataset_dir)
    print(f'Backup it at {repr(backup_filepath)}')

os.makedirs(dataset_dir)

dataset_dir 'datasets\\hiragana' existed!
Backup it at 'E:\\hiragana-recognition\\datasets\\1592748139-hiragana'


In [19]:
dataset_metadata = DatasetMetadata(
    source=label_file.source,
    content=label_file.content,
    labels=label_file.labels,
)

with open(serialized_records_filepath, mode='wb') as out_stream:

    pbar = tqdm(render_tasks)
    for char_text, font in pbar:
        if not char_text in font.supported_chars:
            dataset_metadata.unsupported_combinations.append({
                'char': char_text,
                'font': font.name,
            })
            
            continue
            
        image = render_image(char_text, font, image_size)
        if image is None:
            dataset_metadata.blank_combinations.append({
                'char': char_text,
                'font': font.name,
            })
            
            continue
            
        buffer = io.BytesIO()
        image.save(buffer, format='PNG')
        encoded_image = buffer.getvalue()
        
        image_data_md5_hash = hashlib.md5(encoded_image).hexdigest()
        
        desc = f'char {char_text} grayscale image created with font {font.name} and font size {font.size}'
        
        record_dict = {
            'HASH': image_data_md5_hash,
            'CHARACTER': char_text,
            'WIDTH': image_size,
            'HEIGHT': image_size,
            'DEPTH': 1, # grayscale image
            'PNG_IMAGE': encoded_image,
            'FONT_SIZE': font.size,
            'FONT_NAME': font.name,
            'DESCRIPTION': desc,
        }
        
        record_datatype, serialized_record = XFormat.serialize_obj(record_dict)
        record_byte_count = len(serialized_record)
        record_seek_start = out_stream.tell()
        
        out_stream.write(record_datatype)
        out_stream.write(XFormat.serialize_int(record_byte_count))
        out_stream.write(serialized_record)
        
        record_seek_end = out_stream.tell()
        
        dataset_metadata.records.append({
            'hash': image_data_md5_hash,
            'char': char_text,
            'font': font.name,
            'seek_start': record_seek_start,
            'seek_end': record_seek_end,
        })

100%|█████████████████████████████████████████████████████████████████████████████| 2070/2070 [00:04<00:00, 499.64it/s]


In [20]:
dataset_metadata.records

[{'hash': 'cb61562d38233f686de18ca8e36c8caa',
  'char': 'あ',
  'font': '07YsashisaGothicTegaki_Regular',
  'seek_start': 0,
  'seek_end': 1455},
 {'hash': '639f35a260158f5d4cf0a024f9a0bdd9',
  'char': 'あ',
  'font': '851MkPOP_Regular',
  'seek_start': 1455,
  'seek_end': 2691},
 {'hash': '696ff5ab26ec49e6f025f659b3b03d09',
  'char': 'あ',
  'font': 'ArmedBanana_Regular',
  'seek_start': 2691,
  'seek_end': 3842},
 {'hash': '2662518249ec58166326c8bfe65456a1',
  'char': 'あ',
  'font': 'Chihaya Jyun_Regular',
  'seek_start': 3842,
  'seek_end': 4789},
 {'hash': '6f41e667ebc74a2052120abe7de4e0fa',
  'char': 'あ',
  'font': 'darts font_Regular',
  'seek_start': 4789,
  'seek_end': 6063},
 {'hash': 'd816c2da3a15a9055a29f5c71f5e3f9a',
  'char': 'あ',
  'font': 'Dining message_Regular',
  'seek_start': 6063,
  'seek_end': 7189},
 {'hash': 'e54421d494527ee7be2b20ec058e4b0b',
  'char': 'あ',
  'font': 'ElmerFont_Regular',
  'seek_start': 7189,
  'seek_end': 8230},
 {'hash': 'cd33d629b90ba767234a74f4

In [21]:
with open(metadata_filepath, mode='w', encoding='utf-8') as outfile:
    universal_dump(dataset_metadata.__dict__, outfile)