# Goal

Because classification model can only output array of numbers, we need to generate mapping between the indices and the characters. Otherwise, we may lose track of which indicies belong to which characters. I will use JSON format to save the mapping output.

In [1]:
import os
import re
import time
import argparse
import inspect

from constants import *
from logger import *
from utils import *
from serializable import *

Prepare a text file that contains all the characters.

In [2]:
infile = 'hiragana.txt'
os.path.exists(infile)

True

Create a list of characters from `infile`. Here we can apply all kind of ordering to make the label ordering as relevant as possible. However, we don't know anything about how we should order the characters yet so we will stick with the order of appearance in the text file.

In [3]:
lines = open(infile, mode='r', encoding='utf-8').readlines()
content = ''.join(lines)
content = content.replace('\n', '')
characters = [c for c in content]
characters

['あ',
 'い',
 'う',
 'え',
 'お',
 'か',
 'き',
 'く',
 'け',
 'こ',
 'さ',
 'し',
 'す',
 'せ',
 'そ',
 'た',
 'ち',
 'つ',
 'て',
 'と',
 'な',
 'に',
 'ぬ',
 'ね',
 'の',
 'ま',
 'み',
 'む',
 'め',
 'も',
 'は',
 'ひ',
 'ふ',
 'へ',
 'ほ',
 'や',
 'ゆ',
 'よ',
 'ら',
 'り',
 'る',
 'れ',
 'ろ',
 'わ',
 'ん']

When serializing, we should also the source file name and content of the file so that we can easily identify where the labels come from later.

In [4]:
print(inspect.getsource(LabelFile))

class LabelFile:
    def __init__(self, source: str, content: str, labels: list):
        self.source = source
        self.content = content
        self.labels = labels

    def __repr__(self):
        return repr(self.__dict__)

    @staticmethod
    def parse_obj(obj: dict):
        if not isinstance(obj, dict):
            raise Exception(f'{obj} is not a dict!')

        sample = LabelFile('', '', [])
        args = {}

        for key in sample.__dict__:
            if key not in obj:
                obj_str = repr(obj)
                if len(obj_str) > 80:
                    obj_str = obj_str[:40] + '...' + obj_str[-40:]

                raise Exception(f'{obj_str} does not contain key {repr(key)}!')

            args[key] = obj[key]

        return LabelFile(**args)



The `staticmethod` is used to parse the `json` back. Yeah, I think it is a naive way of doing that but it is simple.

In [5]:
label_file = LabelFile(
    source=os.path.basename(infile),
    content=content,
    labels=characters,
)

label_file

{'source': 'hiragana.txt', 'content': 'あいうえおかきくけこさしすせそたちつてとなにぬねのまみむめもはひふへほやゆよらりるれろわん', 'labels': ['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', 'け', 'こ', 'さ', 'し', 'す', 'せ', 'そ', 'た', 'ち', 'つ', 'て', 'と', 'な', 'に', 'ぬ', 'ね', 'の', 'ま', 'み', 'む', 'め', 'も', 'は', 'ひ', 'ふ', 'へ', 'ほ', 'や', 'ゆ', 'よ', 'ら', 'り', 'る', 'れ', 'ろ', 'わ', 'ん']}

In [6]:
outfile = LABEL_FILENAME
outfile, os.path.exists(outfile)

('labels.json', True)

As we are going to prototype a lots, it is a good idea to back up generated samples every iterations.

In [7]:
if os.path.exists(outfile):
    print(f'Output file {outfile} is already existed!')

    backup_path = backup_file_by_modified_date(outfile)
    print(f'It has been backed up at {os.path.basename(backup_path)}.')

outfile, os.path.exists(outfile)

Output file labels.json is already existed!
It has been backed up at 1589632615-labels.json.


('labels.json', False)

We are going to dump a lot of data to `json` so it may be a good idea to use the same format each times.

In [8]:
print(inspect.getsource(universal_dump))

def universal_dump(obj, fp):
    # use tab to reduce file size
    json.dump(obj, fp, ensure_ascii=False, indent='\t')



In [9]:
with open(outfile, mode='w', encoding='utf-8') as out_stream:
    universal_dump(label_file.__dict__, out_stream)