# Inspect the `ETL2` dataset

## Prerequisite

Get and unzip the `ETL2` dataset as show below.

```
ETL2
├── ETL2INFO
├── ETL2_1
├── ETL2_2
├── ETL2_3
├── ETL2_4
└── ETL2_5
```

In [None]:
import os
import time
import math
import re
import struct
import traceback
import codecs

# external dependencies
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import cv2
import bitstring

from PIL import Image
import IPython.display as display

In [None]:
etl2_filepaths = [    
    'ETL2/ETL2_1',
    'ETL2/ETL2_2',
    'ETL2/ETL2_3',
    'ETL2/ETL2_4',
    'ETL2/ETL2_5',
]

etl2_filepaths

The dataset uses an uncommon encoding (`co59`) to encode the label so we need to prepare a map to convert them to unicode.

The code below was taken from the official guide with the supplement `co59-utf8.txt` file.

In [None]:
t56s = '0123456789[#@:>? ABCDEFGHI&.](<  JKLMNOPQR-$*);\'|/STUVWXYZ ,%="!'

def T56(c):
    return t56s[c]

with codecs.open('co59-utf8.txt', 'r', 'utf-8') as co59f:
    co59t = co59f.read()

co59l = co59t.split()
CO59 = {}
for c in co59l:
    ch = c.split(':')
    co = ch[1].split(',')
    CO59[(int(co[0]), int(co[1]))] = ch[0]

CO59

In [None]:
len(CO59.keys())

# Take a look a sample file

In [None]:
sample_filepath = etl2_filepaths[0]
sample_filepath

In [None]:
file_stream = bitstring.ConstBitStream(filename=sample_filepath)

Each records has the size of 3660 6-bit bytes.

In [None]:
RECORD_LENGTH = 6 * 3660 # in bit(s)

In [None]:
# you can change the skip value to inspect a different record
skip = 0
file_stream.pos = skip * RECORD_LENGTH

Unpack the record as shown in the specification.

http://etlcdb.db.aist.go.jp/specification-of-etl-2

In [None]:
bitstring_unpack_str = ','.join([
    'int:36', # Serial Index - [0]
    'uint:6', # Source ('A': Mincho Newspaper, 'B': Gothic Newspaper, 'C': Mincho Patent, 'D': Gothic Patent) - [1]
    'pad:30', # padding bits - no index as they are skipped
    '6*uint:6', # Class ('KANJI': kanji, 'EIJI': roman alphabets, 'HRKANA': hiragana, 'KTKANA': katakana, 'KIGO': special characters, 'SUUJI': numbers) - [2:8]
    '6*uint:6', # Font ('MINCHO', 'GOTHIC') - [8:14]
    'pad:24', # padding bits
    '2*uint:6', # CO-59 Code - [14:16]
    'pad:180', # padding bits
    'bytes:2700', # 6-bit-depth image of 60 x 60 = 3600 pixels - [16]
])

record = file_stream.readlist(bitstring_unpack_str)

In [None]:
print(type(record), len(record))
record[:-1]

In [None]:
print(record[0], T56(record[1]), ''.join(map(T56, record[2:8])), ''.join(map(T56, record[8:14])), CO59[tuple(record[14:16])])

In [None]:
from typing import List, Dict

In [None]:
class ETL2Record:
    def __init__(
        self,
        index: int, # the Serial Index in the record
        source: str, # the source material that the record has been scanned from
        character_type: str, # enum type: 'KANJI', 'EIJI', 'HRKANA', 'KTKANA', 'KIGO', 'SUUJI'
        font: str, # e.g. "('MINCHO', 'GOTHIC')"
        unicode_char: str, # e.g. あ
        image: bytes, # PNG encoded image
    ):
        self.index = index
        self.source = source
        self.character_type = character_type
        self.font = font
        self.unicode_char = unicode_char
        self.image = image

    def __repr__(self):
        return repr(self.__dict__)

# All the images are 6-bit depth 60x60 pixels images

In [None]:
IMG_WIDTH = 60
IMG_HEIGHT = 60

pil_image = Image.frombytes('F', (IMG_WIDTH, IMG_HEIGHT), record[16], 'bit', 6)

np_img = np.array(pil_image)

plt.imshow(np_img)
plt.colorbar()

# Convert image to grayscale `[0-255]` range

In [None]:
pil_image = pil_image.convert('L')
np_img = np.array(pil_image)

plt.imshow(np_img)
plt.colorbar()

In [None]:
import io

In [None]:
buffer = io.BytesIO()
pil_image.save(buffer, format='PNG')
png_encoded_image = buffer.getvalue()

In [None]:
pil_image = Image.frombytes('F', (IMG_WIDTH, IMG_HEIGHT), record[16], 'bit', 6)
pil_image = pil_image.convert('L')

buffer = io.BytesIO()
pil_image.save(buffer, format='PNG')
png_encoded_image = buffer.getvalue()

etl2_record = ETL2Record(
    index=record[0],
    source=T56(record[1]),
    character_type=''.join(map(T56, record[2:8])),
    font=''.join(map(T56, record[8:14])),
    unicode_char=CO59[tuple(record[14:16])],
    image=png_encoded_image,
)

etl2_record

# That's all for inspecting a single record in the dataset.

`XFormat` is my custom data serialization format. I created this format because I don't want to use `JSON` or `TFRecord` (`protobuf`) for various limitations.

In [None]:
class XFormat:
    INT_SIZE = 4
    BYTE_ORDER = 'little'
    EXTENSION = '.xformat'
    ENCODING = 'utf-8'

    DATA_TYPE_BYTES = 0
    DATA_TYPE_INT = 1
    DATA_TYPE_UTF8_STRING = 2
    DATA_TYPE_LIST = 3
    DATA_TYPE_DICT = 4

    @classmethod
    def serialize_string(cls, s: str) -> bytes:
        record_data = s.encode(encoding=cls.ENCODING)
        return record_data

    @classmethod
    def deserialize_string(cls, bs: bytes) -> str:
        return bs.decode(encoding=cls.ENCODING)

    @classmethod
    def serialize_int(cls, n: int) -> bytes:
        record_data = n.to_bytes(
            length=cls.INT_SIZE,
            byteorder=cls.BYTE_ORDER,
            signed=True,
        )

        return record_data

    @classmethod
    def deserialize_int(cls, bs: bytes) -> int:
        return int.from_bytes(bs, byteorder=cls.BYTE_ORDER, signed=True)

    @classmethod
    def serialize_obj(cls, obj) -> (bytes, bytes):
        obj_type = type(obj)
        if obj_type == int:
            return bytes([cls.DATA_TYPE_INT]), cls.serialize_int(obj)
        elif obj_type == str:
            return bytes([cls.DATA_TYPE_UTF8_STRING]), cls.serialize_string(obj)
        elif obj_type == bytes:
            return bytes([cls.DATA_TYPE_BYTES]), obj
        elif obj_type == list:
            buffer = io.BytesIO()

            for value in obj:
                datatype, encoded_value = cls.serialize_obj(value)
                buffer.write(datatype)
                buffer.write(cls.serialize_int(len(encoded_value)))
                buffer.write(encoded_value)

            return bytes([cls.DATA_TYPE_LIST]), buffer.getvalue()
        elif obj_type == dict:
            buffer = io.BytesIO()

            for key in obj:
                datatype, encoded_key = cls.serialize_obj(key)
                buffer.write(datatype)
                buffer.write(cls.serialize_int(len(encoded_key)))
                buffer.write(encoded_key)

                datatype, encoded_value = cls.serialize_obj(obj[key])
                buffer.write(datatype)
                buffer.write(cls.serialize_int(len(encoded_value)))
                buffer.write(encoded_value)

            return bytes([cls.DATA_TYPE_DICT]), buffer.getvalue()
        else:
            raise Exception(f'Unsupported type {obj_type}!')
            return 0

    @classmethod
    def deserialze_obj(cls, bs: bytes, datatype: int):
        if datatype == cls.DATA_TYPE_BYTES:
            return bs
        elif datatype == cls.DATA_TYPE_INT:
            return cls.deserialize_int(bs)
        elif datatype == cls.DATA_TYPE_UTF8_STRING:
            return cls.deserialize_string(bs)
        elif datatype == cls.DATA_TYPE_LIST:
            retval = []
            buffer = io.BytesIO(bs)
            pos = 0
            bs_len = len(bs)

            while pos < bs_len:
                value_datatype = bs[pos]
                pos += 1

                if(pos + cls.INT_SIZE) > bs_len:
                    raise Exception(f'Broken serialized data!')
                value_byte_count = cls.deserialize_int(bs[pos:pos+cls.INT_SIZE])  # noqa
                pos += cls.INT_SIZE

                if(pos + value_byte_count) > bs_len:
                    raise Exception(f'Broken serialized data!')
                value = cls.deserialze_obj(bs[pos:pos+value_byte_count], value_datatype)  # noqa
                retval.append(value)
                pos += value_byte_count

            return retval
        elif datatype == cls.DATA_TYPE_DICT:
            retval = {}
            buffer = io.BytesIO(bs)
            pos = 0
            bs_len = len(bs)

            while pos < bs_len:
                key_datatype = bs[pos]
                pos += 1

                if (pos + cls.INT_SIZE) > bs_len:
                    raise Exception(f'Broken serialized data!')
                key_byte_count = cls.deserialize_int(bs[pos:pos+cls.INT_SIZE])
                pos += cls.INT_SIZE

                if(pos + key_byte_count) > bs_len:
                    raise Exception(f'Broken serialized data!')
                key = cls.deserialze_obj(bs[pos:pos+key_byte_count], key_datatype)  # noqa
                pos += key_byte_count

                value_datatype = bs[pos]
                pos += 1

                if(pos + cls.INT_SIZE) > bs_len:
                    raise Exception(f'Broken serialized data!')
                value_byte_count = cls.deserialize_int(bs[pos:pos+cls.INT_SIZE])  # noqa
                pos += cls.INT_SIZE

                if(pos + value_byte_count) > bs_len:
                    raise Exception(f'Broken serialized data!')
                value = cls.deserialze_obj(bs[pos:pos+value_byte_count], value_datatype)  # noqa
                pos += value_byte_count

                retval[key] = value

            return retval
        else:
            raise Exception(f'Unsupported data type {datatype}!')

In [None]:
import json

In [None]:
from tqdm import tqdm

In [None]:
records_metadata = []
etl2_serialized_dataset_filepath = f'etl2{XFormat.EXTENSION}'

with open(etl2_serialized_dataset_filepath, mode='wb') as out_stream:
    pbar = tqdm(etl2_filepaths)
    for filename in pbar:

        file_stream = bitstring.ConstBitStream(filename=filename)

        while True:
            try:
                record = file_stream.readlist(bitstring_unpack_str)
            except:
                # TODO properly check for end of file
                # print(record)
                # traceback.print_exc()
                break

            pil_image = Image.frombytes('F', (IMG_WIDTH, IMG_HEIGHT), record[16], 'bit', 6)
            pil_image = pil_image.convert('L')

            buffer = io.BytesIO()
            pil_image.save(buffer, format='PNG')
            png_encoded_image = buffer.getvalue()

            etl2_record = ETL2Record(
                index=record[0],
                source=T56(record[1]),
                character_type=''.join(map(T56, record[2:8])),
                font=''.join(map(T56, record[8:14])),
                unicode_char=CO59[tuple(record[14:16])],
                image=png_encoded_image,
            )

            record_datatype, serialized_record = XFormat.serialize_obj(etl2_record.__dict__)
            record_byte_count = len(serialized_record)
            record_seek_start = out_stream.tell()

            out_stream.write(record_datatype)
            out_stream.write(XFormat.serialize_int(record_byte_count))
            out_stream.write(serialized_record)

            record_seek_end = out_stream.tell()
            
            record_metadata = {
                'index': etl2_record.index,
                'source': etl2_record.source,
                'character_type': etl2_record.character_type,
                'font': etl2_record.font,
                'unicode_char': etl2_record.unicode_char,
                'seek_start': record_seek_start,
                'seek_end': record_seek_end,
            }
            
            records_metadata.append(record_metadata)
            
            pbar.set_description(f'{filename} - {etl2_record.index}')

In [None]:
with open('etl2-metadata.json', mode='w', encoding='utf-8') as out_stream:
    json.dump(records_metadata, out_stream, ensure_ascii=False, indent='\t')