In [None]:
import os

import bitstring

import numpy as np
import matplotlib.pyplot as plt
import cv2

from PIL import Image
import IPython.display as display

# local module
from JIS0208 import JIS0208

In [None]:
part_filepaths = [
    'ETL9B/ETL9B_1',
    'ETL9B/ETL9B_2',
    'ETL9B/ETL9B_3',
    'ETL9B/ETL9B_4',
    'ETL9B/ETL9B_5',
]

# the code below is for Restart and Run All people (include me)
for part_filepath in part_filepaths:
    if not os.path.exists(part_filepath):
        raise Exception(part_filepath, 'does not exist!')

In [None]:
sample_filepath = part_filepaths[0]
print(sample_filepath)

In [None]:
# constants by specification
IMG_WIDTH = 64
IMG_HEIGHT = 63

bitstring_unpack_str = ','.join([
    'int:16', # Serial Sheet Number - [0]
    'uint:16', # JIS Kanji Code (JIS X 0208) - [1]
    '4*uint:8', # JIS Typical Reading (e.g. 'AI.M') - [2:6]
    'bytes:504', # image data - [6]
    'pad:512',
])

# I count this myself from the provided unpack string
# RECORD_SIZE = 16 + 16 + 32 + 504 + 512 # in bit(s)

In [None]:
infile = bitstring.ConstBitStream(filename=sample_filepath)

**The first record of each file is zero padded dummy.**

In [None]:
unpacked_data = infile.readlist(bitstring_unpack_str)

In [None]:
# type: int - Serial Sheet Number
sheet_number = unpacked_data[0]

# type: binary - JIS Kanji Code (JIS X 0208)
jis_char_code = unpacked_data[1]

if jis_char_code in JIS0208:
    unicode_char = JIS0208[jis_char_code]
else:
    unicode_char = 'null_' + repr(jis_char_code)

# type: ASCII - JIS Typical Reading (e.g. 'AI.M')
jis_reading = unpacked_data[2:6]
jis_reading = ''.join(list(map(chr, jis_reading)))

# type: bytes
raw_image_data = unpacked_data[6]

print(type(sheet_number), 'sheet_number:', sheet_number)
print(type(jis_char_code), 'jis_char_code:', jis_char_code)
print('unicode_char:', unicode_char)
print(type(jis_reading), 'jis_reading:', repr(jis_reading))

np_img = np.array(
    object=Image.frombytes('1', (IMG_WIDTH, IMG_HEIGHT), raw_image_data, 'raw'),
    dtype=np.uint8,
)

plt.imshow(np_img)
plt.colorbar()

In [None]:
for part_fpath in part_filepaths:

    infile = bitstring.ConstBitStream(filename=part_fpath)
    # skip the first record in each file
    infile.readlist(bitstring_unpack_str)

    while True:
        try:
            unpacked_data = infile.readlist(bitstring_unpack_str)
        except:
            break

        # type: int - Serial Sheet Number
        sheet_number = unpacked_data[0]

        # type: binary - JIS Kanji Code (JIS X 0208)
        jis_char_code = unpacked_data[1]

        if jis_char_code in JIS0208:
            unicode_char = JIS0208[jis_char_code]
        else:
            unicode_char = 'null_' + repr(jis_char_code)

        # type: ASCII - JIS Typical Reading (e.g. 'AI.M')
        jis_reading = unpacked_data[2:6]
        jis_reading = ''.join(list(map(chr, jis_reading)))

        # type: bytes
        raw_image_data = unpacked_data[6]

        np_img = np.array(
            object=Image.frombytes('1', (IMG_WIDTH, IMG_HEIGHT), raw_image_data, 'raw'),
            dtype=np.uint8,
        )