In [None]:
import os
import io
import json

import bitstring

from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt
import cv2

from PIL import Image
import IPython.display as display

# local module
from JIS0208 import JIS0208

In [None]:
part_filepaths = [
    'ETL9B/ETL9B_1',
    'ETL9B/ETL9B_2',
    'ETL9B/ETL9B_3',
    'ETL9B/ETL9B_4',
    'ETL9B/ETL9B_5',
]

# the code below is for Restart and Run All people (include me)
for part_filepath in part_filepaths:
    if not os.path.exists(part_filepath):
        raise Exception(part_filepath, 'does not exist!')

In [None]:
sample_filepath = part_filepaths[0]
print(sample_filepath)

In [None]:
# constants by specification
IMG_WIDTH = 64
IMG_HEIGHT = 63

bitstring_unpack_str = ','.join([
    'int:16', # Serial Sheet Number - [0]
    'uint:16', # JIS Kanji Code (JIS X 0208) - [1]
    '4*uint:8', # JIS Typical Reading (e.g. 'AI.M') - [2:6]
    'bytes:504', # image data - [6]
    'pad:512',
])

# I count this myself from the provided unpack string
# RECORD_SIZE = 16 + 16 + 32 + 504 + 512 # in bit(s)

In [None]:
infile = bitstring.ConstBitStream(filename=sample_filepath)

**The first record of each file is zero padded dummy.**

In [None]:
unpacked_data = infile.readlist(bitstring_unpack_str)

In [None]:
# type: int - Serial Sheet Number
sheet_number = unpacked_data[0]

# type: binary - JIS Kanji Code (JIS X 0208)
jis_char_code = unpacked_data[1]

if jis_char_code in JIS0208:
    unicode_char = JIS0208[jis_char_code]
else:
    unicode_char = 'null_' + repr(jis_char_code)

# type: ASCII - JIS Typical Reading (e.g. 'AI.M')
jis_reading = unpacked_data[2:6]
jis_reading = ''.join(list(map(chr, jis_reading)))

# type: bytes
raw_image_data = unpacked_data[6]

print(type(sheet_number), 'sheet_number:', sheet_number)
print(type(jis_char_code), 'jis_char_code:', jis_char_code)
print('unicode_char:', unicode_char)
print(type(jis_reading), 'jis_reading:', repr(jis_reading))

pil_image = Image.frombytes('1', (IMG_WIDTH, IMG_HEIGHT), raw_image_data, 'raw')

np_img = np.array(
    object=pil_image,
    dtype=np.uint8,
)

plt.imshow(np_img)
plt.colorbar()

# Save the image data as PNG image

In [None]:
buffer = io.BytesIO()
pil_image.save(buffer, format='PNG')
png_encoded_image = buffer.getvalue()
print(len(png_encoded_image))

In [None]:
png_encoded_image

In [None]:
reload_pil_image = Image.open(io.BytesIO(png_encoded_image))
print(type(reload_pil_image))

In [None]:
# The image was saved in [0,1] format as intended.
np_image = np.array(reload_pil_image)
print(type(np_image))
print(np_image.dtype)
print(np_image.shape)

In [None]:
record_metadata_list = []

save_dir = 'repacked_etl9b'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

record_metadata_fpath = os.path.join(save_dir, 'metadata.json')
png_image_packed_fpath = os.path.join(save_dir, 'images.bin')

with open(png_image_packed_fpath, mode='wb') as outfile:
    # I decided to use a local variable to track the current file seeking position
    # instead of call outfile.tell() every time we need to know the current seeking position.
    # I think this is fine in our case because we only write/append to the output file.
    cur_file_pos = 0

    pbar = tqdm(part_filepaths)
    for part_fpath in pbar:
        pbar.set_description(part_fpath)

        infile = bitstring.ConstBitStream(filename=part_fpath)
        # skip the first record in each file
        infile.readlist(bitstring_unpack_str)

        # throttle tqdm
        tqdm_idx = 0
        while True:
            tqdm_idx += 1

            try:
                unpacked_data = infile.readlist(bitstring_unpack_str)
            except:
                break

            # type: int - Serial Sheet Number
            sheet_number = unpacked_data[0]

            # type: binary - JIS Kanji Code (JIS X 0208)
            jis_char_code = unpacked_data[1]

            if jis_char_code in JIS0208:
                unicode_char = JIS0208[jis_char_code]
            else:
                unicode_char = 'null_' + repr(jis_char_code)

            # type: ASCII - JIS Typical Reading (e.g. 'AI.M')
            jis_reading = unpacked_data[2:6]
            jis_reading = ''.join(list(map(chr, jis_reading)))

            # type: bytes
            raw_image_data = unpacked_data[6]
            pil_image = Image.frombytes('1', (IMG_WIDTH, IMG_HEIGHT), raw_image_data, 'raw')
            buffer = io.BytesIO()
            pil_image.save(buffer, format='PNG')
            png_encoded_image = buffer.getvalue()
            
            img_data_seek_start = cur_file_pos
            outfile.write(png_encoded_image)

            cur_file_pos += len(png_encoded_image)
            img_data_seek_end = cur_file_pos
            
            record_metadata = {
                'char': unicode_char,
                'dataset_source': part_fpath,
                'width': IMG_WIDTH,
                'height': IMG_HEIGHT,
                'seek_start': img_data_seek_start,
                'seek_end': img_data_seek_end,
                'sheet_number': sheet_number,
                'jis_reading': jis_reading,
            }
            
            record_metadata_list.append(record_metadata)
            
            if (tqdm_idx % 1000) == 0:
                pbar.set_description(f'{part_fpath} - {sheet_number} - {unicode_char}')
            
with open(record_metadata_fpath, mode='wb') as outfile:
    outfile.write(json.dumps(
        obj=record_metadata_list,
        ensure_ascii=False,
        indent='\t',
    ).encode('utf-8'))