In [None]:
import os
import time
import math
import re
import struct
import traceback
import codecs

import bitstring

import numpy as np
import matplotlib.pyplot as plt
import cv2

from PIL import Image
import IPython.display as display

# local module
from JIS0208 import JIS0208

In [None]:
part_filepaths = [
    'ETL9B/ETL9B_1',
    'ETL9B/ETL9B_2',
    'ETL9B/ETL9B_3',
    'ETL9B/ETL9B_4',
    'ETL9B/ETL9B_5',
]

# the code below is for Restart and Run All people (include me)
for part_filepath in part_filepaths:
    if not os.path.exists(part_filepath):
        raise Exception(part_filepath, 'does not exist!')

In [None]:
sample_filepath = part_filepaths[0]
print(sample_filepath)

In [None]:
# constants by specification
IMG_WIDTH = 64
IMG_HEIGHT = 63

bitstring_unpack_str = ','.join([
    'int:16', # Serial Sheet Number - [0]
    'uint:16', # JIS Kanji Code (JIS X 0208) - [1]
    'uint:32', # JIS Typical Reading (e.g. 'AI.M') - [2]
    'bytes:504', # image data - [3]
    'pad:512',
])

# I count this myself from the provided unpack string
# RECORD_SIZE = 16 + 16 + 32 + 504 + 512 # in bit(s)

In [None]:
infile = bitstring.ConstBitStream(filename=sample_filepath)

**The first record of each file is zero padded dummy.**

In [None]:
unpacked_data = infile.readlist(bitstring_unpack_str)

sheet_number = unpacked_data[0]
jis_char_code = unpacked_data[1]
record_data = {
    'sheet_number': record[0], # type: int - Serial Sheet Number
    'JIS_KANJI_CODE': record[1], # type: binary - JIS Kanji Code (JIS X 0208)
    'JIS_READING': record[2], # type: ASCII - JIS Typical Reading (e.g. 'AI.M')
    'image_data': record[3], # type: bytes
}

width = 64
height = 63

np_img = np.array(Image.frombytes('1', (width, height), record_dict['image_data'], 'raw')).astype(int)
plt.imshow(np_img)
plt.colorbar()

In [None]:
record = file_stream.readlist('int:16,uint:16,uint:32,bytes:504,pad:512')
record_dict = {
    'sheet_number': record[0], # type: int - Serial Sheet Number
    'JIS_KANJI_CODE': record[1], # type: binary - JIS Kanji Code (JIS X 0208)
    'JIS_READING': record[2], # type: ASCII - JIS Typical Reading (e.g. 'AI.M')
    'image_data': record[3], # type: bytes
}

width = 64
height = 63

np_img = np.array(Image.frombytes('1', (width, height), record_dict['image_data'], 'raw')).astype(int)
plt.imshow(np_img)
plt.colorbar()

In [None]:
total_samples = 0
record_count = {}

for record_filepath in part_filepaths:

    file_stream = bitstring.ConstBitStream(filename=record_filepath)
    # skip the first record in each file
    file_stream.readlist('bytes:576')

    while True:
        try:
            record = file_stream.readlist('int:16,int:16,uint:32,bytes:504,pad:512')
        except:
            break
        
        total_samples += 1
        record_dict = {
            'sheet_number': record[0], # type: int - Serial Sheet Number
            'JIS_KANJI_CODE': record[1], # type: binary - JIS Kanji Code (JIS X 0208)
            'JIS_READING': record[2], # type: ASCII - JIS Typical Reading (e.g. 'AI.M')
            'image_data': record[3], # type: bytes
        }
        
        name = hex(record_dict['JIS_KANJI_CODE'])
        if name in record_count.keys():
            record_count[name] += 1
        else:
            record_count[name] = 1

In [None]:
total_samples

In [None]:
len(record_count.keys())

In [None]:
record_count