In [None]:
import os
import time
import math
import re
import struct
import traceback

import numpy as np
import matplotlib.pyplot as plt
# import cv2
import pandas as pd

from PIL import Image
import IPython.display as display

In [None]:
os.listdir('.')

In [None]:
file_list = [
    'ETL7LC_1',
    'ETL7LC_2',
    'ETL7SC_1',
    'ETL7SC_2',
]

In [None]:
filename = file_list[0]
filename

In [None]:
RECORD_LENGTH = 2052 # bytes

In [None]:
with open(filename, 'rb') as data_file:
    record_string = data_file.read(RECORD_LENGTH)

In [None]:
print(type(record_string), len(record_string))

In [None]:
UNPACK_STRING = '>H2sH6BI4H4B4x2016s4x'

In [None]:
record = struct.unpack(UNPACK_STRING, record_string)
print(type(record), len(record))

In [None]:
record_dict = {
    'index': record[0], # type: int
    'character_name': record[1], # type: bytes - ascii encoding (e.g. A, KA, 0, $)
    'sheet_index': record[2], # type: int
    'JIS_X_0201_CODE': record[3], # type: binary - https://en.wikipedia.org/wiki/JIS_X_0201
    'EBCDIC_CODE': record[4], # type: binary - https://en.wikipedia.org/wiki/EBCDIC
    'image_quality': record[5], # type: int - [0, 1, 2, 3] with 0 is clean
    'group_quality': record[6], # type: int - [0, 1, 2] with 0 is clean
    'writer_gender': record[7], # type: int - 1 is male, 2 is female. JIS X 0303 encoding?
    'writer_age': record[8], # type: int - 
    'serial_data_index': record[9], # type: int
    'industry_classification_code': record[10], # type: int - JIS X 0403
    'occupation_classification_code': record[11], # type: int - JIS X 0404
    'data_of_sheet_gathering': record[12], # type: int - data of sheet gathering (19)YYMM
    'date_of_scan': record[13], # type: int - date of scan (19)YYMM
    'x_pos': record[14], # type: int - x coordinate of scan position on sheet (>= 1)
    'y_pos': record[15], # type: int - y coordinate of scan position on sheet (>= 1)
    'minimum_intensity_level': record[16], # type: int - value in range (0-255)
    'maximum_scanned_level': record[17], # type: int - value in range (0-255)
    'image_data': record[18], # type: bytes
}

In [None]:
for key in record_dict.keys():
    print(key, type(record_dict[key]))

In [None]:
record_dict['character_name']

In [None]:
record_dict['character_name'].decode('ascii').strip()

In [None]:
width = 64
height = 63
img = np.array(Image.frombytes('F', (width, height), record_dict['image_data'], 'bit', 4))

In [None]:
plt.imshow(img)
plt.colorbar()

In [None]:
total_samples = 0
record_count = {}
RECORD_LENGTH = 2052 # bytes
UNPACK_STRING = '>H2sH6BI4H4B4x2016s4x'

for filename in file_list:
    f = open(filename, 'rb')

    while True:
        record_string = f.read(RECORD_LENGTH)

        if len(record_string) < RECORD_LENGTH:
            break
        total_samples += 1
        record = struct.unpack(UNPACK_STRING, record_string)

        record_dict = {
            'index': record[0], # type: int
            'character_name': record[1], # type: bytes - ascii encoding (e.g. A, KA, 0, $)
            'sheet_index': record[2], # type: int
            'JIS_X_0201_CODE': record[3], # type: binary - https://en.wikipedia.org/wiki/JIS_X_0201
            'EBCDIC_CODE': record[4], # type: binary - https://en.wikipedia.org/wiki/EBCDIC
            'image_quality': record[5], # type: int - [0, 1, 2, 3] with 0 is clean
            'group_quality': record[6], # type: int - [0, 1, 2] with 0 is clean
            'writer_gender': record[7], # type: int - 1 is male, 2 is female. JIS X 0303 encoding?
            'writer_age': record[8], # type: int - 
            'serial_data_index': record[9], # type: int
            'industry_classification_code': record[10], # type: int - JIS X 0403
            'occupation_classification_code': record[11], # type: int - JIS X 0404
            'data_of_sheet_gathering': record[12], # type: int - data of sheet gathering (19)YYMM
            'date_of_scan': record[13], # type: int - date of scan (19)YYMM
            'x_pos': record[14], # type: int - x coordinate of scan position on sheet (>= 1)
            'y_pos': record[15], # type: int - y coordinate of scan position on sheet (>= 1)
            'minimum_intensity_level': record[16], # type: int - value in range (0-255)
            'maximum_scanned_level': record[17], # type: int - value in range (0-255)
            'image_data': record[18], # type: bytes
        }

#         name = record_dict['character_name'].decode('ascii').strip()
        name = record_dict['character_name'].decode('ascii')

        if name in record_count.keys():
            record_count[name] += 1
        else:
            record_count[name] = 1

    f.close()

In [None]:
total_samples

In [None]:
len(record_count.keys())

In [None]:
record_count

In [None]:
csv_filename = 'classes.tsv'
with open(csv_filename, mode='w', encoding='utf-8') as f:
    f.write('class\tnum_samples\n')
    for k in record_count.keys():
        log_str = f'{k}\t{record_count[k]}\n'
        f.write(log_str)
# sort classes
# pd_df = pd.read_csv(csv_filename, encoding='utf-8', sep='\t')
# pd_df = pd_df.sort_values(['class'])
# pd_df.to_csv(csv_filename, encoding='utf-8', index=False, sep='\t')