In [None]:
import os
import re
import io
import time
import math
import struct
import traceback
import codecs
import json
from typing import List, Dict

# external dependencies
import numpy as np
import matplotlib.pyplot as plt
import bitstring
from tqdm import tqdm

from PIL import Image
import IPython.display as display

# local modules
import shared

```
.
└── ETL4
    ├── ETL4C
    └── ETL4INFO
```

In [None]:
part_filepaths = [
    'ETL4/ETL4C',
]

# the code below is for Restart and Run All people (include me)
for part_filepath in part_filepaths:
    if not os.path.exists(part_filepath):
        raise Exception(part_filepath, 'does not exist!')

In [None]:
part_filepath = part_filepaths[0]
part_filepath

In [None]:
infile = bitstring.ConstBitStream(filename=part_filepath)

In [None]:
bitstring_unpack_str = ','.join([
    'uint:36', # serial data number - [0]
    'uint:36', # serial sheet number - [1]
    'uint:36', # JIS Code - [2]
    'uint:36', # EBCDIC Code - [3]
    '4*uint:6', # 4 Character Code - [4:8]
    'pad:12', # Spaces
    'uint:36', # Evaluation of Individual Character Image (0=clean, 1, 2, 3) - [8]
    'uint:36', # Evaluation of Character Group (0=clean, 1, 2) - [9]
    'uint:36', # Sample Position Y on Sheet - [10]
    'uint:36', # Sample Position X on Sheet - [11]
    'uint:36', # Male-Female Code (1=male, 2=female) (JIS X 0303) - [12]
    'uint:36', # Age of Writer - [13]
    'uint:36', # Industry Classification Code (JIS X 0403) - [14]
    'uint:36', # Occupation Classification Code (JIS X 0404) - [15]
    'uint:36', # Sheet Gathering Date - [16]
    'uint:36', # Scanning Date - [17]
    'uint:36', # Number of X-Axis Sampling Points - [18]
    'uint:36', # Number of Y-Axis Sampling Points - [19]
    'uint:36', # Number of Levels of Pixel - [20]
    'uint:36', # Magnification of Scanning Lenz - [21]
    'uint:36', # Serial Data Number (old) - [22]
    'pad:1008', # (undefined)
    'bytes:2736', # 16 Gray Level (4bit/pixel) Image Data 72(X-axis size) * 76(Y-axis size) = 5472 pixels - [23]
])

In [None]:
RECORD_LENGTH = 6 * 3936 # in bits
IMG_WIDTH = 72
IMG_HEIGHT = 76

In [None]:
skip = 0
infile.pos = skip * RECORD_LENGTH

In [None]:
record = infile.readlist(bitstring_unpack_str)

print(type(record), len(record))

# skip the last image data as it is too large to print out
for idx, v in enumerate(record[:-1]):
    print(f'{idx} - {v}')

In [None]:
pil_image = Image.frombytes('F', (IMG_WIDTH, IMG_HEIGHT), record[23], 'bit', 4)
np_image = np.array(pil_image)
plt.imshow(np_image, cmap='gray')
plt.colorbar()

In [None]:
pil_image = Image.frombytes('F', (IMG_WIDTH, IMG_HEIGHT), record[23], 'bit', 4)
np_image = np.array(pil_image, dtype=np.uint8)
print(np_image.dtype, np_image.shape)

np_image = np_image * 16 # 256/16
print(np_image.dtype, np_image.shape)

plt.imshow(np_image, cmap='gray')
plt.colorbar()

In [None]:
def encode_image_as_png_bytes(data: bytes):
    pil_image = Image.frombytes('F', (IMG_WIDTH, IMG_HEIGHT), data, 'bit', 4)
    np_image = np.array(pil_image, dtype=np.uint8)
    np_image = np_image * 16 # 256/16
    pil_image = Image.fromarray(np_image)

    buffer = io.BytesIO()
    pil_image.save(buffer, format='PNG')
    png_encoded_image = buffer.getvalue()

    return buffer.getvalue()

In [None]:
png_image = encode_image_as_png_bytes(record[23])
image_filepath = 'tmp.png'
with open(image_filepath, mode='wb') as outfile:
    outfile.write(png_image)

display.display(display.Image(image_filepath))

In [None]:
{
    'serial data number': record[0],
    'serial sheet number': record[1],
    'JIS Code': ''.join(map(shared.T56, record[4:8])),
    'Evaluation of Individual Character Image (0=clean, 1, 2, 3)': record[8],
    'Evaluation of Character Group (0=clean, 1, 2)': record[9],
    'Sample Position Y on Sheet': record[10],
    'Sample Position X on Sheet': record[11],
    'Male-Female Code (1=male, 2=female) (JIS X 0303)': record[12],
    'Age of Writer': record[13],
    'Industry Classification Code (JIS X 0403)': record[14],
    'Occupation Classification Code (JIS X 0404)': record[15],
    'Sheet Gathering Date': record[16],
    'Scanning Date': record[17],
    'Number of X-Axis Sampling Points': record[18],
    'Number of Y-Axis Sampling Points': record[19],
    'Number of Levels of Pixel': record[20],
    'Magnification of Scanning Lenz': record[21],
    'Serial Data Number (old)': record[22],
    'PNG_encoded_image': encode_image_as_png_bytes(record[23]),
}

In [None]:
records_metadata = []
serialized_dataset_filepath = f'etl4{shared.XFormat.EXTENSION}'

if os.path.exists(serialized_dataset_filepath):
    raise Exception(serialized_dataset_filepath + ' is already existed!')

with open(serialized_dataset_filepath, mode='wb') as outfile:
    pbar = tqdm(part_filepaths)
    
    for part_filepath in pbar:
        pbar.set_description(part_filepath)

        infile = bitstring.ConstBitStream(filename=part_filepath)
        infile.pos = 0

        while True:
            try:
                record = infile.readlist(bitstring_unpack_str)
            except:
                # print(record)
                # traceback.print_exc()
                break

            record_data = {
                'serial data number': record[0],
                'serial sheet number': record[1],
                'JIS Code': ''.join(map(shared.T56, record[4:8])),
                'Evaluation of Individual Character Image (0=clean, 1, 2, 3)': record[8],
                'Evaluation of Character Group (0=clean, 1, 2)': record[9],
                'Sample Position Y on Sheet': record[10],
                'Sample Position X on Sheet': record[11],
                'Male-Female Code (1=male, 2=female) (JIS X 0303)': record[12],
                'Age of Writer': record[13],
                'Industry Classification Code (JIS X 0403)': record[14],
                'Occupation Classification Code (JIS X 0404)': record[15],
                'Sheet Gathering Date': record[16],
                'Scanning Date': record[17],
                'Number of X-Axis Sampling Points': record[18],
                'Number of Y-Axis Sampling Points': record[19],
                'Number of Levels of Pixel': record[20],
                'Magnification of Scanning Lenz': record[21],
                'Serial Data Number (old)': record[22],
                'PNG_encoded_image': encode_image_as_png_bytes(record[23]),
            }

            serialized_record_datatype, serialized_record = shared.XFormat.serialize_obj(record_data)
            record_byte_count = len(serialized_record)
            record_seek_start = outfile.tell()

            outfile.write(serialized_record_datatype)
            outfile.write(shared.XFormat.serialize_int(record_byte_count))
            outfile.write(serialized_record)

            record_seek_end = outfile.tell()

            metadata_record = {
                'serial data number': record[0],
                'serial sheet number': record[1],
                'JIS Code': ''.join(map(shared.T56, record[4:8])),
                'Evaluation of Individual Character Image (0=clean, 1, 2, 3)': record[8],
                'Evaluation of Character Group (0=clean, 1, 2)': record[9],
                'Sample Position Y on Sheet': record[10],
                'Sample Position X on Sheet': record[11],
                'Male-Female Code (1=male, 2=female) (JIS X 0303)': record[12],
                'Age of Writer': record[13],
                'Industry Classification Code (JIS X 0403)': record[14],
                'Occupation Classification Code (JIS X 0404)': record[15],
                'Sheet Gathering Date': record[16],
                'Scanning Date': record[17],
                'Number of X-Axis Sampling Points': record[18],
                'Number of Y-Axis Sampling Points': record[19],
                'Number of Levels of Pixel': record[20],
                'Magnification of Scanning Lenz': record[21],
                'Serial Data Number (old)': record[22],
                'seek_start': record_seek_start,
                'seek_end': record_seek_end,
            }

            records_metadata.append(metadata_record)

            pbar.set_description(part_filepath + ' - ' + str(metadata_record['serial data number']) + ' - ' + metadata_record['JIS Code'])

In [None]:
metadata_filepath = 'etl4-metadata.json'
if os.path.exists(metadata_filepath):
    raise Exception(metadata_filepath + 'is already existed!')

with open(metadata_filepath, mode='w', encoding='utf-8') as out_stream:
    json.dump(records_metadata, out_stream, ensure_ascii=False, indent='\t')

In [None]:
len(records_metadata)