In [None]:
import os
import time
import math
import re
import struct
import traceback
import codecs
import bitstring

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# import cv2

from PIL import Image
import IPython.display as display

```
.
├── ETL3
│   ├── ETL3C_1
│   ├── ETL3C_2
│   └── ETL3INFO
├── README.md
├── classes.tsv
├── co59-utf8.txt
└── note.ipynb
```

In [None]:
dataset_part_filepaths = [
    'ETL3/ETL3C_1',
    'ETL3/ETL3C_2',
]

for part_filepath in dataset_part_filepaths:
    print(os.path.exists(part_filepath), part_filepath)

In [None]:
t56s = '0123456789[#@:>? ABCDEFGHI&.](<  JKLMNOPQR-$*);\'|/STUVWXYZ ,%="!'

def T56(c):
    return t56s[c]

with codecs.open('co59-utf8.txt', 'r', 'utf-8') as co59f:
    co59t = co59f.read()
    
co59l = co59t.split()
CO59 = {}
for c in co59l:
    ch = c.split(':')
    co = ch[1].split(',')
    CO59[(int(co[0]), int(co[1]))] = ch[0]

In [None]:
len(CO59.keys())

In [None]:
part_filepath = dataset_part_filepaths[0]
part_filepath

Below is the data format for the `ETL3`, `ETL4`, and `ETL5`.

```
/---[form_c.htm 2001-09.04]-------------------------------------------------------------------


				C-Type Data Format	(ETL3, ETL4, ETL5)


1. File Format (Fixed Record Length without Control Words)

                   <----------> Logical record (2952 bytes) (1byte = 8bits)
     ------------------------------------------------------------------
     |  Sample 1  |  Sample 2  |  Sample 3  |    ....    |  Sample N  |  (No. of records = N)
     ------------------------------------------------------------------


2. Contents of Logical Record (3936 characters = 2952 bytes) (1character = 6bits)

 --------------------------------------------------------------------------------------------
|             |No. of|        |                                                              |
|   Character |Char- |  Type  |            Contents of Logical Record                        |
|   Position  |acters|        |                                                              |
|============================================================================================|
|    1 -    6 |    6 | Integer| Serial Data Number                                           |
|    7 -   12 |    6 | Integer| Serial Sheet Number                                          |
|   13 -   18 |    6 | Binary | JIS Code (Effective bits = Left 8 bits) (JIS X 0201)         |
|I  19 -   24 |    6 | Binary | EBCDIC Code (Effective bits = Left 8 bits)                   |
|D  25 -   28 |    4 | T56Code| 4 Character Code ( ex. "N  0", "A  A", "S  +", "K KA" )      |
|   29 -   30 |    2 | T56Code| Spaces                                                       |
|P  31 -   36 |    6 | Integer| Evaluation of Individual Character Image (0=clean, 1, 2, 3)  |
|a  37 -   42 |    6 | Integer| Evaluation of Character Group (0=clean, 1, 2)                |
|r  43 -   48 |    6 | Integer| Sample Position Y on Sheet                                   |
|t  49 -   54 |    6 | Integer| Sample Position X on Sheet                                   |
|   55 -   60 |    6 | Integer| Male-Female Code ( 1=male, 2=female ) (JIS X 0303)           |
|   61 -   66 |    6 | Integer| Age of Writer                                                |
|   67 -   72 |    6 | Integer| Industry Classification Code (JIS X 0403)                    |
|   73 -   78 |    6 | Integer| Occupation Classification Code (JIS X 0404)                  |
|   79 -   84 |    6 | Integer| Sheet Gatherring Date                                        |
|   85 -   90 |    6 | Integer| Scanning Date                                                |
|   91 -   96 |    6 | Integer| Number of X-Axis Sampling Points                             |
|   97 -  102 |    6 | Integer| Number of Y-Axis Sampling Points                             |
|  103 -  108 |    6 | Integer| Number of Levels of Pixel                                    |
|  109 -  114 |    6 | Integer| Magnification of Scanning Lenz                               |
|  115 -  120 |    6 | Integer| Serial Data Number (old)                                     |
|  121 -  288 |  168 |        | (undefined)                                                  |
|-------------|------|--------|--------------------------------------------------------------|
|  289 - 3936 | 3648 | Packed | 16 Gray Level (4bit/pixel) Image Data                        |
|             |      |        | 72(X-axis size) * 76(Y-axis size) = 5472 pixels              |
 --------------------------------------------------------------------------------------------

------------------------------------------------------------[form_c.htm]---------------------/
```

In [None]:
infile = bitstring.ConstBitStream(filename=part_filepath)

In [None]:
bitstring_unpack_str = ','.join([
    'uint:36', # serial data number - [0]
    'uint:36', # serial sheet number - [1]
    'uint:36', # JIS Code - [2]
    'uint:36', # EBCDIC Code - [3]
    '4*uint:6', # 4 Character Code - [4:8]
    'pad:12', # Spaces
    'uint:36', # Evaluation of Individual Character Image (0=clean, 1, 2, 3) - [8]
    'uint:36', # Evaluation of Character Group (0=clean, 1, 2) - [9]
    'uint:36', # Sample Position Y on Sheet - [10]
    'uint:36', # Sample Position X on Sheet - [11]
    'uint:36', # Male-Female Code (1=male, 2=female) (JIS X 0303) - [12]
    'uint:36', # Age of Writer - [13]
    'uint:36', # Industry Classification Code (JIS X 0403) - [14]
    'uint:36', # Occupation Classification Code (JIS X 0404) - [15]
    'uint:36', # Sheet Gathering Date - [16]
    'uint:36', # Scanning Date - [17]
    'uint:36', # Number of X-Axis Sampling Points - [18]
    'uint:36', # Number of Y-Axis Sampling Points - [19]
    'uint:36', # Number of Levels of Pixel - [20]
    'uint:36', # Magnification of Scanning Lenz - [21]
    'uint:36', # Serial Data Number (old) - [22]
    'pad:1008', # (undefined)
    'bytes:2736', # 16 Gray Level (4bit/pixel) Image Data 72(X-axis size) * 76(Y-axis size) = 5472 pixels - [23]
])

In [None]:
RECORD_LENGTH = 6 * 3936 # in bits

In [None]:
skip = 0
infile.pos = skip * RECORD_LENGTH
record = infile.readlist(bitstring_unpack_str)

In [None]:
print(type(record), len(record))

# skip the last image data as it is too large to print out
for idx, v in enumerate(record[:-1]):
    print(f'{idx} - {v}')

In [None]:
''.join(map(T56, record[4:8]))

In [None]:
record_dict = {
    'character': ''.join(map(T56, record[4:8])), # 4 Character Code - [4:8]
    'image_data': record[23], # 16 Gray Level (4bit/pixel) Image Data 72(X-axis size) * 76(Y-axis size) = 5472 pixels - [23]
}

In [None]:
record_dict['character']

In [None]:
width = 72
height = 76
np_img = np.array(Image.frombytes('F', (width, height), record_dict['image_data'], 'bit', 4))

plt.imshow(np_img)
plt.colorbar()

In [None]:
total_samples = 0
record_count = {}
RECORD_LENGTH = 6 * 3660 # bits

for filename in file_list:

    file_stream = bitstring.ConstBitStream(filename=filename)

    while True:
        try:
            record = file_stream.readlist(','.join([
                'uint:36', # serial data number - [0]
                'uint:36', # serial sheet number - [1]
                'uint:36', # JIS Code - [2]
                'uint:36', # EBCDIC Code - [3]
                '4*uint:6', # 4 Character Code - [4:8]
                'pad:12', # Spaces
                'uint:36', # Evaluation of Individual Character Image (0=clean, 1, 2, 3) - [8]
                'uint:36', # Evaluation of Character Group (0=clean, 1, 2) - [9]
                'uint:36', # Sample Position Y on Sheet - [10]
                'uint:36', # Sample Position X on Sheet - [11]
                'uint:36', # Male-Female Code (1=male, 2=female) (JIS X 0303) - [12]
                'uint:36', # Age of Writer - [13]
                'uint:36', # Industry Classification Code (JIS X 0403) - [14]
                'uint:36', # Occupation Classification Code (JIS X 0404) - [15]
                'uint:36', # Sheet Gathering Date - [16]
                'uint:36', # Scanning Date - [17]
                'uint:36', # Number of X-Axis Sampling Points - [18]
                'uint:36', # Number of Y-Axis Sampling Points - [19]
                'uint:36', # Number of Levels of Pixel - [20]
                'uint:36', # Magnification of Scanning Lenz - [21]
                'uint:36', # Serial Data Number (old) - [22]
                'pad:1008', # (undefined)
                'bytes:2736', # 16 Gray Level (4bit/pixel) Image Data 72(X-axis size) * 76(Y-axis size) = 5472 pixels - [23]
            ]))
        except:
            break
        
        total_samples += 1
        
        record_dict = {
            'character': ''.join(map(T56, record[4:8])), # 4 Character Code - [4:8]
            'image_data': record[23], # 16 Gray Level (4bit/pixel) Image Data 72(X-axis size) * 76(Y-axis size) = 5472 pixels - [23]
        }

#         name = record_dict['character'].strip()
        name = record_dict['character']

        if name in record_count.keys():
            record_count[name] += 1
        else:
            record_count[name] = 1

In [None]:
total_samples

In [None]:
len(record_count.keys())

In [None]:
record_count

In [None]:
csv_filename = 'classes.tsv'
with open(csv_filename, mode='w', encoding='utf-8') as f:
    f.write('class\tnum_samples\n')
    for k in record_count.keys():
        log_str = f'{k}\t{record_count[k]}\n'
        f.write(log_str)
pd_df = pd.read_csv(csv_filename, encoding='utf-8', sep='\t')
pd_df = pd_df.sort_values(['class'])
pd_df.to_csv(csv_filename, encoding='utf-8', index=False, sep='\t')