# Katakana classifier

- This notebook will demonstrate how to build a CNN model to recognize Japanese katakana characters.

## Setup

- Python packages: `numpy`, `opencv-python`, `matplotlib`, `pillow`, `pandas` and `tensorflow`

- Install all of them with the following command.

In [None]:
!pip install numpy opencv-python matplotlib pillow pandas tensorflow

- then import all necessary packages

In [None]:
from __future__ import absolute_import, division, print_function

import os
import pathlib
import re
import struct

# set GPU to invisible for saving battery life
# os.environ['CUDA_VISIBLE_DEVICES'] = ''
# os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

import pandas as pd
import numpy as np
import cv2
import matplotlib.pyplot as plt
from PIL import Image

import tensorflow as tf
from tensorflow import keras

print('TensorFlow version:',tf.VERSION)
print('Keras version:', keras.__version__)

# tf.enable_eager_execution() # unable to use model.save() while enabling eager execution

AUTOTUNE = tf.data.experimental.AUTOTUNE

## Prepare the datasets

- Require the [ETL-1](http://etlcdb.db.aist.go.jp/) to be un-zipped

```bash
├───ETL1
│───────ETL1C_01
│───────ETL1C_02
│───────ETL1C_03
│───────ETL1C_04
│───────ETL1C_05
│───────ETL1C_06
│───────ETL1C_07
│───────ETL1C_08
│───────ETL1C_09
│───────ETL1C_10
│───────ETL1C_11
│───────ETL1C_12
│───────ETL1C_13
│───────ETL1INFO
```

- The [`label_dict.csv`](label_dict.csv) file contains the `index`, `label`, and `kana` representation information.

- We are going to perform the following steps:

    1. Convert ETL datasets to categozied images. (*once time*)
    2. Convert the categozied images to `numpy` arrays then save them as `.npy` files. (*once time*)
    3. Load them into memory to perform training. (*every time when re-run the notebook*)
        - This step requires lots of memory (~2GB) so if you can convert them to `tf.data.Dataset` before training.

### Convert ETL datasets to categorized images

- We will unpack the ETL datasets into images and save them in separated categories.

In [None]:
def unpack_etl_file(etl_file, unpack_dir):
    idx = 0
    print('Unpacking {}'.format(etl_file))
    
    f = open(etl_file, 'rb')
    while True:
        
        idx += 1
        if idx % 100 == 0:
            print('.', end='')
        
        s = f.read(2052)

        if not len(s) == 2052:
            print()
            print('[{}] Reach EOF, remain {} bytes unread.'.format(etl_file, len(s)))
            break;

        r = struct.unpack('>H2sH6BI4H4B4x2016s4x', s)

        # the record index is at index 0. We are going to use it as file name
        r_idx = r[0]

        # label at index 1
        label = r[1].decode('ascii')
        if ' ' in label: # remove spaces
            label = label.replace(' ', '')

        # image at index 18
        iF = Image.frombytes('F', (64, 63), r[18], 'bit', 4)
        np_img = np.array(iF, dtype=np.uint8) # np_img.shape = (63, 64)
        final_img = np_img.astype(np.float) * int(255 / 15)
        
        cat_dir = os.path.join(unpack_dir, label)
        
        if not os.path.exists(cat_dir):
            os.makedirs(cat_dir)
            
        img_path = os.path.join(cat_dir, '{}.png'.format(r_idx))
        
        if not os.path.exists(img_path):
            cv2.imwrite(img_path, final_img)

    f.close()

def etl_to_images(etl_root='ETL1', unpack_dir='etlcb_images'):
    # we only need the katakana characters
    skip_etls = [
        'ETL1C_01',
        'ETL1C_02',
        'ETL1C_03',
        'ETL1C_04',
        'ETL1C_05',
        'ETL1C_06',
    ]
    
    etl_files = os.listdir(etl_root)
    
    for etl in etl_files:
        if not re.search('ETL1C_\d\d', etl) == None:
            if etl in skip_etls:
                continue
                
            etl_file = os.path.join(etl_root, etl)
            
            unpack_etl_file(etl_file, unpack_dir)

In [None]:
etl_to_images()

### Convert the ETL images into `.npy` files

In [None]:
def convert_etl_images_to_numpy(etl_image_root='etlcb_images', etl_numpy_root='etlcb_numpy', input_shape=(64, 64, 1), csv_dict_path='label_dict.csv', img_filename='img_ds.npy', label_filename='label_ds.npy'):
    label_dict_df = pd.read_csv(csv_dict_path, keep_default_na=False)
    label_dict_df = label_dict_df.set_index('label')
    
    label_dict = label_dict_df.to_dict(orient='index')
    kana_dirs = os.listdir(etl_image_root)
    
    for kana_dir in kana_dirs:
        
        img_arr = []
        label_arr = []
        
        try:
            dir_label = int(label_dict[kana_dir]['index'])
        except KeyError:
            continue
        
        img_paths = [os.path.join(etl_image_root, kana_dir, img) for img in os.listdir(os.path.join(etl_image_root, kana_dir))]
#         print(img_paths)
        for img_path in img_paths:
            img = cv2.imread(img_path, 0) # read as grayscale image
            img = cv2.resize(img, input_shape[:2], interpolation=cv2.INTER_CUBIC)
            img = img.reshape(input_shape)
            img = img.astype(np.float) / 255.0
#             print(img.shape)
            img_arr.append(img)
            label_arr.append(dir_label)
        
        kana_np_path = os.path.join(etl_numpy_root, kana_dir)
        if not os.path.exists(kana_np_path):
            os.makedirs(kana_np_path)
            
        img_arr = np.array(img_arr)
        print('img_arr.shape', img_arr.shape)
        label_arr = np.array(label_arr)
        print('label_arr.shape', label_arr.shape)
        
        img_ds_path = os.path.join(kana_np_path, 'img_ds.npy')
        np.save(img_ds_path, img_arr)
        label_ds_path = os.path.join(kana_np_path, 'label_ds.npy')
        np.save(label_ds_path, label_arr)
        
        del img_arr
        del label_arr

In [None]:
convert_etl_images_to_numpy()

### Load the data for training

- The `load_numpy_etl()` method will return a `dict` which contains the following objects:
    - `train_set`: the training data
    - `train_label`: the training labels
    - `val_set`: the validation data
    - `val_label`: the validation labels
    - `label_dict`: a `dict` contains the list of 

In [None]:
def load_numpy_etl(etl_numpy_root='etlcb_numpy', csv_dict='label_dict.csv', val_ratio=0.7):
    retval = dict()
    
    label_dict_df = pd.read_csv(csv_dict, keep_default_na=False)
    label_dict_df = label_dict_df.set_index('label')
    
    label_dict = label_dict_df.to_dict(orient='index')
    retval['label_dict'] = label_dict
    
    kana_dirs = os.listdir(etl_numpy_root)
    
    train_set = None
    train_label = None
    
    val_set = None
    val_label = None
    
    is_first = True
    
    for kana_dir in kana_dirs:
        dir_label = int(label_dict[kana_dir]['index'])
        
        kana_np_path = os.path.join(etl_numpy_root, kana_dir)
        
        img_ds_path = os.path.join(kana_np_path, 'img_ds.npy')
        img_arr = np.load(img_ds_path)
        print('img_arr.shape', img_arr.shape)
        
        label_ds_path = os.path.join(kana_np_path, 'label_ds.npy')
        label_arr = np.load(label_ds_path)
        print('label_arr.shape', label_arr.shape)
        
        split_idx = int(len(img_arr) * val_ratio)
        
        _train_set, _val_set = np.split(img_arr, [split_idx], axis=0)
        _train_label, _val_label = np.split(label_arr, [split_idx], axis=0)
        
        del img_arr
        del label_arr
        
        if is_first:
            train_set = _train_set
            train_label = _train_label

            val_set = _val_set
            val_label = _val_label
            
            is_first = False
        else:
            train_set = np.concatenate((train_set, _train_set), axis=0)
            train_label = np.concatenate((train_label, _train_label), axis=0)

            val_set = np.concatenate((val_set, _val_set), axis=0)
            val_label = np.concatenate((val_label, _val_label), axis=0)
        
    retval['train_set'] = train_set
    retval['train_label'] = train_label
    
    retval['val_set'] = val_set
    retval['val_label'] = val_label
    
    return retval

In [None]:
ds_dict = load_numpy_etl()
print(ds_dict.keys())

In [None]:
print(ds_dict['train_set'].shape)
print(ds_dict['val_set'].shape)

In [None]:
def kana_keras_model():
    model = keras.Sequential([
        keras.layers.Conv2D(
            filters=32, 
            kernel_size=5, 
            padding='same',
            activation=tf.nn.relu,
            input_shape=(64,64,1,), 
            data_format='channels_last'
            ),
        keras.layers.Dropout(0.5),
        keras.layers.MaxPool2D(
            pool_size=2,
            ),
        keras.layers.Conv2D(
            filters=16, 
            kernel_size=5, 
            padding='same',
            activation=tf.nn.relu,
            ),
        keras.layers.Dropout(0.2),
        keras.layers.MaxPool2D(
            pool_size=(2, 2),
            ),
        keras.layers.Conv2D(
            filters=16, 
            kernel_size=3, 
            padding='same',
            activation=tf.nn.relu,
            ),
        keras.layers.Dropout(0.2),
        keras.layers.MaxPool2D(
            pool_size=2,
            ),
        keras.layers.Flatten(),
        keras.layers.Dense(
            units=128, 
            activation=tf.nn.relu,
            ),
        keras.layers.Dropout(0.15),
        keras.layers.Dense(
            units=46,
            activation=tf.nn.softmax
        )
    ])
    return model

model = kana_keras_model()
model.compile(optimizer='adam', 
              loss='sparse_categorical_crossentropy', 
              metrics=['accuracy'])
model.summary()

In [None]:
model.fit(ds_dict['train_set'], ds_dict['train_label'], 
          epochs=5, 
          batch_size=512, 
          validation_data=(ds_dict['val_set'], ds_dict['val_label']),
          shuffle=True,
         )

In [None]:
model.save('kana_classifier_model.h5')