In [None]:
import os
import zipfile
import concurrent.futures
import pandas as pd
import random

import etl_util

In [None]:
# ETL7.zipとETL8B.zipのパスを設定してください
ETL7_ZIP_PATH = './data/ETL7.zip'
ETL8B_ZIP_PATH = './data/ETL8B.zip'

# 出力先を設定してください
DEFAULT_DEST = './output/'

# 文字コード表を設定してください
JIS0201_PATH = './data/jis0201.txt'
JIS0208_PATH = './data/jis0208.txt'

In [None]:
def convert_etl(filename, etl_num = '7', destination = DEFAULT_DEST):
    pickle_name = 'etl' + etl_num + '.pickle'
    dest_full_path = destination + pickle_name

    # 既に変換済みであれば実行しない.
    if os.path.exists(dest_full_path) == False:
        # zip fileの展開.
        with zipfile.ZipFile(filename) as etl_zipped:
            etl_zipped.extractall(destination)

        # バイナリデータからpngに変換.
        binary_dir = destination + 'ETL' + etl_num + '/'
        img_list = []
        # バイナリデータ形式ごとに変換.
        if etl_num == '1' or etl_num == '6' or etl_num == '7':
            JIS0201_char_dict = etl_util.jis0201_to_char(JIS0201_PATH)
            img_list = etl_util.convert_m_type(binary_dir, JIS0201_char_dict)
            print("img_list:", len(img_list))
        if etl_num == '8B':
            JIS0208_char_dict = etl_util.jis0208_to_char(JIS0208_PATH)
            img_list = etl_util.convert_b_type(binary_dir, JIS0208_char_dict)
        #print(img_list[:10])

        # 画像ごとに外接矩形を算出.
        contour_list = []
        # 時間がかかるため並列処理で行う.
        # 並列処理用Executor生成.
        executor = concurrent.futures.ProcessPoolExecutor()
        # 並列処理の実行.
        futures = [executor.submit(etl_util.create_contour, img_path) for img_path in img_list]
        for future in concurrent.futures.as_completed(futures):
            contour_list.append(future.result())
        # 並列処理の後処理.
        executor.shutdown()

        # 座標が[0,0,0,0]のdummyファイルをリストから抜く.
        contour_list = [contour for contour in contour_list if contour[1] != [0,0,0,0]]

        # for Debug.
        sampling = random.sample(contour_list, 10)
        for index in range(len(sampling)):
            etl_util.showRectangleImage(sampling[index][0], [sampling[index][1]], sampling[index][2])

        # save pickle
        pd.to_pickle(contour_list, dest_full_path)
        
    return dest_full_path

In [None]:
# Main
# ETL7.zipの変換.
pickle_etl7_path = convert_etl( ETL7_ZIP_PATH, '7' )
print('ETL7 convert complete:', pickle_etl7_path)

# ETL8B.zipの変換.
pickle_etl8b_path = convert_etl( ETL8B_ZIP_PATH, '8B' )
print('ETL8B convert complete:', pickle_etl8b_path)