In [4]:
'''
create_lmdb_dataset.py에서 가져온 내용
수정사항
env = lmdb.open(outputPath, map_size=1099511627776/1024)
with open(gtFile, 'r', encoding='cp949') as data:
'''

# !pip install lmdb
# !pip install fire


import fire
import os
import lmdb
import cv2

import numpy as np

In [5]:
def checkImageIsValid(imageBin):
    if imageBin is None:
        return False
    imageBuf = np.frombuffer(imageBin, dtype=np.uint8)
    img = cv2.imdecode(imageBuf, cv2.IMREAD_GRAYSCALE)
    imgH, imgW = img.shape[0], img.shape[1]
    if imgH * imgW == 0:
        return False
    return True


def writeCache(env, cache):
    with env.begin(write=True) as txn:
        for k, v in cache.items():
            txn.put(k, v)


def createDataset(inputPath, gtFile, outputPath, checkValid=True):
    """
    Create LMDB dataset for training and evaluation.
    ARGS:
        inputPath  : input folder path where starts imagePath
        outputPath : LMDB output path
        gtFile     : list of image path and label
        checkValid : if true, check the validity of every image
    """
    os.makedirs(outputPath, exist_ok=True)
    env = lmdb.open(outputPath, map_size=1099511627776/32)
    cache = {}
    cnt = 1

    with open(gtFile, 'r', encoding='cp949') as data:
        datalist = data.readlines()

    nSamples = len(datalist)
    for i in range(nSamples):
        imagePath, label = datalist[i].strip('\n').split('\t')
        imagePath = os.path.join(inputPath, imagePath)

        # # only use alphanumeric data
        # if re.search('[^a-zA-Z0-9]', label):
        #     continue

        if not os.path.exists(imagePath):
            print('%s does not exist' % imagePath)
            continue
        with open(imagePath, 'rb') as f:
            imageBin = f.read()
        if checkValid:
            try:
                if not checkImageIsValid(imageBin):
                    print('%s is not a valid image' % imagePath)
                    continue
            except:
                print('error occured', i)
                with open(outputPath + '/error_image_log.txt', 'a') as log:
                    log.write('%s-th image data occured error\n' % str(i))
                continue

        imageKey = 'image-%09d'.encode() % cnt
        labelKey = 'label-%09d'.encode() % cnt
        cache[imageKey] = imageBin
        cache[labelKey] = label.encode()

        if cnt % 1000 == 0:
            writeCache(env, cache)
            cache = {}
            print('Written %d / %d' % (cnt, nSamples))
        cnt += 1
    nSamples = cnt-1
    cache['num-samples'.encode()] = str(nSamples).encode()
    writeCache(env, cache)
    print('Created dataset with %d samples' % nSamples)

In [6]:
# train_lmdb 데이터 생성
createDataset('data/', 'data/gt_train.txt', 'data_lmdb/train') # inputpath, gtfile_path, output_path

  env = lmdb.open(outputPath, map_size=1099511627776/32)


Written 1000 / 726125
Written 2000 / 726125
Written 3000 / 726125
Written 4000 / 726125
Written 5000 / 726125
Written 6000 / 726125
Written 7000 / 726125
Written 8000 / 726125
Written 9000 / 726125
Written 10000 / 726125
Written 11000 / 726125
Written 12000 / 726125
Written 13000 / 726125
Written 14000 / 726125
Written 15000 / 726125
Written 16000 / 726125
Written 17000 / 726125
Written 18000 / 726125
Written 19000 / 726125
Written 20000 / 726125
Written 21000 / 726125
Written 22000 / 726125
Written 23000 / 726125
Written 24000 / 726125
Written 25000 / 726125
Written 26000 / 726125
Written 27000 / 726125
Written 28000 / 726125
Written 29000 / 726125
Written 30000 / 726125
Written 31000 / 726125
Written 32000 / 726125
Written 33000 / 726125
Written 34000 / 726125
Written 35000 / 726125
Written 36000 / 726125
Written 37000 / 726125
Written 38000 / 726125
Written 39000 / 726125
Written 40000 / 726125
Written 41000 / 726125
Written 42000 / 726125
Written 43000 / 726125
Written 44000 / 7261

Written 346000 / 726125
Written 347000 / 726125
Written 348000 / 726125
Written 349000 / 726125
Written 350000 / 726125
Written 351000 / 726125
Written 352000 / 726125
Written 353000 / 726125
Written 354000 / 726125
Written 355000 / 726125
Written 356000 / 726125
Written 357000 / 726125
Written 358000 / 726125
Written 359000 / 726125
Written 360000 / 726125
Written 361000 / 726125
Written 362000 / 726125
Written 363000 / 726125
Written 364000 / 726125
Written 365000 / 726125
Written 366000 / 726125
Written 367000 / 726125
Written 368000 / 726125
Written 369000 / 726125
Written 370000 / 726125
Written 371000 / 726125
Written 372000 / 726125
Written 373000 / 726125
Written 374000 / 726125
Written 375000 / 726125
Written 376000 / 726125
Written 377000 / 726125
Written 378000 / 726125
Written 379000 / 726125
Written 380000 / 726125
Written 381000 / 726125
Written 382000 / 726125
Written 383000 / 726125
Written 384000 / 726125
Written 385000 / 726125
Written 386000 / 726125
Written 387000 /

Written 687000 / 726125
Written 688000 / 726125
Written 689000 / 726125
Written 690000 / 726125
Written 691000 / 726125
Written 692000 / 726125
Written 693000 / 726125
Written 694000 / 726125
Written 695000 / 726125
Written 696000 / 726125
Written 697000 / 726125
Written 698000 / 726125
Written 699000 / 726125
Written 700000 / 726125
Written 701000 / 726125
Written 702000 / 726125
Written 703000 / 726125
Written 704000 / 726125
Written 705000 / 726125
Written 706000 / 726125
Written 707000 / 726125
Written 708000 / 726125
Written 709000 / 726125
Written 710000 / 726125
Written 711000 / 726125
Written 712000 / 726125
Written 713000 / 726125
Written 714000 / 726125
Written 715000 / 726125
Written 716000 / 726125
Written 717000 / 726125
Written 718000 / 726125
Written 719000 / 726125
Written 720000 / 726125
Written 721000 / 726125
Written 722000 / 726125
Written 723000 / 726125
Written 724000 / 726125
Written 725000 / 726125
Written 726000 / 726125
Created dataset with 726125 samples


In [7]:
def checkImageIsValid(imageBin):
    if imageBin is None:
        return False
    imageBuf = np.frombuffer(imageBin, dtype=np.uint8)
    img = cv2.imdecode(imageBuf, cv2.IMREAD_GRAYSCALE)
    imgH, imgW = img.shape[0], img.shape[1]
    if imgH * imgW == 0:
        return False
    return True


def writeCache(env, cache):
    with env.begin(write=True) as txn:
        for k, v in cache.items():
            txn.put(k, v)


def createDataset(inputPath, gtFile, outputPath, checkValid=True):
    """
    Create LMDB dataset for training and evaluation.
    ARGS:
        inputPath  : input folder path where starts imagePath
        outputPath : LMDB output path
        gtFile     : list of image path and label
        checkValid : if true, check the validity of every image
    """
    os.makedirs(outputPath, exist_ok=True)
    env = lmdb.open(outputPath, map_size=1099511627776/512)
    cache = {}
    cnt = 1

    with open(gtFile, 'r', encoding='cp949') as data:
        datalist = data.readlines()

    nSamples = len(datalist)
    for i in range(nSamples):
        imagePath, label = datalist[i].strip('\n').split('\t')
        imagePath = os.path.join(inputPath, imagePath)

        # # only use alphanumeric data
        # if re.search('[^a-zA-Z0-9]', label):
        #     continue

        if not os.path.exists(imagePath):
            print('%s does not exist' % imagePath)
            continue
        with open(imagePath, 'rb') as f:
            imageBin = f.read()
        if checkValid:
            try:
                if not checkImageIsValid(imageBin):
                    print('%s is not a valid image' % imagePath)
                    continue
            except:
                print('error occured', i)
                with open(outputPath + '/error_image_log.txt', 'a') as log:
                    log.write('%s-th image data occured error\n' % str(i))
                continue

        imageKey = 'image-%09d'.encode() % cnt
        labelKey = 'label-%09d'.encode() % cnt
        cache[imageKey] = imageBin
        cache[labelKey] = label.encode()

        if cnt % 1000 == 0:
            writeCache(env, cache)
            cache = {}
            print('Written %d / %d' % (cnt, nSamples))
        cnt += 1
    nSamples = cnt-1
    cache['num-samples'.encode()] = str(nSamples).encode()
    writeCache(env, cache)
    print('Created dataset with %d samples' % nSamples)

In [8]:
# test lmdb데이터 생성
createDataset('data/', 'data/gt_test.txt', 'data_lmdb/test')

  env = lmdb.open(outputPath, map_size=1099511627776/512)


Written 1000 / 91568
Written 2000 / 91568
Written 3000 / 91568
Written 4000 / 91568
Written 5000 / 91568
Written 6000 / 91568
Written 7000 / 91568
Written 8000 / 91568
Written 9000 / 91568
Written 10000 / 91568
Written 11000 / 91568
Written 12000 / 91568
Written 13000 / 91568
Written 14000 / 91568
Written 15000 / 91568
Written 16000 / 91568
Written 17000 / 91568
Written 18000 / 91568
Written 19000 / 91568
Written 20000 / 91568
Written 21000 / 91568
Written 22000 / 91568
Written 23000 / 91568
Written 24000 / 91568
Written 25000 / 91568
Written 26000 / 91568
Written 27000 / 91568
Written 28000 / 91568
Written 29000 / 91568
Written 30000 / 91568
Written 31000 / 91568
Written 32000 / 91568
Written 33000 / 91568
Written 34000 / 91568
Written 35000 / 91568
Written 36000 / 91568
Written 37000 / 91568
Written 38000 / 91568
Written 39000 / 91568
Written 40000 / 91568
Written 41000 / 91568
Written 42000 / 91568
Written 43000 / 91568
Written 44000 / 91568
Written 45000 / 91568
Written 46000 / 915