In [5]:
import os
import numpy as np
import cv2

import manga109api
from natsort import natsorted

def crop_word_boxes(root_path, data_split, hardROI=False):

    manga109_root_path = root_path
    manga109_parser = manga109api.Parser(root_dir=manga109_root_path)
    annotation_type = "annotations"

    with open(f'./books_{data_split}.txt', 'r', encoding='utf-8-sig') as manga_onomatopoeia_split:
        manga_onomatopoeia_split_books = manga_onomatopoeia_split.readlines()

    print("The number of mangas:", len(manga_onomatopoeia_split_books))
    # Precomputing total image count
    total_image_count = 0
    for manga_name in manga_onomatopoeia_split_books:
        manga_name = manga_name.strip()
        annotation = manga109_parser.get_annotation(book=manga_name, annotation_type=annotation_type)
        total_image_count += len(annotation["page"])
    print("The number of scene images:", total_image_count)

    image_count = 0
    word_count = 0
    if hardROI:
        word_gt_path = os.path.join(root_path, f"TRBA_data/gt_{data_split}_hardROI.txt")
    else:
        word_gt_path = os.path.join(root_path, f"TRBA_data/gt_{data_split}.txt")
    os.makedirs(os.path.join(root_path, "TRBA_data"), exist_ok=True)
        
    with open(word_gt_path, 'w', encoding='utf-8') as word_gt:
        for manga_name in natsorted(manga_onomatopoeia_split_books):
            manga_name = manga_name.strip()
            if hardROI:
                image_output_path = os.path.join(root_path, f"TRBA_data/{data_split}_hardROI/{manga_name}/")
            else:
                image_output_path = os.path.join(root_path, f"TRBA_data/{data_split}/{manga_name}/")
            os.makedirs(image_output_path, exist_ok=True)

            annotation = manga109_parser.get_annotation(book=manga_name, annotation_type=annotation_type)

            for page_index in range(len(annotation["page"])):

                image_count += 1
                if image_count % 200 == 0:
                    print(f'{image_count} images are cropped')

                image_path = manga109_parser.img_path(book=manga_name, index=page_index)
                image = cv2.imread(image_path)

                try:
                    rois = annotation['page'][page_index]["onomatopoeia"]
                    if isinstance(rois, dict):
                        rois = [rois]  # for one instance case.
                except:
                    continue

                cnt = 0
                for roi in rois:
                    x_list = [int(roi[attr]) for attr in roi if '@x' in attr]
                    y_list = [int(roi[attr]) for attr in roi if '@y' in attr]

                    # when the label only have top-left and bottom-right points of rectangle.
                    if len(x_list) == 2:
                        x_list = [x_list[0], x_list[1], x_list[1], x_list[0]]
                        y_list = [y_list[0], y_list[0], y_list[1], y_list[1]]

                    # polygon = [(x, y) for x, y in zip(x_list, y_list)]
                    polygon = []
                    for x, y in zip(x_list, y_list):
                        polygon.append(x)
                        polygon.append(y)
                  
                    label = roi['#text']

                    try:
                        xmin = min(x_list)
                        ymin = min(y_list)
                        xmax = max(x_list)
                        ymax = max(y_list)
                        word_box = image[ymin:ymax, xmin:xmax]

                        if hardROI:
                            #print(word_box.shape, xmin, ymin)
                            w = xmax - xmin
                            h = ymax - ymin
                            polygon = np.array(polygon, np.int32)
                            polygon[0::2] -= xmin
                            polygon[1::2] -= ymin
                            polygon = polygon.reshape((-1, 1, 2))
                            M = np.zeros([h, w])
                            cv2.fillPoly(M, [polygon], 1)
                            M = np.stack([M, M, M], axis=2)
                            word_box = word_box * M

                    except:
                        word_box = None
                        print('corrupt image?', image_path)
                        continue

                    word_box_name = str(page_index) + "-" + str(cnt)
                    word_box_path = os.path.join(image_output_path, word_box_name + ".jpg")
                    cv2.imwrite(word_box_path, word_box)
                    
                    if hardROI:
                        word_gt.write(f'TRBA_data/{data_split}_hardROI/{manga_name}/{word_box_name}.jpg\t{label}\n')
                    else:
                        word_gt.write(f'TRBA_data/{data_split}/{manga_name}/{word_box_name}.jpg\t{label}\n')

                    cnt += 1
                    word_count += 1
                    if word_count % 50000 == 0:
                        print(f'{word_count} words are cropped')

In [7]:
# for create lmdb
import os
import random

import lmdb
import cv2
import numpy as np
from tqdm import tqdm


def checkImageIsValid(imageBin):
    if imageBin is None:
        return False
    imageBuf = np.frombuffer(imageBin, dtype=np.uint8)
    img = cv2.imdecode(imageBuf, cv2.IMREAD_GRAYSCALE)
    imgH, imgW = img.shape[0], img.shape[1]
    if imgH * imgW == 0:
        return False
    return True


def writeCache(env, cache):
    with env.begin(write=True) as txn:
        for k, v in cache.items():
            txn.put(k, v)


def createDataset(inputPath, gtFile, outputPath, checkValid=True):
    """a modified version of CRNN torch repository https://github.com/bgshih/crnn/blob/master/tool/create_dataset.py
    Create LMDB dataset for training and evaluation.
    ARGS:
        inputPath  : input folder path where starts imagePath
        outputPath : LMDB output path
        gtFile     : list of image path and label
        checkValid : if true, check the validity of every image
    """
    # CAUTION: if outputPath (lmdb) already exists, this function add dataset
    # into it. so remove former one and re-create lmdb.
    if os.path.exists(outputPath):
        os.system(f"rm -r {outputPath}")

    os.makedirs(outputPath, exist_ok=True)
    env = lmdb.open(outputPath, map_size=30 * 2 ** 30)
    cache = {}
    cnt = 1

    with open(gtFile, "r", encoding="utf-8-sig") as data:
        datalist = data.readlines()

    nSamples = len(datalist)
    for i in tqdm(range(nSamples), total=nSamples, position=0, leave=True):
        imagePath, label = datalist[i].strip("\n").split("\t")
        imagePath = os.path.join(inputPath, imagePath)

        if not os.path.exists(imagePath):
            print("%s does not exist" % imagePath)
            continue
        with open(imagePath, "rb") as f:
            imageBin = f.read()
        if checkValid:
            try:
                if not checkImageIsValid(imageBin):
                    print("%s is not a valid image" % imagePath)
                    continue
            except:
                print("error occured", i)
                with open(outputPath + "/error_image_log.txt", "a") as log:
                    log.write("%s-th image data occured error\n" % str(i))
                continue

        imageKey = "image-%09d".encode() % cnt
        imagepathKey = "imagepath-%09d".encode() % cnt
        labelKey = "label-%09d".encode() % cnt
        cache[imageKey] = imageBin
        cache[labelKey] = label.encode()
        cache[imagepathKey] = imagePath.encode()

        if cnt % 1000 == 0:
            writeCache(env, cache)
            cache = {}
            # print('Written %d / %d' % (cnt, nSamples))
        cnt += 1
    nSamples = cnt - 1
    cache["num-samples".encode()] = str(nSamples).encode()
    writeCache(env, cache)
    print("Created dataset with %d samples" % nSamples)

In [8]:
data_split_list = ["train", "val", "test"]
hardROI = False

for data_split in data_split_list:
#     root_path = f'./onomatopoeia_data/'
    root_path = f'.'
    crop_word_boxes(root_path, data_split, hardROI=hardROI)

    inputPath = f'.'
    if hardROI:
        gtFile = f'{inputPath}/TRBA_data/gt_{data_split}_hardROI.txt'
        outputPath = f'./TRBA_data/hardROI/{data_split}'
    else:
        gtFile = f'{inputPath}/TRBA_data/gt_{data_split}.txt'
        outputPath = f'./TRBA_data/lmdb/{data_split}'
    createDataset(inputPath, gtFile, outputPath)

The number of mangas: 89
The number of scene images: 8763
200 images are cropped
400 images are cropped
600 images are cropped
800 images are cropped
1000 images are cropped
1200 images are cropped
1400 images are cropped
1600 images are cropped
1800 images are cropped
2000 images are cropped
2200 images are cropped
2400 images are cropped
2600 images are cropped
2800 images are cropped
3000 images are cropped
3200 images are cropped
3400 images are cropped
3600 images are cropped
3800 images are cropped
4000 images are cropped
4200 images are cropped
4400 images are cropped
4600 images are cropped
4800 images are cropped
5000 images are cropped
5200 images are cropped
5400 images are cropped
5600 images are cropped
5800 images are cropped
6000 images are cropped
6200 images are cropped
6400 images are cropped
6600 images are cropped
6800 images are cropped
7000 images are cropped
7200 images are cropped
7400 images are cropped
7600 images are cropped
7800 images are cropped
8000 image

100%|████████████████████████████████████████████████████████████████████████| 50064/50064 [01:47<00:00, 465.82it/s]


Created dataset with 50064 samples
The number of mangas: 10
The number of scene images: 890
200 images are cropped
400 images are cropped
600 images are cropped
800 images are cropped


100%|██████████████████████████████████████████████████████████████████████████| 4636/4636 [00:10<00:00, 443.10it/s]


Created dataset with 4636 samples
The number of mangas: 10
The number of scene images: 949
200 images are cropped
400 images are cropped
600 images are cropped
800 images are cropped


100%|██████████████████████████████████████████████████████████████████████████| 6765/6765 [00:12<00:00, 525.43it/s]

Created dataset with 6765 samples





In [9]:
# for hardROI data
data_split_list = ["train", "val", "test"]
hardROI = True

for data_split in data_split_list:
    root_path = f'.'
    crop_word_boxes(root_path, data_split, hardROI=hardROI)

    inputPath = f'.'
    if hardROI:
        gtFile = f'{inputPath}/TRBA_data/gt_{data_split}_hardROI.txt'
        outputPath = f'./TRBA_data/hardROI/{data_split}'
    else:
        gtFile = f'{inputPath}/TRBA_data/gt_{data_split}.txt'
        outputPath = f'./TRBA_data/lmdb/{data_split}'
    createDataset(inputPath, gtFile, outputPath)

The number of mangas: 89
The number of scene images: 8763
200 images are cropped
400 images are cropped
600 images are cropped
800 images are cropped
1000 images are cropped
1200 images are cropped
1400 images are cropped
1600 images are cropped
1800 images are cropped
2000 images are cropped
2200 images are cropped
2400 images are cropped
2600 images are cropped
2800 images are cropped
3000 images are cropped
3200 images are cropped
3400 images are cropped
3600 images are cropped
3800 images are cropped
4000 images are cropped
4200 images are cropped
4400 images are cropped
4600 images are cropped
4800 images are cropped
5000 images are cropped
5200 images are cropped
5400 images are cropped
5600 images are cropped
5800 images are cropped
6000 images are cropped
6200 images are cropped
6400 images are cropped
6600 images are cropped
6800 images are cropped
7000 images are cropped
7200 images are cropped
7400 images are cropped
7600 images are cropped
7800 images are cropped
8000 image

100%|████████████████████████████████████████████████████████████████████████| 50064/50064 [01:39<00:00, 503.53it/s]


Created dataset with 50064 samples
The number of mangas: 10
The number of scene images: 890
200 images are cropped
400 images are cropped
600 images are cropped
800 images are cropped


100%|██████████████████████████████████████████████████████████████████████████| 4636/4636 [00:12<00:00, 370.75it/s]


Created dataset with 4636 samples
The number of mangas: 10
The number of scene images: 949
200 images are cropped
400 images are cropped
600 images are cropped
800 images are cropped


100%|██████████████████████████████████████████████████████████████████████████| 6765/6765 [00:13<00:00, 509.96it/s]


Created dataset with 6765 samples
