In [1]:
# to make import like we would be in root directory
import sys
sys.path.append('..')

In [4]:
import numpy as np
import cv2
from lib.generator import generate_pages
from lib.data import load_book, load_emnist, load_kmnist
from pathlib import Path

In [5]:
book = load_book()
kmnist = load_kmnist()
emnist = load_emnist()

In [9]:
def generate_dataset(name, corruption_prob, salt_prob, rotation, max_scale, unique_characters):
    dataset_dir = Path(name)
    emnist_dir = dataset_dir / 'EMNIST'
    kmnist_dir = dataset_dir / 'KMNIST'
    text_path = dataset_dir / 'text.txt'
    
    if dataset_dir.exists():
        return
    
    dataset_dir.mkdir()
    emnist_dir.mkdir()
    kmnist_dir.mkdir()
    
    np.random.seed(42)
    emnist_pages, gen_text = generate_pages(book, emnist, corruption_prob, salt_prob, rotation, max_scale, unique_characters)
    kmnist_pages, _ = generate_pages(book, kmnist, corruption_prob, salt_prob, rotation, max_scale, unique_characters)
    
    with open(str(text_path), 'w') as file:
        file.write(gen_text)

    for i in range(len(emnist_pages)):
        emnist_path = str(emnist_dir / f'{i}.png')
        kmnist_path = str(kmnist_dir / f'{i}.png')
        
        cv2.imwrite(emnist_path, emnist_pages[i]*255)
        cv2.imwrite(kmnist_path, kmnist_pages[i]*255)

In [6]:
generate_dataset(
    name='clean',
    corruption_prob=0,
    salt_prob=0,
    rotation=0,
    max_scale=1.0,
    unique_characters=1
)

100%|██████████| 138142/138142 [00:14<00:00, 9288.63it/s] 
100%|██████████| 138142/138142 [00:14<00:00, 9230.66it/s] 


In [7]:
generate_dataset(
    name='noise_only',
    corruption_prob=0,
    salt_prob=0.05,
    rotation=0,
    max_scale=1.0,
    unique_characters=1
)

100%|██████████| 138142/138142 [00:15<00:00, 8893.58it/s] 
100%|██████████| 138142/138142 [00:15<00:00, 9001.45it/s] 


In [8]:
generate_dataset(
    name='max_distortions_1_char',
    corruption_prob=0.3,
    salt_prob=0.05,
    rotation=30,
    max_scale=1.15,
    unique_characters=1
)

100%|██████████| 138142/138142 [00:26<00:00, 5225.43it/s]
100%|██████████| 138142/138142 [00:26<00:00, 5178.82it/s]


In [11]:
generate_dataset(
    name='max_distortions_3_char',
    corruption_prob=0.3,
    salt_prob=0.05,
    rotation=30,
    max_scale=1.15,
    unique_characters=3
)

100%|██████████| 138142/138142 [00:26<00:00, 5203.49it/s]
100%|██████████| 138142/138142 [00:26<00:00, 5145.87it/s]


In [12]:
generate_dataset(
    name='max_distortions_5_char',
    corruption_prob=0.3,
    salt_prob=0.05,
    rotation=30,
    max_scale=1.15,
    unique_characters=5
)

100%|██████████| 138142/138142 [00:26<00:00, 5197.68it/s]
100%|██████████| 138142/138142 [00:26<00:00, 5179.83it/s]
