### Database with 4*4 images

shapes : square, triangle, cirle
colors : light red, light blue, light yellow, light green

dataset size : 1,000,000

probability to have a symbol at each position : 0.5


In [1]:
db_root_path = "data/db0.1.1/"

In [2]:
from dbimg import load_db, generate_uuid

db = load_db(db_root_path)

In [3]:
# Number of images generated
NBGEN = 1000000

# Grid division of each image
X_DIVISIONS = 4
Y_DIVISIONS = 4

# Size of the images in pixels
img_size = (700, 700)

# Probability to generate a geometrical shape at each position in the grid
SHAPE_PROB = 0.5

# Define available shapes
SHAPES = ['circle', 'square', 'triangle']
COLORS  = ["#F86C62", "#7AB0CD", "#F4D67B", "#87C09C"]

In [4]:
import numpy as np
from dbimg import generate_uuid
import os
import tqdm

for genidx in tqdm.tqdm(range(NBGEN)):
    content = []
    for i in range(X_DIVISIONS):
        for j in range(Y_DIVISIONS):
            if np.random.random() < SHAPE_PROB:
                content.append({
                    "shape": np.random.choice(SHAPES),
                    "pos": (i, j),
                    "color": np.random.choice(COLORS)
                })

    imgid = generate_uuid()
    db[imgid] = {
        "path": os.path.join(imgid + ".png"),
        "division" : (X_DIVISIONS, Y_DIVISIONS),
        "size": img_size,
        "content": content
    }

100%|██████████| 1000000/1000000 [01:39<00:00, 10060.00it/s]


In [5]:
from genimg import gen_img_and_save_db
gen_img_and_save_db(db, db_root_path, overwrite=False, n_jobs=36)

100%|██████████| 1000000/1000000 [18:58<00:00, 878.28it/s]


### Create dataset with blue diagonal rule

In [9]:
csv_path = "data/db0.1.1/bluediagonal.csv"
sample_img_path = "data/db0.1.1/bluediagonal"

def blue_diagonal_rule(img_content):
    symbols_in_diagonal = 0
    for c in img_content:
        if c["pos"][0] == c["pos"][1]:
            symbols_in_diagonal += 1
            if c["color"] != "#7AB0CD":
                return False
    return symbols_in_diagonal >= 1

In [10]:
from gendataset import create_dataset_based_on_rule

create_dataset_based_on_rule(db_path, csv_path, 10000, 10000, blue_diagonal_rule)

100%|██████████| 1000000/1000000 [00:00<00:00, 1053406.66it/s]


In [11]:
from gendataset import extract_sample_from_dataset
extract_sample_from_dataset(csv_path, sample_img_path, 1000, 1000)

20000it [00:00, 252098.66it/s]
