### Database with 4*4 images

shapes : square, triangle, cirle
colors : light red, light blue, light yellow, light green

dataset size : 2,000,000

probability to have a symbol at each position : 0.5

Same as v0.1.2 but with 2M images and with making sure there isn't any duplicates while generating the DB.



In [1]:
import os

db_dir = os.environ["DATA"] + "PatImgXAI_data/db0.1.3/"
os.makedirs(db_dir, exist_ok=True)

In [2]:
from xaipatimg.datagen.dbimg import load_db

db = load_db(db_dir)

In [3]:
# Number of images generated
NBGEN = 2000000

# Grid division of each image
X_DIVISIONS = 4
Y_DIVISIONS = 4

# Size of the images in pixels
img_size = (700, 700)

# Probability to generate a geometrical shape at each position in the grid
SHAPE_PROB = 0.5

# Define available shapes
SHAPES = ['circle', 'square', 'triangle']
COLORS  = ["#F86C62", "#7AB0CD", "#F4D67B", "#87C09C"]

In [None]:
import numpy as np
from xaipatimg.datagen.dbimg import generate_uuid
import os

to_generate = NBGEN
unique_content_generated = {}
duplicate_count = 0
while to_generate > 0:
    content = []
    for i in range(X_DIVISIONS):
        for j in range(Y_DIVISIONS):
            if np.random.random() < SHAPE_PROB:
                content.append({
                    "shape": np.random.choice(SHAPES),
                    "pos": (i, j),
                    "color": np.random.choice(COLORS)
                })

    if str(content) in unique_content_generated:
        duplicate_count += 1
        continue

    imgid = generate_uuid()
    db[imgid] = {
        "path": os.path.join("img", imgid + ".png"),
        "division" : (X_DIVISIONS, Y_DIVISIONS),
        "size": img_size,
        "content": content
    }

    unique_content_generated[str(content)] = True
    to_generate -= 1

print("unique generated in DB : " + str(len(db)))
print("duplicates avoided : " + str(duplicate_count))

## Validating no duplicate is found

In [None]:
import tqdm

content_dict = {}
nb_duplicates = 0

for k, v in tqdm.tqdm(db.items()):
    if str(v["content"]) in content_dict:
        nb_duplicates += 1
    else:
        content_dict[str(v["content"])] = True

print(nb_duplicates)

In [None]:
from xaipatimg.datagen.genimg import gen_img_and_save_db
gen_img_and_save_db(db, db_dir, overwrite=False, n_jobs=190)

### Create dataset with blue diagonal rule

In [None]:
def blue_diagonal_rule(img_content):
    symbols_in_diagonal = 0
    for c in img_content:
        if c["pos"][0] == c["pos"][1]:
            symbols_in_diagonal += 1
            if c["color"] != "#7AB0CD":
                return False
    return symbols_in_diagonal >= 1

In [None]:
from xaipatimg.datagen.gendataset import create_dataset_based_on_rule
import os
csv_name_train = "bluediagonal_train.csv"
csv_name_test = "bluediagonal_test.csv"
csv_name_valid = "bluediagonal_valid.csv"
sample_img_path = os.path.join(db_dir, "datasets", "bluediagonal_train")
create_dataset_based_on_rule(db_dir, csv_name_train, csv_name_test, csv_name_valid, test_size=4000, valid_size=4000,
                             dataset_pos_samples_nb=12000, dataset_neg_samples_nb=12000, rule_fun=blue_diagonal_rule)

In [None]:
from xaipatimg.datagen.gendataset import extract_sample_from_dataset
extract_sample_from_dataset(db_dir, csv_name_train, sample_img_path, 1000, 1000)

### Create dataset with exactly four squares rule

In [None]:
def four_squares_total(img_content):
    sq_total = 0
    for c in img_content:
        if c["shape"] == "square":
            sq_total += 1
    return sq_total == 4

In [None]:
from xaipatimg.datagen.gendataset import create_dataset_based_on_rule
import os
csv_name_train = "foursquares_train.csv"
csv_name_test = "foursquares_test.csv"
csv_name_valid = "foursquares_valid.csv"
sample_img_path = os.path.join(db_dir, "datasets", "foursquares_train")

create_dataset_based_on_rule(db_dir, csv_name_train, csv_name_test, csv_name_valid, test_size=4000, valid_size=4000,
                             dataset_pos_samples_nb=12000, dataset_neg_samples_nb=12000, rule_fun=four_squares_total)

In [None]:
from xaipatimg.datagen.gendataset import extract_sample_from_dataset
extract_sample_from_dataset(db_dir, csv_name_train, output_dir_path=sample_img_path, pos_samples_nb=1000, neg_samples_nb=1000)

### Create dataset with exactly three squares rule


In [None]:
def three_squares_total(img_content):
    sq_total = 0
    for c in img_content:
        if c["shape"] == "square":
            sq_total += 1
    return sq_total == 3

In [None]:
csv_name_train = "threesquares_train.csv"
csv_name_test = "threesquares_test.csv"
csv_name_valid = "threesquares_valid.csv"
sample_img_path = os.path.join(db_dir, "datasets", "threesquares_train")

create_dataset_based_on_rule(db_dir, csv_name_train, csv_name_test, csv_name_valid, test_size=4000, valid_size=4000,
                             dataset_pos_samples_nb=12000, dataset_neg_samples_nb=12000, rule_fun=three_squares_total)

In [None]:
from xaipatimg.datagen.gendataset import extract_sample_from_dataset
extract_sample_from_dataset(db_dir, csv_name_train, output_dir_path=sample_img_path, pos_samples_nb=1000, neg_samples_nb=1000)

### Create dataset with any image rule (to show sample of the full database)

In [None]:
def any_image(img_content):
    return True

from xaipatimg.datagen.gendataset import create_dataset_based_on_rule
import os
csv_name_train = "any_train.csv"
csv_name_test = "any_test.csv"
csv_name_valid = "any_valid.csv"
sample_img_path = os.path.join(db_dir, "datasets", "any_sample")

create_dataset_based_on_rule(db_dir, csv_name_train, csv_name_test, csv_name_valid, test_size=4000, valid_size=4000,
                             dataset_pos_samples_nb=12000, dataset_neg_samples_nb=0, rule_fun=any_image)

In [None]:
from xaipatimg.datagen.gendataset import extract_sample_from_dataset
extract_sample_from_dataset(db_dir, csv_name_train, output_dir_path=sample_img_path, pos_samples_nb=1000, neg_samples_nb=0)

### Create dataset "twice as many red symbols as blue symbols"

In [None]:
def twice_as_many_red_as_blue(img_content):
    blue_total = 0
    red_total = 0

    for c in img_content:
        if c["color"] == "#F86C62":
            red_total += 1
        elif c["color"] == "#7AB0CD":
            blue_total += 1

    return red_total == 2 * blue_total and blue_total != 0

In [None]:
import os
from xaipatimg.datagen.gendataset import create_dataset_based_on_rule

csv_name_train = "twiceasmanyredasblue_train.csv"
csv_name_test = "twiceasmanyredasblue_test.csv"
csv_name_valid = "twiceasmanyredasblue_valid.csv"
sample_img_path = os.path.join(db_dir, "datasets", "twiceasmanyredasblue_train")

create_dataset_based_on_rule(db_dir, csv_name_train, csv_name_test, csv_name_valid, test_size=4000, valid_size=4000,
                             dataset_pos_samples_nb=12000, dataset_neg_samples_nb=12000, rule_fun=twice_as_many_red_as_blue)

In [None]:
from xaipatimg.datagen.gendataset import extract_sample_from_dataset
extract_sample_from_dataset(db_dir, csv_name_train, output_dir_path=sample_img_path, pos_samples_nb=1000, neg_samples_nb=1000)

### Create dataset "twice as many triangles as circles"


In [None]:
def twice_as_many_triangles_as_circles(img_content):
    triangles_total = 0
    circles_total = 0

    for c in img_content:
        if c["shape"] == "triangle":
            triangles_total += 1
        elif c["shape"] == "circle":
            circles_total += 1

    return triangles_total == 2 * circles_total and circles_total != 0

In [None]:
import os
from xaipatimg.datagen.gendataset import create_dataset_based_on_rule

csv_name_train = "twiceasmanytrianglesascircles_train.csv"
csv_name_test = "twiceasmanytrianglesascircles_test.csv"
csv_name_valid = "twiceasmanytrianglesascircles_valid.csv"
sample_img_path = os.path.join(db_dir, "datasets", "twiceasmanytrianglesascircles_train")

create_dataset_based_on_rule(db_dir, csv_name_train, csv_name_test, csv_name_valid, test_size=4000, valid_size=4000,
                             dataset_pos_samples_nb=12000, dataset_neg_samples_nb=12000, rule_fun=twice_as_many_triangles_as_circles)

In [None]:
from xaipatimg.datagen.gendataset import extract_sample_from_dataset
extract_sample_from_dataset(db_dir, csv_name_train, output_dir_path=sample_img_path, pos_samples_nb=1000, neg_samples_nb=1000)

### Create dataset "Top twice as many shapes of color c2 as bottom c1"

Top part : nine first cells. Bottom part : four last cells.
c1 : color that is the most found on bottom part.
c2 : second color that is the most found on bottom part.


In [4]:
import numpy as np
def as_many_c2_c1_top_bottom(img_content):

    colors = np.array(COLORS)
    colors_count_bottom = np.zeros(len(colors))

    # Counting and sorting the colors that are most represented on bottom side (last line)
    for c in img_content:
        if c["pos"][1] == 3:
            colors_count_bottom[np.argwhere(colors == c["color"])] += 1
    colors_count_sort = np.argsort(colors_count_bottom)[::-1]

    # There can't be two colors that have the same number of instances on bottom side (last line)
    if colors_count_bottom[colors_count_sort[0]] == colors_count_bottom[colors_count_sort[1]] or \
      colors_count_bottom[colors_count_sort[1]] == colors_count_bottom[colors_count_sort[2]]:
        return False

    # Counting the number of colors on top side
    colors_count_top = np.zeros(len(colors))
    for c in img_content:
        if c["pos"][1] < 3:
            colors_count_top[np.argwhere(colors == c["color"])] += 1

    # Let be c1 the most represented color on bottom side and c2 the second one most represented
    # The rule is true if there is as many instances of c2 on top side as there is of c1 on bottom side
    colors_sorted = colors[colors_count_sort]

    return colors_count_top[np.argwhere(colors == colors_sorted[1])] == colors_count_bottom[colors_count_sort[0]]

In [5]:
import os
from xaipatimg.datagen.gendataset import create_dataset_based_on_rule

csv_name_train = "asmanyc1c2bottomtop_train.csv"
csv_name_test = "asmanyc1c2bottomtop_test.csv"
csv_name_valid = "asmanyc1c2bottomtop_valid.csv"
sample_img_path = os.path.join(db_dir, "datasets", "asmanyc1c2bottomtop_train")

create_dataset_based_on_rule(db_dir, csv_name_train, csv_name_test, csv_name_valid, test_size=4000, valid_size=4000,
                             dataset_pos_samples_nb=12000, dataset_neg_samples_nb=12000, rule_fun=as_many_c2_c1_top_bottom)

100%|██████████| 2000000/2000000 [01:19<00:00, 25001.01it/s]


Total number of positive instances found in database : 79572
Total number of negative instances found in database : 1920428


In [6]:
from xaipatimg.datagen.gendataset import extract_sample_from_dataset
extract_sample_from_dataset(db_dir, csv_name_train, output_dir_path=sample_img_path, pos_samples_nb=1000, neg_samples_nb=1000)

16001it [00:12, 1299.27it/s]
