In [1]:
import os

db_dir = os.environ["DATA"] + "PatImgXAI_data/db2.0.0/"
os.makedirs(db_dir, exist_ok=True)

test_datasets_sizes=1000
valid_datasets_sizes=1000
full_datasets_pos_samples_nb=5000
full_datasets_neg_samples_nb=5000
sample_nb_per_class = 100

In [2]:
# Number of images generated
NBGEN = 1000000

# Grid division of each image
X_DIVISIONS = 6
Y_DIVISIONS = 6

# Size of the images in pixels
img_size = (700, 700)

# Probability to generate a geometrical shape at each position in the grid
SHAPE_PROB = 0.5

# Define available shapes
SHAPES = ['circle', 'square', 'triangle']
COLORS  = ["#A33E9A", "#E0B000", "#0C90C0"] # Purple, Yellow, Blue

In [3]:
from xaipatimg.datagen.dbimg import load_db

db = load_db(db_dir)

In [None]:
import numpy as np
from xaipatimg.datagen.dbimg import generate_uuid
import os

to_generate = NBGEN
unique_content_generated = {}
duplicate_count = 0
while to_generate > 0:
    content = []
    for i in range(X_DIVISIONS):
        for j in range(Y_DIVISIONS):
            if np.random.random() < SHAPE_PROB:
                content.append({
                    "shape": np.random.choice(SHAPES),
                    "pos": (i, j),
                    "color": np.random.choice(COLORS)
                })

    if str(content) in unique_content_generated:
        duplicate_count += 1
        continue

    imgid = generate_uuid()
    db[imgid] = {
        "path": os.path.join("img", imgid + ".png"),
        "division" : (X_DIVISIONS, Y_DIVISIONS),
        "size": img_size,
        "content": content
    }

    unique_content_generated[str(content)] = True
    to_generate -= 1

print("unique generated in DB : " + str(len(db)))
print("duplicates avoided : " + str(duplicate_count))

In [6]:
import tqdm

content_dict = {}
nb_duplicates = 0

for k, v in tqdm.tqdm(db.items()):
    if str(v["content"]) in content_dict:
        nb_duplicates += 1
    else:
        content_dict[str(v["content"])] = True

print(nb_duplicates)

100%|██████████| 1000000/1000000 [00:38<00:00, 25679.41it/s]

0





In [7]:
from xaipatimg.datagen.genimg import gen_img_and_save_db
# gen_img_and_save_db(db, db_dir, overwrite=True, n_jobs=20)

100%|██████████| 1000000/1000000 [12:23<00:00, 1345.24it/s]


## Interface prototype v2

In [None]:
datasets_path = os.path.join(db_dir, "datasets", "01_protov2")

In [None]:
from xaipatimg.datagen.gendataset import generic_rule_exist_row_with_only_shape, generic_rule_N_times_color_exactly, \
    generic_rule_shape_color_plus_shape_equals_N, generic_rule_exist_row_with_only_color_and_col_with_only_shape, \
    generic_rule_shape_in_every_row

rules_data = [
    # {"name": "disc_1_triangle_all", "gen_fun": generic_rule_shape_in_every_row, "gen_kwargs": {"shape": "triangle", "y_division": Y_DIVISIONS}, "question": "In the image, is there a triangle in every row (1, ..., 6)?", "target_acc" : 1.0, "samples_interface": 5, "pos_llm_scaffold": "The AI predicts |YES| because every row contains at least one triangle : \n - Row 1 : XX, XX, XX\n- Row 2 : XX, XX, XX\n- Row 3 : XX, XX, XX\n- Row 4 : XX, XX, XX\n- Row 5 : XX, XX, XX\n- Row 5 : XX, XX, XX", "neg_llm_scaffold": "The AI predicts |NO| because the rows X and X do not contain any triangle."},

    {"name": "easy_1_6_blue", "gen_fun": generic_rule_N_times_color_exactly, "gen_kwargs": {"color": "#0C90C0", "N": 6, "x_division": X_DIVISIONS, "y_division": Y_DIVISIONS}, "question": "In the image, is there exactly 6 blue symbols?", "target_acc": 0.8, "samples_interface": 10, "pos_llm_scaffold": "The AI predicts |YES| because there is exactly 6 blue symbols, which are located at :\n- XX\n- XX\n- XX\n- XX\n- XX\n- XX", "neg_llm_scaffold": "The AI predicts |NO| because there is X blue symbols instead of 6. They are located at : \n- XX\n- XX\n- XX\n- XX\n- XX."},

    {"name": "easy_2_row_circle", "gen_fun": generic_rule_exist_row_with_only_shape, "gen_kwargs": {"shape": "circle", "y_division": Y_DIVISIONS},
     "question": "In the image, is there at least one row (1, ..., 6) containing only circles?", "target_acc": 0.8, "samples_interface": 10, "pos_llm_scaffold": "The AI predicts |YES| because there is at least one row which contains only circles : \nRow X contains only circles which are located at XX, XX, XX", "neg_llm_scaffold": "The AI predicts |NO| because there is not a single row containing only circles :\nRow 1 contains a non-circle symbol at XX\nRow 2 contains non-circle symbols at XX, XX, XX.\nRow 3 does not contain any symbol at all\n ..."},

    {"name": "easy_3_7_purple", "gen_fun": generic_rule_N_times_color_exactly, "gen_kwargs": {"color": "#A33E9A", "N": 7, "x_division": X_DIVISIONS, "y_division": Y_DIVISIONS}, "question": "In the image, is there exactly 7 purple symbols?", "target_acc": 0.8, "samples_interface": 10, "pos_llm_scaffold": "The AI predicts |YES| because there is exactly 7 purple symbols, which are located at :\n- XX\n- XX\n- XX\n- XX\n- XX\n- XX\n- XX", "neg_llm_scaffold": "The AI predicts |NO| because there is X purple symbols instead of 7. They are located at : \n- XX\n- XX\n- XX\n- XX\n- XX\n- XX."},

    {"name": "easy_4_row_triangle", "gen_fun": generic_rule_exist_row_with_only_shape, "gen_kwargs": {"shape": "triangle", "y_division": Y_DIVISIONS},
     "question": "In the image, is there at least one row (1, ..., 6) containing only triangles?", "target_acc": 0.8, "samples_interface": 10, "pos_llm_scaffold": "The AI predicts |YES| because there is at least one row which contains only triangles : \nRow X contains only triangles which are located at XX, XX, XX", "neg_llm_scaffold": "The AI predicts |NO| because there is not a single row containing only triangles :\nRow 1 contains a non-triangle symbol at XX\nRow 2 contains non-triangle symbols at XX, XX, XX.\nRow 3 does not contain any symbol at all\n ..."},

    # {"name": "easy_5_5_yellow", "gen_fun": generic_rule_N_times_color_exactly, "gen_kwargs": {"color": "#E0B000", "N": 5, "x_division": X_DIVISIONS, "y_division": Y_DIVISIONS}, "question": "In the image, is there exactly 5 yellow symbols?", "target_acc": 0.8, "samples_interface": 10,  "pos_llm_scaffold": "The AI predicts |YES| because there is exactly 5 yellow symbols, which are located at :\n- XX\n- XX\n- XX\n- XX\n- XX", "neg_llm_scaffold": "The AI predicts |NO| because there is X yellow symbols instead of 5, which are located at : \n- XX\n- XX\n- XX\n- XX\n- XX\n- XX."},

    {"name": "easy_6_row_square", "gen_fun": generic_rule_exist_row_with_only_shape, "gen_kwargs": {"shape": "square", "y_division": Y_DIVISIONS},
     "question": "In the image, is there at least one row (1, ..., 6) containing only squares?", "target_acc": 0.8, "samples_interface": 10, "pos_llm_scaffold": "The AI predicts |YES| because there is at least one row which contains only squares : \nRow X contains only squares which are located at XX, XX, XX", "neg_llm_scaffold": "The AI predicts |NO| because there is not a single row containing only squares :\nRow 1 contains a non-square symbol at XX\nRow 2 contains non-square symbols at XX, XX, XX.\nRow 3 does not contain any symbol at all\n ..."},



    {"name": "hard_1_blue_square_plus_circle_8", "gen_fun": generic_rule_shape_color_plus_shape_equals_N, "gen_kwargs": {"color1": "#0C90C0", "shape1": "square", "shape2": "circle", "N": 8, "x_division": X_DIVISIONS, "y_division": Y_DIVISIONS,},
     "question": "In the image, does the number of blue squares plus (+) the number of circles equal to 8?", "target_acc": 0.8, "samples_interface": 10, "pos_llm_scaffold": "The AI predicts |YES| because \n\n There is a total of X blue squares at positions : \n- XX\n- XX\n- XX\n- XX\n \nThere is a total of X circles at positions : \n- XX\n- XX\n- XX\n- XX\n X + X = 8", "neg_llm_scaffold": "The AI predicts |NO| because \n\n There is a total of X blue squares at positions : \n- XX\n- XX\n- XX\n- XX\n \nThere is a total of X circles at positions : \n- XX\n- XX\n- XX\n- XX\n X + X = X ≠ 8"},

    # {"name": "hard_2_row_purple_col_triangle", "gen_fun": generic_rule_exist_row_with_only_color_and_col_with_only_shape, "gen_kwargs": {"color": "#A33E9A", "shape": "triangle" ,"x_division": X_DIVISIONS, "y_division": Y_DIVISIONS},
    #  "question": "In the image, is there at least one row (1, ..., 6) containing only purple symbols, and one column (A, ..., F) containing only triangles?", "target_acc": 0.8, "samples_interface": 10},

    {"name": "hard_3_yellow_circle_plus_triangle_9", "gen_fun": generic_rule_shape_color_plus_shape_equals_N, "gen_kwargs": {"color1": "#E0B000", "shape1": "circle", "shape2": "triangle", "N": 9, "x_division": X_DIVISIONS, "y_division": Y_DIVISIONS},
     "question": "In the image, does the number of yellow circles plus (+) the number of triangles equal to 9?", "target_acc": 0.8, "samples_interface": 10, "pos_llm_scaffold": "The AI predicts |YES| because \n\n There is a total of X yellow circles at positions : \n- XX\n- XX\n- XX\n- XX\n \nThere is a total of X triangles at positions : \n- XX\n- XX\n- XX\n- XX\n X + X = 9", "neg_llm_scaffold": "The AI predicts |NO| because \n\n There is a total of X yellow circles at positions : \n- XX\n- XX\n- XX\n- XX\n \nThere is a total of X triangles at positions : \n- XX\n- XX\n- XX\n- XX\n X + X = X ≠ 9"},

    # {"name": "hard_4_row_yellow_col_circle", "gen_fun": generic_rule_exist_row_with_only_color_and_col_with_only_shape, "gen_kwargs": {"color": "#E0B000", "shape": "circle" ,"x_division": X_DIVISIONS, "y_division": Y_DIVISIONS},
    #  "question": "In the image, is there at least one row (1, ..., 6) containing only yellow symbols, and one column (A, ..., F) containing only circles?", "target_acc": 0.8, "samples_interface": 10},

    {"name": "hard_5_purple_triangle_plus_square_7", "gen_fun": generic_rule_shape_color_plus_shape_equals_N, "gen_kwargs": {"color1": "#A33E9A", "shape1": "triangle", "shape2": "square", "N": 7, "x_division": X_DIVISIONS, "y_division": Y_DIVISIONS},
     "question": "In the image, does the number of purple triangles plus (+) the number of squares equal to 7?", "target_acc": 0.8, "samples_interface": 10, "pos_llm_scaffold": "The AI predicts |YES| because \n\n There is a total of X purple triangles at positions : \n- XX\n- XX\n- XX\n- XX\n \nThere is a total of X squares at positions : \n- XX\n- XX\n- XX\n- XX\n X + X = 7", "neg_llm_scaffold": "The AI predicts |NO| because \n\n There is a total of X purple triangles at positions : \n- XX\n- XX\n- XX\n- XX\n \nThere is a total of X squares at positions : \n- XX\n- XX\n- XX\n- XX\n X + X = X ≠ 7"},

    # {"name": "hard_6_row_blue_col_square", "gen_fun": generic_rule_exist_row_with_only_color_and_col_with_only_shape, "gen_kwargs": {"color": "#0C90C0", "shape": "square" ,"x_division": X_DIVISIONS, "y_division": Y_DIVISIONS},
    #  "question": "In the image, is there at least one row (1, ..., 6) containing only blue symbols, and one column (A, ..., F) containing only squares?", "target_acc": 0.8, "samples_interface": 10},
]

In [5]:
from xaipatimg.datagen.gendataset import create_dataset_generic_rule_extract_sample
import tqdm

for rule_line in tqdm.tqdm(rules_data):
    name = rule_line["name"]
    sample_path = os.path.join(datasets_path, f"{name}_train")
    create_dataset_generic_rule_extract_sample(db_dir, datasets_dir_path=datasets_path, csv_name_train=name+"_train.csv",
                                               csv_name_test=name+"_test.csv", csv_name_valid=name+"_valid.csv", test_size=test_datasets_sizes,
                                               valid_size=valid_datasets_sizes, dataset_pos_samples_nb=full_datasets_pos_samples_nb,
                                               dataset_neg_samples_nb=full_datasets_neg_samples_nb,
                                               sample_path=sample_path, sample_nb_per_class=sample_nb_per_class,
                                               generic_rule_fun=rule_line["gen_fun"], **rule_line["gen_kwargs"])

  0%|          | 0/1 [00:00<?, ?it/s]
  0%|          | 0/1000000 [00:00<?, ?it/s][A
  1%|          | 5884/1000000 [00:00<00:16, 58827.43it/s][A
  1%|          | 11904/1000000 [00:00<00:16, 59628.35it/s][A
  2%|▏         | 17903/1000000 [00:00<00:16, 59792.71it/s][A
  2%|▏         | 23883/1000000 [00:00<00:16, 58801.60it/s][A
  3%|▎         | 29828/1000000 [00:00<00:16, 59030.72it/s][A
  4%|▎         | 35742/1000000 [00:00<00:16, 59066.49it/s][A
  4%|▍         | 41742/1000000 [00:00<00:16, 59368.93it/s][A
  5%|▍         | 47730/1000000 [00:00<00:15, 59530.12it/s][A
  5%|▌         | 53684/1000000 [00:00<00:16, 58783.63it/s][A
  6%|▌         | 59568/1000000 [00:01<00:15, 58798.81it/s][A
  7%|▋         | 65478/1000000 [00:01<00:15, 58888.34it/s][A
  7%|▋         | 71377/1000000 [00:01<00:15, 58915.64it/s][A
  8%|▊         | 77285/1000000 [00:01<00:15, 58963.49it/s][A
  8%|▊         | 83182/1000000 [00:01<00:15, 58208.28it/s][A
  9%|▉         | 89042/1000000 [00:01<00:15, 583

Total number of positive instances found in database : 86934
Total number of negative instances found in database : 913066



0it [00:00, ?it/s][A
13it [00:00, 129.80it/s][A
26it [00:00, 121.41it/s][A
45it [00:00, 150.45it/s][A
62it [00:00, 156.70it/s][A
78it [00:00, 151.82it/s][A
96it [00:00, 159.10it/s][A
112it [00:00, 153.87it/s][A
128it [00:00, 155.70it/s][A
145it [00:00, 159.18it/s][A
163it [00:01, 164.20it/s][A
180it [00:01, 156.49it/s][A
8001it [00:01, 6063.58it/s]A
100%|██████████| 1/1 [00:58<00:00, 58.33s/it]


## Interface prototype v3

In [4]:
datasets_path = os.path.join(db_dir, "datasets", "02_protov3")

In [5]:
from xaipatimg.datagen.gendataset import generic_rule_exist_row_with_only_shape, generic_rule_N_times_color_exactly, \
    generic_rule_shape_color_plus_shape_equals_N, generic_rule_shape_in_every_row, generic_rule_shape_color_times_2_shape_equals_shape

rules_data = [
    # {"name": "disc_1_triangle_all", "gen_fun": generic_rule_shape_in_every_row, "gen_kwargs": {"shape": "triangle", "y_division": Y_DIVISIONS}, "question": "In the image, is there a triangle in every row (1, ..., 6)?", "target_acc" : 1.0, "shown_acc" : 1.0, "samples_interface": 5, "pos_llm_scaffold": "The AI predicts |YES| because every row contains at least one triangle : \n - Row 1 : XX, XX, XX\n- Row 2 : XX, XX, XX\n- Row 3 : XX, XX, XX\n- Row 4 : XX, XX, XX\n- Row 5 : XX, XX, XX\n- Row 5 : XX, XX, XX", "neg_llm_scaffold": "The AI predicts |NO| because the rows X and X do not contain any triangle."},
    #
    # {"name": "disc_1_triangle_all_2", "gen_fun": generic_rule_shape_in_every_row, "gen_kwargs": {"shape": "triangle", "y_division": Y_DIVISIONS}, "question": "In the image, is there a triangle in every row (1, ..., 6)?", "target_acc" : 1.0, "shown_acc" : 1.0, "samples_interface": 5, "pos_llm_scaffold": "The AI predicts |YES| because every row contains at least one triangle : \n - Row 1 : XX, XX, XX\n- Row 2 : XX, XX, XX\n- Row 3 : XX, XX, XX\n- Row 4 : XX, XX, XX\n- Row 5 : XX, XX, XX\n- Row 5 : XX, XX, XX", "neg_llm_scaffold": "The AI predicts |NO| because the rows X and X do not contain any triangle."},
    #
    # {"name": "disc_1_triangle_all_3", "gen_fun": generic_rule_shape_in_every_row, "gen_kwargs": {"shape": "triangle", "y_division": Y_DIVISIONS}, "question": "In the image, is there a triangle in every row (1, ..., 6)?", "target_acc" : 1.0, "shown_acc" : 1.0, "samples_interface": 5, "pos_llm_scaffold": "The AI predicts |YES| because every row contains at least one triangle : \n - Row 1 : XX, XX, XX\n- Row 2 : XX, XX, XX\n- Row 3 : XX, XX, XX\n- Row 4 : XX, XX, XX\n- Row 5 : XX, XX, XX\n- Row 5 : XX, XX, XX", "neg_llm_scaffold": "The AI predicts |NO| because the rows X and X do not contain any triangle."},
    #
    #
    # {"name": "easy_1_6_blue", "gen_fun": generic_rule_N_times_color_exactly, "gen_kwargs": {"color": "#0C90C0", "N": 6, "x_division": X_DIVISIONS, "y_division": Y_DIVISIONS}, "question": "In the image, are there exactly 6 blue symbols?", "target_acc": 0.9, "shown_acc": 0.8, "samples_interface": 10, "pos_llm_scaffold": "The AI predicts |YES| because there is exactly 6 blue symbols, which are located at :\n- XX\n- XX\n- XX\n- XX\n- XX\n- XX", "neg_llm_scaffold": "The AI predicts |NO| because there is X blue symbols instead of 6. They are located at : \n- XX\n- XX\n- XX\n- XX\n- XX."},
    #
    # {"name": "easy_2_row_circle", "gen_fun": generic_rule_exist_row_with_only_shape, "gen_kwargs": {"shape": "circle", "y_division": Y_DIVISIONS},
    #  "question": "In the image, is there at least one row (1, ..., 6) containing only circles?", "target_acc": 0.9, "shown_acc": 0.8, "samples_interface": 10, "pos_llm_scaffold": "The AI predicts |YES| because there is at least one row which contains only circles : \nRow X contains only circles which are located at XX, XX, XX", "neg_llm_scaffold": "The AI predicts |NO| because there is not a single row containing only circles :\nRow 1 contains a non-circle symbol at XX\nRow 2 contains non-circle symbols at XX, XX, XX.\nRow 3 does not contain any symbol at all\n ..."},
    #
    # {"name": "easy_3_7_purple", "gen_fun": generic_rule_N_times_color_exactly, "gen_kwargs": {"color": "#A33E9A", "N": 7, "x_division": X_DIVISIONS, "y_division": Y_DIVISIONS}, "question": "In the image, are there exactly 7 purple symbols?", "target_acc": 0.9, "shown_acc": 0.8, "samples_interface": 10, "pos_llm_scaffold": "The AI predicts |YES| because there is exactly 7 purple symbols, which are located at :\n- XX\n- XX\n- XX\n- XX\n- XX\n- XX\n- XX", "neg_llm_scaffold": "The AI predicts |NO| because there is X purple symbols instead of 7. They are located at : \n- XX\n- XX\n- XX\n- XX\n- XX\n- XX."},
    #
    # {"name": "easy_4_row_triangle", "gen_fun": generic_rule_exist_row_with_only_shape, "gen_kwargs": {"shape": "triangle", "y_division": Y_DIVISIONS},
    #  "question": "In the image, is there at least one row (1, ..., 6) containing only triangles?", "target_acc": 0.9, "shown_acc": 0.8, "samples_interface": 10, "pos_llm_scaffold": "The AI predicts |YES| because there is at least one row which contains only triangles : \nRow X contains only triangles which are located at XX, XX, XX", "neg_llm_scaffold": "The AI predicts |NO| because there is not a single row containing only triangles :\nRow 1 contains a non-triangle symbol at XX\nRow 2 contains non-triangle symbols at XX, XX, XX.\nRow 3 does not contain any symbol at all\n ..."},
    #
    # {"name": "easy_5_5_yellow", "gen_fun": generic_rule_N_times_color_exactly, "gen_kwargs": {"color": "#E0B000", "N": 5, "x_division": X_DIVISIONS, "y_division": Y_DIVISIONS}, "question": "In the image, are there exactly 5 yellow symbols?", "target_acc": 0.9, "shown_acc": 0.8, "samples_interface": 10,  "pos_llm_scaffold": "The AI predicts |YES| because there is exactly 5 yellow symbols, which are located at :\n- XX\n- XX\n- XX\n- XX\n- XX", "neg_llm_scaffold": "The AI predicts |NO| because there is X yellow symbols instead of 5, which are located at : \n- XX\n- XX\n- XX\n- XX\n- XX\n- XX."},
    #
    # {"name": "easy_6_row_square", "gen_fun": generic_rule_exist_row_with_only_shape, "gen_kwargs": {"shape": "square", "y_division": Y_DIVISIONS},
    #  "question": "In the image, is there at least one row (1, ..., 6) containing only squares?", "target_acc": 0.9, "shown_acc": 0.8, "samples_interface": 10, "pos_llm_scaffold": "The AI predicts |YES| because there is at least one row which contains only squares : \nRow X contains only squares which are located at XX, XX, XX", "neg_llm_scaffold": "The AI predicts |NO| because there is not a single row containing only squares :\nRow 1 contains a non-square symbol at XX\nRow 2 contains non-square symbols at XX, XX, XX.\nRow 3 does not contain any symbol at all\n ..."},
    #
    #
    # {"name": "hard_1_blue_square_plus_circle_8", "gen_fun": generic_rule_shape_color_plus_shape_equals_N, "gen_kwargs": {"color1": "#0C90C0", "shape1": "square", "shape2": "circle", "N": 8, "x_division": X_DIVISIONS, "y_division": Y_DIVISIONS,},
    #  "question": "In the image, does the number of blue squares plus (+) the number of circles equal to 8?", "question_llm": "In the image, does the number of blue squares plus (+) the number of circles of any color equal to 8", "target_acc": 0.9, "shown_acc": 0.8, "samples_interface": 10, "pos_llm_scaffold": "The AI predicts |YES| because \n\n There is a total of X blue squares at positions : \n- XX\n- XX\n- XX\n- XX\n \nThere is a total of X circles at positions : \n- XX\n- XX\n- XX\n- XX\n\n X + X = 8", "neg_llm_scaffold": "The AI predicts |NO| because \n\n There is a total of X blue squares at positions : \n- XX\n- XX\n- XX\n- XX\n \nThere is a total of X circles at positions : \n- XX\n- XX\n- XX\n- XX\n X + X = X ≠ 8"},
    #
    # {"name": "hard_2_yellow_triangles_times2_squares", "gen_fun": generic_rule_shape_color_times_2_shape_equals_shape, "gen_kwargs": {"color1": "#E0B000", "shape1": "triangle", "shape2": "square", "x_division": X_DIVISIONS, "y_division": Y_DIVISIONS,},
    #  "question": "In the image, does the number of yellow triangles multiplied by 2 (×2) equal to the number of squares?", "question_llm": "In the image, does the number of yellow triangles multiplied by 2 (×2) equal to the number of squares of any color ?", "target_acc": 0.9, "shown_acc": 0.8, "samples_interface": 10, "pos_llm_scaffold": "The AI predicts |YES| because \n\n There is a total of X yellow triangles at positions : \n- XX\n- XX\n- XX\n- XX\n \nThere is a total of X squares at positions : \n- XX\n- XX\n- XX\n- XX\n\n X × 2 = X", "neg_llm_scaffold": "There is a total of X yellow triangles at positions : \n- XX\n- XX\n- XX\n- XX\n \nThere is a total of X squares at positions : \n- XX\n- XX\n- XX\n- XX\n\n X × 2 = X ≠ X"},
    #
    # {"name": "hard_3_yellow_circle_plus_triangle_9", "gen_fun": generic_rule_shape_color_plus_shape_equals_N, "gen_kwargs": {"color1": "#E0B000", "shape1": "circle", "shape2": "triangle", "N": 9, "x_division": X_DIVISIONS, "y_division": Y_DIVISIONS},
    #  "question": "In the image, does the number of yellow circles plus (+) the number of triangles equal to 9?",
    #  "question_llm": "In the image, does the number of yellow circles plus (+) the number of triangles of any color equal to 9?","target_acc": 0.9, "shown_acc": 0.8, "samples_interface": 10, "pos_llm_scaffold": "The AI predicts |YES| because \n\n There is a total of X yellow circles at positions : \n- XX\n- XX\n- XX\n- XX\n \nThere is a total of X triangles at positions : \n- XX\n- XX\n- XX\n- XX\n\n X + X = 9", "neg_llm_scaffold": "The AI predicts |NO| because \n\n There is a total of X yellow circles at positions : \n- XX\n- XX\n- XX\n- XX\n \nThere is a total of X triangles at positions : \n- XX\n- XX\n- XX\n- XX\n X + X = X ≠ 9"},
    # #
    # {"name": "hard_4_purple_squares_times2_circles", "gen_fun": generic_rule_shape_color_times_2_shape_equals_shape, "gen_kwargs": {"color1": "#A33E9A", "shape1": "square", "shape2": "circle", "x_division": X_DIVISIONS, "y_division": Y_DIVISIONS,},
    #  "question": "In the image, does the number of purple squares multiplied by 2 (×2) equal to the number of circles?",
    #  "question_llm": "In the image, does the number of purple squares multiplied by 2 (×2) equal to the number of circles of any color?", "target_acc": 0.9, "shown_acc": 0.8, "samples_interface": 10, "pos_llm_scaffold": "The AI predicts |YES| because \n\n There is a total of X purple squares at positions : \n- XX\n- XX\n- XX\n- XX\n \nThere is a total of X circles at positions : \n- XX\n- XX\n- XX\n- XX\n\n X × 2 = X", "neg_llm_scaffold": "There is a total of X purple squares at positions : \n- XX\n- XX\n- XX\n- XX\n \nThere is a total of X circles at positions : \n- XX\n- XX\n- XX\n- XX\n\n X × 2 = X ≠ X"},
    #
    # {"name": "hard_5_purple_triangle_plus_square_7", "gen_fun": generic_rule_shape_color_plus_shape_equals_N, "gen_kwargs": {"color1": "#A33E9A", "shape1": "triangle", "shape2": "square", "N": 7, "x_division": X_DIVISIONS, "y_division": Y_DIVISIONS},
    #  "question": "In the image, does the number of purple triangles plus (+) the number of squares equal to 7?",
    #  "question_llm": "In the image, does the number of purple triangles plus (+) the number of squares of any color equal to 7?", "target_acc": 0.9, "shown_acc": 0.8, "samples_interface": 10, "pos_llm_scaffold": "The AI predicts |YES| because \n\n There is a total of X purple triangles at positions : \n- XX\n- XX\n- XX\n- XX\n \nThere is a total of X squares at positions : \n- XX\n- XX\n- XX\n- XX\n\n X + X = 7", "neg_llm_scaffold": "The AI predicts |NO| because \n\n There is a total of X purple triangles at positions : \n- XX\n- XX\n- XX\n- XX\n \nThere is a total of X squares at positions : \n- XX\n- XX\n- XX\n- XX\n X + X = X ≠ 7"},

    {"name": "hard_6_blue_circles_times2_triangles", "gen_fun": generic_rule_shape_color_times_2_shape_equals_shape, "gen_kwargs": {"color1": "#0C90C0", "shape1": "circle", "shape2": "triangle", "x_division": X_DIVISIONS, "y_division": Y_DIVISIONS,},
     "question": "In the image, does the number of blue circles multiplied by 2 (×2) equal to the number of triangles?", "question_llm": "In the image, does the number of blue circles multiplied by 2 (×2) equal to the number of triangles of any color?","target_acc": 0.9, "shown_acc": 0.8, "samples_interface": 10, "pos_llm_scaffold": "The AI predicts |YES| because \n\n There is a total of X blue circles at positions : \n- XX\n- XX\n- XX\n- XX\n \nThere is a total of X triangles at positions : \n- XX\n- XX\n- XX\n- XX\n\n X × 2 = X", "neg_llm_scaffold": "There is a total of X blue circles at positions : \n- XX\n- XX\n- XX\n- XX\n \nThere is a total of X triangles at positions : \n- XX\n- XX\n- XX\n- XX\n\n X × 2 = X ≠ X"},
]


In [6]:
from xaipatimg.datagen.gendataset import create_dataset_generic_rule_extract_sample
import tqdm

for rule_line in tqdm.tqdm(rules_data):
    name = rule_line["name"]
    sample_path = os.path.join(datasets_path, f"{name}_train")
    create_dataset_generic_rule_extract_sample(db_dir, datasets_dir_path=datasets_path, csv_name_train=name+"_train.csv",
                                               csv_name_test=name+"_test.csv", csv_name_valid=name+"_valid.csv", test_size=test_datasets_sizes,
                                               valid_size=valid_datasets_sizes, dataset_pos_samples_nb=full_datasets_pos_samples_nb,
                                               dataset_neg_samples_nb=full_datasets_neg_samples_nb,
                                               sample_path=sample_path, sample_nb_per_class=sample_nb_per_class,
                                               generic_rule_fun=rule_line["gen_fun"], **rule_line["gen_kwargs"])

  0%|          | 0/1 [00:00<?, ?it/s]
  0%|          | 0/1000000 [00:00<?, ?it/s][A
  0%|          | 1223/1000000 [00:00<01:21, 12227.59it/s][A
  0%|          | 2446/1000000 [00:00<01:22, 12127.97it/s][A
  0%|          | 3659/1000000 [00:00<01:22, 12039.01it/s][A
  0%|          | 4872/1000000 [00:00<01:22, 12071.14it/s][A
  1%|          | 6080/1000000 [00:00<01:22, 12034.86it/s][A
  1%|          | 7284/1000000 [00:00<01:22, 12017.77it/s][A
  1%|          | 8486/1000000 [00:00<01:22, 11995.61it/s][A
  1%|          | 9691/1000000 [00:00<01:22, 12009.88it/s][A
  1%|          | 10897/1000000 [00:00<01:22, 12022.58it/s][A
  1%|          | 12100/1000000 [00:01<01:22, 12009.20it/s][A
  1%|▏         | 13301/1000000 [00:01<01:22, 11982.45it/s][A
  1%|▏         | 14500/1000000 [00:01<01:22, 11965.30it/s][A
  2%|▏         | 15700/1000000 [00:01<01:22, 11975.63it/s][A
  2%|▏         | 16903/1000000 [00:01<01:21, 11991.19it/s][A
  2%|▏         | 18112/1000000 [00:01<01:21, 12017.96it

Total number of positive instances found in database : 86920
Total number of negative instances found in database : 913080



0it [00:00, ?it/s][A
22it [00:00, 211.06it/s][A
59it [00:00, 302.92it/s][A
94it [00:00, 322.79it/s][A
133it [00:00, 348.04it/s][A
8001it [00:00, 13305.78it/s]
100%|██████████| 1/1 [02:01<00:00, 121.85s/it]
