In [2]:
 # imports 
import random
import math
import statistics
import os

In [34]:
# generate random uniform sythetic data base 

def random_uniform_synthetic_data_base(n_objects: int, n_binary_attrs: int, bin_density_percent: float, n_numeric_attrs: int, distinct_value_percent: float, n_items: int, occ_min_percent: float, occ_max_percent: float,
                                       min_itemsets: int, max_itemsets: int, output_path: str, seed: int = 42,):
    # Step 0: basic checks and seed
    if n_objects <= 0 or n_binary_attrs <= 0 or n_numeric_attrs <= 0 or n_items <= 0:
        raise ValueError("n_objects, n_binary_attrs, n_numeric_attrs and n_items must be > 0")

    if not (0 <= bin_density_percent <= 100):
        raise ValueError("bin_density_percent must be in [0, 100]")

    if not (0 <= distinct_value_percent <= 100):
        raise ValueError("distinct_value_percent must be in [0, 100]")

    if not (0 <= occ_min_percent <= occ_max_percent <= 100):
        raise ValueError("occurrence percentages must be in [0, 100] and min <= max")

    if min_itemsets <= 0 or max_itemsets < min_itemsets:
        raise ValueError("min_itemsets must be > 0 and max_itemsets >= min_itemsets")

   

    random.seed(seed)


    # Step 1: binary matrix

    total_bin_cells = n_objects * n_binary_attrs # number of entries in the binary matrix
    target_ones = int(total_bin_cells * bin_density_percent / 100.0)
    if target_ones > total_bin_cells:
        raise ValueError("Requested number of ones is larger than the number of binary cells")
    

    binary_matrix = [[0 for _ in range(n_binary_attrs)] for _ in range(n_objects)]
    all_positions = list(range(total_bin_cells))
    chosen_positions = random.sample(all_positions, target_ones)


    for pos in chosen_positions:
        i = pos // n_binary_attrs
        j = pos % n_binary_attrs
        binary_matrix[i][j] = 1

    ones_count = sum(sum(row) for row in binary_matrix)
    if ones_count != target_ones:
        raise ValueError("binary ones < > target_ones")
    print("Step 1 binary matrix done")
    


    # step 2 : numerical matrix

    total_num_cells = n_objects * n_numeric_attrs
    target_distinct = int(total_num_cells * distinct_value_percent / 100.0)
    if target_distinct <= 0:
        raise ValueError("distinct value less than 1")
    if target_distinct > total_num_cells:
        raise ValueError("distinct value larger than total entries of the matrix")
    


    numeric_matrix = [[None for _ in range(n_numeric_attrs)] for _ in range(n_objects)]
    all_num_positions = list(range(total_num_cells))
    random.shuffle(all_num_positions)


    #first passe to assigne a distinct value at least once
    distinct_values = list(range(1, target_distinct + 1))
    assigned_positions = []
    for v in distinct_values:
        if not all_num_positions:
            raise ValueError("number of entries exhausted before assigning all distinct values")
        pos = all_num_positions.pop()
        assigned_positions.append(pos)
        i = pos // n_numeric_attrs
        j = pos % n_numeric_attrs
        numeric_matrix[i][j] = v


    # second passe to fill the rest of the matrix

    for pos in all_num_positions:
        i = pos // n_numeric_attrs
        j = pos % n_numeric_attrs
        numeric_matrix[i][j] = random.randint(1, target_distinct)


    # verification

    flat_values = [numeric_matrix[i][j] for i in range(n_objects) for j in range(n_numeric_attrs)]
    distinct_used = sorted(set(flat_values))
    if len(distinct_used) != target_distinct or distinct_used[0] != 1 or distinct_used[-1] != target_distinct:
        raise ValueError("Step 2 failed: numeric distinct values < > target_distinct")
    print("Step 2 numeric matrix done")



    # step 3 sequence database

    items = list(range(1, n_items + 1))
    target_instances_per_item = {}

    while True:
        assigned_items = [set() for _ in range(n_objects)]
        target_instances_per_item.clear()

        for it in items:
            p = random.uniform(occ_min_percent, occ_max_percent)
            k = int(n_objects * p / 100.0)
            if k > 0:
                chosen_objs = random.sample(range(n_objects), k)
                for i in chosen_objs:
                    assigned_items[i].add(it)
            target_instances_per_item[it] = k

        # ensure each instance has at least one item
        if all(len(assigned_items[i]) > 0 for i in range(n_objects)):
            break
    

    # size of sequences
    sequences = []
    seq_lengths = []

    for i in range(n_objects):
        items_for_i = sorted(assigned_items[i])
        attempts = 0

        while True:
            attempts += 1
            if attempts > 1000:
                raise ValueError("Step 3 failed: could not build a valid sequence for instance {}".format(i))

            n_itemsets = random.randint(min_itemsets, max_itemsets)
            itemsets = []

            for _ in range(n_itemsets):
                max_size_here =  len(items_for_i)
                size = random.randint(1, max_size_here)
                chosen_items = sorted(random.sample(items_for_i, size))
                itemsets.append(chosen_items)

            # ensure that all items are used 
            used = set()
            for itset in itemsets:
                for it in itset:
                    used.add(it)

            if used == set(items_for_i):
                sequences.append(itemsets)
                seq_lengths.append(sum(len(s) for s in itemsets))
                break

    # checks that the generated sequences respect the target occurrences
    occ_instances = {it: 0 for it in items}
    for i in range(n_objects):
        used = set()
        for itset in sequences[i]:
            for it in itset:
                used.add(it)
        for it in used:
            occ_instances[it] += 1

    for it in items:
        if occ_instances[it] != target_instances_per_item[it]:
            raise ValueError(f" step 3 item {it} occurrence in instances ({occ_instances[it]}) " f"does not match target ({target_instances_per_item[it]})")

    if not all(len(sequences[i]) >= min_itemsets and len(sequences[i]) <= max_itemsets for i in range(n_objects)):
        raise ValueError("Step 3  number of itemsets per sequence is out of bounds")

    print("Step 3 sequence is done")

    # meta data

    if seq_lengths: 
        max_seq_len = max(seq_lengths)
        mean_seq_len = statistics.mean(seq_lengths)
        std_seq_len = statistics.pstdev(seq_lengths)
    else:
        raise ValueError("no sequences generated")
    

    os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)

    header_values = [n_objects, n_binary_attrs, round(bin_density_percent, 3), int(target_ones), n_numeric_attrs, round(distinct_value_percent, 3), int(target_distinct), n_items, round(occ_min_percent, 3),
                    round(occ_max_percent, 3), min_itemsets, max_itemsets, int(max_seq_len), round(mean_seq_len, 3), round(std_seq_len, 3), seed]
    header_str = " ".join(str(v) for v in header_values)

    # final output 

    with open(output_path, "w", encoding="utf-8") as f:
        f.write(header_str + "\n")
        for i in range(n_objects):
            bin_str = " ".join(str(x) for x in binary_matrix[i])
            num_str = " ".join(str(x) for x in numeric_matrix[i])

            seq_tokens = []
            for itset in sequences[i]:
                if itset:
                    seq_tokens.append(" ".join(str(x) for x in itset) + " -4")
                else:
                    seq_tokens.append("-4")
            seq_str = " ".join(seq_tokens)

            line = f"{bin_str} -1 {num_str} -2 {seq_str} -3\n"
            f.write(line)

    return {"output_path": output_path, "summary": { "n_objects": n_objects, "n_binary_attrs": n_binary_attrs, "bin_density_percent": bin_density_percent, "bin_ones": target_ones, "n_numeric_attrs": n_numeric_attrs,
            "distinct_value_percent": distinct_value_percent, "distinct_values": target_distinct, "n_items": n_items, "occ_min_percent": occ_min_percent, "occ_max_percent": occ_max_percent, "min_itemsets": min_itemsets,
            "max_itemsets": max_itemsets, "max_seq_length": max_seq_len, "mean_seq_length": mean_seq_len, "std_seq_length": std_seq_len, "seed": seed },}



In [135]:
if __name__ == "__main__":
    res = random_uniform_synthetic_data_base(n_objects=10000, n_binary_attrs=13, bin_density_percent=80.0, n_numeric_attrs=18, distinct_value_percent=10, n_items=15, occ_min_percent=40.0, occ_max_percent=90.0, min_itemsets=6,
                                             max_itemsets=9,output_path="C:\\Stage\\forge\\these_rayane_lachache\\code\\IDA\\brenchmark\\test1.txt",seed=208,)
    print("File written:", res["output_path"])
    print("Summary:", res["summary"])

Step 1 binary matrix done
Step 2 numeric matrix done
Step 3 sequence is done
File written: C:\Stage\forge\these_rayane_lachache\code\IDA\brenchmark\test1.txt
Summary: {'n_objects': 10000, 'n_binary_attrs': 13, 'bin_density_percent': 80.0, 'bin_ones': 104000, 'n_numeric_attrs': 18, 'distinct_value_percent': 10, 'distinct_values': 18000, 'n_items': 15, 'occ_min_percent': 40.0, 'occ_max_percent': 90.0, 'min_itemsets': 6, 'max_itemsets': 9, 'max_seq_length': 103, 'mean_seq_length': 40.3279, 'std_seq_length': 11.655633041152248, 'seed': 208}
