In [1]:
import itertools
import ast
import pandas as pd
import os
from tqdm import tqdm
import multiprocessing
import re
from gplearn_memetic.simplification import count_expression_length, simplify_expression, SimplificationError, CONSTANT, OPERATORS, FUNCTIONS

# Function to generate all valid tree structures
def generate_valid_trees(k):
    if k == 1:
        return [[0]]  # Only one valid tree with a single node

    valid_trees = []
    possible_trees = itertools.product([0, 1, 2], repeat=k)

    for tree in possible_trees:
        if tree[0] == 0 or tree[-1] != 0:  # First node cannot be nullary, last must be nullary
            continue

        stack = 1
        valid = True

        for node in tree:
            if stack < 1:
                valid = False
                break
            if node == 0:
                stack -= 1
            elif node == 1:
                stack += 0  # Unary does not affect stack size
            elif node == 2:
                stack += 1  # Binary increases stack requirement

        if valid and stack == 0:  # Only keep valid trees
            valid_trees.append(tree)

    return valid_trees

def build_expressions_from_tree(tree):
    expressions = []

    # Choices for nullary nodes (variables and constant)
    leaf_choices = VARIABLES + [CONSTANT]

    # Iterate over all possible ways to assign nullary nodes
    for leaf_assignment in itertools.product(leaf_choices, repeat=tree.count(0)):
        for unary_assignment in itertools.product(FUNCTIONS, repeat=tree.count(1)):
            for binary_assignment in itertools.product(OPERATORS, repeat=tree.count(2)):

                stack = []
                leaf_idx, unary_idx, binary_idx = 0, 0, 0  # Index tracking
                valid = True

                # **Process tree from RIGHT to LEFT**
                for node in reversed(tree):
                    if node == 0:  # Nullary operator (Variable or Constant)
                        stack.append(leaf_assignment[leaf_idx])
                        leaf_idx += 1
                    elif node == 1:  # Unary operator (sin, cos, etc.)
                        if len(stack) < 1:
                            valid = False  # Invalid if there's no operand
                            break
                        arg = stack.pop()
                        stack.append(f"{unary_assignment[unary_idx]}({arg})")
                        unary_idx += 1
                    elif node == 2:  # Binary operator (+, -, *, /)
                        if len(stack) < 2:
                            valid = False  # Invalid if less than 2 operands exist
                            break
                        right = stack.pop()
                        left = stack.pop()
                        stack.append(f"({left} {binary_assignment[binary_idx]} {right})")
                        binary_idx += 1

                # **Ensure exactly 1 final expression remains**
                if valid and len(stack) == 1:
                    expressions.append(stack[0])

    return expressions

def generate_expressions(max_length, save_interval, num_features):
    for length in tqdm(range(1, max_length + 1), desc="Generating Expressions"):
        csv_filename = f"skeleton_data/len{length}_features{num_features}.csv"
        feather_filename = f"skeleton_data/len{length}_features{num_features}.feather"

        # Skip this length if the feather file exists (indicating the current length is fully completed)
        if os.path.exists(feather_filename):
            print(f"Skipping length {length} (already completed).")
            continue

        valid_trees = generate_valid_trees(length)  # Generate valid trees

        num_workers = min(max(1, multiprocessing.cpu_count() - 1), len(valid_trees))  # Limit workers to available cores
        batch = set()

        with multiprocessing.Pool(num_workers) as pool:
            for expressions in pool.imap_unordered(process_tree, [(tree, length) for tree in valid_trees]):
                batch.update(expressions)  # Add unique expressions to the batch

                # Save when batch reaches save_interval
                if len(batch) >= save_interval:
                    append_to_csv(csv_filename, batch)
                    batch.clear()  # Clear batch after saving

        # Save any remaining expressions
        if batch:
            append_to_csv(csv_filename, batch)

        # **Clean and Sort the CSV File After Processing**
        clean_and_sort_csv(csv_filename, feather_filename)

def process_tree(args):
    tree, length = args
    expressions = set()
    raw_expressions = build_expressions_from_tree(tree)

    for expr in raw_expressions:
        simplified_expr = simplify_expression(expr)
        if (count_expression_length(simplified_expr) == length) or ('1' in simplified_expr) or ('0' in simplified_expr):
            expressions.add(simplified_expr)  # Ensure uniqueness at worker level

    return expressions

def append_to_csv(filename, expressions):
    if not expressions:
        return  # Avoid writing empty data

    df = pd.DataFrame(list(expressions), columns=['Expression'])
    df.to_csv(filename, mode='a', index=False, header=not os.path.exists(filename))

def clean_and_sort_csv(csv_filename, feather_filename):
    if not os.path.exists(csv_filename):
        print(f"Skipping cleaning for {csv_filename} (file not found).")
        return
    
    # Load current CSV
    df = pd.read_csv(csv_filename)
    
    # Extract len and feature count from filename
    match = re.search(r'len(\d+)_features(\d+)', csv_filename)
    if not match:
        raise ValueError(f"Filename format not recognized: {csv_filename}")
    
    current_len = int(match.group(1))
    feature_count = match.group(2)
    directory = os.path.dirname(csv_filename)
    
    # Collect expressions from smaller len files with the same feature count
    existing_expressions = set()
    
    for file in os.listdir(directory):
        file_match = re.search(r'len(\d+)_features' + feature_count + r'\.csv$', file)
        if file_match:
            file_len = int(file_match.group(1))
            if file_len < current_len:
                small_df = pd.read_csv(os.path.join(directory, file))
                existing_expressions.update(small_df['Expression'].dropna().unique())
    
    # Remove entries that already exist in smaller len files
    df = df[~df['Expression'].isin(existing_expressions)]
    
    # Remove duplicates and sort
    df.drop_duplicates(subset=['Expression'], inplace=True)
    df.sort_values(by=['Expression'], inplace=True)
    
    # Save back to CSV (cleaned version)
    df.to_csv(csv_filename, index=False)
    
    # Convert to Feather for faster access
    df.reset_index(drop=True, inplace=True)
    df.to_feather(feather_filename)
    
    print(f"Cleaned and converted {csv_filename} → {feather_filename}")

for num_features in range(2, 8):
    VARIABLES = [f"X{i}" for i in range(num_features)]
    # Set parameters
    if num_features in (2,3):
        max_length = 11
    else:
        max_length = 9
    save_interval = 100000
    
    # Generate expressions with proper batch processing and multiprocessing
    generate_expressions(max_length, save_interval, num_features = num_features)

Generating Expressions:   0%|                            | 0/11 [00:00<?, ?it/s]

Skipping length 1 (already completed).
Skipping cleaning for skeleton_data/len2_features2.csv (file not found).
Skipping length 3 (already completed).
Skipping cleaning for skeleton_data/len4_features2.csv (file not found).
Skipping length 5 (already completed).


Generating Expressions:  55%|██████████▉         | 6/11 [00:00<00:00, 58.79it/s]

Skipping cleaning for skeleton_data/len6_features2.csv (file not found).
Skipping length 7 (already completed).
Skipping cleaning for skeleton_data/len8_features2.csv (file not found).
Skipping length 9 (already completed).


Generating Expressions: 100%|███████████████████| 11/11 [00:00<00:00, 13.99it/s]


Skipping cleaning for skeleton_data/len10_features2.csv (file not found).
Skipping length 11 (already completed).


Generating Expressions:   0%|                            | 0/11 [00:00<?, ?it/s]

Skipping length 1 (already completed).
Skipping cleaning for skeleton_data/len2_features3.csv (file not found).
Skipping length 3 (already completed).
Skipping cleaning for skeleton_data/len4_features3.csv (file not found).
Skipping length 5 (already completed).


Generating Expressions:  55%|██████████▉         | 6/11 [00:00<00:00, 58.35it/s]

Skipping cleaning for skeleton_data/len6_features3.csv (file not found).
Skipping length 7 (already completed).
Skipping cleaning for skeleton_data/len8_features3.csv (file not found).
Skipping length 9 (already completed).


Generating Expressions: 100%|███████████████████| 11/11 [00:00<00:00, 14.98it/s]


Skipping cleaning for skeleton_data/len10_features3.csv (file not found).
Skipping length 11 (already completed).


Generating Expressions:   0%|                             | 0/9 [00:00<?, ?it/s]

Skipping length 1 (already completed).
Skipping cleaning for skeleton_data/len2_features4.csv (file not found).
Skipping length 3 (already completed).
Skipping cleaning for skeleton_data/len4_features4.csv (file not found).
Skipping length 5 (already completed).
Skipping cleaning for skeleton_data/len6_features4.csv (file not found).
Skipping length 7 (already completed).


Generating Expressions: 100%|█████████████████████| 9/9 [00:00<00:00, 21.40it/s]


Skipping cleaning for skeleton_data/len8_features4.csv (file not found).
Skipping length 9 (already completed).


Generating Expressions:   0%|                             | 0/9 [00:00<?, ?it/s]

Skipping length 1 (already completed).
Skipping cleaning for skeleton_data/len2_features5.csv (file not found).
Skipping length 3 (already completed).
Skipping cleaning for skeleton_data/len4_features5.csv (file not found).
Skipping length 5 (already completed).
Skipping cleaning for skeleton_data/len6_features5.csv (file not found).
Skipping length 7 (already completed).


Generating Expressions: 100%|█████████████████████| 9/9 [00:00<00:00, 21.45it/s]


Skipping cleaning for skeleton_data/len8_features5.csv (file not found).
Skipping length 9 (already completed).


Generating Expressions:   0%|                             | 0/9 [00:00<?, ?it/s]

Skipping length 1 (already completed).
Skipping cleaning for skeleton_data/len2_features6.csv (file not found).
Skipping length 3 (already completed).
Skipping cleaning for skeleton_data/len4_features6.csv (file not found).
Skipping length 5 (already completed).
Skipping cleaning for skeleton_data/len6_features6.csv (file not found).
Skipping length 7 (already completed).


Generating Expressions: 100%|█████████████████████| 9/9 [00:00<00:00, 21.70it/s]


Skipping cleaning for skeleton_data/len8_features6.csv (file not found).
Skipping length 9 (already completed).


Generating Expressions:   0%|                             | 0/9 [00:00<?, ?it/s]

Skipping length 1 (already completed).
Skipping cleaning for skeleton_data/len2_features7.csv (file not found).
Skipping length 3 (already completed).
Skipping cleaning for skeleton_data/len4_features7.csv (file not found).
Skipping length 5 (already completed).
Skipping cleaning for skeleton_data/len6_features7.csv (file not found).
Skipping length 7 (already completed).


Generating Expressions: 100%|█████████████████████| 9/9 [00:00<00:00, 21.34it/s]

Skipping cleaning for skeleton_data/len8_features7.csv (file not found).
Skipping length 9 (already completed).



