In [1]:
import numpy as np
from sympy import symbols, Eq, solve, sympify
import os
import re

In [6]:
def extract_target_polynomial(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()

    target_polynomial = ""
    found_target = False

    for line in lines:
        # Check if we've reached the "Target Polynomial" section
        if "Target Polynomial:" in line:
            found_target = True
            continue  # Skip the line containing "Target Polynomial:"

        # If we're in the target polynomial section, append the line
        if found_target:
            target_polynomial += line.strip()  # Remove leading/trailing whitespace

    return target_polynomial

In [3]:
def extract_variables(equation):
    # Use regex to find all variable-like patterns
    # This matches sequences of letters and numbers, but excludes standalone numbers
    variables = re.findall(r'\b[a-zA-Z_][a-zA-Z0-9_]*\b', equation)
    
    # Remove duplicates by converting to a set, then back to a list
    return list(set(variables))

def get_variables_from_equations(equations):
    # Apply extract_variables to each equation in the list
    return [extract_variables(eq) for eq in equations]

In [55]:
import numpy as np
import sympy as sp
from sympy import symbols, sympify, Eq, diff

def generate_dataset_no_constants(equation, variables, num_points=1000, seed=42, tol=1e-6, max_iter=100):
    """
    Generate a dataset where the equation holds true for Gaussian-distributed variables.

    Parameters:
        equation (str): The equation as a string (e.g., "x + y * z - w**2").
        variables (list): List of variable names (e.g., ['x', 'y', 'z', 'w']).
        num_points (int): Number of data points to generate (default: 1000).
        seed (int): Random seed for reproducibility (default: 42).
        tol (float): Tolerance for the Newton-Raphson method (default: 1e-6).
        max_iter (int): Maximum number of iterations for the Newton-Raphson method (default: 100).

    Returns:
        dict: A dictionary where keys are variable names and values are numpy arrays of data.
    """
    
    # Set random seed for reproducibility
    np.random.seed(seed)

    # Generate Gaussian data for all but one variable
    data = {var: np.random.normal(loc=0, scale=1, size=num_points) for var in variables[:-1]}

    # Define symbols for the variables
    sym_vars = symbols(' '.join(variables))

    # Parse the equation into a symbolic expression
    eq_expr = sympify(equation)

    # Define the function and its derivative
    last_var = variables[-1]
    last_var_sym = sym_vars[-1]
    f = eq_expr
    f_prime = diff(f, last_var_sym)

    # Generate data for the last variable
    last_var_data = np.zeros(num_points)
    valid_indices = []  # To track valid (real) data points

    for i in range(num_points):
        # Substitute the values of the other variables into the function
        subs_dict = {sym_vars[j]: data[variables[j]][i] for j in range(len(variables) - 1)}
        f_subs = f.subs(subs_dict)
        f_prime_subs = f_prime.subs(subs_dict)

        # Initial guess for the Newton-Raphson method
        x0 = 0.0  # You can choose a different initial guess if needed

        # Newton-Raphson iteration
        x = x0
        for _ in range(max_iter):
            fx = f_subs.subs(last_var_sym, x)
            fpx = f_prime_subs.subs(last_var_sym, x)
            if fpx == 0:
                break  # Avoid division by zero
            x_new = x - fx / fpx
            if abs(x_new - x) < tol:
                x = x_new
                break
            x = x_new

        # Check if the solution is real
        if abs(f_subs.subs(last_var_sym, x)) < tol and x.is_real:
            last_var_data[i] = float(x)
            valid_indices.append(i)

    # Filter out invalid (complex) data points
    valid_indices = np.array(valid_indices)
    if len(valid_indices) < num_points:
        print(f"Warning: {num_points - len(valid_indices)} data points were discarded due to complex solutions.")
        for var in variables[:-1]:
            data[var] = data[var][valid_indices]
        last_var_data = last_var_data[valid_indices]

    # Add the last variable's data to the dataset
    data[last_var] = last_var_data

    return data

In [67]:
n = 70
num_points = 1000
seed = 27

for i in range(n):

    file_path = f'target_polynomial_benchmark/axioms_and_target_polynomials/system{i}.txt'
    target = extract_target_polynomial(file_path)
    variables = extract_variables(target)

    print("Target:", target)
    print("Variables:", variables)

    dataset = generate_dataset_no_constants(target,variables,num_points,seed)

    data_dir = 'target_polynomial_benchmark/target_polynomial_data/no_constants/'

    if not os.path.exists(data_dir):
        os.makedirs(data_dir)
    
    # Write the data to a file
    file_name = f'data_{i}.dat'
    file_path = os.path.join(data_dir, file_name)

    with open(file_path, 'w') as file:
        # Write the equation
        file.write(f'Equation: {target}\n')

        # Write the variables
        file.write(f'Variables: {", ".join(variables)}\n')

        # Write the data points
        for j in range(len(dataset[variables[0]])):
            values = [f'{var}={dataset[var][j]}' for var in variables]
            file.write(f'{", ".join(values)}\n')

    print(f"Data written to {file_path}")

    


Target: m1^2*d2*d2xdt2^2*Fg*Fc-2*m1^2*d2*d2xdt2^2*Fc^2-2*m1*d2*d2xdt2*Fg^2*Fc+6*m1*d2*d2xdt2*Fg*Fc^2-4*m1*d2*d2xdt2*Fc^3-m1*p^2*d2xdt2^2*Fg+d2*Fg^3*Fc-4*d2*Fg^2*Fc^2+5*d2*Fg*Fc^3-2*d2*Fc^4
Variables: ['d2', 'Fc', 'Fg', 'p', 'm1', 'd2xdt2']
Data written to target_polynomial_benchmark/target_polynomial_data/no_constants/data_59.dat
Target: d2*d1*Fg^2+2*m2^2*G*Fg-4*m2^2*G*Fc
Variables: ['m2', 'd2', 'Fc', 'Fg', 'G', 'd1']
Data written to target_polynomial_benchmark/target_polynomial_data/no_constants/data_60.dat
Target: d2*Fg*p+d2*m2^2*d2xdt2*c-2*W*m2*c+2*W*p+m2*p*c^2
Variables: ['m2', 'd2', 'Fg', 'c', 'p', 'W', 'd2xdt2']
Data written to target_polynomial_benchmark/target_polynomial_data/no_constants/data_61.dat
Target: 2*d2xdt2*m1*c+2*d2xdt2*p-Fc*c
Variables: ['Fc', 'c', 'p', 'm1', 'd2xdt2']
Data written to target_polynomial_benchmark/target_polynomial_data/no_constants/data_62.dat
Target: m1*d2xdt2^2*m2*W*d2*c^2-m1*d2xdt2^2*m2*d2^2*c^2*Fg+3*m1*d2xdt2^2*W*d2^2*Fg-m1*d2xdt2^2*W*d2^2*Fc-3*m