In [9]:
import numpy as np
from sympy import symbols, Eq, solve, sympify
import os
import re


In [6]:
def extract_target_polynomial(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()

    target_polynomial = ""
    found_target = False

    for line in lines:
        # Check if we've reached the "Target Polynomial" section
        if "Target Polynomial:" in line:
            found_target = True
            continue  # Skip the line containing "Target Polynomial:"

        # If we're in the target polynomial section, append the line
        if found_target:
            target_polynomial += line.strip()  # Remove leading/trailing whitespace

    return target_polynomial

In [3]:
def extract_variables(equation):
    # Use regex to find all variable-like patterns
    # This matches sequences of letters and numbers, but excludes standalone numbers
    variables = re.findall(r'\b[a-zA-Z_][a-zA-Z0-9_]*\b', equation)
    
    # Remove duplicates by converting to a set, then back to a list
    return list(set(variables))

def get_variables_from_equations(equations):
    # Apply extract_variables to each equation in the list
    return [extract_variables(eq) for eq in equations]

In [55]:
import numpy as np
import sympy as sp
from sympy import symbols, sympify, Eq, diff

def generate_dataset_no_constants(equation, variables, num_points=1000, seed=42, tol=1e-6, max_iter=100):
    """
    Generate a dataset where the equation holds true for Gaussian-distributed variables.

    Parameters:
        equation (str): The equation as a string (e.g., "x + y * z - w**2").
        variables (list): List of variable names (e.g., ['x', 'y', 'z', 'w']).
        num_points (int): Number of data points to generate (default: 1000).
        seed (int): Random seed for reproducibility (default: 42).
        tol (float): Tolerance for the Newton-Raphson method (default: 1e-6).
        max_iter (int): Maximum number of iterations for the Newton-Raphson method (default: 100).

    Returns:
        dict: A dictionary where keys are variable names and values are numpy arrays of data.
    """
    
    # Set random seed for reproducibility
    np.random.seed(seed)

    # Generate Gaussian data for all but one variable
    data = {var: np.random.normal(loc=0, scale=1, size=num_points) for var in variables[:-1]}

    # Define symbols for the variables
    sym_vars = symbols(' '.join(variables))

    # Parse the equation into a symbolic expression
    eq_expr = sympify(equation)

    # Define the function and its derivative
    last_var = variables[-1]
    last_var_sym = sym_vars[-1]
    f = eq_expr
    f_prime = diff(f, last_var_sym)

    # Generate data for the last variable
    last_var_data = np.zeros(num_points)
    valid_indices = []  # To track valid (real) data points

    for i in range(num_points):
        # Substitute the values of the other variables into the function
        subs_dict = {sym_vars[j]: data[variables[j]][i] for j in range(len(variables) - 1)}
        f_subs = f.subs(subs_dict)
        f_prime_subs = f_prime.subs(subs_dict)

        # Initial guess for the Newton-Raphson method
        x0 = 0.0  # You can choose a different initial guess if needed

        # Newton-Raphson iteration
        x = x0
        for _ in range(max_iter):
            fx = f_subs.subs(last_var_sym, x)
            fpx = f_prime_subs.subs(last_var_sym, x)
            if fpx == 0:
                break  # Avoid division by zero
            x_new = x - fx / fpx
            if abs(x_new - x) < tol:
                x = x_new
                break
            x = x_new

        # Check if the solution is real
        if abs(f_subs.subs(last_var_sym, x)) < tol and x.is_real:
            last_var_data[i] = float(x)
            valid_indices.append(i)

    # Filter out invalid (complex) data points
    valid_indices = np.array(valid_indices)
    if len(valid_indices) < num_points:
        print(f"Warning: {num_points - len(valid_indices)} data points were discarded due to complex solutions.")
        for var in variables[:-1]:
            data[var] = data[var][valid_indices]
        last_var_data = last_var_data[valid_indices]

    # Add the last variable's data to the dataset
    data[last_var] = last_var_data

    return data

In [67]:
n = 70
num_points = 1000
seed = 27

for i in range(n):

    file_path = f'target_polynomial_benchmark/axioms_and_target_polynomials/system{i}.txt'
    target = extract_target_polynomial(file_path)
    variables = extract_variables(target)

    print("Target:", target)
    print("Variables:", variables)

    dataset = generate_dataset_no_constants(target,variables,num_points,seed)

    data_dir = 'target_polynomial_benchmark/target_polynomial_data/no_constants/'

    if not os.path.exists(data_dir):
        os.makedirs(data_dir)
    
    # Write the data to a file
    file_name = f'data_{i}.dat'
    file_path = os.path.join(data_dir, file_name)

    with open(file_path, 'w') as file:
        # Write the equation
        file.write(f'Equation: {target}\n')

        # Write the variables
        file.write(f'Variables: {", ".join(variables)}\n')

        # Write the data points
        for j in range(len(dataset[variables[0]])):
            values = [f'{var}={dataset[var][j]}' for var in variables]
            file.write(f'{", ".join(values)}\n')

    print(f"Data written to {file_path}")

    


Target: m1^2*d2*d2xdt2^2*Fg*Fc-2*m1^2*d2*d2xdt2^2*Fc^2-2*m1*d2*d2xdt2*Fg^2*Fc+6*m1*d2*d2xdt2*Fg*Fc^2-4*m1*d2*d2xdt2*Fc^3-m1*p^2*d2xdt2^2*Fg+d2*Fg^3*Fc-4*d2*Fg^2*Fc^2+5*d2*Fg*Fc^3-2*d2*Fc^4
Variables: ['d2', 'Fc', 'Fg', 'p', 'm1', 'd2xdt2']
Data written to target_polynomial_benchmark/target_polynomial_data/no_constants/data_59.dat
Target: d2*d1*Fg^2+2*m2^2*G*Fg-4*m2^2*G*Fc
Variables: ['m2', 'd2', 'Fc', 'Fg', 'G', 'd1']
Data written to target_polynomial_benchmark/target_polynomial_data/no_constants/data_60.dat
Target: d2*Fg*p+d2*m2^2*d2xdt2*c-2*W*m2*c+2*W*p+m2*p*c^2
Variables: ['m2', 'd2', 'Fg', 'c', 'p', 'W', 'd2xdt2']
Data written to target_polynomial_benchmark/target_polynomial_data/no_constants/data_61.dat
Target: 2*d2xdt2*m1*c+2*d2xdt2*p-Fc*c
Variables: ['Fc', 'c', 'p', 'm1', 'd2xdt2']
Data written to target_polynomial_benchmark/target_polynomial_data/no_constants/data_62.dat
Target: m1*d2xdt2^2*m2*W*d2*c^2-m1*d2xdt2^2*m2*d2^2*c^2*Fg+3*m1*d2xdt2^2*W*d2^2*Fg-m1*d2xdt2^2*W*d2^2*Fc-3*m

In [1]:
def generate_polynomial_dataset(polynomial_str, variables, num_samples=1000, mean=0, std_dev=1, seed=42):
    """
    Generates a dataset for a given polynomial and variables.

    Parameters:
        polynomial_str (str): The polynomial as a string.
        variables (list): List of variable names.
        num_samples (int): Number of data points to generate.
        mean (float): Mean of the Gaussian distribution.
        std_dev (float): Standard deviation of the Gaussian distribution.
        seed (int): Random seed for reproducibility.

    Returns:
        np.ndarray: Dataset with shape (num_samples, len(variables) + 1).
                   The last column is the polynomial evaluation.
    """
    # Set random seed for reproducibility
    np.random.seed(seed)

    # Create symbolic variables
    sym_vars = sp.symbols(' '.join(variables))
    expr = sp.sympify(polynomial_str)

    # Generate Gaussian data for variables
    data = np.random.normal(loc=mean, scale=std_dev, size=(num_samples, len(variables)))

    # Evaluate the polynomial at each data point
    output = np.zeros(num_samples)
    for i in range(num_samples):
        subs_dict = {var: val for var, val in zip(sym_vars, data[i])}
        output[i] = expr.subs(subs_dict).evalf()

    # Combine variables and output into a single array
    dataset = np.hstack((data, output.reshape(-1, 1)))

    return dataset

def save_dataset_to_file(dataset, filename):
    """
    Saves the dataset to a .dat file.

    Parameters:
        dataset (np.ndarray): Dataset to save.
        variables (list): List of variable names.
        filename (str): Name of the output file.
    """
    np.savetxt(filename, dataset, delimiter=" ", fmt="%.6f", header=header, comments="")
    print(f"Dataset saved to {filename}")


In [17]:
n = 70
num_points = 1000
seed = 27

for i in range(n):
    dir_path = 'target_polynomial_benchmark/target_polynomial_data/no_constants/'
    file_name = f'data_{i}.dat'
    file_path = os.path.join(dir_path, file_name)

    if not os.path.exists(file_path):
            print(f"File {file_name} does not exist. Skipping...")
            continue
    


    # Input and output file paths
    input_file = file_path
    output_file = f'target_polynomial_benchmark/target_polynomial_data/no_constants/numeric_data_{i}.dat'

    with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
        # Skip the first two lines (Equation and Variables)
        next(infile)
        next(infile)
        
        for line in infile:
            if '=' in line:
                # Split the line by commas to get each variable=value pair
                pairs = line.strip().split(', ')
                
                # Extract just the values after the equals sign
                values = []
                for pair in pairs:
                    if '=' in pair:
                        value = pair.split('=')[1]
                        values.append(value)
                
                # Append a 0 to the values
                values.append('0.0')
                
                # Write the values separated by spaces
                outfile.write(' '.join(values) + '\n')

File data_6.dat does not exist. Skipping...
File data_7.dat does not exist. Skipping...
File data_20.dat does not exist. Skipping...
File data_27.dat does not exist. Skipping...
File data_42.dat does not exist. Skipping...
File data_49.dat does not exist. Skipping...
File data_51.dat does not exist. Skipping...
File data_53.dat does not exist. Skipping...
File data_54.dat does not exist. Skipping...
File data_56.dat does not exist. Skipping...
File data_57.dat does not exist. Skipping...
File data_58.dat does not exist. Skipping...


In [None]:
import os
import re
import numpy as np
from sympy import symbols, sympify, Poly
from collections import defaultdict

def evaluate_polynomial(target_polynomial, dataset, observed_constants, observed_derivatives, measured_variables):

    evaluated_values = []
    
    # Combine all variable names (constants, derivatives, and measured variables)
    all_variables = observed_constants + observed_derivatives + measured_variables
    
    for i in range(dataset.shape[0]):
        # Substitute values into the polynomial
        polynomial = target_polynomial
        for j, var in enumerate(all_variables):
            polynomial = polynomial.replace(var, str(dataset[i, j]))
        
        polynomial = polynomial.replace('^', '**')

        print(polynomial)
        # Evaluate the polynomial
        try:
            value = eval(polynomial)
            evaluated_values.append(value)
        except:
            evaluated_values.append(np.nan)  # Skip if evaluation fails
    
    return np.array(evaluated_values)


# Function to extract target polynomial, measured variables, observed constants, and observed derivatives
def extract_info_from_file(filepath):
    with open(filepath, 'r') as file:
        content = file.readlines()
    
    # Extract target polynomial
    target_polynomial_line = next(line for line in content if line.startswith('Target Polynomial:'))
    target_polynomial = target_polynomial_line.split('Target Polynomial:')[1].strip()
    
    # Extract measured variables
    measured_variables_line = next(line for line in content if line.startswith('Measured Variables:'))
    measured_variables = eval(measured_variables_line.split('Measured Variables:')[1].strip())
    
    # Extract observed constants
    observed_constants_line = next(line for line in content if line.startswith('Observed Constants:'))
    observed_constants = eval(observed_constants_line.split('Observed Constants:')[1].strip())
    
    # Extract observed derivatives
    observed_derivatives_line = next(line for line in content if line.startswith('Observed Derivatives:'))
    observed_derivatives = eval(observed_derivatives_line.split('Observed Derivatives:')[1].strip())
    
    return target_polynomial, measured_variables, observed_constants, observed_derivatives

def generate_dataset(target_polynomial, measured_variables, observed_constants, observed_derivatives, constant_data=True, derivative_data=True):
    # Initialize dataset as a 2D array with 1000 rows and 0 columns
    dataset = np.zeros((1000, 0))  # Start with an empty 2D array

    # Step 2a: Generate data for constants
    if constant_data and observed_constants:  # Only proceed if constants are provided
        constant_values = {const: np.random.uniform(0, 1) for const in observed_constants}  # Sample once
        constant_data_matrix = np.array([[constant_values[const]] * 1000 for const in observed_constants]).T  # Repeat 1000 times
        dataset = np.hstack((dataset, constant_data_matrix))  # Append constant data
    elif not constant_data:
        measured_variables.extend(observed_constants)  # Add constants to measured variables if not generating data

    # Step 2b: Generate data for derivatives
    if derivative_data and 'd2xdt2' in observed_derivatives:  # Only proceed if derivatives are provided
        if 'd1' in measured_variables:
            d1_data = np.random.normal(0, 1, 1000)
            d2xdt2_data = np.diff(d1_data, append=d1_data[0])  # Differences with loop-around
        elif 'd2' in measured_variables:
            d2_data = np.random.normal(0, 1, 1000)
            d2xdt2_data = np.diff(d2_data, append=d2_data[0])  # Differences with loop-around
        else:
            d2xdt2_data = np.random.normal(0, 1, 1000)
        
        dataset = np.hstack((dataset, d2xdt2_data.reshape(-1, 1)))  # Append derivative data
    
    # Step 2c: Sort measured variables by degree in target polynomial
    degree_dict = defaultdict(int)
    for var in measured_variables:
        matches = re.findall(rf'{var}\^(\d+)', target_polynomial)
        if matches:
            degree_dict[var] = sum(int(exp) for exp in matches)
        else:
            degree_dict[var] = 1
    sorted_variables = sorted(measured_variables, key=lambda x: degree_dict[x], reverse=True)
    
    # Generate data for all but the last variable
    for var in sorted_variables[:-1]:
        var_data = np.random.normal(0, 1, 1000)
        dataset = np.hstack((dataset, var_data.reshape(-1, 1)))
    
    # Step 2d: Generate data for the last variable
    last_var = sorted_variables[-1]
    last_var_symbol = symbols(last_var)  # Convert to symbolic variable

    roots_column = np.zeros((dataset.shape[0], 1)) * np.nan  # Initialize with NaN

    complex_count = 0
    evaluation_fail_count = 0

    for i in range(1000):
        # Substitute all known values into the polynomial at once
        polynomial = target_polynomial
        for j, var in enumerate(observed_constants + observed_derivatives + sorted_variables[:-1]):
            # Replace variables with placeholders to avoid invalid syntax
            polynomial = re.sub(rf'\b{var}\b', f'{var}_value', polynomial)
        
        # Replace '^' with '**' for exponentiation
        polynomial = polynomial.replace('^', '**')

        # Convert the polynomial into a symbolic expression
        try:
            expr = sympify(polynomial)
            
            # Substitute numeric values into the expression
            substitutions = {}
            for j, var in enumerate(observed_constants + observed_derivatives + sorted_variables[:-1]):
                substitutions[symbols(f'{var}_value')] = dataset[i, j]
            
            expr = expr.subs(substitutions)
            
            # Extract coefficients of the polynomial in the last variable
            poly = Poly(expr, last_var_symbol)
            coefficients = poly.all_coeffs()
            
            # Solve for the roots
            roots = np.roots(coefficients)
            
            # Filter real roots
            real_roots = roots[np.isreal(roots)].real
            
            if len(real_roots) > 0:
                roots_column[i] = real_roots[0]  # Use the first real root
            else:
                complex_count += 1
        except Exception as e:
            evaluation_fail_count += 1
            print(f"An exception occurred: {e}")  # Skip if polynomial evaluation fails   

    # Append the roots column to the dataset
    dataset = np.hstack((dataset, roots_column))

    # Remove rows with NaN values
    dataset = dataset[~np.isnan(dataset).any(axis=1)] 
    print("Complex roots thrown out: ", complex_count)
    print("Number of failed evaluations: ", evaluation_fail_count)  
    
    return dataset

# Main script
output_directory = 'target_polynomial_benchmark/axioms_and_target_polynomials/constant_data/'
if not os.path.exists(output_directory):
        os.makedirs(output_directory)

input_directory = 'target_polynomial_benchmark/axioms_and_target_polynomials/'
n = 70
for i in range(n):
    filename = f'system{i}.txt'
    filepath = os.path.join(input_directory, filename)
    
    # Check if the file exists
    if not os.path.exists(filepath):
        print(f"File {filename} does not exist. Skipping iteration {i}.")
    
    # Extract info from file
    target_polynomial, measured_variables, observed_constants, observed_derivatives = extract_info_from_file(filepath)
    print(target_polynomial,measured_variables,observed_constants,observed_derivatives)
    # Generate dataset
    dataset = generate_dataset(target_polynomial, measured_variables, observed_constants, observed_derivatives)
    if len(dataset) == 0:
        print("No real roots found for system ",  i ,". Skipping")
        continue
    # Save dataset to file
    np.savetxt(os.path.join(output_directory, f'numeric_data_{i}.dat'), dataset, delimiter=',')
    print(f"Generated dataset for {filename} with shape {dataset.shape}")


3*W*G+d2xdt2*c^2*d1^2+d2xdt2*c^2*d2^2-c^4*d2 ['d1', 'd2', 'W'] ['G', 'c'] ['d2xdt2']
Complex roots thrown found:  0
Number of failed evaluations:  0
Generated dataset for system0.txt with shape (1000, 6)
3*c^2*Fg+6*c*d2xdt2*p-2*d1*d2xdt2*Fg-6*d2xdt2*W ['W', 'd1', 'Fg', 'p'] ['c'] ['d2xdt2']
Complex roots thrown found:  0
Number of failed evaluations:  0
Generated dataset for system1.txt with shape (1000, 6)
d2xdt2*m2+Fg-Fc ['Fg', 'Fc', 'm2'] [] ['d2xdt2']
Complex roots thrown found:  0
Number of failed evaluations:  0
Generated dataset for system2.txt with shape (1000, 4)
2*p^2*W^2*Fc*d1*m1*d2xdt2^2-p^2*Fc^2*d1^2*d2*m1^2*d2xdt2^3-p^2*Fc*d1^3*Fg*m1^2*d2xdt2^3-4*W^4*Fc^2+16*W^4*Fc*m1*d2xdt2-16*W^4*m1^2*d2xdt2^2+4*W^2*Fc^3*d1*d2*m1*d2xdt2+8*W^2*Fc^2*d1^2*Fg*m1*d2xdt2-16*W^2*Fc^2*d1*d2*m1^2*d2xdt2^2-24*W^2*Fc*d1^2*Fg*m1^2*d2xdt2^2+16*W^2*Fc*d1*d2*m1^3*d2xdt2^3+16*W^2*d1^2*Fg*m1^3*d2xdt2^3-Fc^4*d1^2*d2^2*m1^2*d2xdt2^2-4*Fc^3*d1^3*Fg*d2*m1^2*d2xdt2^2+4*Fc^3*d1^2*d2^2*m1^3*d2xdt2^3-4*Fc^2*d1^