In [78]:
import numpy as np
import sympy as sp
from sympy import symbols, sympify, Poly
import os
from collections import defaultdict
from m2_functions import *
import re
import ast

In [124]:
def get_variables_from_equations(equations):
    # Apply extract_variables to each equation in the list
    return [extract_variables(eq) for eq in equations]

def extract_variables(equation):
    variables = re.findall(r'\b[a-zA-Z_][a-zA-Z0-9_]*\b', equation)
    return list(set(variables))

def get_base_variable(derivative):
    if derivative.startswith('dx') and 'dt' in derivative:
        parts = derivative.split('x')
        num_part = parts[1].split('dt')[0]
        num = num_part[0] if num_part else '1'
        return f'd{num}'
    elif derivative.startswith('d2x') and 'dt2' in derivative:
        parts = derivative.split('x')
        num_part = parts[1].split('dt')[0]
        num = num_part[0] if num_part else '1'
        return f'd{num}'
    else:
        return None

def collect_variables_in_sequence_and_reverse(equations):
    """
    Collects variables from each equation in sequence and reverses the list.
    Variables from equation 1 will be at the rightmost side,
    variables from equation 2 will be to the left of those from equation 1, and so on.
    
    Parameters:
        equations (list): List of equations as strings.
    
    Returns:
        list: Reversed list of variables collected in sequence.
    """
    all_vars = []
    
    # Get variables from each equation in sequence
    equation_variables = get_variables_from_equations(equations)
    
    # Add variables in sequence, avoiding duplicates
    for vars_list in equation_variables:
        for var in vars_list:
            if var not in all_vars:
                all_vars.append(var)
    
    # Reverse the list so variables from equation 1 are at the rightmost side
    all_vars.reverse()
    
    return all_vars

def generate_dataset_from_grobner_basis(grobner_basis, variables, observed_constants, observed_derivatives,
                                        constant_data=True, derivative_data=True, region=[1, 5],
                                        num_points=10000, seed=42, tol=1e-6):
    np.random.seed(seed)
    data = {}
    sym_vars = symbols(' '.join(variables))
    sorted_basis = sorted(grobner_basis, key=lambda eq: len(extract_variables(eq)))
    num_valid_points = num_points

    # Generate data for constants
    for const in observed_constants:
        if constant_data:
            value = np.random.uniform(region[0], region[1])
            data[const] = np.full(num_valid_points, value)
        else:
            data[const] = np.random.uniform(region[0], region[1], num_valid_points)

    # Generate data for derivatives
    for deriv in observed_derivatives:
        base_var = get_base_variable(deriv)
        if base_var in variables:
            if base_var in data:
                base_data = data[base_var]
                deriv_data = np.diff(base_data, append=base_data[0])
                data[deriv] = deriv_data
            else:
                data[deriv] = np.random.uniform(region[0], region[1], num_valid_points)
        else:
            data[deriv] = np.random.uniform(region[0], region[1], num_valid_points)

    # Process Gröbner basis equations
    for eq in sorted_basis:
        eq_expr = sympify(eq)
        eq_variables = extract_variables(eq)
        unknown_vars = [var for var in eq_variables if var not in data]

        if not unknown_vars:
            continue

        # Generate data for all but one unknown variable
        for var in unknown_vars[:-1]:
            data[var] = np.random.uniform(region[0], region[1], num_valid_points)

        last_var = unknown_vars[-1]
        last_var_sym = sym_vars[variables.index(last_var)]

        # Prepare polynomial coefficients
        try:
            poly = Poly(eq_expr, last_var_sym)
            coefficients = poly.all_coeffs()
        except (sp.PolynomialError, ValueError):
            print(f"Equation {eq} is not a valid polynomial in {last_var}. Skipping.")
            continue

        last_var_data = np.full(num_valid_points, np.nan)
        valid_indices = []

        for i in range(num_valid_points):
            subs_dict = {}
            for var in eq_variables:
                if var != last_var and var in data:
                    subs_dict[sym_vars[variables.index(var)]] = data[var][i]
            try:
                coeffs = [c.subs(subs_dict) for c in coefficients]
                coeffs = [complex(c) for c in coeffs]
                roots = np.roots(coeffs)
                real_roots = roots[np.isreal(roots)].real
                if real_roots.size > 0:
                    last_var_data[i] = real_roots[0]
                    valid_indices.append(i)
            except Exception as e:
                pass

        if valid_indices:
            valid_indices = np.array(valid_indices)
            for var in data:
                data[var] = data[var][valid_indices]
            data[last_var] = last_var_data[valid_indices]
            num_valid_points = len(valid_indices)
        else:
            print(f"No valid solutions for {last_var} in equation {eq}")
            return None

    # Generate remaining variables
    for var in variables:
        if var not in data:
            data[var] = np.random.uniform(region[0], region[1], num_valid_points)

    return data

def generate_data_for_system(equations, observed_constants, observed_derivatives, num_points=1000, seed=42,
                             constant_data=True, derivative_data=True, region=[1, 5]):
    variables = collect_variables_in_sequence_and_reverse(equations)
    projection(variables, equations, variables, [], filename='temp_grobner.txt')

    with open('temp_grobner.txt', 'r') as file:
        content = file.read()
        # Match everything after "Polynomials..." line until next section or end
        grobner_match = re.search(
            r'Polynomials of the Gröbner basis of the eliminated ideal:\s*\n(.*?)(?=\n\S+:|$)',
            content, 
            re.DOTALL
        )
        
        if grobner_match:
            # Split captured group into lines and clean
            grobner_basis = [
                line.strip().replace('^', '**') 
                for line in grobner_match.group(1).split('\n') 
                if line.strip()
            ]
        else:
            grobner_basis = []

    os.remove('temp_grobner.txt')

    if not grobner_basis:
        raise ValueError("No Gröbner basis found in the projection output.")

    return generate_dataset_from_grobner_basis(grobner_basis, variables, observed_constants, observed_derivatives,
                                              constant_data, derivative_data, region, num_points, seed)

In [125]:

def parse_data(file_path):
    """
    Parses a system description file with variables, constants, derivatives, and equations.
    """
    with open(file_path, 'r') as file:
        content = file.read()

    system_data = {}
    
    # System number
    system_number = re.search(r'System number (\d+)', content)
    system_data['system_number'] = int(system_number.group(1)) if system_number else None
    
    # Variables
    variables = re.search(r'Variables:\s*(\[.*?\])', content)
    system_data['variables'] = ast.literal_eval(variables.group(1)) if variables else []
    
    # Constants
    constants = re.search(r'Constants:\s*(\[.*?\])', content)
    system_data['constants'] = ast.literal_eval(constants.group(1)) if constants else []
    
    # Derivatives
    derivatives = re.search(r'Derivatives:\s*(\[.*?\])', content)
    system_data['derivatives'] = ast.literal_eval(derivatives.group(1)) if derivatives else []
    
    # Equations - this is the fixed section
    equations = []
    equations_match = re.search(r'Equations:\n(.*?)(?=\n\w+:|$)', content, re.DOTALL)
    if equations_match:
        raw_eqs = equations_match.group(1).strip().split('\n')
        equations = [
            eq.strip().replace('^', '**').replace(' ', '')
            for eq in raw_eqs
            if eq.strip() and not eq.startswith('Units')
        ]
    
    system_data['equations'] = equations
    
    return system_data

def save_dataset(dataset, filename, delimiter='\t'):
    """
    Save dataset to .dat file with variable headers.
    
    Parameters:
        dataset (dict): Dictionary of {variable: numpy_array}
        filename (str): Output file path
        delimiter (str): Column separator (default: tab)
    """
    # Validate dataset
    if not dataset:
        raise ValueError("Dataset is empty")
    
    # Check consistent array lengths
    lengths = [len(arr) for arr in dataset.values()]
    if len(set(lengths)) > 1:
        raise ValueError("Inconsistent array lengths in dataset")
    
    # Prepare header and data
    headers = list(dataset.keys())
    data = np.column_stack([dataset[var] for var in headers])
    
    # Write to file
    with open(filename, 'w') as f:
        # Write header
        f.write(delimiter.join(headers) + '\n')
        
        # Write data
        np.savetxt(f, data, delimiter=delimiter, fmt='%.8e')

In [132]:
# First parse the system description file
for j in range(5,6):
    for i in range(1,6):
        print(j,i)
        parsed_system = parse_data(f"axiom_correction_benchmark/system_{j}/replaced_system_{i}.txt")
        print(parsed_system)
        print(parsed_system['variables'])

        # Then generate data using the parsed components
        try:
            dataset = generate_data_for_system(
                equations=parsed_system['equations'],
                observed_constants=parsed_system['constants'],
                observed_derivatives=parsed_system['derivatives'],
                num_points=1000,
                seed=42,
                constant_data=True,
                derivative_data=True,
                region=[1, 5]
            )
            print("Successfully generated dataset with shape:", {k: v.shape for k, v in dataset.items()})
            # After generating and validating data
            save_dataset(
                dataset=dataset,
                filename=f'axiom_correction_benchmark/system_{j}/replaced_system_{i}_data.dat',
                delimiter=' '  # Use '\t' for tab, ',' for CSV
            )
        except ValueError as e:
            print(f"Error generating dataset: {e}")

5 1
{'system_number': 5, 'variables': ['d1', 'd2', 'm1', 'm2', 'w', 'Fg'], 'constants': ['G'], 'derivatives': ['dx1dt', 'd2x1dt2', 'dx2dt', 'd2x2dt2'], 'equations': ['-dx1dt**2*m1**2+Fg*d1*m2', '-dx1dt*dx2dt*m1+2*dx1dt*d2*m2*w+Fg*d2-d1**2*m1*w**2', 'd2x1dt2**2*Fg-d2x2dt2**2*d1*m1*w**2+dx2dt**3*m1*w**3', '-G*d2x2dt2*m2+d2x1dt2*d2x2dt2*d1*d2+d2x1dt2*dx1dt*dx2dt*d1+d2x2dt2*d1**2*d2*w**2']}
['d1', 'd2', 'm1', 'm2', 'w', 'Fg']
Output from Macaulay2:
Ring defined: R
Axioms defined: {matrix {{-4*dx1dt*m1+m2*Fg*d1}}, matrix {{-dx2dt*dx1dt*m1+2*w*d2*dx1dt*m2-4*w*m1*d1+d2*Fg}}, matrix {{2*d2x1dt2*Fg-4*d2x2dt2*w*m1*d1+9*dx2dt*w*m1}}, matrix {{-G*d2x2dt2*m2+d2x1dt2*d2x2dt2*d2*d1+d2x1dt2*dx2dt*dx1dt*d1+4*d2x2dt2*w*d2*d1}}}
Measured variables defined: {G, d2x1dt2, d2x2dt2, dx2dt, w, d2, dx1dt, m2, m1, Fg, d1}
Non-measured variables defined: {}
Ideal defined: ideal(-4*dx1dt*m1+m2*Fg*d1,-dx2dt*dx1dt*m1+2*w*d2*dx1dt*m2-4*w*m1*d1+d2*Fg,2*d2x1dt2*Fg-4*d2x2dt2*w*m1*d1+9*dx2dt*w*m1,-G*d2x2dt2*m2+d2x1dt2*d2

In [133]:
def evaluate_polynomials(dataset, equations, variable_order, tolerance=1e-5):
    """
    Evaluate how well the dataset satisfies the given polynomial equations.
    
    Parameters:
        dataset (dict): Dictionary of {variable: numpy_array}
        equations (list or str): Equation(s) as a list of strings or a single equation string.
                                 Each equation should be written as an expression equal to 0.
        variable_order (list): List of variable names in the expected order.
        tolerance (float): Maximum allowed absolute error.
    
    Returns:
        dict: Evaluation results with statistics including total points, number of valid points,
              pass rate (%), maximum absolute error, and indices of valid points.
    """
    # Ensure equations is a list, even if a single string is provided.
    if isinstance(equations, str):
        equations = [equations]
    
    # Create sympy symbols in the provided order.
    sym_vars = sp.symbols(' '.join(variable_order))
    
    # Build lambdified functions for each equation.
    eval_fns = []
    for eq in equations:
        try:
            # Replace caret with ** for exponentiation
            expr = sp.sympify(eq.replace('^', '**'))
            fn = sp.lambdify(sym_vars, expr, modules="numpy")
            eval_fns.append(fn)
        except Exception as e:
            raise ValueError(f"Could not parse equation '{eq}': {str(e)}")
    
    # Construct the data matrix using the provided variable order.
    try:
        data_matrix = np.column_stack([dataset[var] for var in variable_order])
    except KeyError as e:
        raise KeyError(f"Variable {e} is missing from the dataset. Check your variable_order list.")
    
    total_points = data_matrix.shape[0]
    valid_mask = np.ones(total_points, dtype=bool)
    max_error = 0.0
    
    # Evaluate each equation over all data points.
    for fn in eval_fns:
        try:
            # Pass each column as separate arguments.
            results = fn(*data_matrix.T)
            results = np.array(results)
            # If the results are complex, compute the absolute error as the sum of absolute real and imaginary parts.
            if np.iscomplexobj(results):
                error = np.abs(np.real(results)) + np.abs(np.imag(results))
            else:
                error = np.abs(results)
            max_error = max(max_error, np.nanmax(error))
            valid_mask &= (error <= tolerance)
        except Exception as e:
            print(f"Error during evaluation of an equation: {e}")
            valid_mask &= False

    valid_points = np.sum(valid_mask)
    pass_rate = valid_points / total_points * 100

    return {
        'total_points': total_points,
        'valid_points': valid_points,
        'pass_rate': pass_rate,
        'max_absolute_error': max_error,
        'valid_indices': np.where(valid_mask)[0].tolist()
    }
