In [1]:
import os
import re
import numpy as np
from sympy import symbols, sympify, Poly
from collections import defaultdict

def evaluate_polynomial(target_polynomial, dataset, observed_constants, observed_derivatives, measured_variables):
    evaluated_values = []
    
    # Combine all variable names (constants, derivatives, and measured variables)
    all_variables = observed_constants + observed_derivatives + measured_variables
    
    for i in range(dataset.shape[0]):
        # Substitute values into the polynomial
        polynomial = target_polynomial
        for j, var in enumerate(all_variables):
            # Replace variables with placeholders to avoid invalid syntax
            polynomial = re.sub(rf'\b{var}\b', f'{var}_value', polynomial)
        
        # Replace '^' with '**' for exponentiation
        polynomial = polynomial.replace('^', '**')

        # Evaluate the polynomial
        try:
            # Parse the polynomial string into a symbolic expression
            expr = sympify(polynomial)
            
            # Substitute numeric values into the expression
            substitutions = {}
            for j, var in enumerate(all_variables):
                substitutions[symbols(f'{var}_value')] = dataset[i, j]
            
            expr = expr.subs(substitutions)
            
            # Evaluate the polynomial
            value = float(expr.evalf())
            evaluated_values.append(value)
        except Exception as e:
            evaluated_values.append(np.nan)  # Skip if evaluation fails
            print(f"Exception: {e}")  # Debugging: Print the exception
    
    return np.array(evaluated_values)


# Function to extract target polynomial, measured variables, observed constants, and observed derivatives
def extract_info_from_file(filepath):
    with open(filepath, 'r') as file:
        content = file.readlines()
    
    # Extract target polynomial
    target_polynomial_line = next(line for line in content if line.startswith('Target Polynomial:'))
    target_polynomial = target_polynomial_line.split('Target Polynomial:')[1].strip()
    
    # Extract measured variables
    measured_variables_line = next(line for line in content if line.startswith('Measured Variables:'))
    measured_variables = eval(measured_variables_line.split('Measured Variables:')[1].strip())
    
    # Extract observed constants
    observed_constants_line = next(line for line in content if line.startswith('Observed Constants:'))
    observed_constants = eval(observed_constants_line.split('Observed Constants:')[1].strip())
    
    # Extract observed derivatives
    observed_derivatives_line = next(line for line in content if line.startswith('Observed Derivatives:'))
    observed_derivatives = eval(observed_derivatives_line.split('Observed Derivatives:')[1].strip())
    
    return target_polynomial, measured_variables, observed_constants, observed_derivatives

def generate_dataset(target_polynomial, measured_variables, observed_constants, observed_derivatives, constant_data=True, derivative_data=True):
    # Initialize dataset as a 2D array with 1000 rows and 0 columns
    dataset = np.zeros((1000, 0))  # Start with an empty 2D array

    # Step 2a: Generate data for constants
    if constant_data and observed_constants:  # Only proceed if constants are provided
        constant_values = {const: np.random.uniform(0, 10) for const in observed_constants}  # Sample once
        if 'c' in observed_constants:
            constant_values['c'] = 2.99792458e8
        if 'G' in observed_constants:
            constant_values['G'] = 6.6743e-11
        constant_data_matrix = np.array([[constant_values[const]] * 1000 for const in observed_constants]).T  # Repeat 1000 times
        dataset = np.hstack((dataset, constant_data_matrix))  # Append constant data
    elif not constant_data and observed_constants:
        constant_values = {const: np.random.normal(5, 3, 1000) for const in observed_constants}  # Generate 1000 samples per constant
        constant_data_matrix = np.column_stack([constant_values[const] for const in observed_constants])  # Stack properly
        dataset = np.hstack((dataset, constant_data_matrix))  # Append constant data


    derivative_dependent_variable = []
    # Step 2b: Generate data for derivatives
    if derivative_data and 'd2xdt2' in observed_derivatives:  # Only proceed if derivatives are provided
        if 'd1' in measured_variables:
            # Generate data for d1 and its derivative d2xdt2
            d1_data = np.random.normal(5, 3, 1000)
            d2xdt2_data = np.diff(d1_data, append=d1_data[0])  # Differences with loop-around
            
            # Append d1 and d2xdt2 to the dataset
            dataset = np.hstack((dataset, d1_data.reshape(-1, 1), d2xdt2_data.reshape(-1, 1)))
            
            # Remove d1 from measured_variables to avoid double-generation
            measured_variables.remove('d1')
            derivative_dependent_variable.append('d1')
        elif 'd2' in measured_variables:
            # Generate data for d2 and its derivative d2xdt2
            d2_data = np.random.normal(5, 3, 1000)
            d2xdt2_data = np.diff(d2_data, append=d2_data[0])  # Differences with loop-around
            
            # Append d2 and d2xdt2 to the dataset
            dataset = np.hstack((dataset, d2_data.reshape(-1, 1), d2xdt2_data.reshape(-1, 1)))
            
            # Remove d2 from measured_variables to avoid double-generation
            measured_variables.remove('d2')
            derivative_dependent_variable.append('d2')
        else:
            # If neither d1 nor d2 is present, generate data for d2xdt2 directly
            d2xdt2_data = np.random.normal(0, 5, 1000)
            dataset = np.hstack((dataset, d2xdt2_data.reshape(-1, 1)))
    
    # Step 2c: Sort measured variables by degree in target polynomial
    degree_dict = defaultdict(int)
    for var in measured_variables:
        matches = re.findall(rf'{var}\^(\d+)', target_polynomial)
        if matches:
            degree_dict[var] = sum(int(exp) for exp in matches)
        else:
            degree_dict[var] = 1
    sorted_variables = sorted(measured_variables, key=lambda x: degree_dict[x], reverse=True)
    
    # Generate data for all but the last variable
    for var in sorted_variables[:-1]:
        var_data = np.random.normal(5, 3, 1000)
        dataset = np.hstack((dataset, var_data.reshape(-1, 1)))
    
    # Step 2d: Generate data for the last variable
    last_var = sorted_variables[-1]
    last_var_symbol = symbols(last_var)  # Convert to symbolic variable

    roots_column = np.zeros((dataset.shape[0], 1)) * np.nan  # Initialize with NaN

    complex_count = 0
    evaluation_fail_count = 0

    for i in range(1000):
        # Substitute all known values into the polynomial at once
        polynomial = target_polynomial
        for j, var in enumerate(observed_constants + observed_derivatives + derivative_dependent_variable + sorted_variables[:-1]):
            # Replace variables with placeholders to avoid invalid syntax
            polynomial = re.sub(rf'\b{var}\b', f'{var}_value', polynomial)
        
        # Replace '^' with '**' for exponentiation
        polynomial = polynomial.replace('^', '**')

        # Convert the polynomial into a symbolic expression
        try:
            expr = sympify(polynomial)
            
            # Substitute numeric values into the expression
            substitutions = {}
            for j, var in enumerate(observed_constants + observed_derivatives + derivative_dependent_variable + sorted_variables[:-1]):
                substitutions[symbols(f'{var}_value')] = dataset[i, j]
            
            expr = expr.subs(substitutions)
            
            # Extract coefficients of the polynomial in the last variable
            poly = Poly(expr, last_var_symbol)
            coefficients = poly.all_coeffs()
            
            # Solve for the roots
            roots = np.roots(coefficients)
            
            # Filter real roots
            real_roots = roots[np.isreal(roots)].real
            
            if len(real_roots) > 0:
                roots_column[i] = real_roots[0]  # Use the first real root
            else:
                complex_count += 1
        except Exception as e:
            evaluation_fail_count += 1
            print(f"An exception occurred: {e}")  # Skip if polynomial evaluation fails   

    # Append the roots column to the dataset
    dataset = np.hstack((dataset, roots_column))

    # Remove rows with NaN values
    dataset = dataset[~np.isnan(dataset).any(axis=1)] 
    print("Complex roots thrown out: ", complex_count)
    print("Number of failed evaluations: ", evaluation_fail_count)  
    
    return dataset, derivative_dependent_variable



In [36]:
def generate_dataset(target_polynomial, measured_variables, observed_constants, observed_derivatives, constant_data=True, derivative_data=True, region=[1,5]):
    # Initialize dataset as a 2D array with 1000 rows and 0 columns
    dataset = np.zeros((1000, 0))  # Start with an empty 2D array

    # Step 2a: Generate data for constants
    if constant_data and observed_constants:  # Only proceed if constants are provided
        constant_values = {const: np.random.uniform(0, 10) for const in observed_constants}  # Sample once
        if 'c' in observed_constants:
            constant_values['c'] = 2.99792458e8
        if 'G' in observed_constants:
            constant_values['G'] = 6.6743e-11
        constant_data_matrix = np.array([[constant_values[const]] * 1000 for const in observed_constants]).T  # Repeat 1000 times
        dataset = np.hstack((dataset, constant_data_matrix))  # Append constant data
    elif not constant_data and observed_constants:
        constant_values = {const: np.random.uniform(region[0], region[1], 1000) for const in observed_constants}  # Generate 1000 samples per constant
        constant_data_matrix = np.column_stack([constant_values[const] for const in observed_constants])  # Stack properly
        dataset = np.hstack((dataset, constant_data_matrix))  # Append constant data

    derivative_dependent_variable = []
    derivative_data_matrix = []

    
    
    # Step 2b: Generate data for derivatives only if needed
    if derivative_data:
        if 'd1' in measured_variables and ('dx1dt' in observed_derivatives or 'd2x1dt2' in observed_derivatives):
            d1_data = np.random.uniform(region[0], region[1], 1000)
            dx1dt_data = np.diff(d1_data, append=d1_data[0]) if 'dx1dt' in observed_derivatives else None
            if 'dx1dt' in observed_derivatives:
                d2x1dt2_data = np.diff(dx1dt_data, append=dx1dt_data[0]) if 'd2x1dt2' in observed_derivatives else None
            else: 
                d2x1dt2_data = np.random.uniform(region[0],region[1], 1000) if 'd2x1dt2' in observed_derivatives else None
            
            if dx1dt_data is not None:
                derivative_data_matrix.append(dx1dt_data)
            if d2x1dt2_data is not None:
                derivative_data_matrix.append(d2x1dt2_data)

        elif 'd1' not in measured_variables and ('dx1dt' in observed_derivatives or 'd2x1dt2' in observed_derivatives):
            dx1dt_data = np.random.uniform(region[0],region[1], 1000)
            if 'dx1dt' in observed_derivatives:
                d2x1dt2_data = np.diff(dx1dt_data, append=dx1dt_data[0]) if 'd2x1dt2' in observed_derivatives else None
            else: 
                d2x1dt2_data = np.random.uniform(region[0],region[1], 1000) if 'd2x1dt2' in observed_derivatives else None
            if dx1dt_data is not None:
                derivative_data_matrix.append(dx1dt_data)
            if d2x1dt2_data is not None:
                derivative_data_matrix.append(d2x1dt2_data)
        
        if 'd2' in measured_variables and ('dx2dt' in observed_derivatives or 'd2x2dt2' in observed_derivatives):
            d2_data = np.random.uniform(region[0], region[1], 1000)
            dx2dt_data = np.diff(d2_data, append=d2_data[0]) if 'dx2dt' in observed_derivatives else None
            if 'dx2dt' in observed_derivatives:
                d2x2dt2_data = np.diff(dx2dt_data, append=dx2dt_data[0]) if 'd2x2dt2' in observed_derivatives else None
            else: 
                d2x2dt2_data = np.random.uniform(region[0],region[1], 1000) if 'd2x2dt2' in observed_derivatives else None
            
            if dx2dt_data is not None:
                derivative_data_matrix.append(dx2dt_data)
            if d2x2dt2_data is not None:
                derivative_data_matrix.append(d2x2dt2_data)

        elif 'd2' not in measured_variables and ('dx2dt' in observed_derivatives or 'd2x2dt2' in observed_derivatives):
            dx2dt_data = np.random.uniform(region[0],region[1], 1000)
            if 'dx2dt' in observed_derivatives:
                d2x2dt2_data = np.diff(dx2dt_data, append=dx2dt_data[0]) if 'd2x2dt2' in observed_derivatives else None
            else: 
                d2x2dt2_data = np.random.uniform(region[0],region[1], 1000) if 'd2x2dt2' in observed_derivatives else None

            if dx2dt_data is not None:
                derivative_data_matrix.append(dx2dt_data)
            if d2x2dt2_data is not None:
                derivative_data_matrix.append(d2x2dt2_data)

        if 'd1' in measured_variables and ('dx1dt' in observed_derivatives or 'd2x1dt2' in observed_derivatives):
            derivative_data_matrix.append(d1_data)
            measured_variables.remove('d1')
            derivative_dependent_variable.append('d1')
        if 'd2' in measured_variables and ('dx2dt' in observed_derivatives or 'd2x2dt2' in observed_derivatives):
            derivative_data_matrix.append(d2_data)
            measured_variables.remove('d2')
            derivative_dependent_variable.append('d2')

    
    if derivative_data_matrix:
        dataset = np.hstack((dataset, np.column_stack(derivative_data_matrix)))
    
    # Step 2c: Sort measured variables by degree in target polynomial
    degree_dict = defaultdict(int)
    for var in measured_variables:
        matches = re.findall(rf'{var}\^(\d+)', target_polynomial)
        if matches:
            degree_dict[var] = sum(int(exp) for exp in matches)
        else:
            degree_dict[var] = 1
    sorted_variables = sorted(measured_variables, key=lambda x: degree_dict[x], reverse=True)

    # Generate data for all but the last variable
    for var in sorted_variables[:-1]:
        var_data = np.random.uniform(region[0], region[1], 1000)
        dataset = np.hstack((dataset, var_data.reshape(-1, 1)))

    # Step 2d: Generate data for the last variable
    last_var = sorted_variables[-1]
    last_var_symbol = symbols(last_var)  # Convert to symbolic variable

    roots_column = np.zeros((dataset.shape[0], 1)) * np.nan  # Initialize with NaN

    complex_count = 0
    evaluation_fail_count = 0

    for i in range(1000):
        # Substitute all known values into the polynomial at once
        polynomial = target_polynomial
        for j, var in enumerate(observed_constants + observed_derivatives + derivative_dependent_variable + sorted_variables[:-1]):
            # Replace variables with placeholders to avoid invalid syntax
            polynomial = re.sub(rf'\b{var}\b', f'{var}_value', polynomial)

        # Replace '^' with '**' for exponentiation
        polynomial = polynomial.replace('^', '**')

        # Convert the polynomial into a symbolic expression
        try:
            expr = sympify(polynomial)
            
            # Substitute numeric values into the expression
            substitutions = {}
            for j, var in enumerate(observed_constants + observed_derivatives + derivative_dependent_variable + sorted_variables[:-1]):
                substitutions[symbols(f'{var}_value')] = dataset[i, j]
            
            expr = expr.subs(substitutions)
            
            # Extract coefficients of the polynomial in the last variable
            poly = Poly(expr, last_var_symbol)
            coefficients = poly.all_coeffs()

            # Solve for the roots
            roots = np.roots(coefficients)
            
            # Filter real roots
            real_roots = roots[np.isreal(roots)].real
            
            if len(real_roots) > 0:
                roots_column[i] = real_roots[0]  # Use the first real root
            else:
                complex_count += 1
        except Exception as e:
            evaluation_fail_count += 1
            print(f"An exception occurred: {e}")  # Skip if polynomial evaluation fails   

    # Append the roots column to the dataset
    dataset = np.hstack((dataset, roots_column))

    # Remove rows with NaN values
    dataset = dataset[~np.isnan(dataset).any(axis=1)] 
    print("Complex roots thrown out: ", complex_count)
    print("Number of failed evaluations: ", evaluation_fail_count)  
    
    return dataset

def generate_dataset_no_der(target_polynomial, measured_variables, observed_constants, constant_data=True, region=[1,5]):
    # Initialize dataset as a 2D array with 1000 rows and 0 columns
    dataset = np.zeros((1000, 0))  # Start with an empty 2D array

    # Step 2a: Generate data for constants
    if constant_data and observed_constants:  # Only proceed if constants are provided
        constant_values = {const: np.random.uniform(0, 10) for const in observed_constants}  # Sample once
        if 'c' in observed_constants:
            constant_values['c'] = 2.99792458e8
        if 'G' in observed_constants:
            constant_values['G'] = 6.6743e-11
        constant_data_matrix = np.array([[constant_values[const]] * 1000 for const in observed_constants]).T  # Repeat 1000 times
        dataset = np.hstack((dataset, constant_data_matrix))  # Append constant data
    elif not constant_data and observed_constants:
        constant_values = {const: np.random.uniform(region[0], region[1], 1000) for const in observed_constants}  # Generate 1000 samples per constant
        constant_data_matrix = np.column_stack([constant_values[const] for const in observed_constants])  # Stack properly
        dataset = np.hstack((dataset, constant_data_matrix))  # Append constant data

    
    # Step 2c: Sort measured variables by degree in target polynomial
    degree_dict = defaultdict(int)
    for var in measured_variables:
        matches = re.findall(rf'{var}\^(\d+)', target_polynomial)
        if matches:
            degree_dict[var] = sum(int(exp) for exp in matches)
        else:
            degree_dict[var] = 1
    sorted_variables = sorted(measured_variables, key=lambda x: degree_dict[x], reverse=True)

    # Generate data for all but the last variable
    for var in sorted_variables[:-1]:
        var_data = np.random.uniform(region[0], region[1], 1000)
        dataset = np.hstack((dataset, var_data.reshape(-1, 1)))

    # Step 2d: Generate data for the last variable
    last_var = sorted_variables[-1]
    last_var_symbol = symbols(last_var)  # Convert to symbolic variable

    roots_column = np.zeros((dataset.shape[0], 1)) * np.nan  # Initialize with NaN

    complex_count = 0
    evaluation_fail_count = 0

    for i in range(1000):
        # Substitute all known values into the polynomial at once
        polynomial = target_polynomial
        for j, var in enumerate(observed_constants + sorted_variables[:-1]):
            # Replace variables with placeholders to avoid invalid syntax
            polynomial = re.sub(rf'\b{var}\b', f'{var}_value', polynomial)

        # Replace '^' with '**' for exponentiation
        polynomial = polynomial.replace('^', '**')

        # Convert the polynomial into a symbolic expression
        try:
            expr = sympify(polynomial)
            
            # Substitute numeric values into the expression
            substitutions = {}
            for j, var in enumerate(observed_constants + sorted_variables[:-1]):
                substitutions[symbols(f'{var}_value')] = dataset[i, j]
            
            expr = expr.subs(substitutions)
            
            # Extract coefficients of the polynomial in the last variable
            poly = Poly(expr, last_var_symbol)
            coefficients = poly.all_coeffs()

            # Solve for the roots
            roots = np.roots(coefficients)
            
            # Filter real roots
            real_roots = roots[np.isreal(roots)].real
            
            if len(real_roots) > 0:
                roots_column[i] = real_roots[0]  # Use the first real root
            else:
                complex_count += 1
        except Exception as e:
            evaluation_fail_count += 1
            print(f"An exception occurred: {e}")  # Skip if polynomial evaluation fails   

    # Append the roots column to the dataset
    dataset = np.hstack((dataset, roots_column))

    # Remove rows with NaN values
    dataset = dataset[~np.isnan(dataset).any(axis=1)] 
    print("Complex roots thrown out: ", complex_count)
    print("Number of failed evaluations: ", evaluation_fail_count)  
    
    return dataset



In [37]:
# Main script
output_directory = 'benchmarks/consequence_discovery_benchmark/fixed_constant_data'
if not os.path.exists(output_directory):
        os.makedirs(output_directory)

input_directory = 'benchmarks/consequence_discovery_benchmark/'

n = 110
np.random.seed(42)

for i in range(1,n):
    filename = f'system{i}.txt'
    filepath = os.path.join(input_directory, filename)
    
    # Check if the file exists
    if not os.path.exists(filepath):
        print(f"File {filename} does not exist. Skipping iteration {i}.")
        continue
    
    # Extract info from file
    target_polynomial, measured_variables, observed_constants, observed_derivatives = extract_info_from_file(filepath)
    print(target_polynomial,measured_variables,observed_constants,observed_derivatives)
    if observed_derivatives == []:
         observed_derivatives = ['']
    # Generate dataset
    r1 = np.random.randint(0, 15)
    region = [r1,r1+5]
    if '' not in observed_derivatives:
        dataset = generate_dataset(target_polynomial, measured_variables.copy(), observed_constants, observed_derivatives, True, True, region)
    else:
        dataset = generate_dataset_no_der(target_polynomial, measured_variables.copy(), observed_constants, True, region)

    if len(dataset) == 0:
        print("No real roots found for system ",  i ,". Skipping")
        continue
    # Save dataset to file
    np.savetxt(os.path.join(output_directory, f'numeric_data_{i}.dat'), dataset, delimiter=' ')
    print(measured_variables)
    print("Evaluation: ", sum(evaluate_polynomial(target_polynomial,dataset[0:5],observed_constants,observed_derivatives,measured_variables)))
    print(f"Generated dataset for {filename} with shape {dataset.shape}")


3*c^2*Fg+6*c*d2x1dt2*p-2*d1*d2x1dt2*Fg-6*d2x1dt2*W ['d1', 'W', 'Fg', 'p'] ['c'] ['d2x1dt2']
Complex roots thrown out:  0
Number of failed evaluations:  0
['d1', 'W', 'Fg', 'p']
Evaluation:  0.0
Generated dataset for system1.txt with shape (1000, 6)
d2x1dt2*m2+Fg-Fc ['Fg', 'Fc', 'm2'] [] ['d2x1dt2']
Complex roots thrown out:  0
Number of failed evaluations:  0
['Fg', 'Fc', 'm2']
Evaluation:  197.45829112177742
Generated dataset for system2.txt with shape (1000, 5)
2*p^2*W^2*Fc*d1*m1*d2x1dt2^2-p^2*Fc^2*d1^2*d2*m1^2*d2x1dt2^3-p^2*Fc*d1^3*Fg*m1^2*d2x1dt2^3-4*W^4*Fc^2+16*W^4*Fc*m1*d2x1dt2-16*W^4*m1^2*d2x1dt2^2+4*W^2*Fc^3*d1*d2*m1*d2x1dt2+8*W^2*Fc^2*d1^2*Fg*m1*d2x1dt2-16*W^2*Fc^2*d1*d2*m1^2*d2x1dt2^2-24*W^2*Fc*d1^2*Fg*m1^2*d2x1dt2^2+16*W^2*Fc*d1*d2*m1^3*d2x1dt2^3+16*W^2*d1^2*Fg*m1^3*d2x1dt2^3-Fc^4*d1^2*d2^2*m1^2*d2x1dt2^2-4*Fc^3*d1^3*Fg*d2*m1^2*d2x1dt2^2+4*Fc^3*d1^2*d2^2*m1^3*d2x1dt2^3-4*Fc^2*d1^4*Fg^2*m1^2*d2x1dt2^2+12*Fc^2*d1^3*Fg*d2*m1^3*d2x1dt2^3-4*Fc^2*d1^2*d2^2*m1^4*d2x1dt2^4+8*Fc*d1^4

In [42]:
# Run this cell to add noise to the data

import numpy as np

def add_gaussian_noise(input_file, output_file, epsilon):
    # Load data
    data = np.loadtxt(input_file)
        
    # Generate Gaussian noise with standard deviation as 5% of the column's mean
    noise = np.random.normal(0, np.abs(np.mean(data[:,-1])) * epsilon, len(data[:,-1]))
    
    # Apply noise only to non-constant columns
    
    data[:, -1] += noise
    
    # Save to output file
    np.savetxt(output_file, data, fmt='%.18e')

if __name__ == "__main__":
    epsilon = 5e-1
    input_directory = 'benchmarks/consequence_discovery_benchmark/fixed_constant_data'
    n = 110
    for i in range(1,n):

        filename = f'numeric_data_{i}.dat'
        input_file = os.path.join(input_directory, filename)
        if not os.path.exists(input_file):
            print(f"File {filename} does not exist. Skipping iteration {i}.")
            continue
        output_directory = f"benchmarks/consequence_discovery_benchmark/fixed_constant_data_noisy/{epsilon}/"
        if not os.path.exists(output_directory):
            os.makedirs(output_directory)
        
        output_file = os.path.join(output_directory, filename)
        add_gaussian_noise(input_file, output_file,epsilon)


File numeric_data_3.dat does not exist. Skipping iteration 3.
File numeric_data_9.dat does not exist. Skipping iteration 9.
File numeric_data_12.dat does not exist. Skipping iteration 12.
File numeric_data_19.dat does not exist. Skipping iteration 19.
File numeric_data_30.dat does not exist. Skipping iteration 30.
File numeric_data_31.dat does not exist. Skipping iteration 31.
File numeric_data_41.dat does not exist. Skipping iteration 41.
File numeric_data_48.dat does not exist. Skipping iteration 48.
File numeric_data_55.dat does not exist. Skipping iteration 55.
File numeric_data_59.dat does not exist. Skipping iteration 59.
File numeric_data_73.dat does not exist. Skipping iteration 73.
File numeric_data_80.dat does not exist. Skipping iteration 80.
File numeric_data_85.dat does not exist. Skipping iteration 85.
File numeric_data_86.dat does not exist. Skipping iteration 86.
File numeric_data_91.dat does not exist. Skipping iteration 91.
File numeric_data_92.dat does not exist. Ski

In [46]:
# You can run this cell if you want to reduce the amount of data you have
import sys
import os
from pathlib import Path

def shorten_data(input_file, output_file=None):

    try:
        # Convert input path to Path object
        input_path = Path(input_file)
        
        # Generate output filename if not provided
        if output_file is None:
            output_file = input_path.stem + '_converted' + input_path.suffix
        
        # Check if input file exists
        if not input_path.exists():
            raise FileNotFoundError(f"Input file '{input_file}' not found")
        
        # Process the file line by line
        with open(input_path, 'r') as infile, open(output_file, 'w') as outfile:
            for i, line in enumerate(infile):
                # Replace commas with spaces and write to output file
                converted_line = line.strip()
                outfile.write(converted_line + '\n')
                if i ==100:
                    break
                
        print(f"Successfully converted '{input_file}' to '{output_file}'")
        return output_file
        
    except PermissionError:
        print(f"Error: Permission denied while accessing files")
        raise
    except Exception as e:
        print(f"An unexpected error occurred: {str(e)}")
        raise

if __name__ == "__main__":


    input_directory = 'target_polynomial_benchmark/data_noisy/0.0001'
    n = 110
    for i in range(90,n):
        filename = f'numeric_data_{i}.dat'
        filepath = os.path.join(input_directory, filename)
        if not os.path.exists(filepath):
            print(f"File {filename} does not exist. Skipping iteration {i}.")
            continue
        output_directory = "target_polynomial_benchmark/data_shortened_noisy/0.0001/"
        if not os.path.exists(output_directory):
            os.makedirs(output_directory)
        output_filepath = os.path.join(output_directory, filename)
        
        shorten_data(filepath, output_filepath)

Successfully converted 'target_polynomial_benchmark/data_noisy/0.0001/numeric_data_90.dat' to 'target_polynomial_benchmark/data_shortened_noisy/0.0001/numeric_data_90.dat'
File numeric_data_91.dat does not exist. Skipping iteration 91.
File numeric_data_92.dat does not exist. Skipping iteration 92.
File numeric_data_93.dat does not exist. Skipping iteration 93.
Successfully converted 'target_polynomial_benchmark/data_noisy/0.0001/numeric_data_94.dat' to 'target_polynomial_benchmark/data_shortened_noisy/0.0001/numeric_data_94.dat'
File numeric_data_95.dat does not exist. Skipping iteration 95.
File numeric_data_96.dat does not exist. Skipping iteration 96.
File numeric_data_97.dat does not exist. Skipping iteration 97.
File numeric_data_98.dat does not exist. Skipping iteration 98.
File numeric_data_99.dat does not exist. Skipping iteration 99.
File numeric_data_100.dat does not exist. Skipping iteration 100.
Successfully converted 'target_polynomial_benchmark/data_noisy/0.0001/numeric_