In [None]:
import re
import csv
import os
import pandas as pd

# Function to preprocess equations
def preprocess_equation(equation):
    """Standardizes equation formatting by ensuring consistent spacing and replacing '**' with '^'."""
    equation = equation.replace("**", "^")  # Convert Python-style exponentiation to symbolic representation
    equation = re.sub(r'\s+', '', equation)  # Remove unnecessary whitespace
    return equation

# Function to tokenize equations
def tokenize_equation(equation):
    """Tokenizes the equation into symbols, operators, and functions."""
    functions = ["sin", "cos", "exp", "sqrt", "ln", "arcsin"]
    constants = ["pi", "h", "G", "c", "epsilon", "kB", "T", "m", "q"]
    operators = ["+", "-", "*", "/", "^"]

    token_pattern = r'([a-zA-Z_][a-zA-Z0-9_]*)|([0-9]+\.?[0-9]*)|([\+\-\*/\^()])'

    tokens = []
    for match in re.finditer(token_pattern, equation):
        token = match.group()
        if token in functions:
            tokens.append(f'FUNC_{token.upper()}')
        elif token in constants:
            tokens.append(f'CONST_{token.upper()}')
        elif token in operators:
            tokens.append(f'OP_{token}')
        elif re.match(r'^[0-9]+\.?[0-9]*$', token):
            tokens.append('<NUM>')
        else:
            tokens.append(f'VAR_{token}')

    return tokens

# Load CSV file
file = '/content/FeynmanEquations.csv'
df = pd.read_csv(file)

# Ensure "Formula" column is processed
if "Formula" in df.columns:
    results = []
    for eq in df["Formula"].dropna():
        processed_eq = preprocess_equation(eq)
        tokens = tokenize_equation(processed_eq)
        tokenized_str = " | ".join(tokens)  # Clearly separate tokens

        print(f"Original: {eq}")
        print(f"Tokenized: {tokenized_str}\n")

        results.append({
            "original_equation": eq,
            "processed_equation": processed_eq,
            "tokens": tokenized_str
        })
    # Save results to CSV
    output_csv = "/mnt/data/tokenized_equations.csv"
    # Create the directory if it doesn't exist
    os.makedirs(os.path.dirname(output_csv), exist_ok=True)
    df_output = pd.DataFrame(results)
    df_output.to_csv(output_csv, index=False)
    print(f"Results saved to {output_csv}")
    from google.colab import files
    files.download('/mnt/data/tokenized_equations.csv')
else:
    print("Error: 'Formula' column not found in the CSV file.")


Original: exp(-theta**2/2)/sqrt(2*pi)
Tokenized: FUNC_EXP | VAR_( | OP_- | VAR_theta | OP_^ | <NUM> | OP_/ | <NUM> | VAR_) | OP_/ | FUNC_SQRT | VAR_( | <NUM> | OP_* | CONST_PI | VAR_)

Original: exp(-(theta/sigma)**2/2)/(sqrt(2*pi)*sigma)
Tokenized: FUNC_EXP | VAR_( | OP_- | VAR_( | VAR_theta | OP_/ | VAR_sigma | VAR_) | OP_^ | <NUM> | OP_/ | <NUM> | VAR_) | OP_/ | VAR_( | FUNC_SQRT | VAR_( | <NUM> | OP_* | CONST_PI | VAR_) | OP_* | VAR_sigma | VAR_)

Original: exp(-((theta-theta1)/sigma)**2/2)/(sqrt(2*pi)*sigma)
Tokenized: FUNC_EXP | VAR_( | OP_- | VAR_( | VAR_( | VAR_theta | OP_- | VAR_theta1 | VAR_) | OP_/ | VAR_sigma | VAR_) | OP_^ | <NUM> | OP_/ | <NUM> | VAR_) | OP_/ | VAR_( | FUNC_SQRT | VAR_( | <NUM> | OP_* | CONST_PI | VAR_) | OP_* | VAR_sigma | VAR_)

Original: sqrt((x2-x1)**2+(y2-y1)**2)
Tokenized: FUNC_SQRT | VAR_( | VAR_( | VAR_x2 | OP_- | VAR_x1 | VAR_) | OP_^ | <NUM> | OP_+ | VAR_( | VAR_y2 | OP_- | VAR_y1 | VAR_) | OP_^ | <NUM> | VAR_)

Original: G*m1*m2/((x2-x1)**2+(y2

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
!ls /mnt/data

tokenized_equations.csv


In [None]:
from google.colab import files
files.download('/mnt/data/tokenized_equations.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>