In [2]:
import pandas as pd
import numpy as np
import sympy as sp
from sympy import *
import matplotlib.pyplot as plt
import random
from IPython.display import Markdown
from lxml import etree
import json

In [None]:
class EquationGenerator:
    def __init__(self):
        self.latin = symbols('a b c d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y Z')
        self.greek = symbols('α β γ δ ε ζ η θ ι κ λ μ ν ξ ο π ρ σ τ υ φ χ ψ ω Α Β Γ Δ Ε Ζ Η Θ Ι Κ Λ Μ Ν Ξ Ο Π Ρ Σ Τ Υ Φ Χ Ψ Ω')
        self.vars = self.latin + self.greek 
        self.operators = ('+', '-', '*', '/', '**')
        self.functions = (sin, cos, tan, exp, log, sqrt)
        self.nums = tuple(range(1, 10))

        self.get_combined_vars(52) # 52 to roughly match the number of latin and greek letters for equal probability of choice
        self.vars += self.combined_vars # Add the combined variables to the list of variables

    def generate_expression(self):            
        complexity1 = random.randint(1, 2) # Length of expression
        expression = random.choice(self.functions)(random.choice(self.vars))  # Start with a function of a variable e.g. sin(a)
        for _ in range(complexity1):
            operator = random.choice(self.operators)
            complexity2 = random.randint(1, 3) # Complexity of the term
            if complexity2 == 1:
                term = random.choice((random.choice(self.vars), random.choice(self.nums))) # e.g. a, 1
            elif complexity2 == 2:
                term = random.choice(self.functions)(random.choice(self.vars)) # e.g. sin(a), log(b)
            elif complexity2 == 3:
                func = random.choice(self.functions)
                inner1 = random.choice(self.vars)
                inner2 = random.choice((random.choice(self.nums), random.choice(self.vars)))
                inner_operator = random.choice(self.operators)
                term = f"{func.__name__}({inner1} {inner_operator} {inner2})" # e.g. sin(a + b), log(c * d)   
            expression = f"{expression} {operator} {term}" # Concatenate the expression with the operator and term
        # print(expression)
        return sympify(expression)
    
    def get_combined_vars(self, num):
        combined_vars = []
        for _ in range(num): # Create num combined variables by joining two random variables with a "_"
            part1 = random.choice(self.vars)
            part2 = random.choice(self.vars + self.nums)
            combined_vars.append(f"{part1}_{part2}")
        self.combined_vars = symbols(" ".join(combined_vars)) # Convert the list of combined variables to a tuple of SymPy symbols
    
    def generate_equation(self):
        lhs = self.generate_expression()
        rhs = self.generate_expression()
        self.equation = Eq(lhs, rhs) # Create an attribute containing a SymPy equation in the form of lhs = rhs
    
    def to_python(self):
        self.py = sp.printing.python(self.equation) # Convert the SymPy expression to Python code
    
    def to_mathml(self):
        self.mml = sp.printing.mathml(self.equation, printer='presentation') # Convert the SymPy expression to MathML in the correct style
    
    def format_mathml(self):
        mml = self.mml
        mml = mml.replace("<mo>&InvisibleTimes;</mo>", "") # Remove invisible times operator to match scraped MathML
        mml = mml.replace("<mi>&ExponentialE;</mi>", "<mtext>exp</mtext>") # Replace exponential e with exp to match scraped MathML
        parser = etree.XMLParser(remove_blank_text=True) # Create an XML parser that removes blank text
        root = etree.fromstring(mml, parser) # Parse the MathML string into an XML element tree

        namespace = "http://www.w3.org/1998/Math/MathML" # Define the MathML namespace
        
        def add_namespace(elem): # Recursively add the namespace to all elements
            elem.tag = f"{{{namespace}}}{elem.tag}" # Add the namespace to the current tag
            for child in elem: # Do the same for all children
                add_namespace(child)                
    
        add_namespace(root) # Apply namespace

        mml =  etree.tostring(root, pretty_print=True, xml_declaration=False, encoding="UTF-8").decode("utf-8") # Convert the XML element tree back to a string
        mml = '\n'.join([line.lstrip() for line in mml.splitlines()]) # Remove leading whitespace from each line
        mml = mml.replace("ns0", "mml") # Replace the namespace prefix with "mml" to match scraped MathML
        mml = mml.replace('<mml:mrow xmlns:mml="http://www.w3.org/1998/Math/MathML">', "") # Remove first tag
        
        index = mml.rfind("</mml:mrow>") # Logic to remove final mrow tag
        if index != -1:
            mml = mml[:index] + mml[index + 11:] # 11 is len(tag)       
        
        self.mml = mml # Update attribute

    def print_latex(self):
        display(Markdown(f"$$ {latex(self.equation)} $$")) # Display the equation in LaTeX format

    def generate(self):
        self.generate_equation()
        self.to_python()
        self.to_mathml()
        self.format_mathml()
        return self.py, self.mml

def create_json(num, filepath):
    try: # Check to see if there is already data at the filepath
        with open(filepath, "r") as f:
            existing_data = json.load(f) # Load it if it exists
    except FileNotFoundError:
        existing_data = [] # Create an empty list otherwise

    new_data = []
    while len(new_data) < num: # Add num new equations to the list in dictionary format
        eg = EquationGenerator()
        try: # Skip errors
            py, mml = eg.generate()
            mml.replace("\n", "\\n")
            py.replace("\n", "\\n")
            new_data.append({"input": mml, "expected_output": py}) # Format here
        except:
            continue
        print(f"Number of new equations: {len(new_data)} / {num}")

    
    existing_data.extend(new_data)
    with open(filepath, "w", encoding="utf-8") as f:
        json.dump(existing_data, f, ensure_ascii=False, indent=4)

def create_csv(num, filepath):
    try: # Check to see if there is already data at the filepath
        existing_data = pd.read_csv(filepath) # Load it if it exists
    except FileNotFoundError:
        existing_data = pd.DataFrame(columns=["input", "expected_output"]) # Create an empty df otherwise

    new_data = []
    while len(new_data) < num: # Add num new equations to the list in dictionary format
        eg = EquationGenerator()
        try: # Skip errors
            py, mml = eg.generate()
            new_data.append({"input": mml, "expected_output": py}) # Format here
        except:
            continue
        print(f"Number of new equations: {len(new_data)} / {num}")
    
    new_data = pd.DataFrame(new_data)
    existing_data = pd.concat([existing_data, new_data])
    existing_data.to_csv(filepath, index=False)

def test():
    eg = EquationGenerator() # Create an instance of the EquationGenerator class
    py, mml = eg.generate() # Generate a random python equation with corresponding MathML
    print(py)
    print(mml)
    eg.print_latex() # Display the equation in LaTeX format

def main():
    filepath = "gemini_training_1.csv"
    n_equations = 500
    if filepath.split(".")[-1] == "json":
        create_json(n_equations, filepath)
    elif filepath.split(".")[-1] == "csv":
        create_csv(n_equations, filepath)
    else:
        print("Invalid file format. Please use .json or .csv")

if __name__ == "__main__":
    main()



Number of new equations: 1 / 500
Number of new equations: 2 / 500
Number of new equations: 3 / 500
Number of new equations: 4 / 500
Number of new equations: 5 / 500
Number of new equations: 6 / 500
Number of new equations: 7 / 500
Number of new equations: 8 / 500
Number of new equations: 9 / 500
Number of new equations: 10 / 500
Number of new equations: 11 / 500
Number of new equations: 12 / 500
Number of new equations: 13 / 500
Number of new equations: 14 / 500
Number of new equations: 15 / 500
Number of new equations: 16 / 500
Number of new equations: 17 / 500
Number of new equations: 18 / 500
Number of new equations: 19 / 500
Number of new equations: 20 / 500
Number of new equations: 21 / 500
Number of new equations: 22 / 500
Number of new equations: 23 / 500
Number of new equations: 24 / 500
Number of new equations: 25 / 500
Number of new equations: 26 / 500
Number of new equations: 27 / 500
Number of new equations: 28 / 500
Number of new equations: 29 / 500
Number of new equations

In [10]:
mml = """<mml:mrow>
<mml:mi>η</mml:mi>
<mml:mo linebreak="badbreak">=</mml:mo>
<mml:msub>
<mml:mi>η</mml:mi>
<mml:mn>0</mml:mn>
</mml:msub>
<mml:mtext>exp</mml:mtext>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mfrac>
<mml:msub>
<mml:mi>Q</mml:mi>
<mml:mi>η</mml:mi>
</mml:msub>
<mml:mrow>
<mml:mi>R</mml:mi>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>"""

mml = mml.replace("<", "&lt;")
mml = mml.replace(">", "&gt;")
mml = mml.replace("&", "&amp;")

print(mml)

&amp;lt;mml:mrow&amp;gt;
&amp;lt;mml:mi&amp;gt;η&amp;lt;/mml:mi&amp;gt;
&amp;lt;mml:mo linebreak="badbreak"&amp;gt;=&amp;lt;/mml:mo&amp;gt;
&amp;lt;mml:msub&amp;gt;
&amp;lt;mml:mi&amp;gt;η&amp;lt;/mml:mi&amp;gt;
&amp;lt;mml:mn&amp;gt;0&amp;lt;/mml:mn&amp;gt;
&amp;lt;/mml:msub&amp;gt;
&amp;lt;mml:mtext&amp;gt;exp&amp;lt;/mml:mtext&amp;gt;
&amp;lt;mml:mrow&amp;gt;
&amp;lt;mml:mo stretchy="false"&amp;gt;(&amp;lt;/mml:mo&amp;gt;
&amp;lt;mml:mfrac&amp;gt;
&amp;lt;mml:msub&amp;gt;
&amp;lt;mml:mi&amp;gt;Q&amp;lt;/mml:mi&amp;gt;
&amp;lt;mml:mi&amp;gt;η&amp;lt;/mml:mi&amp;gt;
&amp;lt;/mml:msub&amp;gt;
&amp;lt;mml:mrow&amp;gt;
&amp;lt;mml:mi&amp;gt;R&amp;lt;/mml:mi&amp;gt;
&amp;lt;mml:mi&amp;gt;T&amp;lt;/mml:mi&amp;gt;
&amp;lt;/mml:mrow&amp;gt;
&amp;lt;/mml:mfrac&amp;gt;
&amp;lt;mml:mo stretchy="false"&amp;gt;)&amp;lt;/mml:mo&amp;gt;
&amp;lt;/mml:mrow&amp;gt;
&amp;lt;/mml:mrow&amp;gt;
&amp;lt;/mml:math&amp;gt;
