<a href="https://colab.research.google.com/github/ldmrepo/latex-parser/blob/main/latex_parser.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install lark-parser

Collecting lark-parser
  Downloading lark_parser-0.12.0-py2.py3-none-any.whl (103 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m103.5/103.5 kB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: lark-parser
Successfully installed lark-parser-0.12.0


In [15]:
# 필요한 라이브러리 임포트
from lark import Lark, Transformer, v_args

# LaTeX 구문을 정의하는 EBNF 문법
latex_grammar = r'''
start: expression

expression: align_env | equation_env | sum_expr | prod_expr | int_expr | lim_expr | basic_expr

basic_expr: term (("+" | "-" | "\\pm" | "\\mp" | "\\cup" | "\\cap" | "\\setminus") term)*

term: factor (("*" | "/" | "\\cdot" | "\\times" | "\\div" | "\\land" | "\\lor" | "\\wedge" | "\\vee") factor)*

factor: base ("^" exponent)? ("_" subscript)?
base: number | variable | function | fraction | sqrt | nroot | matrix | binom | logical_op | "{" expression "}" | "(" expression ")" | overline | underline | text | accent | delimiter | font_style | color

exponent: factor
subscript: factor

number: DIGIT+ ("." DIGIT+)?
variable: LETTER+ | greek_letter

function: function_name "(" expression ")" | function_name "{" expression "}"
function_name: "sin" | "cos" | "tan" | "cot" | "sec" | "csc" | "log" | "ln" | "exp" | "sqrt" | "max" | "min" | "sum" | "prod" | "lim" | "int" | "frac" | "binom" | "arcsin" | "arccos" | "arctan" | "sinh" | "cosh" | "tanh"

fraction: "\\frac" "{" expression "}" "{" expression "}"
sqrt: "\\sqrt" "{" expression "}"
nroot: "\\sqrt" "[" expression "]" "{" expression "}"
binom: "\\binom" "{" expression "}" "{" expression "}"

matrix: "\\begin{" matrix_type "}" matrix_rows "\\end{" matrix_type "}"
matrix_type: "matrix" | "pmatrix" | "bmatrix" | "Bmatrix" | "vmatrix" | "Vmatrix"
matrix_rows: matrix_row ("\\\\" matrix_row)*
matrix_row: expression ("&" expression)*

align_env: "\\begin{align}" align_rows "\\end{align}" | "\\begin{align*}" align_rows "\\end{align*}"
align_rows: align_row ("\\\\" align_row)*
align_row: expression ("&" expression)*

equation_env: "\\begin{equation}" expression "\\end{equation}" | "\\begin{equation*}" expression "\\end{equation*}"

sum_expr: "\\sum" "_" subscript "^" exponent "{" expression "}"
prod_expr: "\\prod" "_" subscript "^" exponent "{" expression "}"
int_expr: "\\int" "_" lower_limit "^" upper_limit "{" expression "}"
lim_expr: "\\lim" "_" subscript "{" expression "}"

lower_limit: expression
upper_limit: expression

overline: "\\overline" "{" expression "}"
underline: "\\underline" "{" expression "}"

text: "\\text" "{" text_content "}"
text_content: (LETTER | DIGIT | " " | symbol)*

accent: "\\hat" "{" expression "}" | "\\bar" "{" expression "}" | "\\vec" "{" expression "}" | "\\dot" "{" expression "}" | "\\ddot" "{" expression "}" | "\\tilde" "{" expression "}" | "\\breve" "{" expression "}" | "\\check" "{" expression "}" | "\\acute" "{" expression "}" | "\\grave" "{" expression "}"

operator: "+" | "-" | "*" | "/" | "^" | "=" | "<" | ">" | "\\leq" | "\\geq" | "\\neq" | "\\approx" | "\\equiv" | "\\sim" | "\\simeq" | "\\cong" | "\\propto" | "\\infty" | "\\partial" | "\\nabla" | "\\forall" | "\\exists" | "\\neg" | "\\to" | "\\implies" | "\\iff" | "\\int" | "\\sum" | "\\prod" | "\\cup" | "\\cap" | "\\setminus"

delimiter: "(" | ")" | "[" | "]" | "{" | "}" | "|" | "\\|" | "\\langle" | "\\rangle"

DIGIT: "0".."9"
LETTER: "a".."z" | "A".."Z"
greek_letter: "alpha" | "beta" | "gamma" | "delta" | "epsilon" | "zeta" | "eta" | "theta" | "iota" | "kappa" | "lambda" | "mu" | "nu" | "xi" | "omicron" | "pi" | "rho" | "sigma" | "tau" | "upsilon" | "phi" | "chi" | "psi" | "omega" | "Alpha" | "Beta" | "Gamma" | "Delta" | "Epsilon" | "Zeta" | "Eta" | "Theta" | "Iota" | "Kappa" | "Lambda" | "Mu" | "Nu" | "Xi" | "Omicron" | "Pi" | "Rho" | "Sigma" | "Tau" | "Upsilon" | "Phi" | "Chi" | "Psi" | "Omega"

logical_op: "\\land" | "\\lor" | "\\neg" | "\\implies" | "\\iff" | "\\forall" | "\\exists" | "\\in" | "\\notin" | "\\subset" | "\\supset" | "\\subseteq" | "\\supseteq" | "\\setminus" | "\\emptyset"

symbol: "!" | "\\\\" | "#" | "$" | "%" | "&" | "'" | "(" | ")" | "*" | "+" | "," | "-" | "." | "/" | ":" | ";" | "<" | "=" | ">" | "?" | "@" | "[" | "\\\\" | "]" | "^" | "_" | "`" | "{" | "|" | "}" | "~"

font_style: "\\mathrm{" text_content "}" | "\\mathit{" text_content "}" | "\\mathbf{" text_content "}" | "\\mathsf{" text_content "}" | "\\mathtt{" text_content "}" | "\\mathcal{" text_content "}" | "\\mathbb{" text_content "}"

color: "\\color{" color_name "}" | "\\textcolor{" color_name "}{" text_content "}" | "\\colorbox{" color_name "}{" text_content "}"
color_name: "red" | "green" | "blue" | "cyan" | "magenta" | "yellow" | "black" | "white" | "gray" | "brown" | "lime" | "olive" | "orange" | "pink" | "purple" | "teal" | "violet"

size_adjustment: "\\scalebox{" factor "}{" expression "}" | "\\resizebox{" width "}{" height "}{" expression "}" | "\\rotatebox{" angle "}{" expression "}"
width: number unit
height: number unit
angle: number
unit: "pt" | "mm" | "cm" | "in" | "ex" | "em" | "bp" | "dd" | "pc"
'''

# 파서를 초기화
parser = Lark(latex_grammar, start='start', parser='lalr')

# 샘플 LaTeX 표현식
latex_expression = r"\frac{a+b}{c+d}"

# 표현식을 파싱
tree = parser.parse(latex_expression)

# 파스 트리를 처리하는 변환기
class LatexTransformer(Transformer):
    def number(self, items):
        return float(items[0])

    def variable(self, items):
        return str(items[0])

    def expression(self, items):
        return items

# 파스 트리를 변환
transformer = LatexTransformer()
result = transformer.transform(tree)

# 파스 트리와 결과 출력
print(tree.pretty())
print(result)


start
  expression
    basic_expr
      term
        factor
          base
            fraction
              expression
                basic_expr
                  term
                    factor
                      base
                        variable	a
                  term
                    factor
                      base
                        variable	b
              expression
                basic_expr
                  term
                    factor
                      base
                        variable	c
                  term
                    factor
                      base
                        variable	d

Tree('start', [[Tree('basic_expr', [Tree('term', [Tree('factor', [Tree('base', [Tree('fraction', [[Tree('basic_expr', [Tree('term', [Tree('factor', [Tree('base', ['a'])])]), Tree('term', [Tree('factor', [Tree('base', ['b'])])])])], [Tree('basic_expr', [Tree('term', [Tree('factor', [Tree('base', ['c'])])]), Tree('term', [Tree('factor', [Tree('base', 