In [None]:
%load_ext autoreload
%autoreload 2

# (Math) Evalution

> Elaborate answer extraction and correctness judgement (for mathematical evaluation).

In [None]:
# | hide
from nbdev.showdoc import *
from fastcore.test import *

In [None]:
from dart_math.eval import *

math_evaluator = EvaluatorMathBatch()

## Elaborate Mathematical Evaluation Pipeline

In [None]:
show_doc(EvaluatorMath, title_level=3)

`EvaluatorMath` implements an elaborate evaluation pipeline for mathematical reasoning tasks.

#### Accurately Extracting Answer Strings

`EvaluatorMath` can:

1. **extract** short answers from long responses rather **accurately** 
2. and **normalize** into a **mathematical** expression.

In [None]:
# MATH-style boxed answer
math_evaluator.extract_ans("Therefore, $1+1=\\boxed{2}$.")

'2'

In [None]:
# Answer around "answer"
math_evaluator.extract_ans(
    "Both $1$ and $11$ divide $11,$ so $\\boxed{11}=2$, and since $1,$ $2,$ $4,$ $5,$ $10,$ and $20$ divide $20,$ then $\\boxed{20}=6$. The inner expression, $\\boxed{11}\\times\\boxed{20}=2\\times6=12$. Finally, $\\boxed{12}=6$ because $1,$ $2,$ $3,$ $4,$ $6,$ and $12$ divide $12.$\n\nTherefore, $6$ is our answer. Please note that we have not boxed the correct answer as we normally do, as that would be especially confusing for this problem."
)

'6'

In [None]:
# Use the last number by default
math_evaluator.extract_ans(
    'First, we need to count the total number of letters in the word "CIRCLE". There are 6 letters.\n\nNext, we need to count the number of distinct letters. There are 6 distinct letters in the word "CIRCLE": C, I, R, L, E, and G.\n\nNow, let\'s consider the arrangements of the distinct letters. The number of ways to arrange n distinct items is n factorial (n!). So, we have 6! = 6 × 5 × 4 × 3 × 2 × 1 = 720 ways to arrange the distinct letters.\n\nHowever, the word "CIRCLE" has one letter that repeats (the letter \'C\' repeats twice). We have over-counted the number of distinct arrangements by including arrangements that are just rotations of each other (for example, "CIRCLE" and "LCIRCE" are considered different arrangements here, but they are the same word when read).\n\nTo correct for this, we divide the total number of arrangements by the number of ways to arrange the repeated letters. The number of ways to arrange 2 identical items is 2! = 2 × 1 = 2. So, we divide the total number of arrangements by 2 to get the correct number of distinct arrangements.\n\nTherefore, the number of ways to arrange the letters of the word "CIRCLE" is 720 ÷ 2 = 360.'
)
# More cases ...

'360'

In [None]:
# Normalize fraction
math_evaluator.extract_ans("The answer is 1/2")

'\\frac{1}{2}'

In [None]:
# Normalize pmatrix
math_evaluator.extract_ans(
    "The answer is \\begin{pmatrix} 3 \\\\ \\frac{\\pi}{2} \\end{pmatrix}"
)
# More cases ...

'\\begin{array}3\\\\frac{\\pi}{2}\\end{array}'

#### Correctly Processing Various Mathematical Objects / Special Text

`EvaluatorMath`, based on regular expressions and [SymPy](https://www.sympy.org) symbolic calculation, is able to correctly process

- most **mathematical objects** such as matrices (vectors), intervals, symbols besides numbers,
- as well as some **special texts** like bool expressions, dates and times.

In [None]:
math_evaluator.eq("x+y", "y+x") == True  # Expression

True

In [None]:
math_evaluator.eq("\\frac{1}{2}", "0.5") == True  # LaTeX

True

In [None]:
math_evaluator.eq(
    "\\begin{array}1\\\\2\\end{array}",
    "1,2",
)  # Matrix (Vector)

True

In [None]:
math_evaluator.eq("{1,2}", "{2,1}", compare_sets=True)  # Set

True

In [None]:
math_evaluator.eq("no", "false")  # Bool
# More mathematical objects and special texts ...

True

More test cases:

In [None]:
# |code-fold: true
test_eq(math_evaluator.eq("251,7\\\\ \\noindent", "0"), False)
test_eq(math_evaluator.eq("3.54*10^{-7}", "3.54e-07"), True)
test_eq(math_evaluator.eq(r"\frac{1}{2}", "0.5"), True)
test_eq(math_evaluator.eq("1", "100"), False)
test_eq(math_evaluator.eq("100", "1"), False)
test_eq(math_evaluator.eq("3.04", "0.0304", False), True)
test_eq(math_evaluator.eq(["0.0304", 0.0304], "3.04"), True)
test_eq(math_evaluator.eq("x<-1", "x>3"), False)
test_eq(
    math_evaluator.eq("(-\\infty,0)\\cup(0,\\infty)", "(-\\infty,0)\\cup(0,\\infty)"),
    True,
)
test_eq(math_evaluator.eq("1+2,2+1", "2+1,1+2"), True)
test_eq(math_evaluator.eq(5, 5), True)
test_eq(math_evaluator.eq(0.1 + 0.2, 0.3), True)
test_eq(math_evaluator.eq("x + y", "y + x"), True)
test_eq(math_evaluator.eq("C", "C"), True)
test_eq(math_evaluator.eq("1,234", "1234"), True)
test_eq(math_evaluator.eq("12,34", "(12,34)"), True)

test_eq(math_evaluator.eq("\\$ 5", "5"), True)
test_eq(math_evaluator.eq("3 * \\sqrt{13}", "3\\sqrt{13}"), True)
test_eq(math_evaluator.eq("\\pi/2", "\\frac{\\pi}{2}"), True)
test_eq(math_evaluator.eq("(3,\\pi/2)", "(3,\\frac{\\pi}{2})"), True)
test_eq(math_evaluator.eq("23000", "\\$23{,}000"), True)
test_eq(
    math_evaluator.eq(r"\left(1,2\right)", r"\left(2,1\right)", compare_sets=True), True
)
test_eq(math_evaluator.eq("White", "white"), True)

In [None]:
show_doc(EvaluatorMathBatch, title_level=3)

SymPy symbolic calculation causes risks of ex-long evaluation time.

To address this, we implement `EvaluatorMathBatch` to evaluate in batch with **timeout** but still efficiently (based on `asyncio` coroutines instead of `multiprocessing` in previous implementations).

```python
answers, corrects = math_evalutor.batch_eval(resp_samples)
```

## API Reference

In [None]:
show_doc(EvaluatorBase, title_level=3)

In [None]:
show_doc(EvaluatorBatchBase, title_level=3)

### Parsing LaTeX

#### Interval

In [None]:
from dart_math.eval import latex2sympy_interval

In [None]:
latex2sympy_interval("(-11,-10)\\cup\\{-\\sqrt{110}\\}")

Interval.open(-11, -10)

In [None]:
latex2sympy_interval("(-\\infty, 0) \\cup (0, \\infty)")

Union(Interval.open(-oo, 0), Interval.open(0, oo))

In [None]:
latex2sympy_interval("(a+b,b]")

Interval.Lopen(a + b, b)

#### Matrix / Vector

In [None]:
math_evaluator.latex2matrix(r"\sqrt{400\cos^2(9\pi/44)},\frac{\pi}{4}")

Matrix([[sqrt(400*cos((9*pi)/44)**2), pi/4]])

In [None]:
math_evaluator.latex2matrix(
    r"\begin{pmatrix} \frac{1}{2} & 0 & -\frac{\sqrt{3}}{2} \\ 0 & 1 & 0 \\ \frac{\sqrt{3}}{2} & 0 & \frac{1}{2} \end{pmatrix}"
)

Matrix([
[      1/2, 0, -1*sqrt(3)/2],
[        0, 1,            0],
[sqrt(3)/2, 0,          1/2]])

In [None]:
test_eq(
    math_evaluator.latex2matrix("\\begin{pmatrix}-18\\\\-49\\\\96\\end{pmatrix}"),
    Matrix([[-18, -49, 96]]),
)
test_eq(
    math_evaluator.latex2matrix("\\begin{pmatrix} 2 & 3 \\\\ 0 & -2 \\end{pmatrix}"),
    Matrix([[2, 3], [0, -2]]),
)

### Normalization

In [None]:
test_eq(math_evaluator.norm_math_str("251,7\\\\ \\noindent"), "251,7")

In [None]:
test_eq(fix_a_slash_b("(3/4)\\sqrt{3}"), "(\\frac{3}{4})\\sqrt{3}")

In [None]:
test_eq(math_evaluator.norm_pm("x\\pmy"), "x-y,x+y")
test_eq(math_evaluator.norm_pm("a\\mpb"), "a-b,a+b")
test_eq(math_evaluator.norm_pm("1\\pm\\sqrt{19}"), "1-\\sqrt{19},1+\\sqrt{19}")
test_eq(math_evaluator.norm_pm(r"\{1\pm\sqrt{5},-2\}"), "1-\\sqrt{5},1+\\sqrt{5},-2")
test_eq(
    math_evaluator.norm_pm("\\(\\frac{1\\pm\\sqrt{17}}{4}\\)"),
    "\\frac{1-\\sqrt{17}}{4},\\frac{1+\\sqrt{17}}{4}",
)
test_eq(
    math_evaluator.norm_pm(r"\frac{1\pm\sqrt{1-\frac{2}{\sqrt{3}}}}{1}"),
    "\\frac{1-\\sqrt{1-\\frac{2}{\\sqrt{3}}}}{1},\\frac{1+\\sqrt{1-\\frac{2}{\\sqrt{3}}}}{1}",
)

In [None]:
test_eq(norm_deg(r"20^\circ"), r"20")
test_eq(norm_deg(r"\sin 20^\circ"), r"\sin {20*\frac{\pi}{180}}")

In [None]:
test_eq(math_evaluator.norm_basic_fn(r"sinx"), r"\sin^{1}x")
test_eq(math_evaluator.norm_basic_fn(r"\sin^2x"), r"\sin^{2}x")

### Processing Sets

In [None]:
test_eq(math_evaluator.extract_set("{2,1}"), ["1", "2"])

In [None]:
test_eq(is_set("{2,1}"), True)
test_eq(is_set("orange"), False)
test_eq(is_set("x<-1orx>3"), True)
test_eq(is_set("(3/4)sqrt(3)"), False)

### Manipulating Strings

In [None]:
test_eq(math_evaluator.remove_first_paren_pair("{white}", "{"), "white")