# MATH data process


In [None]:
import os
import json
import random
from copy import deepcopy
import re

In [None]:
test_data = [
    {
        "problem": "How many vertical asymptotes does the graph of $y=\\frac{2}{x^2+x-6}$ have?",
        "level": "Level 3",
        "type": "Algebra",
        "solution": "The denominator of the rational function factors into $x^2+x-6=(x-2)(x+3)$. Since the numerator is always nonzero, there is a vertical asymptote whenever the denominator is $0$, which occurs for $x = 2$ and $x = -3$.  Therefore, the graph has $\\boxed{2}$ vertical asymptotes."
    },
    {
        "problem": "What is the positive difference between $120\\%$ of 30 and $130\\%$ of 20?",
        "level": "Level 1",
        "type": "Algebra",
        "solution": "One hundred twenty percent of 30 is $120\\cdot30\\cdot\\frac{1}{100}=36$, and $130\\%$ of 20 is $ 130\\cdot 20\\cdot\\frac{1}{100}=26$.  The difference between 36 and 26 is $\\boxed{10}$."
    },
    {
        "problem": "The matrix for projecting onto a certain line $\\ell,$ which passes through the origin, is given by\n\\[\\begin{pmatrix} \\frac{1}{50} & \\frac{7}{50} \\\\ \\frac{7}{50} & \\frac{49}{50} \\end{pmatrix}.\\]Find the direction vector of line $\\ell.$  Enter your answer in the form $\\begin{pmatrix} a \\\\ b \\end{pmatrix},$ where $a,$ and $b$ are integers, $a > 0,$ and $\\gcd(|a|,|b|) = 1.$",
        "level": "Level 3",
        "type": "Precalculus",
        "solution": "Let $\\mathbf{P}$ denote the given matrix, so $\\mathbf{P} \\mathbf{v}$ is the projection of $\\mathbf{v}$ onto $\\ell.$  In particular, $\\mathbf{P} \\mathbf{v}$ lies on $\\ell$ for any vector $\\mathbf{v}.$  So, we can take $\\mathbf{v} = \\mathbf{i}.$  Then\n\\[\\mathbf{P} \\mathbf{i} = \\begin{pmatrix} \\frac{1}{50} \\\\ \\frac{7}{50} \\end{pmatrix} = \\frac{1}{50} \\begin{pmatrix} 1 \\\\ 7 \\end{pmatrix}.\\]Thus, the direction vector we seek is $\\boxed{\\begin{pmatrix} 1 \\\\ 7 \\end{pmatrix}}.$"
    }
]

In [None]:
def split_list(data_list: list) -> dict:
    level_lists = [[] for _ in range(5)]
    for data in data_list:
        level = int(data.get("level")[-1]) - 1
        level_lists[level].append(data)
    return level_lists

In [None]:
def last_boxed_only_string(answer: str):
    idx = answer.rfind("\\boxed")
    if idx < 0:
        idx = answer.rfind("\\fbox")
        if idx < 0:
            return None

    i = idx
    right_brace_idx = None
    num_left_braces_open = 0
    while i < len(answer):
        if answer[i] == "{":
            num_left_braces_open += 1
        if answer[i] == "}":
            num_left_braces_open -= 1
            if num_left_braces_open == 0:
                right_brace_idx = i
                break
        i += 1
    
    if right_brace_idx == None:
        retval = None
    else:
        retval = answer[idx:right_brace_idx + 1]
    
    return retval

def get_answer(answer: str) -> str:
    boxed_content = last_boxed_only_string(answer)

    boxed_content = boxed_content[7:]
    boxed_content = boxed_content[:-1]
    
    if not boxed_content: return ''
    return boxed_content


get_answer(test_data[2]['solution'])

In [None]:
current_path = os.getcwd()
file_names = [file for file in os.listdir(current_path) if file.endswith(".json")]
print(file_names)
# file_names.remove('math_raw_1k.json')
# file_names.remove('math_1k_demos.json')

all_data = {}
all_data_list = []
for file_name in file_names:
    file_path = os.path.join(current_path, file_name)
    type_name = file_name[:-5]
    type_data_list = []
    with open(file_path, "r", encoding="utf-8") as f:
        type_data_list = json.load(f)
        all_data[type_name] = split_list(type_data_list)
        print(type_name, len(type_data_list))
        all_data_list.extend(type_data_list)

In [None]:
type_list = [file[:-5] for file in file_names]

In [None]:
print(len(all_data))
print(len(all_data_list))
print(type_list)

In [None]:
level_cnt = [0, 0, 0, 0, 0]

level_dict = {
    "Level 1": 0,
    "Level 2": 1,
    "Level 3": 2,
    "Level 4": 3,
    "Level 5": 4
}

for data in all_data_list:
    level_cnt[level_dict[data['level']]] += 1

level_cnt = [level / 50 for level in level_cnt]
print(level_cnt)

random sample

In [None]:
data_1k = random.sample(all_data_list, 1000)
with open('./math_raw_1k.json', 'w', encoding='utf8') as f:
    json.dump(data_1k, f, ensure_ascii=False, indent=4)

make demos

In [None]:
with open('./math_raw_1k.json', 'r', encoding='utf8') as f:
    data_list = json.load(f)
level_cnt = [0, 0, 0, 0, 0]

level_dict = {
    "Level 1": 0,
    "Level 2": 1,
    "Level 3": 2,
    "Level 4": 3,
    "Level 5": 4
}

for data in data_list:
    level_cnt[level_dict[data['level']]] += 1
level_cnt = [level / 10 for level in level_cnt]
print(level_cnt)

In [None]:
type_dict = {
    'Intermediate Algebra': 'intermediate_algebra',
    'Precalculus': 'precalculus',
    'Number Theory': 'number_theory',
    'Geometry': 'geometry',
    'Prealgebra': 'prealgebra',
    'Algebra': 'algebra',
    'Counting & Probability': 'counting_and_probability'
}

for data in data_list:
    data_type = type_dict[data.get("type")]
    level_idx = int(data.get("level")[-1]) - 1
    demos = []
    selected_type_list = deepcopy(type_list)
    selected_type_list.remove(data_type)
    selected_type_list = random.sample(selected_type_list, 3)
    for selected_type in selected_type_list:
        demo = random.choice(all_data[selected_type][level_idx])
        demo['answer'] = get_answer(demo['solution'])
        demos.append(demo)
    data["demos"] = demos
    data["answer"] = get_answer(data["solution"])
    
with open('./math_1k_demos.json', 'w', encoding='utf8') as f:
    json.dump(data_list, f, ensure_ascii=False, indent=4)
    