In [None]:
%load_ext autoreload
%autoreload 2

# Analyze sampling

In [None]:
import pandas as pd
from datasets import load_dataset, Dataset
from pathlib import Path
from collections import defaultdict
from tqdm.notebook import tqdm

In [None]:
from dart_math.utils import load_jsonl, PROJ_HOME

In [None]:
OAI_OUTPUT_HOME: Path = Path(PROJ_HOME, "data/oai-outputs")

In [None]:
MATH_DSET: Dataset = load_dataset("hendrycks/competition_math", split="train")
MATH_QUERY2LVL: dict[str, int] = {}
for row in MATH_DSET:
    query: str = row["problem"].strip()
    level: int
    if (
        query
        == r"We have a triangle $\triangle ABC$ and a point $K$ on $BC$ such that $AK$ is an altitude of $\triangle ABC$. If $AC = 10,$ $BK = 7$, and $BC = 13,$ then what is the area of $\triangle ABC$?"
    ):
        level = 2  # MATH/train/geometry/377.json
    elif (
        query
        == r"One leg of a right triangle is 12 inches, and the measure of the angle opposite that leg is $30^\circ$. What is the number of inches in the hypotenuse of the triangle?"
    ):
        level = 1  # MATH/train/geometry/471.json
    else:
        level = int(row["level"][-1])
    MATH_QUERY2LVL[query] = level
print(f"{len(MATH_QUERY2LVL)=}")

len(MATH_QUERY2LVL)=7500


In [None]:
GSM8K_DSET: Dataset = load_dataset("hkust-nlp/gsm8k-fix", split="train")
GSM8K_QUERY2N_STEP: dict[str, int] = {}
for row in GSM8K_DSET:
    query: str = row["query"].strip()
    n_step = row["query_metadata"]["n_step"]
    GSM8K_QUERY2N_STEP[query] = n_step
print(f"{len(GSM8K_QUERY2N_STEP)=}")

len(GSM8K_QUERY2N_STEP)=7473


In [None]:
def assign_sample_lvl(sample):
    sample["query"] = sample["query"].strip()
    sample["level"] = MATH_QUERY2LVL.get(sample["query"], 0)
    return sample


def assign_lvl(samples: list[dict[str, str]]) -> None:
    for sample in samples:
        assign_sample_lvl(sample)


def calc_coverage(
    df: pd.DataFrame, queries: list[str], column_name: str = "query"
) -> float:
    """Return coverage rate in [0,1]."""
    df_values = set(df[column_name])

    covered: set = df_values.intersection(queries)
    coverage: float = len(covered) / len(queries)

    return coverage

In [None]:
sampling2lvl_cnts: dict[str, dict[int, int]] = {}
QUERY_CNTS: dict[int, int] = {0: 7473, 1: 565, 2: 1349, 3: 1592, 4: 1690, 5: 2304}

In [None]:
def calc_lvl_avg_cnt(df: pd.DataFrame) -> dict[int, int]:
    return dict(
        sorted(
            [
                (lvl, cnt / QUERY_CNTS[lvl])
                for lvl, cnt in df["level"].value_counts().items()
            ],
            key=lambda x: x[0],
        )
    )

## ToRA

In [None]:
tora_gpt_greedy_correct_samples: list[dict[str, str]] = [
    sample
    for sample in load_jsonl(OAI_OUTPUT_HOME / "output_t0.0_n1.jsonl")
    if sample["correct"]
]

assign_lvl(tora_gpt_greedy_correct_samples)
tora_df: pd.DataFrame = pd.DataFrame(tora_gpt_greedy_correct_samples)
print(f"{len(tora_df)=}")

len(tora_df)=12782


In [None]:
tora_df

Unnamed: 0,dataset,ref_ans,ans,correct,agent,prompt_template,query,resp,level
0,hendrycks/competition_math/train,\frac{3}{10},\frac{3}{10},True,gpt-4o-mini-2024-07-18,You are a helpful assistant. Solve the problem...,"A bowl contains 10 jellybeans (four red, one b...",To find the probability of picking exactly two...,5
1,hendrycks/competition_math/train,\frac{27}{64},\frac{27}{64},True,gpt-4o-mini-2024-07-18,You are a helpful assistant. Solve the problem...,The Boston weatherman says there is a 75 perce...,"To solve the problem, we need to determine the...",4
2,hendrycks/competition_math/train,\frac{34}{455},\frac{34}{455},True,gpt-4o-mini-2024-07-18,You are a helpful assistant. Solve the problem...,"A bag has 4 red marbles, 5 white marbles, and ...",To find the probability that all three marbles...,4
3,hendrycks/competition_math/train,300,300,True,gpt-4o-mini-2024-07-18,You are a helpful assistant. Solve the problem...,Compute $\dbinom{25}{2}$.,"To compute \(\dbinom{25}{2}\), we use the form...",1
4,hendrycks/competition_math/train,452,452,True,gpt-4o-mini-2024-07-18,You are a helpful assistant. Solve the problem...,How many three-digit whole numbers have at lea...,To find how many three-digit whole numbers hav...,3
...,...,...,...,...,...,...,...,...,...
12777,hendrycks/competition_math/train,\frac{1}{4},\frac{1}{4},True,gpt-4o-mini-2024-07-18,You are a helpful assistant. Solve the problem...,A circular garden is enlarged so that the new ...,Let the original diameter of the circular gard...,4
12778,hendrycks/competition_math/train,6,6,True,gpt-4o-mini-2024-07-18,You are a helpful assistant. Solve the problem...,A $4$-foot by $8$-foot rectangular piece of pl...,"To solve the problem, we start with a rectangu...",5
12779,hendrycks/competition_math/train,25,25,True,gpt-4o-mini-2024-07-18,You are a helpful assistant. Solve the problem...,Ten students are taking both algebra and draft...,"To solve the problem, we need to determine how...",3
12780,hendrycks/competition_math/train,\frac{1}{2},\frac{1}{2},True,gpt-4o-mini-2024-07-18,You are a helpful assistant. Solve the problem...,The expression $\frac{4k+8}{4}$ simplifies to ...,"To simplify the expression \(\frac{4k+8}{4}\),...",3


In [None]:
sampling2lvl_cnts["tora-gpt-greedy"]: dict[int, int] = calc_lvl_avg_cnt(tora_df)
print(f"{sampling2lvl_cnts['tora-gpt-greedy']=}")

sampling2lvl_cnts['tora-gpt-greedy']={0: 0.9304161648601632, 1: 0.9805309734513274, 2: 0.9481097108969607, 3: 0.8812814070351759, 4: 0.8041420118343195, 5: 0.5355902777777778}


In [None]:
tora_gpt_sample_raw_samples: list[dict[str, str]] = load_jsonl(
    OAI_OUTPUT_HOME / "output_t0.6_n10.jsonl"
)
print(f"{len(tora_gpt_sample_raw_samples)=}")
tora_gpt_sample_samples: list[dict[str, str]] = [
    sample for sample in tora_gpt_sample_raw_samples if sample["correct"]
]
print(f"{len(tora_gpt_sample_samples)=}")
assign_lvl(tora_gpt_sample_samples)

len(tora_gpt_sample_raw_samples)=21910
len(tora_gpt_sample_samples)=4510


In [None]:
tora_df = pd.concat([tora_df, pd.DataFrame(tora_gpt_sample_samples)])
tora_df = tora_df.groupby("query").head(4)
print(f"{len(tora_df)=}")
sampling2lvl_cnts["tora-gpt-sample"]: dict[int, int] = calc_lvl_avg_cnt(tora_df)
print(f"{sampling2lvl_cnts['tora-gpt-sample']=}")

len(tora_df)=15998
sampling2lvl_cnts['tora-gpt-sample']={0: 1.04656764351666, 1: 1.0123893805309734, 2: 1.0467012601927355, 3: 1.1080402010050252, 4: 1.1431952662721894, 5: 1.0837673611111112}


In [None]:
all_dsmath_samples: list[dict] = []
DSMATH_OUTPUT_HOME: Path = Path(PROJ_HOME, "data/res")
for synth_dpath in DSMATH_OUTPUT_HOME.glob("synth-vrt*"):
    for synth_fpath in tqdm(
        list(synth_dpath.glob("synth-vrt-*.jsonl")), desc=synth_dpath.name
    ):
        all_dsmath_samples.extend(load_jsonl(synth_fpath))

synth-vrt-455183:   0%|          | 0/64 [00:00<?, ?it/s]

synth-vrt-455448:   0%|          | 0/64 [00:00<?, ?it/s]

synth-vrt:   0%|          | 0/64 [00:00<?, ?it/s]

In [None]:
all_dsmath_df: pd.DataFrame = pd.DataFrame(all_dsmath_samples)
all_dsmath_df["level"] = all_dsmath_df["level"].fillna(0)
all_dsmath_df["level"].value_counts()

level
0.0    956544
5.0    294912
4.0    216320
3.0    203776
2.0    172672
1.0     72320
Name: count, dtype: int64

In [None]:
tora_dsmath_sample_df: pd.DataFrame = all_dsmath_df.groupby("query").head(64)
tora_dsmath_sample_df["level"].value_counts()
print(f"{len(tora_dsmath_sample_df)=}")
print(f"{len(tora_dsmath_sample_df[tora_dsmath_sample_df['correct']])=}")

len(tora_dsmath_sample_df)=958272
len(tora_dsmath_sample_df[tora_dsmath_sample_df['correct']])=733114


In [None]:
tora_dsmath_sample_left_df: pd.DataFrame = all_dsmath_df[
    ~all_dsmath_df.index.isin(tora_dsmath_sample_df.index)
]
tora_dsmath_sample_left_df["level"].value_counts()

level
0.0    478272
5.0    147456
4.0    108160
3.0    101888
2.0     86336
1.0     36160
Name: count, dtype: int64

In [None]:
query2fail_cnt = (
    tora_dsmath_sample_df[~tora_dsmath_sample_df["correct"]].groupby("query").size()
)

In [None]:
# tora_dsmath_retry_df_list: list[pd.DataFrame] = []
# for query, fail_cnt in tqdm(list(query2fail_cnt.items()), desc="Retry"):
#     if fail_cnt > 0:
#         tora_dsmath_retry_df_list.append(
#             tora_dsmath_sample_left_df[
#                 tora_dsmath_sample_left_df["query"] == query
#             ].head(fail_cnt)
#         )
# tora_dsmath_retry_df: pd.DataFrame = pd.concat(tora_dsmath_retry_df_list)

# Convert query2fail_cnt to a Series for vectorized operations
fail_cnt_series = pd.Series(query2fail_cnt)

# Filter the DataFrame based on queries with fail_cnt > 0
mask = tora_dsmath_sample_left_df["query"].isin(
    fail_cnt_series[fail_cnt_series > 0].index
)
filtered_df = tora_dsmath_sample_left_df[mask]

# Create a Series of cumulative counts for each query
cumulative_counts = filtered_df.groupby("query").cumcount() + 1

# Create a boolean mask for rows to keep
rows_to_keep = cumulative_counts <= filtered_df["query"].map(fail_cnt_series)

# Apply the mask to get the final DataFrame
tora_dsmath_retry_df: pd.DataFrame = filtered_df[rows_to_keep].reset_index(drop=True)
tora_dsmath_retry_df["level"].value_counts()
print(f"{len(tora_dsmath_retry_df)=}")

len(tora_dsmath_retry_df)=225158


In [None]:
tora_dsmath_final_df: pd.DataFrame
tora_dsmath_final_df = pd.concat([tora_dsmath_sample_df, tora_dsmath_retry_df])
tora_dsmath_final_df = (
    tora_dsmath_final_df[tora_dsmath_final_df["correct"]].groupby("query").head(4)
)
# sampling2lvl_cnts["tora-dsmath-final"] = calc_lvl_avg_cnt(tora_dsmath_final_df)
# print(f"{sampling2lvl_cnts['tora-dsmath-final']=}")

In [None]:
tora_df = pd.concat([tora_df, tora_dsmath_final_df])

In [None]:
print(f"{calc_coverage(tora_df, MATH_QUERY2LVL.keys())=}")
print(f"{len(tora_df)=}")
sampling2lvl_cnts["tora-final"] = calc_lvl_avg_cnt(tora_df)
print(f"{sampling2lvl_cnts['tora-final']=}")

calc_coverage(tora_df, MATH_QUERY2LVL.keys())=0.934
len(tora_df)=71973
sampling2lvl_cnts['tora-final']={0.0: 5.0330523216914225, 1.0: 5.005309734513275, 2.0: 4.994810971089696, 3.0: 4.954773869346734, 4.0: 4.7745562130177515, 5.0: 3.8359375}


## MARIO

In [None]:
mario_gpt_greedy_samples: list[dict[str, str]] = [
    sample
    for sample in load_jsonl(OAI_OUTPUT_HOME / "output_t0.3_n2.jsonl")
    if sample["correct"]
]
assign_lvl(mario_gpt_greedy_samples)
mario_df: pd.DataFrame = pd.DataFrame(mario_gpt_greedy_samples)
print(f"{len(mario_df)=}")
sampling2lvl_cnts["mario-gpt-greedy"] = calc_lvl_avg_cnt(mario_df)
print(f"{sampling2lvl_cnts['mario-gpt-greedy']=}")
print(f"{calc_coverage(mario_df, MATH_QUERY2LVL.keys())=}")
print(f"{calc_coverage(mario_df, GSM8K_QUERY2N_STEP.keys())=}")

len(mario_df)=25462
sampling2lvl_cnts['mario-gpt-greedy']={0: 1.858958918774254, 1: 1.9557522123893805, 2: 1.87175685693106, 3: 1.7575376884422111, 4: 1.5846153846153845, 5: 1.0694444444444444}
calc_coverage(mario_df, MATH_QUERY2LVL.keys())=0.8146666666666667
calc_coverage(mario_df, GSM8K_QUERY2N_STEP.keys())=0.9439314866854007


In [None]:
gpt_sample_samples: list[dict[str, str]] = [
    sample
    for sample in load_jsonl(OAI_OUTPUT_HOME / "output_patch_t0.6_n2.jsonl")
    if sample["correct"]
]
assign_lvl(gpt_sample_samples)
gpt_sample_samples.extend(tora_gpt_sample_samples)
mario_sample_df = pd.DataFrame(gpt_sample_samples)
mario_sample_df = (
    mario_sample_df[~mario_sample_df["query"].isin(mario_df["query"])]
    .groupby("query")
    .head(2)
)
mario_df: pd.DataFrame = pd.concat([mario_df, mario_sample_df])
print(f"{len(mario_df)=}")
sampling2lvl_cnts["mario-gpt-sample"] = calc_lvl_avg_cnt(mario_df)
print(f"{sampling2lvl_cnts['mario-gpt-sample']=}")
print(f"{calc_coverage(mario_df, MATH_QUERY2LVL.keys())=}")
print(f"{calc_coverage(mario_df, GSM8K_QUERY2N_STEP.keys())=}")

len(mario_df)=26461
sampling2lvl_cnts['mario-gpt-sample']={0: 1.8930817610062893, 1: 1.9592920353982302, 2: 1.9021497405485546, 3: 1.8190954773869348, 4: 1.6792899408284023, 5: 1.26171875}
calc_coverage(mario_df, MATH_QUERY2LVL.keys())=0.8744
calc_coverage(mario_df, GSM8K_QUERY2N_STEP.keys())=0.9645390070921985


In [None]:
mario_dsmath_raw_sample_df: pd.DataFrame = (
    all_dsmath_df[
        ~all_dsmath_df["query"].str.strip().isin(mario_df["query"].str.strip())
    ]
    .groupby("query")
    .head(100)
)
mario_dsmath_raw_sample_df = (
    mario_dsmath_raw_sample_df[mario_dsmath_raw_sample_df["correct"]]
    .groupby("query")
    .head(4)
)
mario_df = pd.concat([mario_df, mario_dsmath_raw_sample_df])
print(f"{len(mario_df)=}")
sampling2lvl_cnts["mario-final"] = calc_lvl_avg_cnt(mario_df)
print(f"{sampling2lvl_cnts['mario-final']=}")
print(f"{calc_coverage(mario_df, MATH_QUERY2LVL.keys())=}")
print(f"{calc_coverage(mario_df, GSM8K_QUERY2N_STEP.keys())=}")

len(mario_df)=28809
sampling2lvl_cnts['mario-final']={0.0: 2.021142780677104, 1.0: 2.007079646017699, 2.0: 1.98295033358043, 3.0: 1.939070351758794, 4.0: 1.8875739644970415, 5.0: 1.5707465277777777}
calc_coverage(mario_df, MATH_QUERY2LVL.keys())=0.9305333333333333
calc_coverage(mario_df, GSM8K_QUERY2N_STEP.keys())=0.997591328783621


## DART-Math

In [None]:
for dset_id in ["dart-math-uniform", "dart-math-hard"]:
    dset: Dataset = load_dataset(f"hkust-nlp/{dset_id}", split="train")
    dset = dset.map(assign_sample_lvl, num_proc=16)
    print(f"{dset=}")
    df: pd.DataFrame = pd.DataFrame(dset)
    sampling2lvl_cnts[dset_id] = calc_lvl_avg_cnt(df)
    print(f"{sampling2lvl_cnts[dset_id]=}")

dset=Dataset({
    features: ['query', 'response', 'level'],
    num_rows: 590705
})
sampling2lvl_cnts[dset_id]={0: 39.930951425130466, 1: 40.0, 2: 40.0, 3: 39.79899497487437, 4: 39.53905325443787, 5: 37.135416666666664}
dset=Dataset({
    features: ['query', 'response', 'level'],
    num_rows: 585392
})
sampling2lvl_cnts[dset_id]={0: 8.49136892814131, 1: 14.28495575221239, 2: 33.51964418087472, 3: 54.94409547738694, 4: 79.58875739644971, 5: 107.06206597222223}


## Summarize

In [None]:
print(f"{sampling2lvl_cnts=}")

sampling2lvl_cnts={'tora-gpt-greedy': {0: 0.9304161648601632, 1: 0.9805309734513274, 2: 0.9481097108969607, 3: 0.8812814070351759, 4: 0.8041420118343195, 5: 0.5355902777777778}, 'tora-gpt-sample': {0: 1.04656764351666, 1: 1.0123893805309734, 2: 1.0467012601927355, 3: 1.1080402010050252, 4: 1.1431952662721894, 5: 1.0837673611111112}, 'tora-final': {0.0: 5.0330523216914225, 1.0: 5.005309734513275, 2.0: 4.994810971089696, 3.0: 4.954773869346734, 4.0: 4.7745562130177515, 5.0: 3.8359375}, 'mario-gpt-greedy': {0: 1.858958918774254, 1: 1.9557522123893805, 2: 1.87175685693106, 3: 1.7575376884422111, 4: 1.5846153846153845, 5: 1.0694444444444444}, 'mario-gpt-sample': {0: 1.8930817610062893, 1: 1.9592920353982302, 2: 1.9021497405485546, 3: 1.8190954773869348, 4: 1.6792899408284023, 5: 1.26171875}, 'mario-final': {0.0: 2.021142780677104, 1.0: 2.007079646017699, 2.0: 1.98295033358043, 3.0: 1.939070351758794, 4.0: 1.8875739644970415, 5.0: 1.5707465277777777}, 'dart-math-uniform': {0: 39.930951425130

In [None]:
sampling_lvl_cnt_df: pd.DataFrame = pd.DataFrame(sampling2lvl_cnts)
# .2f
sampling_lvl_cnt_df.T.style.format("{:.2f}")

Unnamed: 0,0,1,2,3,4,5
tora-gpt-greedy,0.93,0.98,0.95,0.88,0.8,0.54
tora-gpt-sample,1.05,1.01,1.05,1.11,1.14,1.08
tora-final,5.03,5.01,4.99,4.95,4.77,3.84
mario-gpt-greedy,1.86,1.96,1.87,1.76,1.58,1.07
mario-gpt-sample,1.89,1.96,1.9,1.82,1.68,1.26
mario-final,2.02,2.01,1.98,1.94,1.89,1.57
dart-math-uniform,39.93,40.0,40.0,39.8,39.54,37.14
dart-math-hard,8.49,14.28,33.52,54.94,79.59,107.06
