In [None]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '7'
from transformers import AutoTokenizer, AutoModelForCausalLM
from vllm import LLM

tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B")
# Qwen/Qwen3-4B
# deepseek-ai/DeepSeek-R1-Distill-Llama-8B

In [None]:
import json
data = []
with open('underthink_rollout.jsonl', 'r') as f:
    for line in f:
        data.append(json.loads(line))
        

In [24]:
def last_boxed_only_string(string):
    idx = string.rfind("\\boxed")
    if idx < 0:
        idx = string.rfind("\\fbox")
        if idx < 0:
            return None

    i = idx
    right_brace_idx = None
    num_left_braces_open = 0
    while i < len(string):
        if string[i] == "{":
            num_left_braces_open += 1
        if string[i] == "}":
            num_left_braces_open -= 1
            if num_left_braces_open == 0:
                right_brace_idx = i
                break
        i += 1
    if right_brace_idx is None:
        retval = None
    else:
        retval = string[idx : right_brace_idx + 1]

    return retval


def remove_boxed(s):
    left = "\\boxed{"
    try:
        assert s[: len(left)] == left
        assert s[-1] == "}"
        return s[len(left) : -1]
    except Exception:
        return None


def extract_boxed_answer(solution: str) -> str:
    """Extract the answer from inside a LaTeX \\boxed{} command"""
    solution = last_boxed_only_string(solution)
    solution = remove_boxed(solution)
    return solution

In [None]:
import json
with open('scripts/data/optimalThinkingBench/underthink_bench.json', 'r') as f:
    reference_dataset = json.load(f)



In [28]:
import reasoning_gym
from reasoning_gym.factory import create_dataset

In [29]:
puzzles = {
    "ab": {"length": 20},
    "advanced_geometry": {"min_coord": -20, "max_coord": 20},
    "knight_swap": {},
    "tsumego": {"min_board_size": 15, "max_board_size": 19, "max_stones": 20},
    "fraction_simplification": {},
    "propositional_logic": {},
    "bitwise_arithmetic": {"difficulty": 4},
    "letter_counting": {"min_words": 25, "max_words": 35},
    "maze": {"min_dist": 15, "max_dist": 25, "min_grid_size": 15, "max_grid_size": 25},
    "puzzle24": {"min_value": 8, "max_value": 10},
    "quantum_lock": {"difficulty": 25},
}

In [None]:
correctness_response_length_list = {}
from tqdm import tqdm
for d in tqdm(data):
    generated_answer = extract_boxed_answer(d['response'])
    for i in range(len(reference_dataset)):
        if reference_dataset[i]['uuid'] == d['uuid']:
            gold_entry = reference_dataset[i]
            puzzle = gold_entry['puzzle']
            metadata = gold_entry['metadata']
            dataset = reasoning_gym.create_dataset(puzzle, size=50, **puzzles[puzzle])
            break
    if gold_entry is not None:
        score = dataset.score_answer(answer=generated_answer, entry=gold_entry)
        if score < 1:
            score = 0
    if d['data_source'] not in correctness_response_length_list:
        correctness_response_length_list[d['data_source']] = []
    correctness_response_length_list[d['data_source']].append((score, len(tokenizer.encode(d['response']))))


        

In [31]:
def compute_aucoaa(data):
    # data is a list of (correctness, response_length) tuples
    n = len(data)
    tmax = max(length for _, length in data)
    oaa_t_list = []
    for t in range(tmax + 1):
        filtered = [correct for correct, length in data if length < t]
        if filtered:
            oaa_t = sum(filtered) / n  # Denominator is always n, per definition
        else:
            oaa_t = 0.0
        oaa_t_list.append(oaa_t)
    aucoaa = sum(oaa_t_list) / len(oaa_t_list)
    return aucoaa


In [None]:
import numpy as np
for key in correctness_response_length_list.keys():
    mean_response_length = np.mean([length for _, length in correctness_response_length_list[key]])
    accuracy = np.mean([correct for correct, _ in correctness_response_length_list[key]])
    print("Data Source: ", key)
    print("Mean Response Length: ", mean_response_length)
    print("Accuracy: ", accuracy)
    print("Number of Samples: ", len(correctness_response_length_list[key]))
    print("AUC-OAA: ", compute_aucoaa(correctness_response_length_list[key]))
    print("--------------------------------")