In [None]:
'''
cd /workspace
pip install "huggingface_hub[hf_transfer]"
pip install hf_transfer
HF_HUB_ENABLE_HF_TRANSFER=1 huggingface-cli download Qwen/Qwen2.5-3B-Instruct --local-dir ./qwen3b/
'''

In [1]:
!pip install datasets
!pip install transformers
!pip install accelerate
!pip install numpy, pandas, polars
!pip install matplotlib
!pip install sentence_transformers
#!pip install huggingface_hub
!pip install evaluate
#!pip install mistralai
#!pip install flask

# For vLLM
!pip install -U vllm==0.6.5
#!pip install ray
#!pip install packaging
#!pip install typing

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-19.0.1-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting pandas (from datasets)
  Downloading pandas-2.2.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.9/89.9 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting tqdm>=4.66.3 (from datasets)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.7/57.7 kB[0m [31m29.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_

# Process class

## vLLM.AsyncEngine + asyncio

In [None]:
# Use Model from local environment

import asyncio
from transformers import AutoTokenizer
from vllm import AsyncLLMEngine, AsyncEngineArgs, SamplingParams
import time, warnings

model_name = "deepseek_r1_qwen14b"
tensor_parallel_size = 2

engine_args = AsyncEngineArgs(
    model = model_name,
    tensor_parallel_size = tensor_parallel_size,
    gpu_memory_utilization=0.95,
)
engine = AsyncLLMEngine.from_engine_args(engine_args)
tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side='left')


class AllRequests:
    
    def __init__(self, max_request):
        self.max_request = max_request
        self.requests = []
        self.request_ids = []
        self.request_id = 0
        self.results = []
        self.finished_ids = []
        
    def add(self, request):
        self.requests.append(request)
        self.request_ids.append(self.request_id)
        self.request_id += 1
    
    async def process(self, model=model_name, max_tokens = 3000, temperature=0.4, save_dir = "progress_log", restart = False):

        if not os.path.exists(save_dir):
            os.makedirs(save_dir)

        if restart:
            if os.path.exists(f"{save_dir}/finished_ids.json") and os.path.exists(f"{save_dir}/results.json"):
                with open(f"{save_dir}/finished_ids.json") as f:
                    finished_ids = json.load(f)
                with open(f"{save_dir}/results.json") as f:
                    self.results = json.load(f)
                for finished_id in finished_ids:
                    self.finished_ids.append(finished_id)
                    id = self.request_ids.index(finished_id)
                    self.request_ids.pop(id)
                    self.requests.pop(id)

        await asyncio.gather(
            *[self.process_requests(temperature = temperature, max_tokens = max_tokens, restart = restart, save_dir=save_dir) for _ in range(self.max_request)]
        )
            
        return self.results


    async def process_requests(self, max_tokens = 3000, temperature=0.4, save_dir = "progress_log", restart = False):

        while len(self.requests) != 0:
            request_dict = self.requests.pop(0)
            request_id = self.request_ids.pop(0)

            prompt = request_dict["prompt"]

            final_output = None
            results_generator = engine.generate(prompt, SamplingParams(temperature=temperature, max_tokens=max_tokens), request_id)
            async for request_output in results_generator:
                # print(request_output) => for streaming
                final_output = request_output

            output = final_output.outputs[0].text
            
            request_dict["output"] = output
            self.results.append(request_dict)
            self.finished_ids.append(request_id)

            with open(f"{save_dir}/results.json", "w") as f:
                json.dump(self.results, f)
            with open(f"{save_dir}/finished_ids.json", "w") as f:
                json.dump(self.finished_ids, f)
    
    


## Mistral API

In [None]:
# Initial Setting for Mistral API call
# Attach your API key and Run the following script. Be sure to delete the API key for the case someone else see this script.

import os
from mistralai import Mistral

os.environ["MISTRAL_API_KEY"] = "ACAGbDxsXG5K8kTaGGGPJ4Lj1gJ0aYdl"

api_key = os.environ["MISTRAL_API_KEY"]
#mistral_model = "mistral-large-latest"
mistral_model = "ministral-8b-latest"
client = Mistral(api_key=api_key)

async def process_api_requests(requests, request_id, max_tokens = 2000, temperature=0.4, save_dir = "api_progress_log", restart = False, get_result = None, delete_save_file = True):
    # requests: [{"prompt":, ...} ... ]

    results = []
    
    save_path = f"./{save_dir}/{request_id}.json"
    if os.path.exists(save_dir):
        if os.path.exists(save_path):
            if restart:
                with open(save_path) as f:
                    log_dict = json.load(f)
                results = log_dict["results"]
            else:
                os.remove(save_path)
    else:
        os.makedirs(save_dir)
    
    print()
    print(f"request {request_id} started")

    for i, request_dict in enumerate(requests):

        # For the case restart = True
        if len(results) > i:
            continue
        
        prompt = request_dict["prompt"]

        start = time.time()
        
        # start the generation
        chat_response = client.chat.complete(
            model= mistral_model,
            messages = [
                {
                    "role": "user",
                    "content": prompt,
                },
            ]
        )
    
        output = chat_response.choices[0].message.content

        end = time.time()

        request_dict["output"] = output

        if get_result:
            result = get_result(request_dict, save_dir)
        else:
            result = request_dict
            
        results.append(result)
    
        print()
        print(f"request_id: {request_id}, {i+1}th request finished")
        print(f"num token  prompt:{get_num_tokens(prompt)}, output:{get_num_tokens(output)}")
        print(f"calculation time: {end-start}")
        #print(f"output text: {output}")
        print()

        with open(save_path, "w") as f:
            json.dump({"request_id":request_id, "requests":requests, "results":results}, f)

    print()
    print(f"request {request_id} all finished")

    if delete_save_file:
        if os.path.exists(save_path):
            os.remove(save_path)
            print(f"log file for request {request_id} is deleted")

    # results: [{"prompt":, "output":} ... ]
    return results

def get_num_tokens(text, add_special_tokens=False):
    input_ids = tokenizer.encode(text, add_special_tokens=add_special_tokens)
    return len(input_ids)

## vllm.entrypoints.LLM + release memory after process

In [None]:
# Use Model from local environment

import asyncio
from transformers import AutoTokenizer
from vllm import LLM, SamplingParams
import time, warnings, copy

import gc, torch, contextlib
from vllm.distributed.parallel_state import destroy_model_parallel, destroy_distributed_environment

#model_name = "Qwen/QwQ-32B-Preview"
#model_name = "MBMMurad/QwQ-32B-preview-AWQ-AIMO-earlysharing"
#model_name = "mistralai/Ministral-8B-Instruct-2410"
#model_name = "deepseek-ai/deepseek-math-7b-instruct"
#model_name = "Qwen/Qwen2.5-72B-Instruct-AWQ"
#model_name = "bartowski/Qwen2.5-Math-72B-Instruct-GGUF" *

#qwq_model = "./qwq_awq_model"
#mistral_model = "./mistral8b_model"
default_model_name = "./mistral8b_model"
tensor_parallel_size = 4

class AllRequests:
    
    def __init__(self):
        self.requests = []
        
    def add(self, request):
        self.requests.append(request)

    def process(self, model=default_model_name, max_tokens = 3000, temperature=0.4, save_dir = "progress_log", restart = False, get_result = None, delete_save_file = False, destroy_model = True):
        
        results=[]
        save_path = f"{save_dir}/{0}.json"
        if os.path.exists(save_dir):
            if os.path.exists(save_path):
                if restart:
                    with open(save_path) as f:
                        log_dict = json.load(f)
                    results = log_dict["results"]
                else:
                    os.remove(save_path)
        else:
            os.makedirs(save_dir)

        prompts = []
    
        if restart:
            if len(results) == len(self.requests):
                return results
            processed_requests = self.requests[len(results):]
        else:
            processed_requests = self.requests
    
        for request_dict in processed_requests:
            prompts.append(request_dict["prompt"])


        llm = LLM(model=model, tensor_parallel_size=tensor_parallel_size, distributed_executor_backend="mp", disable_custom_all_reduce=True, trust_remote_code=True, enforce_eager=True)
        sampling_params = SamplingParams(max_tokens=max_tokens, temperature=temperature, top_p=0.95)
        
        outputs = llm.generate(prompts, sampling_params)
        
        for i, output in enumerate(outputs):
            prompt = output.prompt
            generated_text = output.outputs[0].text
            result_dict = processed_requests[i]
            
            result_dict["output"] = generated_text
    
            if get_result:
                result_dict = get_result(result_dict, save_dir)
    
            result = copy.deepcopy(result_dict) # without this, all results become same
            results.append(result)

        with open(save_path, "w") as f:
            json.dump({"requests":self.requests, "results":results}, f)

        if destroy_model:
            destroy_model_parallel()
            destroy_distributed_environment()
            del llm.llm_engine.model_executor
            del llm
            gc.collect()
            torch.cuda.empty_cache()
            with contextlib.suppress(AssertionError):
                torch.distributed.destroy_process_group()
            
        return results
        

### old

In [None]:
# Use Model from local dir using vllm.entrypoints.LLM class releasing memory for model at last

import asyncio
from transformers import AutoTokenizer
from vllm import LLM, AsyncLLMEngine, AsyncEngineArgs, SamplingParams
import time, warnings, copy
import torch

#from huggingface_hub import login
#login(token="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")

#default_model_name = "Qwen/QwQ-32B-Preview"
#default_model_name = "MBMMurad/QwQ-32B-preview-AWQ-AIMO-earlysharing"
#default_model_name = "mistralai/Ministral-8B-Instruct-2410"
#default_model_name = "deepseek-ai/deepseek-math-7b-instruct"
#default_model_name = "Qwen/Qwen2.5-72B-Instruct-AWQ"
#default_model_name = "bartowski/Qwen2.5-Math-72B-Instruct-GGUF" *

#default_model_name = "./qwq_awq_model"
default_model_name = "./mistral8b_model"

import gc, torch, contextlib
from vllm.distributed.parallel_state import destroy_model_parallel, destroy_distributed_environment


def process_requests(requests, request_id, model=default_model_name, max_tokens = 3000, temperature=0.4, save_dir = "progress_log", restart = False, get_result = None, delete_save_file = False):

    results = []
    
    save_path = f"./{save_dir}/{request_id}.json"
    if os.path.exists(save_dir):
        if os.path.exists(save_path):
            if restart:
                with open(save_path) as f:
                    log_dict = json.load(f)
                results = log_dict["results"]
            else:
                os.remove(save_path)
    else:
        os.makedirs(save_dir)

    prompts = []
    
    if restart:
        if len(results) == len(requests):
            return results
        processed_requests = requests[len(results):]
    else:
        processed_requests = requests

    for request_dict in processed_requests:
        prompts.append(request_dict["prompt"])
    
    llm = LLM(model=model, tensor_parallel_size=2, distributed_executor_backend="mp", disable_custom_all_reduce=True,)
    sampling_params = SamplingParams(max_tokens=max_tokens, temperature=temperature, top_p=0.95)
    
    outputs = llm.generate(prompts, sampling_params)
    
    for i, output in enumerate(outputs):
        prompt = output.prompt
        generated_text = output.outputs[0].text
        result_dict = processed_requests[i]
        
        result_dict["output"] = generated_text

        if get_result:
            result_dict = get_result(result_dict, save_dir)

        result = copy.deepcopy(result_dict) # without this, all results become same
        results.append(result)

    with open(save_path, "w") as f:
        json.dump({"request_id":request_id, "requests":requests, "results":results}, f)

    if delete_save_file:
        if os.path.exists(save_path):
            os.remove(save_path)
            print(f"log file for request {request_id} is deleted")

    destroy_model_parallel()
    destroy_distributed_environment()
    del llm.llm_engine.model_executor  #.driver_worker
    del llm # Isn't necessary for releasing memory, but why not
    gc.collect()
    torch.cuda.empty_cache()
    with contextlib.suppress(AssertionError):
        torch.distributed.destroy_process_group()

    print(f"{torch.cuda.memory_allocated() / 1000000000} GB allocated")

    # results: [{"prompt":, "output":} ... ]
    return results


# Not used now. This is useful when someone wants requests in progress
def get_all_requests(dir = "./progress_log", delete_files = True):
    all_requests = {}
    file_names = get_all_file_names(dir)
    for file_name in file_names:
        with open(f"{dir}/{file_name}") as f:
            log_dict = json.load(f)
        request_id = log_dict["request_id"]
        requests = log_dict["requests"]
        all_requests[str(request_id)] = requests

    if delete_files:
        delete_all_files_in_directory(dir)

    return all_requests

def get_num_tokens(text, add_special_tokens=False):
    input_ids = tokenizer.encode(text, add_special_tokens=add_special_tokens)
    return len(input_ids)

# Collect Problems (problems, correct_answers, solutions)

## LiveCodeBench

In [None]:
from datasets import load_dataset
lcb_codegen = load_dataset("livecodebench/code_generation_lite", version_tag="release_v5")

In [None]:
lcb_codegen

In [None]:
lcb_codegen["test"]['question_content'][0]

## BigCodeBench

### Download

In [None]:
num_sample = 100
dataset_name = 'bigcode/bigcodebench'
save_path = "bigcodebench3.json"

import pandas as pd
from datasets import load_dataset
import random

# Load a dataset from Hugging Face
dataset = load_dataset(dataset_name)

# Convert the dataset to a pandas DataFrame
# Assuming you want to use the 'train' split of the dataset
df = pd.DataFrame(dataset['v0.1.0_hf'])

# Convert the DataFrame to a list of dictionaries
data_list = df.to_dict(orient='records')
#data_list = data_list[:num_sample]
data_list = random.sample(data_list, num_sample)

# Print the first few records to verify
#print(data_list[:5])

# Prepare list
task_id = []
complete_prompt = []
instruct_prompt = []
canonical_solution = []
code_prompt = []
test = []
doc_struct = []

for i, data_dict in enumerate(data_list):
    task_id.append(data_dict["task_id"])
    complete_prompt.append(data_dict["complete_prompt"])
    instruct_prompt.append(data_dict["instruct_prompt"])
    canonical_solution.append(data_dict["canonical_solution"])
    code_prompt.append(data_dict["code_prompt"])
    test.append(data_dict["test"])
    doc_struct.append(data_dict["doc_struct"])

import json
with open(save_path, "w") as f:
    json.dump({"task_id":task_id, "complete_prompt":complete_prompt, "instruct_prompt":instruct_prompt, "canonical_solution":canonical_solution, "code_prompt":code_prompt, "test":test, "doc_struct":doc_struct,}, f)


In [None]:
#num_sample = 2000
dataset_name = 'bigcode/bigcodebench'
save_path = "bigcodebench3.json"

import pandas as pd
from datasets import load_dataset
import random

# Load a dataset from Hugging Face
dataset = load_dataset(dataset_name)

# Convert the dataset to a pandas DataFrame
# Assuming you want to use the 'train' split of the dataset
df = pd.DataFrame(dataset['v0.1.3'])

# Convert the DataFrame to a list of dictionaries
data_list = df.to_dict(orient='records')
#data_list = data_list[:num_sample]
#data_list = random.sample(data_list, num_sample)

# Print the first few records to verify
#print(data_list[:5])

# Prepare list
task_id = []
complete_prompt = []
instruct_prompt = []
canonical_solution = []
code_prompt = []
test = []
doc_struct = []

for i, data_dict in enumerate(data_list):
    task_id.append(data_dict["task_id"])
    complete_prompt.append(data_dict["complete_prompt"])
    instruct_prompt.append(data_dict["instruct_prompt"])
    canonical_solution.append(data_dict["canonical_solution"])
    code_prompt.append(data_dict["code_prompt"])
    test.append(data_dict["test"])
    doc_struct.append(data_dict["doc_struct"])

import json
with open(save_path, "w") as f:
    json.dump({"task_id":task_id, "complete_prompt":complete_prompt, "instruct_prompt":instruct_prompt, "canonical_solution":canonical_solution, "code_prompt":code_prompt, "test":test, "doc_struct":doc_struct,}, f)


### Load

In [None]:
import json
save_path = "bigcodebench3.json"
with open(save_path) as f:
    data_dict = json.load(f)

task_id=data_dict["task_id"]
complete_prompt=data_dict["complete_prompt"]
instruct_prompt=data_dict["instruct_prompt"]
canonical_solution=data_dict["canonical_solution"]
code_prompt=data_dict["code_prompt"]
test=data_dict["test"]
doc_struct=data_dict["doc_struct"]

num_problems = len(task_id)

### test

In [None]:
print(instruct_prompt[0])
print()
print(canonical_solution[0])
print()
print(test[0])

In [None]:
import itertools
from random import shuffle
def task_func(numbers=list(range(1, 3))):
    permutations = list(itertools.permutations(numbers))
    sum_diffs = 0

    for perm in permutations:
        perm = list(perm)
        shuffle(perm)
        diffs = [abs(perm[i] - perm[i+1]) for i in range(len(perm)-1)]
        sum_diffs += sum(diffs)

    avg_sum_diffs = sum_diffs / len(permutations)
    
    return avg_sum_diffs

import unittest
from unittest.mock import patch
from random import seed, shuffle
import itertools
class TestCases(unittest.TestCase):
    def test_default_numbers(self):
        # Test with default number range (1 to 10) to check that the result is a positive float.
        result = task_func()
        self.assertIsInstance(result, float)
        self.assertGreater(result, 0)
    def test_custom_list(self):
        # Test with a custom list of small positive integers to ensure proper handling and positive result.
        result = task_func([1, 2, 3])
        self.assertIsInstance(result, float)
        self.assertGreater(result, 0)
    def test_negative_numbers(self):
        # Test with negative numbers to verify the function handles and returns a positive result.
        result = task_func([-3, -2, -1])
        self.assertIsInstance(result, float)
        self.assertGreater(result, 0)
    def test_single_element(self):
        # Test with a single element list to confirm the return is zero since no pairs exist.
        result = task_func([5])
        self.assertIsInstance(result, float)
        self.assertEqual(result, 0)
    def test_empty_list(self):
        # Test with an empty list to ensure the function handles it gracefully and returns zero.
        result = task_func([])
        self.assertIsInstance(result, float)
        self.assertEqual(result, 0)
    def test_identical_elements(self):
        # Test with a list of identical elements to confirm that differences are zero and the average is zero.
        result = task_func([2, 2, 2])
        self.assertIsInstance(result, float)
        self.assertEqual(result, 0)
    def test_mixed_numbers(self):
        # Test with a list of mixed positive and negative numbers to check correct average of differences.
        result = task_func([-10, 10, -5])
        self.assertIsInstance(result, float)
        self.assertGreater(result, 0)
    def test_specific_value_with_seed(self):
        # Set seed for reproducibility and check the computed value
        with patch('random.shuffle', side_effect=lambda x: seed(42) or shuffle(x)):
            result = task_func([1, 2, 3])
            self.assertAlmostEqual(result, 2.5, delta=0.5)  # This expected value should be calculated beforehand
    def test_large_list_with_seed(self):
        # Set seed and test with a larger list for specific computed value
        with patch('random.shuffle', side_effect=lambda x: seed(99) or shuffle(x)):
            result = task_func(list(range(1, 11)))
            self.assertAlmostEqual(result, 33.0, delta=0.5)  # This expected value should be calculated beforehand
    def test_random_behavior(self):
        # Test to ensure different seeds produce different outputs, demonstrating randomness
        with patch('random.shuffle', side_effect=lambda x: seed(1) or shuffle(x)):
            result1 = task_func([1, 2, 3])
        with patch('random.shuffle', side_effect=lambda x: seed(1) or shuffle(x)):
            result2 = task_func([1, 2, 4])
        self.assertNotEqual(result1, result2)



In [None]:
t = TestCases()
t.test_default_numbers()

In [None]:
import inspect
def check_code():
    instance = TestCases()
    methods = inspect.getmembers(cls, predicate=inspect.isfunction)
    try:
        for name, method in methods:
            if name.startswith('test_'):
                method(instance)
    except Exception as e:
        print("An error occurred:", e)
        return False
    return True

is_code_correct__ = check_code()

In [None]:
import itertools
from random import shuffle

def task_func(numbers=list(range(1, 11))):
    # Generate all permutations of the list
    permutations = list(itertools.permutations(numbers))

    # Initialize a list to store the sum of absolute differences for each permutation
    sums_of_differences = []

    # Iterate over each permutation
    for perm in permutations:
        # Shuffle the permutation
        shuffled_perm = list(perm)
        shuffle(shuffled_perm)

        # Calculate the sum of absolute differences between consecutive numbers
        sum_diff = sum(abs(shuffled_perm[i] - shuffled_perm[i + 1]) for i in range(len(shuffled_perm) - 1))

        # Append the sum to the list
        sums_of_differences.append(sum_diff)

    # Calculate the average of the sums of absolute differences
    average_sum_diff = sum(sums_of_differences) / len(sums_of_differences)

    return average_sum_diff


## Kaggle Reference Problems

In [None]:
problems = [
    "Three airline companies operate flights from Dodola island. Each company has a different schedule of departures. The first company departs every 100 days, the second every 120 days and the third every 150 days. What is the greatest positive integer $d$ for which it is true that there will be $d$ consecutive days without a flight from Dodola island, regardless of the departure times of the various airlines?",
    "Fred and George take part in a tennis tournament with $4046$ other players. In each round, the players are paired into $2024$ matches. How many ways are there to arrange the first round such that Fred and George do not have to play each other? (Two arrangements for the first round are \textit{different} if there is a player with a different opponent in the two arrangements.)",
    "Triangle $ABC$ has side length $AB = 120$ and circumradius $R = 100$. Let $D$ be the foot of the perpendicular from $C$ to the line $AB$. What is the greatest possible length of segment $CD$?",
    "Find the three-digit number $n$ such that writing any other three-digit number $10^{2024}$ times in a row and $10^{2024}+2$ times in a row results in two numbers divisible by $n$.",
    "We call a sequence $a_1, a_2, \ldots$ of non-negative integers \textit{delightful} if there exists a positive integer $N$ such that for all $n > N$, $a_n = 0$, and for all $i \geq 1$, $a_i$ counts the number of multiples of $i$ in $a_1, a_2, \ldots, a_N$. How many delightful sequences of non-negative integers are there?",
    "Let $ABC$ be a triangle with $BC=108$, $CA=126$, and $AB=39$. Point $X$ lies on segment $AC$ such that $BX$ bisects $\angle CBA$. Let $\omega$ be the circumcircle of triangle $ABX$. Let $Y$ be a point on $\omega$ different from $X$ such that $CX=CY$. Line $XY$ meets $BC$ at $E$. The length of the segment $BE$ can be written as $\frac{m}{n}$, where $m$ and $n$ are coprime positive integers. Find $m+n$.",
    "For a positive integer $n$, let $S(n)$ denote the sum of the digits of $n$ in base 10. Compute $S(S(1)+S(2)+\cdots+S(N))$ with $N=10^{100}-2$.",
    """For positive integers $x_1,\ldots, x_n$ define $G(x_1, \ldots, x_n)$ to be the sum of their $\frac{n(n-1)}{2}$ pairwise greatest common divisors. We say that an integer $n \geq 2$ is \emph{artificial} if there exist $n$ different positive integers $a_1, ..., a_n$ such that 
\[a_1 + \cdots + a_n = G(a_1, \ldots, a_n) +1.\]
Find the sum of all artificial integers $m$ in the range $2 \leq m \leq 40$.""",
    "The Fibonacci numbers are defined as follows: $F_0 = 0$, $F_1 = 1$, and $F_{n+1} = F_n + F_{n-1}$ for $n \geq 1$. There are $N$ positive integers $n$ strictly less than $10^{101}$ such that $n^2 + (n+1)^2$ is a multiple of 5 but $F_{n-1}^2 + F_n^2$ is not. How many prime factors does $N$ have, counted with multiplicity?",
    "Alice writes all positive integers from $1$ to $n$ on the board for some positive integer $n \geq 11$. Bob then erases ten of them. The mean of the remaining numbers is $3000/37$. The sum of the numbers Bob erased is $S$. What is the remainder when $n \times S$ is divided by $997$?",
]

correct_answers = [79, 250, 180, 143, 3, 751, 891, 810, 201, 902]

solutions = [
    """The airlines are called A100, A120 and A150, labelled by the frequency of their departures. We first prove that there is a period of 99 days after an A100 departure during which no A150 plane takes off.\\
Consider a period of 301 days which starts with an A150 departure on Day 0, followed by a departure on Day 150 and on Day 300. Let the first A100 departure in this period be on Day $x$.

There are two possibilities: (i) $0 \leq x \leq 50$ or (ii) $51 \leq x \leq 99$. In case (i), there is a quiet period of 99 days after the first $A 100$ departure. In case (ii), the second A100 departure will be on Day $100+x$ where $151 \leq 100+x \leq 199$ so there will be a period of 99 consecutive days after the second A100 departure with no A150 departure.

We will now prove that there are 79 consecutive days on which no departure of any airline happens, including the A120 planes. We restart time and define Day 0 to be when an $A 100$ flight departs and there is no A150 flight before Day 100. This situation will repeat later because $300=3 \times 100=$ $2 \times 150$. The fourth A100 flight will take off on Day 300 and there will be no subsequent departure of an $A 150$ plane before Day 400.

Suppose the first departure of an A120 plane is on Day $y$. If $y \leq 20$ or $y \geq 80$, we have found the 79 consecutive days by looking after or before this A120 departure.\\
If $20 \leq y \leq 60$, then there will be an A120 departure on day $240+y$ where $260 \leq 240+y \leq 300$ so there will be no A120 departure strictly between Day 300 and Day 380 and so we will find the required 79 consecutive quiet days between those dates.

Finally, if $61 \leq y \leq 80$, there will be an A120 departure on Day $y+240$ where $301 \leq y+240 \leq 320$ and there will be no subsequent departures before Day 400 and again we find the required 79 consecutive quiet days.\\
We now show that this bound can be attained. Suppose that an A100 departs on Day 0, an A120 departs on Day 80 and an A150 departs on Day 120. The departure days are then:

$$
0,80,100,120,200 \& 200,270,300,320,400,420,440,500,560,570
$$

modulo 600 (i.e. it repeats every 600 days).\\
The longest run of consecutive days without flights is 79 days (and this is obtained three times in this 600 day cycle).
""",

    """Consider an tournament with $2 m$ players. The number of possible first round pairings can be calculated by labelling the matches 1 to $m$. Label the players in order 1 to $2 m$ (this can be done in $(2 m)$ ! ways), and assign players $(2 i-1)$ and $2 i$ to match number $i$.\\
How many times does any given particular first round pairing arise from the method specified above? Swapping the labels of any pair $2 i-1,2 i$ does not change the pairings, nor does permuting the order of the labels of the matches. Doing anything else will result in a different first round pairing, so $(2 m)!$ is overcounting the number of first round pairings by a factor of $2^{m}(m!)$.\\
Therefore the number of first round pairings is

$$
\frac{(2 m)!}{2^{m} m!}
$$

Now consider the odd and even factors in the product defining $(2 m)$ ! separately. We see that

$$
(2 m)!=[(2 m) \cdot 2(m-1) \cdots 2] \times[(2 m-1) \cdot(2 m-3) \cdots 3 \cdot 1] .
$$

The product of the odd numbers $(2 m-1) \cdot(2 m-3) \cdots 3 \cdot 1$ is often written as $(2 m-1)$ !!. Pulling factors of 2 out of the even factors we obtain

$$
(2 m)!=2^{m} \cdot m!\cdot(2 m-1)!!
$$

Therefore the number of different first round pairings is

$$
\frac{(2 m)!}{2^{m} \cdot m!}=(2 m-1)!!
$$

The number of first round pairings of the $4048=2 n+2$ players where Fred and George do not play each other is the total number of pairings minus the number of pairings in which they play each other. This is $(2 n+1)!!-(2 n-1)!!=2 n(2 n-1)!!=4046 \cdot 4045!!$. This number is clearly divisible by 125 because of the large number of factors of 5 in $4045!!$. We would like to know the remainder when $4046 \cdot 4045$ !! is divided by 8 , because then we could apply the Chinese Remainder Theorem and deduce the remainder on division by 1000 . Note that $4045 \times 4043 \times 4041 \equiv 5 \times 3 \times 1 \equiv-1 \mathrm{mod}$ 8. The sequence of odd positive integers has period $4 \operatorname{modulo} 8$, and $4039 \equiv 7 \bmod 8$. Notice that $1 \times 3 \times 5 \times 7 \equiv 1 \bmod 8$. Running the sequence of odd numbers from 1 to 4039 cycles through a whole number of periods modulo 8 and so $4039!!\equiv 1 \bmod 8$. Now $4046 \cdot 4045!!\equiv 6 \times(-1) \times 1 \equiv 2$ $\bmod 8$.\\
By the Chinese remainder theorem there is a unique integer $x$ in the range $0 \leq x \leq 999$ which satisfies our conditions modulo 8 and modulo 125, and by inspection that is 250 and so

$$
4046 \cdot 4045!!\equiv 250 \bmod 1000
$$

and we report 250 .""",
    
    """Let $O$ be the circumcentre of triangle $A B C$. Then

$$
\operatorname{dist}(O, A B)=\sqrt{R^{2}-(A B / 2)^{2}}=\sqrt{100^{2}-60^{2}}=80
$$

by Pythagoras.\\
\includegraphics[max width=\textwidth, center]{2024_12_04_73c4dcc43c6e7936620eg-04}

Since $C$ must be on the circle with centre $O$ and radius $O A$, the largest possible altitude $h_{c}$ is attained when $C$ is the mid-point of the larger $\operatorname{arc} A B$ of the circumcircle (i.e. on the perpendicular bisector of $A B$ ) in which case we have $h_{c}=\operatorname{dist}(O, A B)+R=80+100=180$.""",
    
    """Let $M=10^{1024}$. Let $a$ be any three-digit number. Writing $M$ copies of $a$ in a row results in a number $X$ where

$$
X=a \times 100100100 \ldots 1001001
$$

and there are $M$ copies of the digit one in the long number. If instead we wrote $M+2$ copies of $a$ in a row, the resulting number would be $10^{6} X+1001 a$. We use the notation $(u, v)$ to denote the greatest common divisor of two integers $u$ and $v$ which are not both 0 .\\
We apply Euclid's algorithm so

$$
\left(\left(10^{6} X+1001 a\right), X\right)=(1001 a, X)
$$

It is therefore a necessary condition that our three-digit number $n$ should divide $(1001 a, X)$ for all three-digit numbers $a$. By considering $a=100$ and $a=101$, we see that any candidate for $n$ must divide $1001 \times 101-1001 \times 100=1001$. Moreover, if $n$ is a divisor of 1001 , then $n$ will divide $X$ because 1001 divides $10010010010 \ldots 01001001$ which is

$$
1001 \times 10000010000010 \ldots 01000001
$$

The second factor involves $M / 2$ copies of the digit one. Such an $n$ will also divide $10^{6} X+1001 a$.\\
Thus it is a necessary and sufficient condition for $n$ to satisfy the conditions of the problem that $n$ be a three-digit divisor of $1001(=7 \times 11 \times 13)$. There is a unique such number: 143 .""",
    
    """We claim the only such sequences are $(1,0,0, \ldots),(2,1,0,0, \ldots)$ and $(2,2,0,0, \ldots)$. Note that $a_{1}=N$ because 1 divides each of the first $N$ terms. Hence if $N=1$, we necessarily have the sequence $(1,0, \ldots)$, and that obeys the conditions.\\
Observe that if $1 \leq i \leq N$, then $a_{i}$ is at most $N$ because $a_{i}$ counts the size of a subset of $\{1,2, \ldots, N\}$. For each $1 \leq i \leq N$, if (for contradiction) $a_{i}=0$, then $i$ divides $a_{i}$ and so $a_{i} \neq 0$, which is absurd. Therefore the first $N$ terms of the sequence are non-zero (including $a_{N}$ ), and all subsequent terms are 0 because if $i>N$, then $i$ cannot divide any of the first $N$ terms which are all non-zero and at most $N$. This means that a sequence satisfying the condition for some $N$ will only satisfy the condition for that particular $N$.\\
Next assume that $N>1$. The term $a_{N-1}$ is positive and so there is an index $j$ in the range $1 \leq j \leq N$ such that $N-1$ divides $a_{j}$. However, for $1 \leq i \leq N$ each $a_{i}$ is at $\operatorname{most} N$, so $N-1 \leq a_{j} \leq N$. In the case that $N=2$ this yields the second and third delightful sequences mentioned above, and proves that there are no others.\\
It remains to study the case $N>2$. In that case, $N-1$ does not divide $N$ so, taking $j$ as above, we must have $a_{j}=N-1 \quad(\neq 1)$. If (for contradiction) $a_{N} \geq 2$ there is an index $k$ with $1<k \leq N$ such that $a_{k}=N$, so $k$ divides all non-zero terms of the sequence, and in particular $k$ divides both $N-1$ and $N$ and so must be 1 , which is absurd. Finally suppose (for contradiction) that $a_{N}=1$. In that case, $j$ (recall that $a_{j}=N-1>1$ so $2 \leq j \leq N-1$ ) does not divide $a_{N}$ but $j$ does divide all previous terms of the sequence. Therefore $j$ divides $a_{1}$ which is $N$ and $a_{j}$ which is $N-1$ so $j=1$. Therefore, $a_{1}$ is both $N-1$ and $N$, which is absurd.

Thus, there are no delightful sequences with $N>2$. There are exactly the 3 delightful sequences mentioned above so we report the answer 3.""",
    
    """We have the key claim: $\omega$ is tangent to $B C$.\\
Proof of Claim: The angle bisector theorem gives $\frac{C X}{C A}=\frac{C B}{C B+B A}$ which rearranges to

$$
C X=\frac{A C \cdot B C}{A B+B C}=\frac{126 \times 108}{147}=\frac{6 \times 108}{7}=\frac{648}{7}
$$

Now calculate: $C B^{2}=108^{2}=\frac{648}{7} \times 126=C X \cdot C A$ and we establish the required tangency by the converse of power of point (the tangent-secant theorem) applied to $C$ and $\omega$.\\
\includegraphics[max width=\textwidth, center]{2024_12_04_73c4dcc43c6e7936620eg-03}

Having established this tangency, we now perform a short calculation.\\
Let $\Gamma$ denote the circle with centre $C$ and radius $C X$ which passes through $Y$. The point $E$ is on the radical axis $X Y$ of $\omega$ and $\Gamma$. It follows that $E$ has equal powers with respect to both circles $\omega$ and $\Gamma$.

Let $x=B E$ so $E C=108-x$. The power of $E$ with respect to $\omega$ is $x^{2}$ (because of the tangency at $B$ ) and the power of $E$ with respect to $\Gamma$ is $E C^{2}-X C^{2}$ because $C$ is the centre of $\Gamma$ and $X C$ is its radius.

We have

$$
x^{2}=(108-x)^{2}-\left(\frac{648^{2}}{7}\right)^{2}
$$

The quadratic terms cancel so $216 x=108^{2}-\frac{648^{2}}{49}=\frac{49 \times 108^{2}-648^{2}}{49}$ and this gives the solution $x=\frac{702}{49}$. Now, 702 and 49 are coprime so $m+n=751$ is the required answer.""",
    
    """For each integer $k$ in the range $0 \leq k \leq 10^{100}-1$ we have

$$
k+\left(10^{100}-k-1\right)=10^{100}-1
$$

which in decimal notation is a string of 100 nines. Deeming all numbers $k$ in the range to be decimal strings of 100 digits (including initial padding zeros when necessary), we see that for each $j$ in the range $1 \leq j \leq 100$, the $j$-th digit of $k$ and $j$-th digit of $10^{100}-1-k$ add up to 9 .\\
Therefore for each integer $k$ in the range $0 \leq k \leq 10^{100}-1$ we have $S(k)+S\left(10^{100}-1-k\right)=900$. Recall that $N=10^{100}-2$ and observe that $S(0)=0$. Then

$$
2\left(S(0)+S(1)+S(2)+\cdots+S\left(10^{100}-1\right)\right)=900 \times 10^{100}
$$

so

$$
S(1)+S(2)+\cdots+S(N)=450 \times\left(10^{100}-1\right)-S\left(10^{100}-1\right)=450 \times 10^{100}-900
$$

In decimal notation, $450 \times 10^{100}$ is the string 45 followed by 101 zeros. Subtracting 900 gives the string 44 followed by 98 nines and then the digits 100.

The digit sum of this number is $4+4+98 \times 9+1=99 \times 9=891$. We report 891 .""",
    
    """We will show that the smallest artificial number is 5 , and that if $a$ is an artificial number, then so too is $a+1$. This will solve the problem.

First, we eliminate small cases. If $n=2$ and the different positive integers are $a, b$ with $a<b$. Then, $\operatorname{gcd}(a, b) \leq a$ so $\operatorname{gcd}(a, b)+1=G(a, b)+1 \leq a+1 \leq b<a+b$.\\
Now suppose that $n=3$ and the different positive integers are $a<b<c$. Using round brackets to denote gcds, we have

$$
(a, b)+(b, c)+(c, a) \leq 2 a+b \leq a+b+c-2
$$

because $a \leq c-2$. Therefore $G(a, b, c)+1<a+b+c$.\\
Next we tackle the case $n=4$, and let the positive integers be $a<b<c<d$. Now

$$
\begin{aligned}
G(a, b, c, d)+1 & =[(a, b)+(b, c)+(c, d)]+[(a, c)+(a, d)]+(b, d)+1 \\
& =[(a, b-a)+(b, c-b)+(c, d-c)]+[(a, c)+(a, d)]+(b, d)+1 \\
& \leq(b-a+c-b+d-c)+2 a+b+1 \\
& =a+b+d+1 .
\end{aligned}
$$

Now $c \geq 3$ so

$$
G(a, b, c, d)+1 \leq a+b+c+d+(1-c) \leq a+b+c+d-2
$$

Having eliminated small cases, we show that 5 is artificial. Consider the numbers 1,2,3,4 and 6 . Now

$$
G(1,2,3,4,6)+1=1+1+1+1+1+2+2+1+3+2+1=16=(1+2+3+4+6)
$$

Therefore 5 is artificial.\\
Now suppose that $m$ is an artificial positive integer witnessed by different positive integers $a_{1}, a_{2}, \ldots a_{m}$. For $1 \leq i \leq m$, let $b_{i}=m a_{i}$ and let $b_{m+1}=1$. We will show that these $m+1$ different positive integers $b_{i}$ are witnesses to $m+1$ being artificial.

If $i, j \leq m$, then $\operatorname{gcd}\left(b_{i}, b_{j}\right)=m \cdot \operatorname{gcd}\left(a_{i}, a_{j}\right)$ and $\left(b_{i}, b_{m+1}\right)=1$ for $1 \leq i \leq m$. Therefore

$$
\begin{aligned}
G\left(b_{1}, b_{2}, \ldots b_{m+1}\right) & =G\left(b_{1}, b_{2}, \ldots, b_{m}\right)+m \\
& =m \cdot G\left(a_{1}, a_{2}, \ldots, a_{m}\right)+m \\
& =m\left(a_{1}+a_{2}+\cdots+a_{m}-1\right)+m \\
& =b_{1}+b_{2}+\cdots+b_{m} \\
& =\left(b_{1}+b_{2}+\cdots+b_{m+1}\right)-1 .
\end{aligned}
$$

Therefore $m$ is artificial for all $m \geq 5$ by induction. We must report

$$
5+6+\cdots+40=\frac{45 \times 36}{2}=810
$$""",
    
    """Checking modulo 5 , we find that $n^{2}+(n+1)^{2} \equiv 0 \bmod 5$ if, and only if, $n$ is 1 or 3 modulo 5. On the other hand, the Fibonacci numbers modulo 5 form a sequence of period 20 and their squares form a sequence of period 10 . By inspection $F_{n-1}^{2}+F_{n}^{2} \equiv 0 \bmod 5$ if, and only if, $n$ is $3 \bmod 5$. Therefore we are being asked to find the number of positive integers $m$ in the range $1 \leq m \leq 10^{101}$ such that $m$ is 1 modulo 5 . This is one fifth of the numbers in the range since $10^{101}$ is divisible by 5 , so

$$
N=10^{101} / 5=2 \times 10^{100}=2^{101} \cdot 5^{100}
$$

We therefore report $100+101=201$.""",
    
    """Let $T$ be the sum of the numbers, after Bob has erased 10 of them. We can bound $T$ by considering what happens if Bob happens to erase the smallest ten numbers, or the largest ten numbers.\\
The average of the set $\{1,2, \ldots, n-10\}$ is $\frac{n-9}{2}$ (the average of the first and last terms of a finite arithmetic progression). Similarly, average of the set $\{11,12, \ldots, n\}$ is $\frac{n+11}{2}$. The average of the remaining numbers, $\frac{3000}{37}=\frac{T}{n-10}$, must be bounded by these quantities, so

$$
\frac{n-9}{2} \leq \frac{T}{n-10}=\frac{3000}{37} \leq \frac{n+11}{2}
$$

It follows that $n-10$ must be a multiple of 37 so that it can cancel down to 37 . Another way to say that is $n \equiv 10 \bmod 37$. Multiplying the inequalities by 2 we obtain

$$
n-9 \leq \frac{6000}{37} \leq n+11
$$

or rather

$$
\frac{6000}{37}-11 \leq n \leq \frac{6000}{37}+9
$$

Now $6000 / 37=162+\frac{6}{37}$ so

$$
152 \leq n \leq 171
$$

The only integer in this range which is 10 modulo 37 is 158 , so $n=158$ and

$$
T=(n-10) \times \frac{3000}{37}=\frac{148 \times 3000}{37}=4 \times 3000=12000
$$

Let the sum of the numbers erased by Bob be $S$, so

$$
S=\left(\sum_{i=1}^{158} i\right)-12000=\frac{158 \times 159}{2}-12000=12561-12000=561
$$

Bob can achieve this by erasing $\{51,52, \ldots, 59,66\}$.\\
The number that we must report is $n \times S$ modulo 997 , which is $158 \times 561$ modulo 997 , which is 902 .""",
]

## AIME Dataset

### Download

In [None]:
num_sample = 10000
dataset_name = 'Dahoas/aimo-validation-aime'
save_path = "aime.json"

import pandas as pd
from datasets import load_dataset

# Load a dataset from Hugging Face
dataset = load_dataset(dataset_name)

# Convert the dataset to a pandas DataFrame
# Assuming you want to use the 'train' split of the dataset
df = pd.DataFrame(dataset['train'])

# Convert the DataFrame to a list of dictionaries
data_list = df.to_dict(orient='records')
data_list = data_list[:num_sample]

# Print the first few records to verify
#print(data_list[:5])

# Prepare list
problems = []
correct_answers = []
solutions = []
for i, data_dict in enumerate(data_list):
    problems.append(data_dict["problem"])
    correct_answers.append(int(data_dict["answer"]))
    solutions.append(data_dict["solution"].split("\n~")[0])

import json
with open(save_path, "w") as f:
    json.dump({"problems":problems, "correct_answers":correct_answers, "solutions":solutions}, f)

### Load

In [None]:
import json
save_path = "aime.json"
with open(save_path) as f:
    data = json.load(f)

problems = data["problems"]
correct_answers = data["correct_answers"]
solutions = data["solutions"]
num_problems = len(problems)

### test

In [None]:
id = 3
print(data_list[id]["problem"])
print()
print(data_list[id]["solution"].split("\n~")[0])
print()
print(data_list[id]["answer"])
print(type(int(data_list[id]["answer"])))

In [None]:
print(len(data_list))

# Solve Problems

## SWEBench

In [1]:
from SEIMEI import SEIMEI
import asyncio

processed_path = "./processed"  # input path same as save_path you used in Preparation
database_path = "./database"
#se_restrictions = ["MetaSurvey2"]  # search engine only hits classes in this list normally (except when adding expert_restriction in kwargs)
expert_config = [
    {
        "dir_path" : "./Experts/SWE/", # can be either folder or file
        "class_names" : ["Answer", "CheckInf", "MetaSurvey", "CollectCodeFileToModify"]
    }
]

seimei = SEIMEI(
    processed_path = processed_path,
    database_path = database_path, 
    expert_config = expert_config,
    max_inference_time = 1000,
    tensor_parallel_size = 1,
    max_request = 20,
)


INFO 03-30 12:29:16 config.py:478] This model supports multiple tasks: {'reward', 'embed', 'classify', 'generate', 'score'}. Defaulting to 'generate'.
INFO 03-30 12:29:16 llm_engine.py:249] Initializing an LLM engine (v0.6.5) with config: model='/workspace/qwen3b', speculative_config=None, tokenizer='/workspace/qwen3b', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=/workspace/qwen3b, num_scheduler_

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]


INFO 03-30 12:29:56 model_runner.py:1097] Loading model weights took 5.7915 GB
INFO 03-30 12:29:58 worker.py:241] Memory profiling takes 2.40 seconds
INFO 03-30 12:29:58 worker.py:241] the current vLLM instance can use total_gpu_memory (44.34GiB) x gpu_memory_utilization (0.90) = 39.91GiB
INFO 03-30 12:29:58 worker.py:241] model weights take 5.79GiB; non_torch_memory takes 0.14GiB; PyTorch activation peak memory takes 2.52GiB; the rest of the memory reserved for KV Cache is 31.45GiB.
INFO 03-30 12:29:59 gpu_executor.py:76] # GPU blocks: 57251, # CPU blocks: 7281
INFO 03-30 12:29:59 gpu_executor.py:80] Maximum concurrency for 32768 tokens per request: 27.95x
INFO 03-30 12:30:02 model_runner.py:1413] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 03-30 12:30:02 model_runner.py:1417] If out-of-memory error occurs during cudagraph captu

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/266 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/114k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/677 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/670M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/297 [00:00<?, ?B/s]


SEIMEI.expert_classes:  [<class 'Answer.Answer'>, <class 'CheckInf.CheckInf'>, <class 'MetaSurvey.MetaSurvey'>, <class 'CollectCodeFileToModify.CollectCodeFileToModify'>]



In [None]:
original_question = "How to implement a new equilibrium state called Miller equilibrium into gyro-kinetic vlasov simulation?"
queries = [
    {
        "query":original_question,
        "doc_path":"/gkv-code",
    }
]

final_answer = await seimei.get_answer(queries = queries) # return final answer

print()
print()
print(final_answer)


Expert <class 'SEIMEI.Experts'> started


Expert <class 'SEIMEI.SpecificExperts'> started


Expert <class 'SEIMEI.Search'> started


Expert <class 'SEIMEI.PermanentExperts'> started


Expert <class 'MetaSurvey.MetaSurvey'> started

<class 'SEIMEI.Experts'>
{'query': 'How to implement a new equilibrium state called Miller equilibrium into gyro-kinetic vlasov simulation?', 'doc_path': '/gkv-code'}

Expert <class 'QuickSummary.QuickSummary'> started

QuickSummary prompt num token:  5166
QuickSummary prompt num token:  835
QuickSummary prompt num token:  4327
QuickSummary prompt num token:  4815
QuickSummary prompt num token:  2699
QuickSummary prompt num token:  185


Token indices sequence length is longer than the specified maximum sequence length for this model (150445 > 131072). Running this sequence through the model will result in indexing errors


QuickSummary prompt num token:  8625


Token indices sequence length is longer than the specified maximum sequence length for this model (187652 > 131072). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (187652 > 131072). Running this sequence through the model will result in indexing errors


QuickSummary prompt num token:  7799
QuickSummary prompt num token:  8977
QuickSummary prompt num token:  7707
QuickSummary prompt num token:  5280
QuickSummary prompt num token:  7341
QuickSummary prompt num token:  5509
QuickSummary prompt num token:  4273
QuickSummary prompt num token:  4608
QuickSummary prompt num token:  4708
QuickSummary prompt num token:  3369
QuickSummary prompt num token:  4927
QuickSummary prompt num token:  6132
QuickSummary prompt num token:  4330
QuickSummary prompt num token:  7788
QuickSummary prompt num token:  10237
QuickSummary prompt num token:  10238
QuickSummary prompt num token:  10239
QuickSummary prompt num token:  10238
QuickSummary prompt num token:  10237
QuickSummary prompt num token:  10239
QuickSummary prompt num token:  9087
QuickSummary prompt num token:  2196

Expert <class 'SEIMEI.PermanentExpert'> started


Expert <class 'CheckInf.CheckInf'> started



Expert <class 'CheckInf.CheckInf'> ended

result: None



Expert <class 'SEIMEI.Per

Traceback (most recent call last):
  File "/workspace/SEIMEI6-2/Experts/SWE/QuickSummary.py", line 115, in inference
    id = int(id_text)
ValueError: invalid literal for int() with base 10: 'file id: 0'
Traceback (most recent call last):
  File "/workspace/SEIMEI6-2/Experts/SWE/QuickSummary.py", line 115, in inference
    id = int(id_text)
ValueError: invalid literal for int() with base 10: 'file id: 1'
Traceback (most recent call last):
  File "/workspace/SEIMEI6-2/Experts/SWE/QuickSummary.py", line 115, in inference
    id = int(id_text)
ValueError: invalid literal for int() with base 10: 'file id: 2'
Traceback (most recent call last):
  File "/workspace/SEIMEI6-2/Experts/SWE/QuickSummary.py", line 115, in inference
    id = int(id_text)
ValueError: invalid literal for int() with base 10: 'file id: 3'
Traceback (most recent call last):
  File "/workspace/SEIMEI6-2/Experts/SWE/QuickSummary.py", line 117, in inference
    data["file_path"] = survey_paths_list[i][id]
KeyError: 1



Expert <class 'CollectCodeFileToModify.CollectCodeFileToModify'> started



Expert <class 'CollectCodeFileToModify.CollectCodeFileToModify'> ended

result: None



Expert <class 'CheckInf.CheckInf'> started



Expert <class 'CheckInf.CheckInf'> ended

result: None


INFO 03-30 12:34:56 metrics.py:467] Avg prompt throughput: 1249.8 tokens/s, Avg generation throughput: 72.8 tokens/s, Running: 1 reqs, Swapped: 0 reqs, Pending: 0 reqs, GPU KV cache usage: 0.7%, CPU KV cache usage: 0.0%.
INFO 03-30 12:35:01 metrics.py:467] Avg prompt throughput: 0.0 tokens/s, Avg generation throughput: 66.9 tokens/s, Running: 1 reqs, Swapped: 0 reqs, Pending: 0 reqs, GPU KV cache usage: 0.7%, CPU KV cache usage: 0.0%.

Expert <class 'CheckInf.CheckInf'> started



Expert <class 'CheckInf.CheckInf'> ended

result: None


INFO 03-30 12:35:06 metrics.py:467] Avg prompt throughput: 0.0 tokens/s, Avg generation throughput: 66.7 tokens/s, Running: 1 reqs, Swapped: 0 reqs, Pending: 0 reqs, GPU KV cache usage: 0.8

## Kaggle/AIMO2

In [None]:
ids = [i for i in range(num_problems)]
#ids = [0,1,4,5,6,7]
num_sample = 5
max_request = 40
progress_save_dir = "math-aime-progress_log3"
save_path = "aime_log1.json"
explanation = "deepseek-r1-qwen14b/aime"

In [None]:
import json, re, os

# Prepare Variables

# log : { str(problem_id): { "prompts": {str(sample_id):promt,}, "outputs":{}, "final_answers":{}, "corrects":{}}, }
log = {str(id):{"prompts":{}, "outputs":{}, "final_answers":{}, "corrects":{}} for id in ids}
log["num_sample"] = num_sample
log["num_problems"] = num_problems
log["info"] = explanation

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("./deepseek_r1_qwen14b", padding_side='left')

# all_requests = {str(request_id): requests}
# requests = [{"prompt": , ... }, ... ]
# Ex. all_requests = {"0":[{"prompt":, "problem_id":,}, ...] }
#all_requests = {str(i):[] for i in range(max_request)}
all_requests = AllRequests(max_request)

# Prepare all_requests
#request_id = 0
for i in range(num_sample):  # To see the rough result quickly, it'd better process problems with different ids first. That's why loop for sample comes before one for ids.
    for id in ids:
        messages = [
            {"role": "user", "content": f"""Please answer to the problem, and put your final answer within \\boxed{{}}. If final answer is a number larger than 1000, take module 1000.

Problem: {problems[id]}"""}
        ]

        #messages = [
        #    {"role": "system", "content": f"""Please answer to the problem, and put your final answer within \\boxed{{}}. If final answer is a number larger than 1000, take module 1000."""},
        #    {"role": "user", "content": f"""Problem: {problems[id]}"""}
        #]
        
        prompt = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )
        
        request_dict = {"problem_id":id, "sample_id":i, "prompt":prompt}
        all_requests.add(request_dict)
        #all_requests[str(request_id)].append(request_dict)
        #request_id += 1
        #if request_id == max_request:
        #    request_id = 0

# Define get_result (Optional): If you want to evaluate the output somehow while running process_requests, you can define get_result function and pass it to process_requests method. This will let you read log files easily with some evaluation.
def get_result(request_dict, save_dir):
    # request_dict: {"prompt":, "output":}
    # save_dir: this is a directory path for progress log. put your evaluation file in here

    problem_id = request_dict["problem_id"]
    output = request_dict["output"]
    eval_file_path = f"{save_dir}/score.json"

    if os.path.exists(eval_file_path):
        with open(eval_file_path) as f:
            log = json.load(f)
        if str(problem_id) in log["num_answered"]:
            num_answered = log["num_answered"][str(problem_id)]
            num_correct = log["num_correct"][str(problem_id)]
        else:
            num_answered = 0
            num_correct = 0
    else:
        log = {"num_answered":{}, "num_correct":{}}
        num_answered = 0
        num_correct = 0

    is_correct = False
    pattern = r'\\boxed{(\d+)}'
    matches = re.findall(pattern, output)
    if matches == []:
        final_answer = None
    else:
        final_answer = int(matches[0])
        if correct_answers[problem_id] == final_answer:
            is_correct = True
            num_correct += 1

    request_dict["final_answer"] = final_answer
    request_dict["is_correct"] = is_correct

    num_answered += 1
    log["num_answered"][str(problem_id)] = num_answered
    log["num_correct"][str(problem_id)] = num_correct

    with open(eval_file_path, "w") as f:
        json.dump(log, f)

    return request_dict


# Process all_requests
# If you had trouble in last process and want to continue to get the output, set restart = True
#all_results = await asyncio.gather(
#    *[process_requests(all_requests[request_id_str], int(request_id_str), max_tokens = 10000, save_dir = progress_save_dir, restart = False, get_result = get_result) for request_id_str in all_requests]
#)
all_results = await all_requests.process(model = "deepseek_r1_qwen14b", max_tokens = 15000, save_dir = progress_save_dir, restart = True, get_result = get_result)

# Check if the outputs are correct
for results_dict in all_results:

    problem_id = results_dict["problem_id"]
    sample_id = results_dict["sample_id"]
    prompt = results_dict["prompt"]
    output = results_dict["output"]

    is_correct = False
    pattern = r'\\boxed{(\d+)}'
    matches = re.findall(pattern, output)
    if matches == []:
        final_answer = None
    else:
        final_answer = int(matches[0])
        if correct_answers[problem_id] == final_answer:
            is_correct = True

    log[str(problem_id)]["prompts"][str(sample_id)] = prompt
    log[str(problem_id)]["outputs"][str(sample_id)] = output
    log[str(problem_id)]["final_answers"][str(sample_id)] = final_answer
    log[str(problem_id)]["corrects"][str(sample_id)] = is_correct


# Calculate Evaluation Scores
from collections import Counter
mv_score = 0
for problem_id in ids:
    num_correct = 0
    problem_id_str = str(problem_id)
    for sample_id_str in log[problem_id_str]["corrects"]:
        if log[problem_id_str]["corrects"][sample_id_str]: num_correct += 1
    log[problem_id_str]["num_correct"] = num_correct

    filtered_numbers = [log[problem_id_str]["final_answers"][sample_id_str] for sample_id_str in log[problem_id_str]["final_answers"] if log[problem_id_str]["final_answers"][sample_id_str] is not None]
    if filtered_numbers == []: continue
    counter = Counter(filtered_numbers)
    most_common_element, count = counter.most_common(1)[0]
    if correct_answers[int(problem_id_str)] == most_common_element:
        mv_score += 1
        
log["mv_score"] = mv_score
with open(save_path, "w") as json_file:
    json.dump(log, json_file)

print()
print("-- ALL FINISHED --")


## BigCodeBench

In [None]:
#ids = [i for i in range(10)]
#num_problems = 100
ids = [i for i in range(num_problems)]
num_sample = 1
max_request = 15
progress_save_dir = "progress_log_solve4"
save_path = "code-log7.json"
explanation = "deepseek_r1_qwen14b/bigcodebench2 for code test with small number of samples"
#num_problems = len(ids)

In [None]:
import json, re, os

# Prepare Variables

# log : { str(problem_id): { "prompts": {str(sample_id):promt,}, "outputs":{}, "final_answers":{}, "corrects":{}}, }
log = {str(id):{"prompts":{}, "outputs":{}, "final_answers":{}, "corrects":{}, "output_code":{}, "errors":{}, "tracebacks":{}} for id in ids}
log["num_sample"] = num_sample
log["num_problems"] = num_problems
log["info"] = explanation


def extract_text_inside_backticks(text, arbitrary_text):
    # Define the pattern to match the text inside ``` that follows the arbitrary text
    pattern = re.compile(r'```{}\s*([\s\S]*?)\s*```'.format(re.escape(arbitrary_text)))

    # Search for the pattern in the text
    match = pattern.search(text)

    if match:
        return match.group(1).strip()
    else:
        return None

'''
def check_output(output, test_code): # this didn't work because exec behaves differently from running it in jupyter notebook
    run_code = """
import inspect
def check_code():
    try:
        sub_obj = TestCases()
        for name, attribute in TestCases.__dict__.items():
            if not name.startswith('__') and not name.startswith('_') and callable(attribute):
                attribute(sub_obj)
    except Exception as e:
        return False
    return True

is_code_correct__ = check_code()"""

    output_code = extract_text_inside_backticks(output, "python")
    if not output_code: output_code = extract_text_inside_backticks(output, "")

    if not output_code: return False, test_code + run_code

    code = output_code + "\n\n\n" + test_code + run_code

    local_vars = {}
    global_vars = {}

    try:
        exec(test_code, global_vars, local_vars)
        is_code_correct = local_vars['is_code_correct__']
    except Exception as e:
        return False, code

    return is_code_correct, code


# Define get_result (Optional): If you want to evaluate the output somehow while running process_requests, you can define get_result function and pass it to process_requests method. This will let you read log files easily with some evaluation.
def get_result(request_dict, save_dir):
    # request_dict: {"prompt":, "output":}
    # save_dir: this is a directory path for progress log. put your evaluation file in here

    problem_id = request_dict["problem_id"]
    output = request_dict["output"]
    test_code = request_dict["test_code"]
    
    eval_file_path = f"{save_dir}/score.json"

    if os.path.exists(eval_file_path):
        with open(eval_file_path) as f:
            log = json.load(f)
        if str(problem_id) in log["num_answered"]:
            num_answered = log["num_answered"][str(problem_id)]
            num_correct = log["num_correct"][str(problem_id)]
        else:
            num_answered = 0
            num_correct = 0
    else:
        log = {"num_answered":{}, "num_correct":{}}
        num_answered = 0
        num_correct = 0

    is_code_correct, code = check_output(output, test_code)
    if is_code_correct: num_correct += 1

    request_dict["is_code_correct"] = is_code_correct

    num_answered += 1
    log["num_answered"][str(problem_id)] = num_answered
    log["num_correct"][str(problem_id)] = num_correct

    with open(eval_file_path, "w") as f:
        json.dump(log, f)

    return request_dict
'''

# all_requests = {str(request_id): requests}
# requests = [{"prompt": , ... }, ... ]
# Ex. all_requests = {"0":[{"prompt":, "problem_id":,}, ...] }
#all_requests = {str(i):[] for i in range(max_request)}
all_requests = AllRequests(max_request)

# Prepare all_requests
#request_id = 0
for i in range(num_sample):  # To see the rough result quickly, it'd better process problems with different ids first. That's why loop for sample comes before one for ids.
    for id in ids:
        messages = [
            {"role": "user", "content": f"""{instruct_prompt[id]}"""}
        ]
        prompt = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )
        test_code = test[id]
        
        request_dict = {"problem_id":id, "sample_id":i, "prompt":prompt, "test_code":test_code}
        all_requests.add(request_dict)
        
all_results = await all_requests.process(max_tokens = 10000, save_dir = progress_save_dir, restart = True)

test_cases = []
candidates = [[] for _ in range(num_problems)]
test_cases_dict = {}
# Check if the outputs are correct
for results_dict in all_results:
    problem_id = results_dict["problem_id"]
    sample_id = results_dict["sample_id"]
    prompt = results_dict["prompt"]
    output = results_dict["output"]
    test_code = results_dict["test_code"]
    #is_code_correct, code = check_output(output, test_code)

    log[str(problem_id)]["prompts"][str(sample_id)] = prompt
    log[str(problem_id)]["outputs"][str(sample_id)] = output
    #log[str(problem_id)]["corrects"][str(sample_id)] = is_code_correct

    output_code = extract_text_inside_backticks(output, "python")
    if not output_code: output_code = extract_text_inside_backticks(output, "")
    if not output_code: output_code = ""

    log[str(problem_id)]["output_code"][str(sample_id)] = output_code

    if not problem_id in test_cases_dict:
        test_cases_dict[problem_id] = test_code
    candidates[problem_id].append(output_code)

for id in test_cases_dict:
    test_cases.append(test_cases_dict[id])

import os, time
os.environ["HF_ALLOW_CODE_EVAL"] = "1"
#from evaluate import load
# Load code evaluation metric
#code_eval_metric = load("code_eval")

# Modified code_eval which has returns traceback of test error 
from code_eval.code_eval import CodeEval
code_eval_metric = CodeEval()
# Compute pass@k
k_values = [1]
print("Evaluating generated code...")
start = time.time()
pass_at_k, results = code_eval_metric._compute(
    references=test_cases,
    predictions=candidates,
    k=k_values,
    num_workers=18,  # Adjust based on your system
    timeout=100.0,   # Adjust the timeout as needed
)
end = time.time()
print("calculation time(s): ", end-start)

# Print the results
#for k in k_values:
#    print(f"Pass@{k}: {pass_at_k[f'pass@{k}'] * 100:.2f}%")
    #log[f"Pass@{k}"] = pass_at_k[f'pass@{k}']

total_num_correct = 0
total_num_problem = 0
num_correct_dict = {}
for problem_id in range(len(results)):
    num_correct = 0
    for sample_id in range(len(results[problem_id])):
        is_correct = results[problem_id][sample_id][1]["passed"]
        if candidates[problem_id][sample_id]=="": is_correct=False  # passed become true when output_code == "" for some reason. This should be incorrect
        log[str(problem_id)]["corrects"][str(sample_id)] = is_correct
        if not is_correct:
            try: # for normal case
                log[str(problem_id)]["errors"][str(sample_id)] = results[problem_id][sample_id][1]["result"]["error"]
                log[str(problem_id)]["tracebacks"][str(sample_id)] = results[problem_id][sample_id][1]["result"]["traceback"]
            except: # for canse output_code == ""
                log[str(problem_id)]["errors"][str(sample_id)] = "failed: there is no code included in the answer"
                log[str(problem_id)]["tracebacks"][str(sample_id)] = "failed: there is no code included in the answer"
        if is_correct: num_correct += 1
    log[str(problem_id)]["num_correct"] = num_correct
    num_correct_dict[str(problem_id)] = num_correct
    total_num_correct+=num_correct
    total_num_problem+=len(results[problem_id])
    
log["num_correct_dict"] = num_correct_dict
log["pass1"] = total_num_correct/total_num_problem
with open(save_path, "w") as f:
    json.dump(log, f)

print()
print("-- ALL FINISHED --")


In [None]:
total_num_correct = 0
total_num_problem = 0
num_correct_dict = {}
for problem_id in range(len(results)):
    num_correct = 0
    for sample_id in range(len(results[problem_id])):
        is_correct = results[problem_id][sample_id][1]["passed"]
        if candidates[problem_id][sample_id]=="": is_correct=False  # passed become true when output_code == "" for some reason. This should be incorrect
        log[str(problem_id)]["corrects"][str(sample_id)] = is_correct
        if not is_correct:
            log[str(problem_id)]["errors"][str(sample_id)] = results[problem_id][sample_id][1]["result"]["error"]
            log[str(problem_id)]["tracebacks"][str(sample_id)] = results[problem_id][sample_id][1]["result"]["traceback"]

        if is_correct: num_correct += 1
    log[str(problem_id)]["num_correct"] = num_correct
    num_correct_dict[str(problem_id)] = num_correct
    total_num_correct+=num_correct
    total_num_problem+=len(results[problem_id])
    
log["num_correct_dict"] = num_correct_dict
log["pass1"] = total_num_correct/total_num_problem
with open(save_path, "w") as f:
    json.dump(log, f)

print()
print("-- ALL FINISHED --")


# New Section

# Correction 3 (with many iteration)

## bigcodebench

In [None]:
num_iteration = 10
max_request = 50  # max_request to AsyncEngine
load_file = "code-log5.json"
save_file = "code-log5-corr1.json"
save_dir_base = "progless_log9"

In [None]:
import os, json, re, traceback
from transformers import AutoTokenizer

with open(load_file) as json_file:
    log = json.load(json_file)

total_num_problem = log["num_problems"]
total_num_sample = log["num_sample"]*log["num_problems"]

if not os.path.exists(save_dir_base):
    os.makedirs(save_dir_base)
    
# advice_result_log: {str(iteration):{str(problem_id):{"num_problem":, "num_correct":}}
advice_result_log = {}

def add_numbers_to_lines(text):
    # Split the text into lines
    lines = text.split('\n\n')

    # Initialize a counter
    counter = 1

    # Create a list to hold the numbered lines
    numbered_lines = []
    numbered_texts = []

    # Iterate through the lines
    for line in lines:
        if line.strip():  # Check if the line is not empty
            # Add the number and the line to the list
            numbered_lines.append((counter, line))
            numbered_texts.append(f"{counter}. {line}")
            # Increment the counter
            counter += 1

    numbered_text = '\n\n'.join(numbered_texts)    

    return numbered_lines, numbered_text


def get_text_before_number(numbered_lines, number):
    # Find the index of the tuple with the given number
    for i, (num, line) in enumerate(numbered_lines):
        if num == number:
            # Return the original text before the given number
            return '\n\n'.join(line for _, line in numbered_lines[:i])

    # If the number is not found, return an empty string
    return ""


def extract_text_inside_backticks(text, arbitrary_text):
    # Define the pattern to match the text inside ``` that follows the arbitrary text
    pattern = re.compile(r'```{}\s*([\s\S]*?)\s*```'.format(re.escape(arbitrary_text)))

    # Search for the pattern in the text
    match = pattern.search(text)

    if match:
        return match.group(1).strip()
    else:
        return None


# Define get_result (Optional): If you want to evaluate the output somehow while running process_requests, you can define get_result function and pass it to process_requests method. This will let you read log files easily with some evaluation.
def get_result(request_dict, save_dir):
    # request_dict: {"prompt":, "output":}
    # save_dir: this is a directory path for progress log. put your evaluation file in here

    problem_id = int(request_dict["log_ids"][0])
    output = request_dict["output"]
    eval_file_path = f"{save_dir}/score.json"

    if os.path.exists(eval_file_path):
        with open(eval_file_path) as f:
            log = json.load(f)
        if str(problem_id) in log["num_answered"]:
            num_answered = log["num_answered"][str(problem_id)]
            num_correct = log["num_correct"][str(problem_id)]
        else:
            num_answered = 0
            num_correct = 0
    else:
        log = {"num_answered":{}, "num_correct":{}}
        num_answered = 0
        num_correct = 0

    is_correct = False
    pattern = r'\\boxed{(\d+)}'
    matches = re.findall(pattern, output)
    if matches == []:
        final_answer = None
    else:
        final_answer = int(matches[0])
        if correct_answers[problem_id] == final_answer:
            is_correct = True
            num_correct += 1

    request_dict["final_answer"] = final_answer
    request_dict["is_correct"] = is_correct

    num_answered += 1
    log["num_answered"][str(problem_id)] = num_answered
    log["num_correct"][str(problem_id)] = num_correct

    with open(eval_file_path, "w") as f:
        json.dump(log, f)

    return request_dict




def get_log_dict(log, log_ids):
    if len(log_ids) == 0:
        return log
    
    problem_id = log_ids.pop(0)
    log_dict = log[str(problem_id)]
    
    for log_id in log_ids:
        log_dict = log_dict["children"][str(log_id)]

    return log_dict


def get_edit_log_dict(log_ids):
    global log
    
    if len(log_ids) == 0:
        return log
    
    problem_id = log_ids.pop(0)
    log_dict = log[str(problem_id)]
    
    for log_id in log_ids:
        log_dict = log_dict["children"][str(log_id)]

    return log_dict



for iter in range(num_iteration):

    all_requests1 = AllRequests(max_request)
    
    # Make all_request for advice by searching log recursively
    def search_log(log_dict, log_ids):  # node_id: str
        global all_requests, request_id
        node_id = log_ids[-1]

        if type(log_dict[node_id]) != dict:
            return None
            
        if "children" in log_dict[node_id]:
            for next_node_id in log_dict[node_id]["children"]:
                search_log(log_dict[node_id]["children"], log_ids + [next_node_id])
        elif "corrects" in log_dict[node_id]:
            if iter!=0:
                all_false = True
                for sample_id_str in log_dict[node_id]["corrects"]:
                    if log_dict[node_id]["corrects"][sample_id_str]:
                        all_false = False
                        break
                    
                if all_false:
                    problem_id_str = log_ids[0]
                    problem_id = int(problem_id_str)
                    pre_prompt = log_dict[node_id]["prompts"]["0"]
                    pre_output = log_dict[node_id]["outputs"]["0"]
                    error = log_dict[node_id]["errors"]["0"]
                    traceback_ = log_dict[node_id]["tracebacks"]["0"]
                    #student_answer = prompt.split("<｜Assistant｜>")[1] + output  #log_dict[node_id]["outputs"][sample_id_str]
                    #numbered_lines, numbered_answer = add_numbers_to_lines(student_answer)

                    messages = [
                        {"role": "user", "content": f"""### Problem:
'''
{instruct_prompt[problem_id]}
'''


### Correct Solution:
'''
{canonical_solution[problem_id]}
'''


### Student's Incorrect Answer:
'''
{pre_output}
'''


### Test Code and Its Error
'''
```
{test[problem_id]}
```

{traceback_}
'''


You are an advanced language model tasked with analyzing a student’s answer to a coding problems and make some instructions to lead him to the correct solution. You are given the coding problem, the correct solution of it, a student’s incorrect answer, test code of the answer code and error cause of the student's incorrect answer. Please make some instructions and let him answer correctly following the instructions below.


### Instructions:
1. **Think Why Student’s Answer was Wrong**: Compare the correct solution and student’s incorrect answer, and analyze why the student’s answer was wrong and think about where it went in a different direction from the correct solution.
2. **Think What was the Idea Missing in Student’s Answer**: Think what idea was included in the correct solution but was missing from the students' answer.
3. **Imagine Many Thinking Processes Which May Lead to the Idea**: Imagine as many thinking processes as possible which may lead him to think of the missing idea.
4. **Give Short and Abstract Instructions**: Expanding your imagination, make as many instructions as possible which may lead him to the missing idea which was not included in the student’s answer. All the instructions should be abstract and general so that it can be applied to other problems too. These are the examples of the instruction; “Explore all the possibilities of it”, “Check if there are enough conditions to solve the problem”, “Imagine what condition will lead you to solve the problem”, “Find some regularities and prove a statement which narrows down the options”, “Summarize your thought and check if it really follows the problem”.
5. **Generate Output**: Based on the result so far, return the missing idea and Instructions in backticks like

```idea
(The missing idea in the student’s answer)
```

```instructions
[
    "Instruction1 (An instruction which leads him to the missing idea)",
    ...
]
```


Let’s think step by step following each step of the instructions."""}
                    ]
                    
                    prompt = tokenizer.apply_chat_template(
                        messages,
                        tokenize=False,
                        add_generation_prompt=True
                    )
                    
                    new_log_ids = log_ids+["0"]
                    request_dict = {"log_ids":new_log_ids, "prompt":prompt, "pre_prompt":pre_prompt, "pre_output":pre_output}
                    all_requests1.add(request_dict)
                    
            else:
                for sample_id_str in log_dict[node_id]["corrects"]:
                    if not log_dict[node_id]["corrects"][sample_id_str]:
        
                        problem_id_str = log_ids[0]
                        problem_id = int(problem_id_str)
                        pre_prompt = log_dict[node_id]["prompts"][sample_id_str]
                        pre_output = log_dict[node_id]["outputs"][sample_id_str]
                        error = log_dict[node_id]["errors"][sample_id_str]
                        traceback_ = log_dict[node_id]["tracebacks"][sample_id_str]
                        #student_answer = prompt.split("<｜Assistant｜>")[1] + output  #log_dict[node_id]["outputs"][sample_id_str]
                        #numbered_lines, numbered_answer = add_numbers_to_lines(student_answer)
    
                        messages = [
                            {"role": "user", "content": f"""### Problem:
'''
{instruct_prompt[problem_id]}
'''


### Correct Solution:
'''
{canonical_solution[problem_id]}
'''


### Student's Incorrect Answer:
'''
{pre_output}
'''


### Test Code and Its Error
'''
```
{test[problem_id]}
```

{traceback_}
'''


You are an advanced language model tasked with analyzing a student’s answer to a coding problems and make some instructions to lead him to the correct solution. You are given the coding problem, the correct solution of it, a student’s incorrect answer, test code of the answer code and error cause of the student's incorrect answer. Please make some instructions and let him answer correctly following the instructions below.


### Instructions:
1. **Think Why Student’s Answer was Wrong**: Compare the correct solution and student’s incorrect answer, and analyze why the student’s answer was wrong and think about where it went in a different direction from the correct solution.
2. **Think What was the Idea Missing in Student’s Answer**: Think what idea was included in the correct solution but was missing from the students' answer.
3. **Imagine Many Thinking Processes Which May Lead to the Idea**: Imagine as many thinking processes as possible which may lead him to think of the missing idea.
4. **Give Short and Abstract Instructions**: Expanding your imagination, make as many instructions as possible which may lead him to the missing idea which was not included in the student’s answer. All the instructions should be abstract and general so that it can be applied to other problems too. These are the examples of the instruction; “Explore all the possibilities of it”, “Check if there are enough conditions to solve the problem”, “Imagine what condition will lead you to solve the problem”, “Find some regularities and prove a statement which narrows down the options”, “Summarize your thought and check if it really follows the problem”.
5. **Generate Output**: Based on the result so far, return the missing idea and Instructions in backticks like

```idea
(The missing idea in the student’s answer)
```

```instructions
[
    "Instruction1 (An instruction which leads him to the missing idea)",
    ...
]
```


Let’s think step by step following each step of the instructions."""}
                        ]
                        
                        prompt = tokenizer.apply_chat_template(
                            messages,
                            tokenize=False,
                            add_generation_prompt=True
                        )
                        
                        new_log_ids = log_ids+[sample_id_str]
                        request_dict = {"log_ids":new_log_ids, "prompt":prompt, "pre_prompt":pre_prompt, "pre_output":pre_output}
                        all_requests1.add(request_dict)
    
    for problem_id_str in log:
        search_log(log, [problem_id_str])
    
    
    # Process all_requests
    # If you had some error in last process and want to continue to get the output, set restart = True
    all_results1 = await all_requests1.process(max_tokens = 15000, restart = True, save_dir=f"{save_dir_base}/inst-{iter}")
    all_requests2 = AllRequests(max_request)

    error_num = 0
    num_results = len(all_results1)
    for result_dict in all_results1:
        log_ids = result_dict["log_ids"]
        prompt = result_dict["prompt"]
        output = result_dict["output"]
        pre_prompt = result_dict["pre_prompt"]
        pre_output = result_dict["pre_output"]
        instruction_log = prompt + output

        missing_idea = extract_text_inside_backticks(output, "idea")
        instruction_list_text = extract_text_inside_backticks(output, "instructions")
        
        if missing_idea and instruction_list_text:
            try:
                instruction_list_text = instruction_list_text.replace("\n","")
                instruction_list_text = instruction_list_text.replace("\\","")
                pattern = r"'((?:[^']|'(?!\s*[,\]]))*)'"
                replacement = r'"\1"'
                #instruction_list_text = re.sub(pattern, replacement, instruction_list_text)  # convert ['I'm a cat', 'This is the student's car',] into ["I'm a cat", "This is the student's car",]
                instruction_list = json.loads(instruction_list_text)
            except Exception as e:
                traceback.print_exc()
                error_num += 1
                print()
                print("An error occurred:", e)
                print("instruction_list_text: ", instruction_list_text)
                print("error_num: ", error_num)
                continue
    
            problem_id = int(log_ids[0])
            problem = instruct_prompt[problem_id]

            prompts_dict = {}
            insert_ids=[]
            instruction_ids=[]
            prompt_id = 0
            for instruction_id, instruction in enumerate(instruction_list):
                modified_insts = instruction_list[:instruction_id] + instruction_list[(instruction_id+1):]
                advices_text = ""
                for i, inst in enumerate(modified_insts):
                    advices_text += f"\nAdvice{i+1}: {inst}"
                    
                messages = [
                    {"role": "user", "content": pre_prompt[29:][:-13]},
                    {"role": "assistant", "content": pre_output},
                    {"role": "user", "content": f"""Your answer might contain some errors. Please revise your answer following the advice below;
{advices_text}"""}
                ]
                    
                new_prompt = tokenizer.apply_chat_template(
                    messages,
                    tokenize=False,
                    add_generation_prompt=True
                )

                prompts_dict[str(prompt_id)] = new_prompt

                next_request = {"log_ids":log_ids+[str(instruction_id)], "prompt":new_prompt, "instruction_log":[instruction_log]}
                all_requests2.add(next_request)
                
            edit_log_dict = get_edit_log_dict(log_ids[:-1])
            if "children" in edit_log_dict:
                edit_log_dict["children"][str(log_ids[-1])] = {"instruction_list":instruction_list, "prompts":prompts_dict, "instruction_log":[instruction_log]}
            else:
                edit_log_dict["children"] = {str(log_ids[-1]):{"instruction_list":instruction_list, "prompts":prompts_dict, "instruction_log":[instruction_log]}}


    with open(save_file, "w") as f:
        json.dump(log, f)

    print("log saved")
    
    # Process all_requests
    # If you had trouble in last process and want to continue to get the output, set restart = True
    all_results2 = await all_requests2.process(max_tokens = 15000, restart = True, save_dir=f"{save_dir_base}/solve-{iter}")

    advice_result_log[str(iter)] = {"num_problem":{}, "num_correct":{},}
    test_cases = []
    candidates = []
    problem_ids = []
    sample_ids = []
    log_ids_list = []
    # Check if the outputs are correct
    for results_dict in all_results2:
        log_ids = results_dict["log_ids"]
        prompt = results_dict["prompt"]
        output = results_dict["output"]
        problem_id = int(log_ids[0])
        sample_id = int(log_ids[-1])
        test_code = test[problem_id]

        output_code = extract_text_inside_backticks(output, "python")
        if not output_code: output_code = extract_text_inside_backticks(output, "")
        if not output_code: output_code = ""

        edit_log_dict = get_edit_log_dict(log_ids[:-1])

        if "prompts" in edit_log_dict:
            edit_log_dict["prompts"][str(sample_id)] = prompt
        else:
            edit_log_dict["prompts"] = {str(sample_id):prompt}

        if "outputs" in edit_log_dict:
            edit_log_dict["outputs"][str(sample_id)] = output
        else:
            edit_log_dict["outputs"] = {str(sample_id):output}

        if "output_codes" in edit_log_dict:
            edit_log_dict["output_codes"][str(sample_id)] = output_code
        else:
            edit_log_dict["output_codes"] = {str(sample_id):output_code}

        test_cases.append(test_code)
        candidates.append([output_code])
        problem_ids.append(problem_id)
        sample_ids.append(sample_id)
        log_ids_list.append(log_ids)
    
    import os, time
    os.environ["HF_ALLOW_CODE_EVAL"] = "1"
    from code_eval.code_eval import CodeEval
    code_eval_metric = CodeEval()
    # Compute pass@k
    k_values = [1]
    print("Evaluating generated code...")
    start = time.time()
    pass_at_k, results = code_eval_metric._compute(
        references=test_cases,
        predictions=candidates,
        k=k_values,
        num_workers=10,  # Adjust based on your system
        timeout=150.0,   # Adjust the timeout as needed
    )
    end = time.time()
    print("calculation time(s): ", end-start)
    
    for i in range(len(results)):
        problem_id = problem_ids[i]
        sample_id = sample_ids[i]
        unexpected_error = False
        if results[problem_id] == []:
            is_correct = False  # [] appeared sometimes for unknown reason. I define it as incorrect for now, but it should be fixed.
            unexpected_error = True
        else: is_correct = results[problem_id][0][1]["passed"]
        
        log_ids = log_ids_list[i]
        edit_log_dict = get_edit_log_dict(log_ids[:-1])

        if "corrects" in edit_log_dict:
            edit_log_dict["corrects"][str(sample_id)] = is_correct
        else:
            edit_log_dict["corrects"] = {str(sample_id):is_correct}

        if not is_correct:
            if not unexpected_error:
                error = results[problem_id][0][1]["result"]["error"]
                traceback_ = results[problem_id][0][1]["result"]["traceback"]
            else:
                error = ""
                traceback_ = ""
    
            if "errors" in edit_log_dict:
                edit_log_dict["errors"][str(sample_id)] = error
            else:
                edit_log_dict["errors"] = {str(sample_id):error}
    
            if "tracebacks" in edit_log_dict:
                edit_log_dict["tracebacks"][str(sample_id)] = traceback_
            else:
                edit_log_dict["tracebacks"] = {str(sample_id):traceback_}
    
        if str(log_ids[0]) in advice_result_log[str(iter)]["num_correct"]:
            if is_correct:
                advice_result_log[str(iter)]["num_correct"][str(log_ids[0])] += 1
        else:
            if is_correct:
                advice_result_log[str(iter)]["num_correct"][str(log_ids[0])] = 1
            else:
                advice_result_log[str(iter)]["num_correct"][str(log_ids[0])] = 0

        if str(log_ids[0]) in advice_result_log[str(iter)]["num_problem"]:
            advice_result_log[str(iter)]["num_problem"][str(log_ids[0])] += 1
        else:
            advice_result_log[str(iter)]["num_problem"][str(log_ids[0])] = 1


    num_problem = 0
    num_sample = 0
    pass1_count = 0
    passAll_count = 0
    for problem_id_str in advice_result_log[str(iter)]["num_problem"]:
        num_problem += 1
        num_sample += advice_result_log[str(iter)]["num_problem"][problem_id_str]
    for problem_id_str in advice_result_log[str(iter)]["num_correct"]:
        pass1_count += advice_result_log[str(iter)]["num_correct"][problem_id_str]
        if advice_result_log[str(iter)]["num_correct"][problem_id_str] > 0:
            passAll_count += 1

    num_already_correct_problem = total_num_problem - len(all_results1)

    print("total_num_problem: ", total_num_problem)
    print("total_num_sample: ", total_num_sample)
    print("num_already_correct_problem: ", num_already_correct_problem)
    print(f"{passAll_count}/{num_problem} problems have got at least 1 correct sample in this iteration")
    print(f"{pass1_count}/{num_sample} samples were correct in total")
    
    pass1 = pass1_count/num_sample
    passAll = passAll_count/num_problem

    advice_result_log[str(iter)]["pass@1"] = pass1
    advice_result_log[str(iter)]["passAll"] = passAll
    print("pass1 in this iteration: ", pass1)
    print("passAll in this iteration: ", passAll)
    
    
    log["advice_result_log"] = advice_result_log
    with open(save_file, "w") as json_file:
        json.dump(log, json_file)

print()
print("-- ALL FINISHED --")


## AIME

In [None]:
num_iteration = 10
max_request = 40  # max_request to AsyncEngine
load_file = "aime_log1.json"
save_file = "aime_log1_advice1.json"
save_dir_base = f"{save_file[:-5]}-progless_log"

In [None]:
import os, json, re, traceback
from transformers import AutoTokenizer

with open(load_file) as json_file:
    log = json.load(json_file)

total_num_problem = log["num_problems"]
total_num_sample = log["num_sample"]*log["num_problems"]

if not os.path.exists(save_dir_base):
    os.makedirs(save_dir_base)
    
# advice_result_log: {str(iteration):{str(problem_id):{"num_problem":, "num_correct":}}
advice_result_log = {}

def add_numbers_to_lines(text):
    # Split the text into lines
    lines = text.split('\n\n')

    # Initialize a counter
    counter = 1

    # Create a list to hold the numbered lines
    numbered_lines = []
    numbered_texts = []

    # Iterate through the lines
    for line in lines:
        if line.strip():  # Check if the line is not empty
            # Add the number and the line to the list
            numbered_lines.append((counter, line))
            numbered_texts.append(f"{counter}. {line}")
            # Increment the counter
            counter += 1

    numbered_text = '\n\n'.join(numbered_texts)    

    return numbered_lines, numbered_text


def get_text_before_number(numbered_lines, number):
    # Find the index of the tuple with the given number
    for i, (num, line) in enumerate(numbered_lines):
        if num == number:
            # Return the original text before the given number
            return '\n\n'.join(line for _, line in numbered_lines[:i])

    # If the number is not found, return an empty string
    return ""


def extract_text_inside_backticks(text, arbitrary_text):
    # Define the pattern to match the text inside ``` that follows the arbitrary text
    pattern = re.compile(r'```{}\s*([\s\S]*?)\s*```'.format(re.escape(arbitrary_text)))

    # Search for the pattern in the text
    match = pattern.search(text)

    if match:
        return match.group(1).strip()
    else:
        return None


# Define get_result (Optional): If you want to evaluate the output somehow while running process_requests, you can define get_result function and pass it to process_requests method. This will let you read log files easily with some evaluation.
def get_result(request_dict, save_dir):
    # request_dict: {"prompt":, "output":}
    # save_dir: this is a directory path for progress log. put your evaluation file in here

    problem_id = int(request_dict["log_ids"][0])
    output = request_dict["output"]
    eval_file_path = f"{save_dir}/score.json"

    if os.path.exists(eval_file_path):
        with open(eval_file_path) as f:
            log = json.load(f)
        if str(problem_id) in log["num_answered"]:
            num_answered = log["num_answered"][str(problem_id)]
            num_correct = log["num_correct"][str(problem_id)]
        else:
            num_answered = 0
            num_correct = 0
    else:
        log = {"num_answered":{}, "num_correct":{}}
        num_answered = 0
        num_correct = 0

    is_correct = False
    pattern = r'\\boxed{(\d+)}'
    matches = re.findall(pattern, output)
    if matches == []:
        final_answer = None
    else:
        final_answer = int(matches[0])
        if correct_answers[problem_id] == final_answer:
            is_correct = True
            num_correct += 1

    request_dict["final_answer"] = final_answer
    request_dict["is_correct"] = is_correct

    num_answered += 1
    log["num_answered"][str(problem_id)] = num_answered
    log["num_correct"][str(problem_id)] = num_correct

    with open(eval_file_path, "w") as f:
        json.dump(log, f)

    return request_dict




def get_log_dict(log, log_ids):
    if len(log_ids) == 0:
        return log
    
    problem_id = log_ids.pop(0)
    log_dict = log[str(problem_id)]
    
    for log_id in log_ids:
        log_dict = log_dict["children"][str(log_id)]

    return log_dict


def get_edit_log_dict(log_ids):
    global log
    
    if len(log_ids) == 0:
        return log
    
    problem_id = log_ids.pop(0)
    log_dict = log[str(problem_id)]
    
    for log_id in log_ids:
        log_dict = log_dict["children"][str(log_id)]

    return log_dict



for iter in range(num_iteration):

    all_requests1 = AllRequests(max_request)
    
    # Make all_request for advice by searching log recursively
    def search_log(log_dict, log_ids):  # node_id: str
        global all_requests, request_id
        node_id = log_ids[-1]

        if type(log_dict[node_id]) != dict:
            return None
            
        if "children" in log_dict[node_id]:
            for next_node_id in log_dict[node_id]["children"]:
                search_log(log_dict[node_id]["children"], log_ids + [next_node_id])
        elif "corrects" in log_dict[node_id]:
            if iter>=0:
                all_false = True
                for sample_id_str in log_dict[node_id]["corrects"]:
                    if log_dict[node_id]["corrects"][sample_id_str]:
                        all_false = False
                        break
                    
                if all_false:
                    problem_id_str = log_ids[0]
                    problem_id = int(problem_id_str)
                    pre_prompt = log_dict[node_id]["prompts"]["0"]
                    pre_output = log_dict[node_id]["outputs"]["0"]

                    messages = [
                        {"role": "user", "content": f"""### Problem:
'''
{problems[problem_id]}
'''


### Correct Solution:
'''
{solutions[problem_id]}
'''


### Student's Incorrect Answer:
'''
{pre_output}
'''


You are an advanced language model tasked with analyzing a student’s answer to a mathematical problem and make some instructions to lead him to a correct solution. You are given the math problem, the correct solution of it, and a student’s incorrect answer. Please make some instructions and let him answer correctly following the instructions below.


### Instructions:
1. **Think Why Student’s Answer was Wrong**: Compare the correct solution and student’s incorrect answer, and analyze why the student’s answer was wrong and think about where it went in a different direction from the correct solution.
2. **Think What was the Idea Missing in Student’s Answer**: Think what idea was included in the correct solution but was missing from the students' answer.
3. **Imagine Many Thinking Processes Which May Lead to the Idea**: Imagine as many thinking processes as possible which may lead him to think of the missing idea.
4. **Give Short and Abstract Instructions**: Expanding your imagination, make as many instructions as possible which may lead him to the missing idea which was not included in the student’s answer. All the instructions should be abstract and general so that it can be applied to other problems too. These are the examples of the instructions; “Explore all the possibilities of it”, “Check if there are enough conditions to solve the problem”, “Imagine what condition will lead you to solve the problem”, “Find some regularities and prove a statement which narrows down the options”, “Summarize your thought and check if it really follows the problem”.
5. **Generate Output**: Based on the result so far, return the missing idea and Instructions in backticks like

```idea
(The missing idea in the student’s answer)
```

```instructions
[
    “Instruction1 (An instruction which leads him to the missing idea)",
    ...
]
```


Let’s think step by step following each step of the instructions."""}
                    ]
                    
                    prompt = tokenizer.apply_chat_template(
                        messages,
                        tokenize=False,
                        add_generation_prompt=True
                    )
                    
                    new_log_ids = log_ids+["0"]
                    request_dict = {"log_ids":new_log_ids, "prompt":prompt, "pre_prompt":pre_prompt, "pre_output":pre_output}
                    all_requests1.add(request_dict)
                    
            else:
                for sample_id_str in log_dict[node_id]["corrects"]:
                    if not log_dict[node_id]["corrects"][sample_id_str]:
        
                        problem_id_str = log_ids[0]
                        problem_id = int(problem_id_str)
                        pre_prompt = log_dict[node_id]["prompts"][sample_id_str]
                        pre_output = log_dict[node_id]["outputs"][sample_id_str]
                        
                        messages = [
                            {"role": "user", "content": f"""### Problem:
'''
{problems[problem_id]}
'''


### Correct Solution:
'''
{solutions[problem_id]}
'''


### Student's Incorrect Answer:
'''
{pre_output}
'''


You are an advanced language model tasked with analyzing a student’s answer to a mathematical problem and make some instructions to lead him to a correct solution. You are given the math problem, the correct solution of it, and a student’s incorrect answer. Please make some instructions and let him answer correctly following the instructions below.


### Instructions:
1. **Think Why Student’s Answer was Wrong**: Compare the correct solution and student’s incorrect answer, and analyze why the student’s answer was wrong and think about where it went in a different direction from the correct solution.
2. **Think What was the Idea Missing in Student’s Answer**: Think what idea was included in the correct solution but was missing from the students' answer.
3. **Imagine Many Thinking Processes Which May Lead to the Idea**: Imagine as many thinking processes as possible which may lead him to think of the missing idea.
4. **Give Short and Abstract Instructions**: Expanding your imagination, make as many instructions as possible which may lead him to the missing idea which was not included in the student’s answer. All the instructions should be abstract and general so that it can be applied to other problems too. These are the examples of the instructions; “Explore all the possibilities of it”, “Check if there are enough conditions to solve the problem”, “Imagine what condition will lead you to solve the problem”, “Find some regularities and prove a statement which narrows down the options”, “Summarize your thought and check if it really follows the problem”.
5. **Generate Output**: Based on the result so far, return the missing idea and Instructions in backticks like

```idea
(The missing idea in the student’s answer)
```

```instructions
[
    “Instruction1 (An instruction which leads him to the missing idea)",
    ...
]
```


Let’s think step by step following each step of the instructions."""}
                        ]
                        
                        prompt = tokenizer.apply_chat_template(
                            messages,
                            tokenize=False,
                            add_generation_prompt=True
                        )
                        
                        new_log_ids = log_ids+[sample_id_str]
                        request_dict = {"log_ids":new_log_ids, "prompt":prompt, "pre_prompt":pre_prompt, "pre_output":pre_output}
                        all_requests1.add(request_dict)
    
    for problem_id_str in log:
        search_log(log, [problem_id_str])
    
    
    # Process all_requests
    # If you had some error in last process and want to continue to get the output, set restart = True
    all_results1 = await all_requests1.process(max_tokens = 10000, restart = True, save_dir=f"{save_dir_base}/inst-{iter}")
    all_requests2 = AllRequests(max_request)

    error_num = 0
    num_results = len(all_results1)
    for result_dict in all_results1:
        log_ids = result_dict["log_ids"]
        prompt = result_dict["prompt"]
        output = result_dict["output"]
        pre_prompt = result_dict["pre_prompt"]
        pre_output = result_dict["pre_output"]
        instruction_log = prompt + output

        missing_idea = extract_text_inside_backticks(output, "idea")
        instruction_list_text = extract_text_inside_backticks(output, "instructions")
        
        if missing_idea and instruction_list_text:
            try:
                instruction_list_text = instruction_list_text.replace("\n","")
                instruction_list_text = instruction_list_text.replace("\\","")
                pattern = r"'((?:[^']|'(?!\s*[,\]]))*)'"
                replacement = r'"\1"'
                #instruction_list_text = re.sub(pattern, replacement, instruction_list_text)  # convert ['I'm a cat', 'This is the student's car',] into ["I'm a cat", "This is the student's car",]
                instruction_list = json.loads(instruction_list_text)
            except Exception as e:
                traceback.print_exc()
                error_num += 1
                print()
                print("An error occurred:", e)
                print("instruction_list_text: ", instruction_list_text)
                print("error_num: ", error_num)
                continue
    
            problem_id = int(log_ids[0])
            problem = problems[problem_id]

            prompts_dict = {}
            insert_ids=[]
            instruction_ids=[]
            prompt_id = 0
            for instruction_id, instruction in enumerate(instruction_list):
                modified_insts = instruction_list[:instruction_id] + instruction_list[(instruction_id+1):]
                advices_text = ""
                for i, inst in enumerate(modified_insts):
                    advices_text += f"\nAdvice{i+1}: {inst}"
                    
                messages = [
                    {"role": "user", "content": pre_prompt[29:][:-13]},
                    {"role": "assistant", "content": pre_output},
                    {"role": "user", "content": f"""Your answer might contain some errors. Please revise your answer following the advice below;
{advices_text}

Please put your final answer within \\boxed{{}}. If final answer is a number larger than 1000, take module 1000."""}
                ]
                    
                new_prompt = tokenizer.apply_chat_template(
                    messages,
                    tokenize=False,
                    add_generation_prompt=True
                )

                prompts_dict[str(prompt_id)] = new_prompt

                next_request = {"log_ids":log_ids+[str(instruction_id)], "prompt":new_prompt}
                all_requests2.add(next_request)
                
            edit_log_dict = get_edit_log_dict(log_ids[:-1])
            if "children" in edit_log_dict:
                edit_log_dict["children"][str(log_ids[-1])] = {"instruction_list":instruction_list, "prompts":prompts_dict, "instruction_log":[instruction_log]}
            else:
                edit_log_dict["children"] = {str(log_ids[-1]):{"instruction_list":instruction_list, "prompts":prompts_dict, "instruction_log":[instruction_log]}}


    with open(save_file, "w") as f:
        json.dump(log, f)

    print("log saved")
    
    # Process all_requests
    # If you had trouble in last process and want to continue to get the output, set restart = True
    all_results2 = await all_requests2.process(max_tokens = 15000, restart = True, save_dir=f"{save_dir_base}/solve-{iter}")

    advice_result_log[str(iter)] = {"num_problem":{}, "num_correct":{},}
    test_cases = []
    candidates = []
    problem_ids = []
    sample_ids = []
    log_ids_list = []
    # Check if the outputs are correct
    for results_dict in all_results2:
        log_ids = results_dict["log_ids"]
        prompt = results_dict["prompt"]
        output = results_dict["output"]
        problem_id = int(log_ids[0])
        sample_id = int(log_ids[-1])

        is_correct = False
        pattern = r'\\boxed{(\d+)}'
        matches = re.findall(pattern, output)
        if matches == []:
            final_answer = None
        else:
            final_answer = int(matches[0])
            if correct_answers[problem_id] == final_answer:
                is_correct = True

        edit_log_dict = get_edit_log_dict(log_ids[:-1])

        if "prompts" in edit_log_dict:
            edit_log_dict["prompts"][str(sample_id)] = prompt
        else:
            edit_log_dict["prompts"] = {str(sample_id):prompt}

        if "outputs" in edit_log_dict:
            edit_log_dict["outputs"][str(sample_id)] = output
        else:
            edit_log_dict["outputs"] = {str(sample_id):output}

        if "corrects" in edit_log_dict:
            edit_log_dict["corrects"][str(sample_id)] = is_correct
        else:
            edit_log_dict["corrects"] = {str(sample_id):is_correct}
    
        if str(log_ids[0]) in advice_result_log[str(iter)]["num_correct"]:
            if is_correct:
                advice_result_log[str(iter)]["num_correct"][str(log_ids[0])] += 1
        else:
            if is_correct:
                advice_result_log[str(iter)]["num_correct"][str(log_ids[0])] = 1
            else:
                advice_result_log[str(iter)]["num_correct"][str(log_ids[0])] = 0

        if str(log_ids[0]) in advice_result_log[str(iter)]["num_problem"]:
            advice_result_log[str(iter)]["num_problem"][str(log_ids[0])] += 1
        else:
            advice_result_log[str(iter)]["num_problem"][str(log_ids[0])] = 1


    num_problem = 0
    num_sample = 0
    pass1_count = 0
    passAll_count = 0
    for problem_id_str in advice_result_log[str(iter)]["num_problem"]:
        num_problem += 1
        num_sample += advice_result_log[str(iter)]["num_problem"][problem_id_str]
    for problem_id_str in advice_result_log[str(iter)]["num_correct"]:
        pass1_count += advice_result_log[str(iter)]["num_correct"][problem_id_str]
        if advice_result_log[str(iter)]["num_correct"][problem_id_str] > 0:
            passAll_count += 1

    num_already_correct_problem = total_num_problem - len(all_results1)

    print("total_num_problem: ", total_num_problem)
    print("total_num_sample: ", total_num_sample)
    print("num_already_correct_problem: ", num_already_correct_problem)
    print(f"{passAll_count}/{num_problem} problems have got at least 1 correct sample in this iteration")
    print(f"{pass1_count}/{num_sample} samples were correct in total")
    
    pass1 = pass1_count/num_sample
    passAll = passAll_count/num_problem

    advice_result_log[str(iter)]["pass@1"] = pass1
    advice_result_log[str(iter)]["passAll"] = passAll
    print("pass1 in this iteration: ", pass1)
    print("passAll in this iteration: ", passAll)
    
    
    log["advice_result_log"] = advice_result_log
    with open(save_file, "w") as json_file:
        json.dump(log, json_file)

print()
print("-- ALL FINISHED --")


# Destruction Check

In [None]:
advice_path = "advice6-1.json"
save_path = "log6-1-3-fake-advice.json"
ids = [i for i in range(10)]
fake_advice_ids = [i for i in range(10)]
num_sample = 5

In [None]:
# with a piece of advice

import json, re, os, random

async def answer(ids, advices):

    # answer with the advices
    texts = []
    for id in ids:
        filtered_list = [num for num in fake_advice_ids if num != id]
        fake_advice_id = random.choice(filtered_list)
        advice = advices[str(fake_advice_id)]
        problem = problems[id]
        
        messages = [
            {"role": "system", "content": f"""Please answer to user's problem referring to the advice below. If the final answer is a number larger than 1000, take modulo 1000.

Advice:
{advice}"""},
            {"role": "user", "content": f"""Problem: {problem}"""}
        ]

        text = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )
        texts.append(text)

    start = time.time()

    results = await asyncio.gather(
        *[add_request(text, i, start, max_tokens = 10000) for i, text in enumerate(texts)]
    )

    import re, json
    boxed_answers = {}
    results_ = {}
    
    for i, result in enumerate(results):
        results_[str(ids[i])] = result
        
        # Define the regex pattern to match the number enclosed in \\boxed{number}
        pattern = r'\\boxed{(\d+)}'
        # Find all matches in the text
        matches = re.findall(pattern, result)
        # Print the extracted numbers
        if matches == []:
            boxed_answers[str(ids[i])] = None
        else:
            boxed_answers[str(ids[i])] = int(matches[0])

    return results_, boxed_answers


with open(advice_path) as json_file:
    advices = json.load(json_file)

all_boxed_answers = {}
all_results = {}
num_correct_answer = {}
mv_score = {}
num_problems = len(ids)

for i in range(num_sample):
    results, boxed_answers = await answer(ids, advices)
    
    for id in ids:

        if str(id) in num_correct_answer:
            score = num_correct_answer[str(id)]
        else:
            score = 0

        if correct_answers[id] == boxed_answers[str(id)]:
            score += 1
        
        num_correct_answer[str(id)] = score
            
        if str(id) in all_results:
            all_results[str(id)].append(results[str(id)])
            all_boxed_answers[str(id)].append(boxed_answers[str(id)])
        else:
            all_results[str(id)] = [results[str(id)]]
            all_boxed_answers[str(id)] = [boxed_answers[str(id)]]
    
    log = {"info":"Answer to the problems with fake advice to check the destruction. Temparature = 0.4", "all_results":all_results, "all_boxed_answers":all_boxed_answers, "num_correct_answer":num_correct_answer, "num_sample":num_sample, "num_problems":num_problems}
    with open(save_path, "w") as json_file:
        json.dump(log, json_file)


from collections import Counter

mv_score = 0
for id in ids:
    filtered_numbers = [num for num in all_boxed_answers[str(id)] if num is not None]
    if filtered_numbers == []: continue
    counter = Counter(filtered_numbers)
    most_common_element, count = counter.most_common(1)[0]
    if correct_answers[id] == most_common_element:
        mv_score += 1

log["mv_score"] = mv_score
with open(save_path, "w") as json_file:
    json.dump(log, json_file)

# Solve Problems with Manual Advice

In [None]:
advices = {
    "0":"When you have to search all possibilities, try to find a regularity and make a generalized method to explore all the possibilities from it.",
    "5":"When you have a geometry problem, you should check if there are enough conditions to solve the problem. If there are not, you should imagine what condition you should prove to solve the problem.",
    "7":"When solving an integer problem with countless possibilities, find some regularities and prove a statement which narrows down the options",
    "4":"When solving an integer problem with countless possibilities, find some regularities and prove a statement which narrows down the options",
    "1":"""In this problem, you may be able to use Chinese Reminder Theorem;

The Chinese Remainder Theorem (CRT) is a fundamental result in number theory that provides a way to solve systems of simultaneous congruences with different moduli. Here's a summary of the theorem:

1. **Statement of the Theorem**:
   - Let \( n_1, n_2, \ldots, n_k \) be positive integers that are pairwise coprime (i.e., \( \gcd(n_i, n_j) = 1 \) for all \( i \neq j \)).
   - Let \( a_1, a_2, \ldots, a_k \) be any integers.
   - The Chinese Remainder Theorem states that there exists an integer \( x \) that satisfies the system of congruences:
     \[
     \begin{cases}
     x \equiv a_1 \pmod{n_1} \\
     x \equiv a_2 \pmod{n_2} \\
     \vdots \\
     x \equiv a_k \pmod{n_k}
     \end{cases}
     \]
   - Moreover, the solution \( x \) is unique modulo \( N \), where \( N = n_1 n_2 \cdots n_k \).

2. **Existence and Uniqueness**:
   - The theorem guarantees the existence of a solution \( x \) that satisfies all the given congruences.
   - The solution is unique modulo \( N \), meaning that if \( x \) and \( y \) are both solutions, then \( x \equiv y \pmod{N} \).

3. **Constructing the Solution**:
   - One method to find the solution \( x \) is to use the formula:
     \[
     x = \sum_{i=1}^{k} a_i N_i M_i \pmod{N}
     \]
     where \( N_i = \frac{N}{n_i} \) and \( M_i \) is the modular inverse of \( N_i \) modulo \( n_i \) (i.e., \( N_i M_i \equiv 1 \pmod{n_i} \)).

4. **Applications**:
   - The Chinese Remainder Theorem has numerous applications in number theory, cryptography, and computer science.
   - It is used in designing algorithms for parallel computations, in cryptographic protocols like secret sharing, and in solving Diophantine equations.

In summary, the Chinese Remainder Theorem provides a powerful tool for solving systems of congruences with coprime moduli, ensuring the existence and uniqueness of solutions modulo the product of the moduli.""",
    "6":"If your answer became too long, summarize your answer and check if it follows the problem to prevent mistakes.",
}

ids = []
advice_model = "./mistral8b_model"
solve_model = "./qwq_awq_model"

num_advice = 1
num_sample_for_improved = 3
num_iteration = 1
load_file = "log9-5.json"
save_file = "log9-5-2-advice.json"
max_request_advice = 1  # Mistral API
max_request_improve = 1  # QwQ vLLM.AsyncEngine

#with open("log9-5.json") as f:
#    log1 = json.load(f)

In [None]:
import os, json, re
from transformers import AutoTokenizer

with open(load_file) as json_file:
    log = json.load(json_file)

# advice_result_log: {str(iteration):{str(problem_id):{"num_problem":, "num_correct":}}
advice_result_log = {}


qwq_tokenizer = AutoTokenizer.from_pretrained("./qwq_awq_model", padding_side='left')
#qwq_tokenizer = AutoTokenizer.from_pretrained("MBMMurad/QwQ-32B-preview-AWQ-AIMO-earlysharing", padding_side='left')

def add_numbers_to_lines(text):
    # Split the text into lines
    lines = text.split('\n\n')

    # Initialize a counter
    counter = 1

    # Create a list to hold the numbered lines
    numbered_lines = []
    numbered_texts = []

    # Iterate through the lines
    for line in lines:
        if line.strip():  # Check if the line is not empty
            # Add the number and the line to the list
            numbered_lines.append((counter, line))
            numbered_texts.append(f"{counter}. {line}")
            # Increment the counter
            counter += 1

    numbered_text = '\n\n'.join(numbered_texts)    

    return numbered_lines, numbered_text


def get_text_before_number(numbered_lines, number):
    # Find the index of the tuple with the given number
    for i, (num, line) in enumerate(numbered_lines):
        if num == number:
            # Return the original text before the given number
            return '\n\n'.join(line for _, line in numbered_lines[:i])

    # If the number is not found, return an empty string
    return ""


def extract_text_inside_backticks(text, arbitrary_text):
    # Define the pattern to match the text inside ``` that follows the arbitrary text
    pattern = re.compile(r'```{}\s*([\s\S]*?)\s*```'.format(re.escape(arbitrary_text)))

    # Search for the pattern in the text
    match = pattern.search(text)

    if match:
        return match.group(1).strip()
    else:
        return None


# Define get_result (Optional): If you want to evaluate the output somehow while running process_requests, you can define get_result function and pass it to process_requests method. This will let you read log files easily with some evaluation.
def get_result(request_dict, save_dir):
    # request_dict: {"prompt":, "output":}
    # save_dir: this is a directory path for progress log. put your evaluation file in here

    problem_id = int(request_dict["log_ids"][0])
    output = request_dict["output"]
    iter = request_dict["iter"]
    eval_file_path = f"{save_dir}/score-{iter}.json"

    if os.path.exists(eval_file_path):
        with open(eval_file_path) as f:
            log = json.load(f)
        if str(problem_id) in log["num_answered"]:
            num_answered = log["num_answered"][str(problem_id)]
            num_correct = log["num_correct"][str(problem_id)]
        else:
            num_answered = 0
            num_correct = 0
    else:
        log = {"num_answered":{}, "num_correct":{}}
        num_answered = 0
        num_correct = 0

    is_correct = False
    pattern = r'\\boxed{(\d+)}'
    matches = re.findall(pattern, output)
    if matches == []:
        final_answer = None
    else:
        final_answer = int(matches[0])
        if correct_answers[problem_id] == final_answer:
            is_correct = True
            num_correct += 1

    request_dict["final_answer"] = final_answer
    request_dict["is_correct"] = is_correct

    num_answered += 1
    log["num_answered"][str(problem_id)] = num_answered
    log["num_correct"][str(problem_id)] = num_correct

    with open(eval_file_path, "w") as f:
        json.dump(log, f)

    return request_dict




def get_log_dict(log, log_ids):
    if len(log_ids) == 0:
        return log
    
    problem_id = log_ids.pop(0)
    log_dict = log[str(problem_id)]
    
    for log_id in log_ids:
        log_dict = log_dict["children"][str(log_id)]

    return log_dict


def get_edit_log_dict(log_ids):
    global log
    
    if len(log_ids) == 0:
        return log
    
    problem_id = log_ids.pop(0)
    log_dict = log[str(problem_id)]
    
    for log_id in log_ids:
        log_dict = log_dict["children"][str(log_id)]

    return log_dict


def set_log_element(log, log_ids, key, element):
    if len(log_ids) == 0:
        raise Exception("log_ids should have at least one element.")
        
    problem_id = log_ids.pop(0)
    log_dict = log[problem_id]

    for log_id in log_ids:
        if not "children" in log_dict:
            log_dict["children"] = {}
        log_dict = log_dict["children"]

        if not log_id in log_dict:
            log_dict[log_id] = {}
        log_dict = log_dict[log_id]

    log_dict[key] = element

    return log



for iter in range(num_iteration):

    # Prepare for the 
    all_requests = {str(i):[] for i in range(max_request_advice)}
    request_id = 0
    # Make all_request for advice by searching log recursively
    def search_log(log_dict, log_ids):  # node_id: str
        global all_requests, request_id
        node_id = log_ids[-1]

        if type(log_dict[node_id]) != dict:
            return None
            
        if "children" in log_dict[node_id]:
            for next_node_id in log_dict[node_id]["children"]:
                search_log(log_dict[node_id]["children"], log_ids + [next_node_id])
        elif "corrects" in log_dict[node_id]:
            for sample_id_str in log_dict[node_id]["corrects"]:
                if not log_dict[node_id]["corrects"][sample_id_str]:
    
                    problem_id_str = log_ids[0]
                    problem_id = int(problem_id_str)
                    prompt = log_dict[node_id]["prompts"][sample_id_str]
                    output = log_dict[node_id]["outputs"][sample_id_str]
                    student_answer = prompt.split("<|im_start|>assistant")[1] + output  #log_dict[node_id]["outputs"][sample_id_str]
                    numbered_lines, numbered_answer = add_numbers_to_lines(student_answer)

                    advice = advices[problem_id_str]
                    
                    prompt = f"""<s>[INST]### Problem:
'''
{problems[problem_id]}
'''


### Correct Solution:
'''
{solutions[problem_id]}
'''


### Incorrect Student’s Answer:
'''
{numbered_answer}
'''


### Advice:
'''
{advice}
'''


You are an advanced language model tasked with analyzing a student’s answer to a mathematical problems and insert an advice in the student’s answer to lead him to the correct answer. You are given the math problem, the correct solution of it, a student’s incorrect answer and a piece of advice for it. Please think where to think about the advice in student’s answer and return the id of sentence where the advice should be thought before following the instructions below.


### Instructions:
1. **Analyze the Problem, Student’s Answer, Correct Solution and Advice**: Carefully understand and analyze them. 
2. **Summarize Answer and Solution**: Based on your analysis, summarize student’s answer and correct solution respectively.
3. **Think How Advice Leads to the Correct Solution**: Think about how the given advice contributes him to get a correct solution and how he rewrites the answer considering the advice.
4. **Think Where the Advice should be Considered**: Based on the advice you created and analysis so far, think where in the student’s answer the advice should be considered. There are ids at the beginning of all sentences in student’s answer. Please provide the id of line in the answer which should be modified by considering the advice.
5. **Generate Output**: Based on your analysis so far, return the id in backticks like

```id
(The id of line where the advice should be considered.)
```


Let’s think step by step following each step of the instructions.[/INST]"""
                    
                    new_log_ids = log_ids+[sample_id_str]
                    request_dict = {"iter":iter, "log_ids":new_log_ids, "prompt":prompt, "numbered_lines":numbered_lines, "advice":advice}
                    all_requests[str(request_id)].append(request_dict)
                    request_id+=1
                    if request_id == max_request_advice:
                        request_id = 0
    
    for problem_id_str in log:
        if problem_id_str in advices:
            search_log(log, [problem_id_str])
    
    
    # Process all_requests
    # If you had trouble in last process and want to continue to get the output, set restart = True
    
    #all_results = await asyncio.gather(
    #    *[process_api_requests(all_requests[request_id_str], int(request_id_str), max_tokens = 3000, restart = True) for request_id_str in all_requests]
    #)
    
    # *Note this part should be modified so that batch inference can be run

    # For using llm.generate
    results1 = process_requests(all_requests[str(0)], int(iter), model=advice_model, max_tokens = 3000, restart = True, save_dir="progless_log1")
    
    # Update log and create next requests (all_requests2)
    all_requests2 = {str(i):[] for i in range(max_request_improve)}
    request_id2 = 0
    
    for result_dict in results1:
        print("log_ids: ", result_dict["log_ids"])
        log_ids = result_dict["log_ids"]
        prompt = result_dict["prompt"]
        output = result_dict["output"]
        numbered_lines = result_dict["numbered_lines"]
        advice = result_dict["advice"]
        
        #advice = extract_text_inside_backticks(output, "advice")
        insert_id_text = extract_text_inside_backticks(output, "id")

        if insert_id_text:
            try:
                insert_id_text_ = re.sub(r'[^0-9]', '', insert_id_text)
                insert_id = int(insert_id_text_)
                new_answer_base = get_text_before_number(numbered_lines, insert_id) + "\n\nNow let's follow the user's advice."
    
                problem_id = int(log_ids[0])
                problem = problems[problem_id]
                messages = [
                    {"role": "system", "content": f"""Please answer to user's problem following the advice below. If the final answer is a number larger than 1000, take modulo 1000."""},
                    {"role": "user", "content": f"""Problem: {problem}

Advice: {advice}"""}
                ]
                
                new_prompt = qwq_tokenizer.apply_chat_template(
                    messages,
                    tokenize=False,
                    add_generation_prompt=True
                )
    
                new_prompt += new_answer_base
                prompts_dict = {str(i):new_prompt for i in range(num_sample_for_improved)}
                edit_log_dict = get_edit_log_dict(log_ids[:-1])
                if "children" in edit_log_dict:
                    edit_log_dict["children"][str(log_ids[-1])] = {"advice":advice, "insert_id":insert_id, "prompts":prompts_dict}
                else:
                    edit_log_dict["children"] = {str(log_ids[-1]):{"advice":advice, "insert_id":insert_id, "prompts":prompts_dict}}
                #log = set_log_element(log, log_ids[:-1], log_ids[-1], {"advice":advice, "insert_id":insert_id, "prompts":prompts_dict})
    
                for i in range(num_sample_for_improved):
                    next_request = {"log_ids":log_ids+[str(i)], "prompt":new_prompt, "iter":iter}
                    all_requests2[str(request_id2)].append(next_request)
                    
                request_id2 += 1
                if request_id2 == max_request_improve:
                    request_id2 = 0
            except:
                pass

    
    with open(save_file, "w") as f:
        json.dump(log, f)

    print("log saved")
    
    
    # Process all_requests
    # If you had trouble in last process and want to continue to get the output, set restart = True
    # all_results2 = [[{"output":}, ... ], ...]
    #all_results2 = await asyncio.gather(
    #    *[process_requests(all_requests2[request_id_str], int(request_id_str), max_tokens = 10000, restart = False, get_result = get_result) for request_id_str in all_requests2]
    #)

    results2 = process_requests(all_requests2[str(0)], int(iter), model=solve_model, max_tokens = 10000, restart = True, get_result = get_result, save_dir="progless_log2")
    
    advice_result_log[str(iter)] = {}
    # Check if the outputs are correct
    for results_dict in results2:
        log_ids = results_dict["log_ids"]
        prompt = results_dict["prompt"]
        output = results_dict["output"]

        problem_id = int(log_ids[0])

        is_correct = False
        pattern = r'\\boxed{(\d+)}'
        matches = re.findall(pattern, output)
        if matches == []:
            final_answer = None
        else:
            final_answer = int(matches[0])
            if correct_answers[problem_id] == final_answer:
                is_correct = True

        edit_log_dict = get_edit_log_dict(log_ids[:-1])
        sample_id = log_ids[-1]

        if "outputs" in edit_log_dict:
            edit_log_dict["outputs"][str(sample_id)] = output
        else:
            edit_log_dict["outputs"] = {str(sample_id):output}

        if "final_answers" in edit_log_dict:
            edit_log_dict["final_answers"][str(sample_id)] = final_answer
        else:
            edit_log_dict["final_answers"] = {str(sample_id):final_answer}

        if "corrects" in edit_log_dict:
            edit_log_dict["corrects"][str(sample_id)] = is_correct
        else:
            edit_log_dict["corrects"] = {str(sample_id):is_correct}

        if str(log_ids[0]) in advice_result_log[str(iter)]:
            advice_result_log[str(iter)][str(log_ids[0])]["num_problem"] += 1
            if is_correct: advice_result_log[str(iter)][str(log_ids[0])]["num_correct"] += 1
        else:
            if is_correct:
                advice_result_log[str(iter)][str(log_ids[0])] = {"num_problem":1, "num_correct":1}
            else:
                advice_result_log[str(iter)][str(log_ids[0])] = {"num_problem":1, "num_correct":0}


    log["advice_result_log"] = advice_result_log
    with open(save_file, "w") as json_file:
        json.dump(log, json_file)

print()
print("-- ALL FINISHED --")


In [None]:
requests = []  # requests: [{"prompt":, ...} ... ]

qwq_tokenizer = AutoTokenizer.from_pretrained("./qwq_awq_model", padding_side='left')

def extract_text_inside_backticks(text, arbitrary_text):
    # Define the pattern to match the text inside ``` that follows the arbitrary text
    pattern = re.compile(r'```{}\s*([\s\S]*?)\s*```'.format(re.escape(arbitrary_text)))

    # Search for the pattern in the text
    match = pattern.search(text)

    if match:
        return match.group(1).strip()
    else:
        return None


def add_numbers_to_lines(text):
    # Split the text into lines
    lines = text.split('\n\n')

    # Initialize a counter
    counter = 1

    # Create a list to hold the numbered lines
    numbered_lines = []
    numbered_texts = []

    # Iterate through the lines
    for line in lines:
        if line.strip():  # Check if the line is not empty
            # Add the number and the line to the list
            numbered_lines.append((counter, line))
            numbered_texts.append(f"{counter}. {line}")
            # Increment the counter
            counter += 1

    numbered_text = '\n\n'.join(numbered_texts)    

    return numbered_lines, numbered_text


def get_text_before_number(numbered_lines, number):
    # Find the index of the tuple with the given number
    for i, (num, line) in enumerate(numbered_lines):
        if num == number:
            # Return the original text before the given number
            return '\n\n'.join(line for _, line in numbered_lines[:i])

    # If the number is not found, return an empty string
    return ""



for id in ids:
    advice = advices[str(id)]

    for i in range(len(log1[str(id)]["outputs"])):
        numbered_lines, numbered_text = add_numbers_to_lines(log1[str(id)]["outputs"][str(i)])
    
        prompt = f"""<s>[INST]### Problem:
'''
{problems[id]}
'''


### Correct Solution:
'''
{solutions[id]}
'''


### Incorrect Student’s Answer:
'''
{numbered_text}
'''


### Advice:
'''
{advice}
'''


You are an advanced language model tasked with analyzing a student’s answer to a mathematical problems and insert an advice in the student’s answer to lead him to the correct answer. You are given the math problem, the correct solution of it, a student’s incorrect answer and a piece of advice for it. Please think where to think about the advice in student’s answer and return the id of sentence where the advice should be thought before following the instructions below.


### Instructions:
1. **Analyze the Problem, Student’s Answer, Correct Solution and Advice**: Carefully understand and analyze them. 
2. **Summarize Answer and Solution**: Based on your analysis, summarize student’s answer and correct solution respectively.
3. **Think How Advice Leads to the Correct Solution**: Think about how the given advice contributes him to get a correct solution and how he rewrites the answer considering the advice.
4. **Think Where the Advice should be Considered**: Based on the advice you created and analysis so far, think where in the student’s answer the advice should be considered. There are ids at the beginning of all sentences in student’s answer. Please provide the id of line in the answer which should be modified by considering the advice.
5. **Generate Output**: Based on your analysis so far, return the id in backticks like

```id
(The id of line where the advice should be considered.)
```


Let’s think step by step following each step of the instructions.[/INST]"""

        requests.append({"prompt":prompt, "id":id, "numbered_lines":numbered_lines})

results = await process_api_requests(requests, 0, max_tokens = 3000, temperature=0.4, save_dir = "api_progress_log", restart = False, get_result = None, delete_save_file = False)

all_requests2 = []
for result in results:
    id = result["id"]
    output = result["output"]
    numbered_lines = result["numbered_lines"]

    id_text = extract_text_inside_backticks(output, "id")
    
    if id_text:
        try:
            insert_id_text_ = re.sub(r'[^0-9]', '', id_text)
            insert_id = int(insert_id_text_)
            new_answer_base = get_text_before_number(numbered_lines, insert_id) + "\n\nNow let's follow the user's advice."

            problem_id = int(log_ids[0])
            problem = problems[problem_id]
            messages = [
                {"role": "system", "content": f"""Please answer to user's problem following the advice below. If the final answer is a number larger than 1000, take modulo 1000."""},
                {"role": "user", "content": f"""Problem: {problem}

Advice: {advice}"""}
            ]
            
            new_prompt = qwq_tokenizer.apply_chat_template(
                messages,
                tokenize=False,
                add_generation_prompt=True
            )

            new_prompt += new_answer_base

            for i in range(num_sample_for_improved):
                next_request = {"prompt":new_prompt, "insert_id":insert_id, "id":id}
                all_requests2[str(0)].append(next_request)

        except:
            pass

results2 = process_requests(all_requests2[str(0)], int(iter), model=solve_model, max_tokens = 10000, restart = True, get_result = get_result, save_dir="progless_log2")




# Advice Prompt Test

In [None]:
#mistral_model = "./mistral8b_model"
#solve_model = "./mistral8b_model"

num_advice = 1
num_sample_for_improved = 1
num_iteration = 10
load_file = "code-log5.json"
save_file = "code-log5-advice1.json"
save_dir_base = "progress_log_test"
max_request_advice = 15  # max number of requests to generate advice by AsyncEngine
max_request_improve = 15  # max number of requests to generate improved answer by AsyncEngine


import os, json, re
from transformers import AutoTokenizer

with open(load_file) as json_file:
    log = json.load(json_file)

total_num_problem = log["num_problems"] * log["num_sample"]

# advice_result_log: {str(iteration):{str(problem_id):{"num_problem":, "num_correct":}}}
advice_result_log = {}

def extract_text_inside_backticks(text, arbitrary_text):
    # Define the pattern to match the text inside ``` that follows the arbitrary text
    pattern = re.compile(r'```{}\s*([\s\S]*?)\s*```'.format(re.escape(arbitrary_text)))

    # Search for the pattern in the text
    match = pattern.search(text)

    if match:
        return match.group(1).strip()
    else:
        return None


def get_log_dict(log, log_ids):
    if len(log_ids) == 0:
        return log
    
    problem_id = log_ids.pop(0)
    log_dict = log[str(problem_id)]
    
    for log_id in log_ids:
        log_dict = log_dict["children"][str(log_id)]

    return log_dict

def get_edit_log_dict(log_ids):
    global log
    
    if len(log_ids) == 0:
        return log
    
    problem_id = log_ids.pop(0)
    log_dict = log[str(problem_id)]
    
    for log_id in log_ids:
        log_dict = log_dict["children"][str(log_id)]

    return log_dict


for iter in range(num_iteration):

    # Prepare for the 
    #all_requests = {str(i):[] for i in range(max_request_advice)}
    #request_id = 0
    all_requests1 = AllRequests(max_request_advice)
    # Make all_request for advice by searching log recursively
    def search_log(log_dict, log_ids):  # node_id: str
        global all_requests, request_id
        node_id = log_ids[-1]

        if type(log_dict[node_id]) != dict:
            return None
            
        if "children" in log_dict[node_id]:
            for next_node_id in log_dict[node_id]["children"]:
                search_log(log_dict[node_id]["children"], log_ids + [next_node_id])
        elif "corrects" in log_dict[node_id]:
            for sample_id_str in log_dict[node_id]["corrects"]:
                if not log_dict[node_id]["corrects"][sample_id_str]:
    
                    problem_id_str = log_ids[0]
                    problem_id = int(problem_id_str)
                    prompt = log_dict[node_id]["prompts"][sample_id_str]
                    output = log_dict[node_id]["outputs"][sample_id_str]
                    error = log_dict[node_id]["errors"][sample_id_str]
                    traceback_ = log_dict[node_id]["tracebacks"][sample_id_str]
                    #student_answer = prompt.split("<|im_start|>assistant")[1] + output  #log_dict[node_id]["outputs"][sample_id_str]
                    #numbered_lines, numbered_answer = add_numbers_to_lines(student_answer)

                    messages = [
                        {"role": "user", "content": f"""### Problem:
'''
{instruct_prompt[problem_id]}
'''


### Correct Solution:
'''
{canonical_solution[problem_id]}
'''


### Student's Incorrect Answer:
'''
{output}
'''


### Test Code and Its Error
'''
```
{test[problem_id]}
```

{error}
'''


You are an advanced language model tasked with analyzing a student’s answer to a coding problems and make some instructions to lead him to the correct solution. You are given the coding problem, the correct solution of it, a student’s incorrect answer, test code of the answer code and error cause of the student's incorrect answer. Please make some instructions and let him answer correctly following the instructions below.


### Instructions:
1. **Think Why Student’s Answer was Wrong**: Compare the correct solution and student’s incorrect answer, and analyze why the student’s answer was wrong and think about where it went in a different direction from the correct solution.
2. **Think What was the Idea Missing in Student’s Answer**: Think what idea was included in the correct solution but was missing from the students' answer.
3. **Think From Where the Student’s Answer Should be Modified**: Based on the missing idea, think from where the student’s answer should be modified. There are IDs at the beginning of all sentences in the student’s answer. Please provide the ID of line in the answer from which the answer should be rewritten.
4. **Imagine Many Thinking Processes Which May Lead to the Idea**: Imagine as many thinking processes as possible which may lead him to think of the missing idea.
5. **Give Short and Abstract Instructions**: Expanding your imagination, make as many instructions as possible which may lead him to the missing idea which was not included in the student’s answer. All the instructions should be abstract and general so that it can be applied to other problems too. These are the examples of the instruction; “Explore all the possibilities of it”, “Check if there are enough conditions to solve the problem”, “Imagine what condition will lead you to solve the problem”, “Find some regularities and prove a statement which narrows down the options”, “Summarize your thought and check if it really follows the problem”.
6. **Generate Output**: Based on the result so far, return the missing idea, ID, and pieces of advice in backticks like

```idea
(The missing idea in the student’s answer.)
```

```ID
(The ID of line from where the answer should be rewritten.)
```

```instructions
[
    "(Instruction 1)",
    ...
]
```


Let’s think step by step following each step of the instructions."""}
                    ]
                    
                    prompt = tokenizer.apply_chat_template(
                        messages,
                        tokenize=False,
                        add_generation_prompt=True
                    )
                    
                    new_log_ids = log_ids+[sample_id_str]
                    request_dict = {"log_ids":new_log_ids, "prompt":prompt}
                    all_requests1.add(request_dict)
                    
    
    for problem_id_str in log:
        search_log(log, [problem_id_str])
    
    
    all_results1 = await all_requests1.process(max_tokens=3000, restart = True, save_dir=f"{save_dir_base}/advice-iter{iter}", delete_save_file = True, test_num_request = 1)
    all_requests2 = AllRequests(max_request_improve)


    for result_dict in all_results1:
        log_ids = result_dict["log_ids"]
        prompt = result_dict["prompt"]
        output = result_dict["output"]
        
        instructions = extract_text_inside_backticks(output, "instructions")


        if instructions:
            problem_id = int(log_ids[0])
            problem = instruct_prompt[problem_id]

            print()
            print()
            print(f"Problem: {problem}")
            print()
            print(f"Advice: {instructions}")

        else:
            print()
            print()
            print(f"Error")
            print()
            print(f"Output: {output}")
            
'''

            messages = [
                {"role": "user", "content": f"""Please answer to user's problem following the advice below.

Problem:
{problem}

Advice: {advice}"""}
            ]
            
            new_prompt = tokenizer.apply_chat_template(
                messages,
                tokenize=False,
                add_generation_prompt=True
            )

            prompts_dict = {str(i):new_prompt for i in range(num_sample_for_improved)}
            edit_log_dict = get_edit_log_dict(log_ids[:-1])
            if "children" in edit_log_dict:
                edit_log_dict["children"][str(log_ids[-1])] = {"advice":advice, "prompts":prompts_dict}
            else:
                edit_log_dict["children"] = {str(log_ids[-1]):{"advice":advice, "prompts":prompts_dict}}
            #log = set_log_element(log, log_ids[:-1], log_ids[-1], {"advice":advice, "insert_id":insert_id, "prompts":prompts_dict})

            for i in range(num_sample_for_improved):
                next_request = {"log_ids":log_ids+[str(i)], "prompt":new_prompt}
                all_requests2.add(next_request)
    
    with open(save_file, "w") as f:
        json.dump(log, f)

    print("log saved")
    
    
    # Process all_requests
    # If you had trouble in last process and want to continue to get the output, set restart = True

    #results2 = process_requests(all_requests2[str(0)], int(iter), model=solve_model, max_tokens = 10000, restart = True, get_result = get_result, save_dir="progless_log2")
    all_results2 = await all_requests2.process(max_tokens=3000, restart = True, save_dir=f"{save_dir_base}/answer-iter{iter}")
    
    advice_result_log[str(iter)] = {"num_problem":{}, "num_correct":{},}
    test_cases = []
    candidates = []
    problem_ids = []
    sample_ids = []
    log_ids_list = []
    # Check if the outputs are correct
    for results_dict in all_results2:
        log_ids = results_dict["log_ids"]
        prompt = results_dict["prompt"]
        output = results_dict["output"]
        problem_id = int(log_ids[0])
        sample_id = int(log_ids[-1])
        test_code = test[problem_id]

        output_code = extract_text_inside_backticks(output, "python")
        if not output_code: output_code = extract_text_inside_backticks(output, "")
        if not output_code: output_code = ""

        edit_log_dict = get_edit_log_dict(log_ids[:-1])

        if "prompts" in edit_log_dict:
            edit_log_dict["prompts"][str(sample_id)] = prompt
        else:
            edit_log_dict["prompts"] = {str(sample_id):prompt}

        if "outputs" in edit_log_dict:
            edit_log_dict["outputs"][str(sample_id)] = output
        else:
            edit_log_dict["outputs"] = {str(sample_id):output}

        if "output_codes" in edit_log_dict:
            edit_log_dict["output_codes"][str(sample_id)] = output_code
        else:
            edit_log_dict["output_codes"] = {str(sample_id):output_code}

        test_cases.append(test_code)
        candidates.append([output_code])
        problem_ids.append(problem_id)
        sample_ids.append(sample_id)
        log_ids_list.append(log_ids)

    
    import os, time
    os.environ["HF_ALLOW_CODE_EVAL"] = "1"
    from code_eval.code_eval import CodeEval
    code_eval_metric = CodeEval()
    # Compute pass@k
    k_values = [1]
    print("Evaluating generated code...")
    start = time.time()
    pass_at_k, results = code_eval_metric._compute(
        references=test_cases,
        predictions=candidates,
        k=k_values,
        num_workers=10,  # Adjust based on your system
        timeout=150.0,   # Adjust the timeout as needed
    )
    end = time.time()
    print("calculation time(s): ", end-start)
    
    # For unknown reason, this shows weird value. 
    #for k in k_values:
    #    print(f"Pass@{k}: {pass_at_k[f'pass@{k}'] * 100:.2f}%")
    #    advice_result_log[str(iter)][f"Pass@{k}"] = pass_at_k[f'pass@{k}']
    
    for i in range(len(results)):
        problem_id = problem_ids[i]
        sample_id = sample_ids[i]
        unexpected_error = False
        if results[problem_id] == []:
            is_correct = False  # [] appeared sometimes for unknown reason. I define it as incorrect for now, but it should be fixed.
            unexpected_error = True
        else: is_correct = results[problem_id][0][1]["passed"]
        
        log_ids = log_ids_list[i]
        edit_log_dict = get_edit_log_dict(log_ids[:-1])

        if "corrects" in edit_log_dict:
            edit_log_dict["corrects"][str(sample_id)] = is_correct
        else:
            edit_log_dict["corrects"] = {str(sample_id):is_correct}

        if not is_correct:
            if not unexpected_error:
                error = results[problem_id][0][1]["result"]["error"]
                traceback_ = results[problem_id][0][1]["result"]["traceback"]
            else:
                error = ""
                traceback_ = ""
    
            if "errors" in edit_log_dict:
                edit_log_dict["errors"][str(sample_id)] = error
            else:
                edit_log_dict["errors"] = {str(sample_id):error}
    
            if "tracebacks" in edit_log_dict:
                edit_log_dict["tracebacks"][str(sample_id)] = traceback_
            else:
                edit_log_dict["tracebacks"] = {str(sample_id):traceback_}
    
        if str(log_ids[0]) in advice_result_log[str(iter)]["num_correct"]:
            if is_correct:
                advice_result_log[str(iter)]["num_correct"][str(log_ids[0])] += 1
        else:
            if is_correct:
                advice_result_log[str(iter)]["num_correct"][str(log_ids[0])] = 1
            else:
                advice_result_log[str(iter)]["num_correct"][str(log_ids[0])] = 0

        if str(log_ids[0]) in advice_result_log[str(iter)]["num_problem"]:
            advice_result_log[str(iter)]["num_problem"][str(log_ids[0])] += 1
        else:
            advice_result_log[str(iter)]["num_problem"][str(log_ids[0])] = 1

    
    num_problem = 0
    num_correct = 0
    for problem_id_str in advice_result_log[str(iter)]["num_problem"]:
        num_problem += advice_result_log[str(iter)]["num_problem"][problem_id_str]
    for problem_id_str in advice_result_log[str(iter)]["num_correct"]:
        num_correct += advice_result_log[str(iter)]["num_correct"][problem_id_str]

    num_already_correct_problem = total_num_problem - len(all_results1)

    print("total_num_problem: ", total_num_problem)
    print("num_already_correct_problem: ", num_already_correct_problem)
    print("num_problem: ", num_problem)
    print("num_correct: ", num_correct)
    
    pass1 = num_correct/num_problem
    total_pass1 = (num_already_correct_problem+num_correct)/total_num_problem

    advice_result_log[str(iter)]["pass@1"] = pass1
    advice_result_log[str(iter)]["total_pass@1"] = total_pass1
    print("pass1: ", pass1)
    print("total_pass1: ", total_pass1)
    
    
    log["advice_result_log"] = advice_result_log
    with open(save_file, "w") as json_file:
        json.dump(log, json_file)

print()
print("-- ALL FINISHED --")
'''
