In [None]:
# This example is for bigcodebench agent reflection improving

!pip install ..

# Step 1 : Inference

## Process Class

In [None]:
# Use Model from local environment

import asyncio
from transformers import AutoTokenizer
from vllm import AsyncLLMEngine, AsyncEngineArgs, SamplingParams
import time, warnings

model_name = "deepseek_r1_qwen14b"
tensor_parallel_size = 2

engine_args = AsyncEngineArgs(
    model = model_name,
    tensor_parallel_size = tensor_parallel_size,
    gpu_memory_utilization=0.95,
)
engine = AsyncLLMEngine.from_engine_args(engine_args)
tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side='left')


class AllRequests:
    
    def __init__(self, max_request):
        self.max_request = max_request
        self.requests = []
        self.request_ids = []
        self.request_id = 0
        self.results = []
        self.finished_ids = []
        
    def add(self, request):
        self.requests.append(request)
        self.request_ids.append(self.request_id)
        self.request_id += 1
    
    async def process(self, model=model_name, max_tokens = 3000, temperature=0.4, save_dir = "progress_log", restart = False):

        if not os.path.exists(save_dir):
            os.makedirs(save_dir)

        if restart:
            if os.path.exists(f"{save_dir}/finished_ids.json") and os.path.exists(f"{save_dir}/results.json"):
                with open(f"{save_dir}/finished_ids.json") as f:
                    finished_ids = json.load(f)
                with open(f"{save_dir}/results.json") as f:
                    self.results = json.load(f)
                for finished_id in finished_ids:
                    self.finished_ids.append(finished_id)
                    id = self.request_ids.index(finished_id)
                    self.request_ids.pop(id)
                    self.requests.pop(id)

        await asyncio.gather(
            *[self.process_requests(temperature = temperature, max_tokens = max_tokens, restart = restart, save_dir=save_dir) for _ in range(self.max_request)]
        )
            
        return self.results


    async def process_requests(self, max_tokens = 3000, temperature=0.4, save_dir = "progress_log", restart = False):

        while len(self.requests) != 0:
            request_dict = self.requests.pop(0)
            request_id = self.request_ids.pop(0)

            prompt = request_dict["prompt"]

            final_output = None
            results_generator = engine.generate(prompt, SamplingParams(temperature=temperature, max_tokens=max_tokens), request_id)
            async for request_output in results_generator:
                # print(request_output) => for streaming
                final_output = request_output

            output = final_output.outputs[0].text
            
            request_dict["output"] = output
            self.results.append(request_dict)
            self.finished_ids.append(request_id)

            with open(f"{save_dir}/results.json", "w") as f:
                json.dump(self.results, f)
            with open(f"{save_dir}/finished_ids.json", "w") as f:
                json.dump(self.finished_ids, f)
    
    


## Download Dataset

In [None]:
num_sample = 100
dataset_name = 'bigcode/bigcodebench'
save_path = "bigcodebench3.json"

import pandas as pd
from datasets import load_dataset
import random

# Load a dataset from Hugging Face
dataset = load_dataset(dataset_name)

# Convert the dataset to a pandas DataFrame
# Assuming you want to use the 'train' split of the dataset
df = pd.DataFrame(dataset['v0.1.0_hf'])

# Convert the DataFrame to a list of dictionaries
data_list = df.to_dict(orient='records')
#data_list = data_list[:num_sample]
data_list = random.sample(data_list, num_sample)

# Print the first few records to verify
#print(data_list[:5])

# Prepare list
task_id = []
complete_prompt = []
instruct_prompt = []
canonical_solution = []
code_prompt = []
test = []
doc_struct = []

for i, data_dict in enumerate(data_list):
    task_id.append(data_dict["task_id"])
    complete_prompt.append(data_dict["complete_prompt"])
    instruct_prompt.append(data_dict["instruct_prompt"])
    canonical_solution.append(data_dict["canonical_solution"])
    code_prompt.append(data_dict["code_prompt"])
    test.append(data_dict["test"])
    doc_struct.append(data_dict["doc_struct"])

import json
with open(save_path, "w") as f:
    json.dump({"task_id":task_id, "complete_prompt":complete_prompt, "instruct_prompt":instruct_prompt, "canonical_solution":canonical_solution, "code_prompt":code_prompt, "test":test, "doc_struct":doc_struct,}, f)


In [None]:
#num_sample = 2000
dataset_name = 'bigcode/bigcodebench'
save_path = "bigcodebench3.json"

import pandas as pd
from datasets import load_dataset
import random

# Load a dataset from Hugging Face
dataset = load_dataset(dataset_name)

# Convert the dataset to a pandas DataFrame
# Assuming you want to use the 'train' split of the dataset
df = pd.DataFrame(dataset['v0.1.3'])

# Convert the DataFrame to a list of dictionaries
data_list = df.to_dict(orient='records')
#data_list = data_list[:num_sample]
#data_list = random.sample(data_list, num_sample)

# Print the first few records to verify
#print(data_list[:5])

# Prepare list
task_id = []
complete_prompt = []
instruct_prompt = []
canonical_solution = []
code_prompt = []
test = []
doc_struct = []

for i, data_dict in enumerate(data_list):
    task_id.append(data_dict["task_id"])
    complete_prompt.append(data_dict["complete_prompt"])
    instruct_prompt.append(data_dict["instruct_prompt"])
    canonical_solution.append(data_dict["canonical_solution"])
    code_prompt.append(data_dict["code_prompt"])
    test.append(data_dict["test"])
    doc_struct.append(data_dict["doc_struct"])

import json
with open(save_path, "w") as f:
    json.dump({"task_id":task_id, "complete_prompt":complete_prompt, "instruct_prompt":instruct_prompt, "canonical_solution":canonical_solution, "code_prompt":code_prompt, "test":test, "doc_struct":doc_struct,}, f)


In [None]:
import json
save_path = "bigcodebench3.json"
with open(save_path) as f:
    data_dict = json.load(f)

task_id=data_dict["task_id"]
complete_prompt=data_dict["complete_prompt"]
instruct_prompt=data_dict["instruct_prompt"]
canonical_solution=data_dict["canonical_solution"]
code_prompt=data_dict["code_prompt"]
test=data_dict["test"]
doc_struct=data_dict["doc_struct"]

num_problems = len(task_id)

## Solve Problems

In [None]:
#ids = [i for i in range(10)]
#num_problems = 100
ids = [i for i in range(num_problems)]
num_sample = 1
max_request = 15
progress_save_dir = "progress_log_solve4"
save_path = "code-log7.json"
explanation = "deepseek_r1_qwen14b/bigcodebench2 for code test with small number of samples"
#num_problems = len(ids)

In [None]:
import json, re, os

# Prepare Variables

# log : { str(problem_id): { "prompts": {str(sample_id):promt,}, "outputs":{}, "final_answers":{}, "corrects":{}}, }
log = {str(id):{"prompts":{}, "outputs":{}, "final_answers":{}, "corrects":{}, "output_code":{}, "errors":{}, "tracebacks":{}} for id in ids}
log["num_sample"] = num_sample
log["num_problems"] = num_problems
log["info"] = explanation


def extract_text_inside_backticks(text, arbitrary_text):
    # Define the pattern to match the text inside ``` that follows the arbitrary text
    pattern = re.compile(r'```{}\s*([\s\S]*?)\s*```'.format(re.escape(arbitrary_text)))

    # Search for the pattern in the text
    match = pattern.search(text)

    if match:
        return match.group(1).strip()
    else:
        return None

'''
def check_output(output, test_code): # this didn't work because exec behaves differently from running it in jupyter notebook
    run_code = """
import inspect
def check_code():
    try:
        sub_obj = TestCases()
        for name, attribute in TestCases.__dict__.items():
            if not name.startswith('__') and not name.startswith('_') and callable(attribute):
                attribute(sub_obj)
    except Exception as e:
        return False
    return True

is_code_correct__ = check_code()"""

    output_code = extract_text_inside_backticks(output, "python")
    if not output_code: output_code = extract_text_inside_backticks(output, "")

    if not output_code: return False, test_code + run_code

    code = output_code + "\n\n\n" + test_code + run_code

    local_vars = {}
    global_vars = {}

    try:
        exec(test_code, global_vars, local_vars)
        is_code_correct = local_vars['is_code_correct__']
    except Exception as e:
        return False, code

    return is_code_correct, code


# Define get_result (Optional): If you want to evaluate the output somehow while running process_requests, you can define get_result function and pass it to process_requests method. This will let you read log files easily with some evaluation.
def get_result(request_dict, save_dir):
    # request_dict: {"prompt":, "output":}
    # save_dir: this is a directory path for progress log. put your evaluation file in here

    problem_id = request_dict["problem_id"]
    output = request_dict["output"]
    test_code = request_dict["test_code"]
    
    eval_file_path = f"{save_dir}/score.json"

    if os.path.exists(eval_file_path):
        with open(eval_file_path) as f:
            log = json.load(f)
        if str(problem_id) in log["num_answered"]:
            num_answered = log["num_answered"][str(problem_id)]
            num_correct = log["num_correct"][str(problem_id)]
        else:
            num_answered = 0
            num_correct = 0
    else:
        log = {"num_answered":{}, "num_correct":{}}
        num_answered = 0
        num_correct = 0

    is_code_correct, code = check_output(output, test_code)
    if is_code_correct: num_correct += 1

    request_dict["is_code_correct"] = is_code_correct

    num_answered += 1
    log["num_answered"][str(problem_id)] = num_answered
    log["num_correct"][str(problem_id)] = num_correct

    with open(eval_file_path, "w") as f:
        json.dump(log, f)

    return request_dict
'''

# all_requests = {str(request_id): requests}
# requests = [{"prompt": , ... }, ... ]
# Ex. all_requests = {"0":[{"prompt":, "problem_id":,}, ...] }
#all_requests = {str(i):[] for i in range(max_request)}
all_requests = AllRequests(max_request)

# Prepare all_requests
#request_id = 0
for i in range(num_sample):  # To see the rough result quickly, it'd better process problems with different ids first. That's why loop for sample comes before one for ids.
    for id in ids:
        messages = [
            {"role": "user", "content": f"""{instruct_prompt[id]}"""}
        ]
        prompt = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )
        test_code = test[id]
        
        request_dict = {"problem_id":id, "sample_id":i, "prompt":prompt, "test_code":test_code}
        all_requests.add(request_dict)
        
all_results = await all_requests.process(max_tokens = 10000, save_dir = progress_save_dir, restart = True)

test_cases = []
candidates = [[] for _ in range(num_problems)]
test_cases_dict = {}
# Check if the outputs are correct
for results_dict in all_results:
    problem_id = results_dict["problem_id"]
    sample_id = results_dict["sample_id"]
    prompt = results_dict["prompt"]
    output = results_dict["output"]
    test_code = results_dict["test_code"]
    #is_code_correct, code = check_output(output, test_code)

    log[str(problem_id)]["prompts"][str(sample_id)] = prompt
    log[str(problem_id)]["outputs"][str(sample_id)] = output
    #log[str(problem_id)]["corrects"][str(sample_id)] = is_code_correct

    output_code = extract_text_inside_backticks(output, "python")
    if not output_code: output_code = extract_text_inside_backticks(output, "")
    if not output_code: output_code = ""

    log[str(problem_id)]["output_code"][str(sample_id)] = output_code

    if not problem_id in test_cases_dict:
        test_cases_dict[problem_id] = test_code
    candidates[problem_id].append(output_code)

for id in test_cases_dict:
    test_cases.append(test_cases_dict[id])

import os, time
os.environ["HF_ALLOW_CODE_EVAL"] = "1"
#from evaluate import load
# Load code evaluation metric
#code_eval_metric = load("code_eval")

# Modified code_eval which has returns traceback of test error 
from code_eval.code_eval import CodeEval
code_eval_metric = CodeEval()
# Compute pass@k
k_values = [1]
print("Evaluating generated code...")
start = time.time()
pass_at_k, results = code_eval_metric._compute(
    references=test_cases,
    predictions=candidates,
    k=k_values,
    num_workers=18,  # Adjust based on your system
    timeout=100.0,   # Adjust the timeout as needed
)
end = time.time()
print("calculation time(s): ", end-start)

# Print the results
#for k in k_values:
#    print(f"Pass@{k}: {pass_at_k[f'pass@{k}'] * 100:.2f}%")
    #log[f"Pass@{k}"] = pass_at_k[f'pass@{k}']

total_num_correct = 0
total_num_problem = 0
num_correct_dict = {}
for problem_id in range(len(results)):
    num_correct = 0
    for sample_id in range(len(results[problem_id])):
        is_correct = results[problem_id][sample_id][1]["passed"]
        if candidates[problem_id][sample_id]=="": is_correct=False  # passed become true when output_code == "" for some reason. This should be incorrect
        log[str(problem_id)]["corrects"][str(sample_id)] = is_correct
        if not is_correct:
            try: # for normal case
                log[str(problem_id)]["errors"][str(sample_id)] = results[problem_id][sample_id][1]["result"]["error"]
                log[str(problem_id)]["tracebacks"][str(sample_id)] = results[problem_id][sample_id][1]["result"]["traceback"]
            except: # for canse output_code == ""
                log[str(problem_id)]["errors"][str(sample_id)] = "failed: there is no code included in the answer"
                log[str(problem_id)]["tracebacks"][str(sample_id)] = "failed: there is no code included in the answer"
        if is_correct: num_correct += 1
    log[str(problem_id)]["num_correct"] = num_correct
    num_correct_dict[str(problem_id)] = num_correct
    total_num_correct+=num_correct
    total_num_problem+=len(results[problem_id])
    
log["num_correct_dict"] = num_correct_dict
log["pass1"] = total_num_correct/total_num_problem
with open(save_path, "w") as f:
    json.dump(log, f)

print()
print("-- ALL FINISHED --")


## Make Agents

In [None]:
num_iteration = 10
max_request = 50  # max_request to AsyncEngine
load_file = "code-log5.json"
save_file = "code-log5-corr1.json"
save_dir_base = "progless_log9"

In [None]:
import os, json, re, traceback
from transformers import AutoTokenizer

with open(load_file) as json_file:
    log = json.load(json_file)

total_num_problem = log["num_problems"]
total_num_sample = log["num_sample"]*log["num_problems"]

if not os.path.exists(save_dir_base):
    os.makedirs(save_dir_base)
    
# advice_result_log: {str(iteration):{str(problem_id):{"num_problem":, "num_correct":}}
advice_result_log = {}

def add_numbers_to_lines(text):
    # Split the text into lines
    lines = text.split('\n\n')

    # Initialize a counter
    counter = 1

    # Create a list to hold the numbered lines
    numbered_lines = []
    numbered_texts = []

    # Iterate through the lines
    for line in lines:
        if line.strip():  # Check if the line is not empty
            # Add the number and the line to the list
            numbered_lines.append((counter, line))
            numbered_texts.append(f"{counter}. {line}")
            # Increment the counter
            counter += 1

    numbered_text = '\n\n'.join(numbered_texts)    

    return numbered_lines, numbered_text


def get_text_before_number(numbered_lines, number):
    # Find the index of the tuple with the given number
    for i, (num, line) in enumerate(numbered_lines):
        if num == number:
            # Return the original text before the given number
            return '\n\n'.join(line for _, line in numbered_lines[:i])

    # If the number is not found, return an empty string
    return ""


def extract_text_inside_backticks(text, arbitrary_text):
    # Define the pattern to match the text inside ``` that follows the arbitrary text
    pattern = re.compile(r'```{}\s*([\s\S]*?)\s*```'.format(re.escape(arbitrary_text)))

    # Search for the pattern in the text
    match = pattern.search(text)

    if match:
        return match.group(1).strip()
    else:
        return None


# Define get_result (Optional): If you want to evaluate the output somehow while running process_requests, you can define get_result function and pass it to process_requests method. This will let you read log files easily with some evaluation.
def get_result(request_dict, save_dir):
    # request_dict: {"prompt":, "output":}
    # save_dir: this is a directory path for progress log. put your evaluation file in here

    problem_id = int(request_dict["log_ids"][0])
    output = request_dict["output"]
    eval_file_path = f"{save_dir}/score.json"

    if os.path.exists(eval_file_path):
        with open(eval_file_path) as f:
            log = json.load(f)
        if str(problem_id) in log["num_answered"]:
            num_answered = log["num_answered"][str(problem_id)]
            num_correct = log["num_correct"][str(problem_id)]
        else:
            num_answered = 0
            num_correct = 0
    else:
        log = {"num_answered":{}, "num_correct":{}}
        num_answered = 0
        num_correct = 0

    is_correct = False
    pattern = r'\\boxed{(\d+)}'
    matches = re.findall(pattern, output)
    if matches == []:
        final_answer = None
    else:
        final_answer = int(matches[0])
        if correct_answers[problem_id] == final_answer:
            is_correct = True
            num_correct += 1

    request_dict["final_answer"] = final_answer
    request_dict["is_correct"] = is_correct

    num_answered += 1
    log["num_answered"][str(problem_id)] = num_answered
    log["num_correct"][str(problem_id)] = num_correct

    with open(eval_file_path, "w") as f:
        json.dump(log, f)

    return request_dict




def get_log_dict(log, log_ids):
    if len(log_ids) == 0:
        return log
    
    problem_id = log_ids.pop(0)
    log_dict = log[str(problem_id)]
    
    for log_id in log_ids:
        log_dict = log_dict["children"][str(log_id)]

    return log_dict


def get_edit_log_dict(log_ids):
    global log
    
    if len(log_ids) == 0:
        return log
    
    problem_id = log_ids.pop(0)
    log_dict = log[str(problem_id)]
    
    for log_id in log_ids:
        log_dict = log_dict["children"][str(log_id)]

    return log_dict



for iter in range(num_iteration):

    all_requests1 = AllRequests(max_request)
    
    # Make all_request for advice by searching log recursively
    def search_log(log_dict, log_ids):  # node_id: str
        global all_requests, request_id
        node_id = log_ids[-1]

        if type(log_dict[node_id]) != dict:
            return None
            
        if "children" in log_dict[node_id]:
            for next_node_id in log_dict[node_id]["children"]:
                search_log(log_dict[node_id]["children"], log_ids + [next_node_id])
        elif "corrects" in log_dict[node_id]:
            if iter!=0:
                all_false = True
                for sample_id_str in log_dict[node_id]["corrects"]:
                    if log_dict[node_id]["corrects"][sample_id_str]:
                        all_false = False
                        break
                    
                if all_false:
                    problem_id_str = log_ids[0]
                    problem_id = int(problem_id_str)
                    pre_prompt = log_dict[node_id]["prompts"]["0"]
                    pre_output = log_dict[node_id]["outputs"]["0"]
                    error = log_dict[node_id]["errors"]["0"]
                    traceback_ = log_dict[node_id]["tracebacks"]["0"]
                    #student_answer = prompt.split("<｜Assistant｜>")[1] + output  #log_dict[node_id]["outputs"][sample_id_str]
                    #numbered_lines, numbered_answer = add_numbers_to_lines(student_answer)

                    messages = [
                        {"role": "user", "content": f"""### Problem:
'''
{instruct_prompt[problem_id]}
'''


### Correct Solution:
'''
{canonical_solution[problem_id]}
'''


### Student's Incorrect Answer:
'''
{pre_output}
'''


### Test Code and Its Error
'''
```
{test[problem_id]}
```

{traceback_}
'''


You are an advanced language model tasked with analyzing a student’s answer to a coding problems and make some instructions to lead him to the correct solution. You are given the coding problem, the correct solution of it, a student’s incorrect answer, test code of the answer code and error cause of the student's incorrect answer. Please make some instructions and let him answer correctly following the instructions below.


### Instructions:
1. **Think Why Student’s Answer was Wrong**: Compare the correct solution and student’s incorrect answer, and analyze why the student’s answer was wrong and think about where it went in a different direction from the correct solution.
2. **Think What was the Idea Missing in Student’s Answer**: Think what idea was included in the correct solution but was missing from the students' answer.
3. **Imagine Many Thinking Processes Which May Lead to the Idea**: Imagine as many thinking processes as possible which may lead him to think of the missing idea.
4. **Give Short and Abstract Instructions**: Expanding your imagination, make as many instructions as possible which may lead him to the missing idea which was not included in the student’s answer. All the instructions should be abstract and general so that it can be applied to other problems too. These are the examples of the instruction; “Explore all the possibilities of it”, “Check if there are enough conditions to solve the problem”, “Imagine what condition will lead you to solve the problem”, “Find some regularities and prove a statement which narrows down the options”, “Summarize your thought and check if it really follows the problem”.
5. **Generate Output**: Based on the result so far, return the missing idea and Instructions in backticks like

```idea
(The missing idea in the student’s answer)
```

```instructions
[
    "Instruction1 (An instruction which leads him to the missing idea)",
    ...
]
```


Let’s think step by step following each step of the instructions."""}
                    ]
                    
                    prompt = tokenizer.apply_chat_template(
                        messages,
                        tokenize=False,
                        add_generation_prompt=True
                    )
                    
                    new_log_ids = log_ids+["0"]
                    request_dict = {"log_ids":new_log_ids, "prompt":prompt, "pre_prompt":pre_prompt, "pre_output":pre_output}
                    all_requests1.add(request_dict)
                    
            else:
                for sample_id_str in log_dict[node_id]["corrects"]:
                    if not log_dict[node_id]["corrects"][sample_id_str]:
        
                        problem_id_str = log_ids[0]
                        problem_id = int(problem_id_str)
                        pre_prompt = log_dict[node_id]["prompts"][sample_id_str]
                        pre_output = log_dict[node_id]["outputs"][sample_id_str]
                        error = log_dict[node_id]["errors"][sample_id_str]
                        traceback_ = log_dict[node_id]["tracebacks"][sample_id_str]
                        #student_answer = prompt.split("<｜Assistant｜>")[1] + output  #log_dict[node_id]["outputs"][sample_id_str]
                        #numbered_lines, numbered_answer = add_numbers_to_lines(student_answer)
    
                        messages = [
                            {"role": "user", "content": f"""### Problem:
'''
{instruct_prompt[problem_id]}
'''


### Correct Solution:
'''
{canonical_solution[problem_id]}
'''


### Student's Incorrect Answer:
'''
{pre_output}
'''


### Test Code and Its Error
'''
```
{test[problem_id]}
```

{traceback_}
'''


You are an advanced language model tasked with analyzing a student’s answer to a coding problems and make some instructions to lead him to the correct solution. You are given the coding problem, the correct solution of it, a student’s incorrect answer, test code of the answer code and error cause of the student's incorrect answer. Please make some instructions and let him answer correctly following the instructions below.


### Instructions:
1. **Think Why Student’s Answer was Wrong**: Compare the correct solution and student’s incorrect answer, and analyze why the student’s answer was wrong and think about where it went in a different direction from the correct solution.
2. **Think What was the Idea Missing in Student’s Answer**: Think what idea was included in the correct solution but was missing from the students' answer.
3. **Imagine Many Thinking Processes Which May Lead to the Idea**: Imagine as many thinking processes as possible which may lead him to think of the missing idea.
4. **Give Short and Abstract Instructions**: Expanding your imagination, make as many instructions as possible which may lead him to the missing idea which was not included in the student’s answer. All the instructions should be abstract and general so that it can be applied to other problems too. These are the examples of the instruction; “Explore all the possibilities of it”, “Check if there are enough conditions to solve the problem”, “Imagine what condition will lead you to solve the problem”, “Find some regularities and prove a statement which narrows down the options”, “Summarize your thought and check if it really follows the problem”.
5. **Generate Output**: Based on the result so far, return the missing idea and Instructions in backticks like

```idea
(The missing idea in the student’s answer)
```

```instructions
[
    "Instruction1 (An instruction which leads him to the missing idea)",
    ...
]
```


Let’s think step by step following each step of the instructions."""}
                        ]
                        
                        prompt = tokenizer.apply_chat_template(
                            messages,
                            tokenize=False,
                            add_generation_prompt=True
                        )
                        
                        new_log_ids = log_ids+[sample_id_str]
                        request_dict = {"log_ids":new_log_ids, "prompt":prompt, "pre_prompt":pre_prompt, "pre_output":pre_output}
                        all_requests1.add(request_dict)
    
    for problem_id_str in log:
        search_log(log, [problem_id_str])
    
    
    # Process all_requests
    # If you had some error in last process and want to continue to get the output, set restart = True
    all_results1 = await all_requests1.process(max_tokens = 15000, restart = True, save_dir=f"{save_dir_base}/inst-{iter}")
    all_requests2 = AllRequests(max_request)

    error_num = 0
    num_results = len(all_results1)
    for result_dict in all_results1:
        log_ids = result_dict["log_ids"]
        prompt = result_dict["prompt"]
        output = result_dict["output"]
        pre_prompt = result_dict["pre_prompt"]
        pre_output = result_dict["pre_output"]
        instruction_log = prompt + output

        missing_idea = extract_text_inside_backticks(output, "idea")
        instruction_list_text = extract_text_inside_backticks(output, "instructions")
        
        if missing_idea and instruction_list_text:
            try:
                instruction_list_text = instruction_list_text.replace("\n","")
                instruction_list_text = instruction_list_text.replace("\\","")
                pattern = r"'((?:[^']|'(?!\s*[,\]]))*)'"
                replacement = r'"\1"'
                #instruction_list_text = re.sub(pattern, replacement, instruction_list_text)  # convert ['I'm a cat', 'This is the student's car',] into ["I'm a cat", "This is the student's car",]
                instruction_list = json.loads(instruction_list_text)
            except Exception as e:
                traceback.print_exc()
                error_num += 1
                print()
                print("An error occurred:", e)
                print("instruction_list_text: ", instruction_list_text)
                print("error_num: ", error_num)
                continue
    
            problem_id = int(log_ids[0])
            problem = instruct_prompt[problem_id]

            prompts_dict = {}
            insert_ids=[]
            instruction_ids=[]
            prompt_id = 0
            for instruction_id, instruction in enumerate(instruction_list):
                modified_insts = instruction_list[:instruction_id] + instruction_list[(instruction_id+1):]
                advices_text = ""
                for i, inst in enumerate(modified_insts):
                    advices_text += f"\nAdvice{i+1}: {inst}"
                    
                messages = [
                    {"role": "user", "content": pre_prompt[29:][:-13]},
                    {"role": "assistant", "content": pre_output},
                    {"role": "user", "content": f"""Your answer might contain some errors. Please revise your answer following the advice below;
{advices_text}"""}
                ]
                    
                new_prompt = tokenizer.apply_chat_template(
                    messages,
                    tokenize=False,
                    add_generation_prompt=True
                )

                prompts_dict[str(prompt_id)] = new_prompt

                next_request = {"log_ids":log_ids+[str(instruction_id)], "prompt":new_prompt, "instruction_log":[instruction_log]}
                all_requests2.add(next_request)
                
            edit_log_dict = get_edit_log_dict(log_ids[:-1])
            if "children" in edit_log_dict:
                edit_log_dict["children"][str(log_ids[-1])] = {"instruction_list":instruction_list, "prompts":prompts_dict, "instruction_log":[instruction_log]}
            else:
                edit_log_dict["children"] = {str(log_ids[-1]):{"instruction_list":instruction_list, "prompts":prompts_dict, "instruction_log":[instruction_log]}}


    with open(save_file, "w") as f:
        json.dump(log, f)

    print("log saved")
    
    # Process all_requests
    # If you had trouble in last process and want to continue to get the output, set restart = True
    all_results2 = await all_requests2.process(max_tokens = 15000, restart = True, save_dir=f"{save_dir_base}/solve-{iter}")

    advice_result_log[str(iter)] = {"num_problem":{}, "num_correct":{},}
    test_cases = []
    candidates = []
    problem_ids = []
    sample_ids = []
    log_ids_list = []
    # Check if the outputs are correct
    for results_dict in all_results2:
        log_ids = results_dict["log_ids"]
        prompt = results_dict["prompt"]
        output = results_dict["output"]
        problem_id = int(log_ids[0])
        sample_id = int(log_ids[-1])
        test_code = test[problem_id]

        output_code = extract_text_inside_backticks(output, "python")
        if not output_code: output_code = extract_text_inside_backticks(output, "")
        if not output_code: output_code = ""

        edit_log_dict = get_edit_log_dict(log_ids[:-1])

        if "prompts" in edit_log_dict:
            edit_log_dict["prompts"][str(sample_id)] = prompt
        else:
            edit_log_dict["prompts"] = {str(sample_id):prompt}

        if "outputs" in edit_log_dict:
            edit_log_dict["outputs"][str(sample_id)] = output
        else:
            edit_log_dict["outputs"] = {str(sample_id):output}

        if "output_codes" in edit_log_dict:
            edit_log_dict["output_codes"][str(sample_id)] = output_code
        else:
            edit_log_dict["output_codes"] = {str(sample_id):output_code}

        test_cases.append(test_code)
        candidates.append([output_code])
        problem_ids.append(problem_id)
        sample_ids.append(sample_id)
        log_ids_list.append(log_ids)
    
    import os, time
    os.environ["HF_ALLOW_CODE_EVAL"] = "1"
    from code_eval.code_eval import CodeEval
    code_eval_metric = CodeEval()
    # Compute pass@k
    k_values = [1]
    print("Evaluating generated code...")
    start = time.time()
    pass_at_k, results = code_eval_metric._compute(
        references=test_cases,
        predictions=candidates,
        k=k_values,
        num_workers=10,  # Adjust based on your system
        timeout=150.0,   # Adjust the timeout as needed
    )
    end = time.time()
    print("calculation time(s): ", end-start)
    
    for i in range(len(results)):
        problem_id = problem_ids[i]
        sample_id = sample_ids[i]
        unexpected_error = False
        if results[problem_id] == []:
            is_correct = False  # [] appeared sometimes for unknown reason. I define it as incorrect for now, but it should be fixed.
            unexpected_error = True
        else: is_correct = results[problem_id][0][1]["passed"]
        
        log_ids = log_ids_list[i]
        edit_log_dict = get_edit_log_dict(log_ids[:-1])

        if "corrects" in edit_log_dict:
            edit_log_dict["corrects"][str(sample_id)] = is_correct
        else:
            edit_log_dict["corrects"] = {str(sample_id):is_correct}

        if not is_correct:
            if not unexpected_error:
                error = results[problem_id][0][1]["result"]["error"]
                traceback_ = results[problem_id][0][1]["result"]["traceback"]
            else:
                error = ""
                traceback_ = ""
    
            if "errors" in edit_log_dict:
                edit_log_dict["errors"][str(sample_id)] = error
            else:
                edit_log_dict["errors"] = {str(sample_id):error}
    
            if "tracebacks" in edit_log_dict:
                edit_log_dict["tracebacks"][str(sample_id)] = traceback_
            else:
                edit_log_dict["tracebacks"] = {str(sample_id):traceback_}
    
        if str(log_ids[0]) in advice_result_log[str(iter)]["num_correct"]:
            if is_correct:
                advice_result_log[str(iter)]["num_correct"][str(log_ids[0])] += 1
        else:
            if is_correct:
                advice_result_log[str(iter)]["num_correct"][str(log_ids[0])] = 1
            else:
                advice_result_log[str(iter)]["num_correct"][str(log_ids[0])] = 0

        if str(log_ids[0]) in advice_result_log[str(iter)]["num_problem"]:
            advice_result_log[str(iter)]["num_problem"][str(log_ids[0])] += 1
        else:
            advice_result_log[str(iter)]["num_problem"][str(log_ids[0])] = 1


    num_problem = 0
    num_sample = 0
    pass1_count = 0
    passAll_count = 0
    for problem_id_str in advice_result_log[str(iter)]["num_problem"]:
        num_problem += 1
        num_sample += advice_result_log[str(iter)]["num_problem"][problem_id_str]
    for problem_id_str in advice_result_log[str(iter)]["num_correct"]:
        pass1_count += advice_result_log[str(iter)]["num_correct"][problem_id_str]
        if advice_result_log[str(iter)]["num_correct"][problem_id_str] > 0:
            passAll_count += 1

    num_already_correct_problem = total_num_problem - len(all_results1)

    print("total_num_problem: ", total_num_problem)
    print("total_num_sample: ", total_num_sample)
    print("num_already_correct_problem: ", num_already_correct_problem)
    print(f"{passAll_count}/{num_problem} problems have got at least 1 correct sample in this iteration")
    print(f"{pass1_count}/{num_sample} samples were correct in total")
    
    pass1 = pass1_count/num_sample
    passAll = passAll_count/num_problem

    advice_result_log[str(iter)]["pass@1"] = pass1
    advice_result_log[str(iter)]["passAll"] = passAll
    print("pass1 in this iteration: ", pass1)
    print("passAll in this iteration: ", passAll)
    
    
    log["advice_result_log"] = advice_result_log
    with open(save_file, "w") as json_file:
        json.dump(log, json_file)

print()
print("-- ALL FINISHED --")


# Step 2 : Make dataset_list

In [None]:
exp_dir = "/workspace/exp12"
model_save_dir = f"{exp_dir}/model1"
model_name = "/workspace/llama3b-rm"
data_save_path = "/workspace/bigcodebench3.json"
advice_save_path = "/workspace/code-log5-corr1.json"
data_dict_save_path = f"{exp_dir}/data_dict.json"
dataset_list_save_path = f"{exp_dir}/dataset_list.json"
#num_advice_per_batch = 13  # total number of advice including both chosen and rejected per 1 batch
avoid_topk = 10 # avoid topk similar advices to each advice list being included in rejected advice

import json, os
if not os.path.exists(exp_dir):
    os.makedirs(exp_dir)
    

In [None]:
# Reference
# https://medium.com/towards-generative-ai/reward-model-training-2209d1befb5f

if not os.path.exists(data_dict_save_path):
    
    with open(data_save_path) as f:
        data_dict = json.load(f)
    
    task_id=data_dict["task_id"]
    complete_prompt=data_dict["complete_prompt"]
    instruct_prompt=data_dict["instruct_prompt"]
    canonical_solution=data_dict["canonical_solution"]
    code_prompt=data_dict["code_prompt"]
    test=data_dict["test"]
    doc_struct=data_dict["doc_struct"]
    num_problems = len(task_id)
    
    with open(advice_save_path) as f:
        log = json.load(f)
    
    data_dict = {"problem_id":[], "problem":[], "advice":[], "advice_id":[], "output":[], "correct":[], "log_ids":[], "node_state":[]}
    
    def search_log(log_dict, log_ids):  # node_id: str
        global data_dict
        node_id = log_ids[-1]
    
        if type(log_dict[node_id]) != dict:
            return None
            
        if "children" in log_dict[node_id]:
            for next_node_id in log_dict[node_id]["children"]:
                problem_id_str = log_ids[0]
                problem_id = int(problem_id_str)
                problem = instruct_prompt[problem_id]
                advice_list = log_dict[node_id]["children"][next_node_id]["instruction_list"]
    
                if "corrects" in log_dict[node_id]["children"][next_node_id]:  # when quiting the inference after finihing creating advice
                    all_correct = True
                    all_incorrect = True
                    child_num_correct = 0
                    child_num_problem = 0
                    for advice_id, advice in enumerate(advice_list):
                        correct = log_dict[node_id]["children"][next_node_id]["corrects"][str(advice_id)]
                        child_num_problem += 1
                        if correct:
                            all_incorrect = False
                            child_num_correct += 1
                        else:
                            all_correct = False
    
                    if child_num_problem == 0: continue
    
                    for advice_id, advice in enumerate(advice_list):
                        correct = log_dict[node_id]["children"][next_node_id]["corrects"][str(advice_id)]
                        #output = log_dict[node_id]["children"][next_node_id]["outputs"][str(advice_id)]
                        output = log_dict[node_id]["outputs"]["0"]
                        data_dict["problem_id"].append(problem_id)
                        data_dict["problem"].append(problem)
                        data_dict["output"].append(output)
                        data_dict["advice"].append(advice)
                        data_dict["advice_id"].append(advice_id)
                        data_dict["correct"].append(correct)
                        data_dict["log_ids"].append(log_ids+[next_node_id])
                        data_dict["node_state"].append({"num_problem":len(advice_list), "child_num_correct":child_num_correct, "all_correct":all_correct, "all_incorrect":all_incorrect})
    
                    if all_incorrect:
                        search_log(log_dict[node_id]["children"], log_ids + [next_node_id])
    
    problem_ids_with_advice = []
    for problem_id_str in log:
        if type(log[problem_id_str])==dict:
            if "children" in log[problem_id_str]:
                problem_ids_with_advice.append(int(problem_id_str))
                search_log(log, [problem_id_str])
    
    with open(f"{exp_dir}/data_dict.json", "w") as f: json.dump(data_dict, f)
    with open(f"{exp_dir}/problem_ids_with_advice.json", "w") as f: json.dump(problem_ids_with_advice, f)

else:
    with open(f"{exp_dir}/data_dict.json") as f: data_dict = json.load(f)
    with open(f"{exp_dir}/problem_ids_with_advice.json") as f: problem_ids_with_advice = json.load(f)



In [None]:
import torch

if not os.path.exists(f"{exp_dir}/advice_similarities.pt"):
    # Get similarities between advice so that avoid rejected advice being important for the problems
    from sentence_transformers import SentenceTransformer
    from sentence_transformers.util import cos_sim
    from sentence_transformers.quantization import quantize_embeddings
    
    # 1. Specify preffered dimensions
    dimensions = 512
    
    # 2. load model
    model = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1", truncate_dim=dimensions).to("cuda")
    advices = data_dict["advice"]
    # The prompt used for query retrieval tasks:
    # query_prompt = 'Represent this sentence for searching relevant passages: '
    
    # 2. Encode
    query_embedding = model.encode(advices, prompt_name="query")
    # Equivalent Alternatives:
    # query_embedding = model.encode(query_prompt + query)
    # query_embedding = model.encode(query, prompt=query_prompt)
    
    docs_embeddings = model.encode(advices)
    
    # Optional: Quantize the embeddings
    binary_query_embedding = quantize_embeddings(query_embedding, precision="ubinary")
    binary_query_embedding = torch.tensor(binary_query_embedding).to("cuda")
    binary_docs_embeddings = quantize_embeddings(docs_embeddings, precision="ubinary")
    binary_docs_embeddings = torch.tensor(binary_docs_embeddings).to("cuda")
    
    advice_similarities = cos_sim(query_embedding, docs_embeddings)
    torch.save(advice_similarities, f"{exp_dir}/advice_similarities.pt")
    
else:
    advice_similarities = torch.load(f"{exp_dir}/advice_similarities.pt")


  advice_similarities = torch.load(f"{exp_dir}/advice_similarities.pt")


In [None]:
import json
import pandas as pd

if not os.path.exists(dataset_list_save_path):
    df = pd.DataFrame(data_dict)
    df['id'] = range(len(df))
    dataset_list = []
    
    for problem_id in problem_ids_with_advice:
        #problem = instruct_prompt[problem_id]
        chosen_df = df[df['problem_id'] == problem_id]
        problem = chosen_df['problem'].values[0]
        output = chosen_df['output'].values[0]
        chosen_ids = chosen_df["id"].tolist()
        all_avoid_ids = []
        for chosen_id in chosen_ids:
            _, avoid_ids = torch.topk(advice_similarities[chosen_id], k=avoid_topk)
            avoid_ids = avoid_ids.tolist()
            all_avoid_ids += avoid_ids
        
        rejected_df = df.drop(all_avoid_ids)[df['problem_id'] != problem_id]
        num_chosen = len(chosen_df)
        num_rejected = num_chosen
        if not chosen_df.empty and not rejected_df.empty:
            '''
            if len_chosen_df<4:
                num_chosen = len_chosen_df
                num_rejected = num_advice_per_batch - len_chosen_df
            else: # For the case there are too many chosen advices
                num_chosen = 3
                num_rejected = num_advice_per_batch - num_chosen
            '''
            chosen_rows = chosen_df.sample(n=num_chosen)
            rejected_rows = rejected_df.sample(n=num_rejected)
            chosen_advice_ids = chosen_rows["id"].tolist()
            rejected_advice_ids = rejected_rows["id"].tolist()
            #chosen_reject_similarities = advice_similarities[chosen_ids][:, rejected_ids]
            for i in range(num_chosen):
                chosen_advice = chosen_rows['advice'].values[i]
                rejected_advice = rejected_rows['advice'].values[i]
                chosen_advice_id = chosen_advice_ids[i]
                rejected_advice_id = rejected_advice_ids[i]
                
                dataset_list.append({
                    "query":[{'role': 'user', 'content':f"Give me an advice to the problem and answer below;\n\nProblem:{problem}\n\nAnswer:{output}"}],
                    "chosen_key":[{'role': 'assistant', 'content': f"{chosen_advice}"}],
                    "rejected_key":[{'role': 'assistant', 'content': f"{rejected_advice}"}],
                    "problem_id":problem_id,
                    "problem":problem,
                    "output":output,
                    "chosen_advice":chosen_advice,
                    "rejected_advice":rejected_advice,
                    "chosen_advice_id":chosen_advice_id,
                    "rejected_advice_id":rejected_advice_id,
                })
        else:
            #print(f"problem {problem_id} doesn't have any advice")
            continue
    
    #if num_gpu*batch_size_per_device != 1:
    #    num_trash = len(dataset_list)%(num_gpu*batch_size_per_device)
    #    dataset_list = dataset_list[:-num_trash]
    
    with open(dataset_list_save_path, "w") as f:
        json.dump(dataset_list, f)
        
else:
    with open(dataset_list_save_path) as f:
        dataset_list = json.load(f)

#dataset1 = Dataset.from_list(dataset_list)
#dataset1.to_pandas()

# Step 3 : Train Reward Model

In [None]:
from rmsearch import RMTrainer

model_name = "/workspace/llama3b-rm"
num_gpus = 1

rmtrainer = RMTrainer(model_name = model_name, num_gpus = num_gpus)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
dataset_save_path = f"{exp_dir}/dataset"
train_ids_save_path = f"{exp_dir}/train_ids.json"
test_ids_save_path = f"{exp_dir}/test_ids.json"
test_size = 48

formatted_dataset = rmtrainer.prepare_dataset(dataset_list, dataset_save_path, test_size, train_ids_save_path, test_ids_save_path)

In [None]:
from trl import RewardConfig
from peft import LoraConfig, TaskType

batch_size_per_device = 4
eval_batch_size_per_device = 4

training_args = RewardConfig(
    output_dir=model_save_dir,
    per_device_train_batch_size=batch_size_per_device,
    per_device_eval_batch_size=eval_batch_size_per_device,
    #evaluation_strategy="steps",
    eval_steps=20,
    eval_on_start=True,
    save_steps=20,
    logging_steps=1,
    num_train_epochs = 3,
    report_to=None,
    remove_unused_columns=False,
)

peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    inference_mode=False,
    target_modules=["k_proj","q_proj","o_proj", "v_proj","down_proj","gate_proj","up_proj",],
    layers_to_transform=[25,26,27],
    r=16,
    lora_alpha=16,
    lora_dropout=0.1,
)

rmtrainer.train(formatted_dataset, training_args = training_args, peft_config = peft_config)

No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Accuracy
0,No log,0.339076,0.812500
1,0.382000,No Log,No Log
2,0.230500,No Log,No Log
3,0.418800,No Log,No Log
4,0.627400,No Log,No Log
5,0.531700,No Log,No Log
6,0.303600,No Log,No Log
7,0.060400,No Log,No Log
8,0.028500,No Log,No Log
9,0.242800,No Log,No Log
